Oscar Forner
$(whoami) Projects Resume

ARM C Compiler (ACC) I


Why do you want to create your own compiler? To answer this question I have to give you some background. For Christmas I got a BeagleBone Black, perfect to learn ARM assembly. After a few weeks of doing the usual stuff I decided I wanted a bigger project to improve my knowledge. However, the idea to start the development of a compiler comes from http://www.sigbus.info/how-i-wrote-a-self-hosting-c-compiler-in-40-days.html.

The only aim of this project is to improve my knowledge of the ARM assembly, the C language and Compilers. The project can be found in GitHub.

What is the target of the project?

My target is to have a self-hosting compiler of C. For this reason I want to keep it simple. Moreover, I will only develop the features I need to accomplish the target of the project. As you can imagine the choosen language is C (ANSI C to be specific).

I decided not to use Flex, Bison, Lex or Yacc because I want to work in all stages of the compiler. Yes, it sounds crazy and maybe I will regret it in the future, but at least I will try it.

What is the current state of the project?

Currently, I have implemented a very basic compiler, this version is available in GitHub as v0.1. It just generates code for a file containing the main function without parameters and the body only contains a return statement of a positive integer.

Structure of the project

The project contains three folders:

  • inc: holds the external libraries needed. Right now, it has the files of Unity for unit testing.
  • unittest: has the unit tests for the project. It has just one, but once I find a good framework to do mocks in C, it will have more unit tests.
  • src: contains the actual source code of the project.

This compiler is made of a Lexer that creates Tokens. These Tokens are used by the Grammar to create AST nodes. Finally, the AST nodes will be used by the Generator to generate the assembly.

Lexer and Tokens

The available Tokens are: int_value, int_type, function, open_parenthesis, close_parenthesis, open_bracket, close_bracket, return, semicolon and end_of_file. These can be found in the file tokens.h:

#ifndef TOKEN_H
#define TOKEN_H

#include <stdlib.h>

enum token_type {

 * Tokens for the parser
typedef struct token_base
	enum token_type type;
	void * token_pointer;
} token_base;

typedef struct token_int_type
	token_base base;
} token_int_type;

typedef struct token_int_value
	token_base base;
	int value;
} token_int_value;

typedef struct token_function
	token_base base;
	char * name;
} token_function;

typedef struct token_opar
	token_base base;
} token_opar;

typedef struct token_cpar
	token_base base;
} token_cpar;

typedef struct token_obra
	token_base base;
} token_obra;

typedef struct token_cbra
	token_base base;
} token_cbra;

typedef struct token_return
	token_base base;
} token_return;

typedef struct token_semicolon
	token_base base;
} token_semicolon;

typedef struct token_eof
	token_base base;
} token_eof;

 * Init functions for the tokens
void init_token_int_type(token_int_type * token);
void init_token_int_value(token_int_value * token, int  value);
void init_token_function(token_function * token, char * name);
void init_token_opar(token_opar * token);
void init_token_cpar(token_cpar * token);
void init_token_obra(token_obra * token);
void init_token_cbra(token_cbra * token);
void init_token_return(token_return * token);
void init_token_semicolon(token_semicolon * token);
void init_token_eof(token_eof * token);

 * Release functions for the tokens
void free_token_int_type(token_int_type * token);
void free_token_int_value(token_int_value * token);
void free_token_function(token_function * token);
void free_token_opar(token_opar * token);
void free_token_cpar(token_cpar * token);
void free_token_obra(token_obra * token);
void free_token_cbra(token_cbra * token);
void free_token_return(token_return * token);
void free_token_semicolon(token_semicolon * token);
void free_token_eof(token_eof * token);
#endif //TOKEN_H

These Tokens are returned by the method next from the Lexer in the file lexer.h:

#ifndef LEXER_H
#define LEXER_H

#include <stdio.h>
#include "token.h"

typedef struct lexer
	FILE * f;
} lexer;

void init_lexer(lexer * l, const char * filename);
void destroy_lexer(lexer * l);
struct token_base * next(lexer * l);

#endif //LEXER_H

Grammar and AST nodes

The available AST nodes are: id, int, function and return. These can be found in the file ast.h:

#ifndef AST_H
#define AST_H

#include <stdlib.h>

enum ast_type {

 * AST nodes
typedef struct ast_base
	enum ast_type type;
	void * ast_pointer;
} ast_base;

typedef struct node_id
	ast_base base;
	char * value;
} node_id;

typedef struct node_int
	ast_base base;
	int value;
} node_int;

typedef struct node_function
	ast_base base;
	char * name;
	ast_base * entry_point;
} node_function;

typedef struct node_return
	ast_base base;
	ast_base * value;
} node_return;

 * Init functions for the AST nodes
void init_node_id(node_id * node, char * value);
void init_node_int(node_int * node, int value);
void init_node_function(node_function * node, char * name, ast_base * entry_point);
void init_node_return(node_return * node, ast_base * value);

 * Release functions for the AST nodes
void free_node_id(node_id * node);
void free_node_int(node_int * node);
void free_node_function(node_function * node);
void free_node_return(node_return * node);
#endif //AST_H

The AST is returned by the method build_ast from the Grammar in the file grammar.h:

#ifndef GRAMMAR_H
#define GRAMMAR_H

#include <stdio.h>
#include "lexer.h"
#include "ast.h"

typedef struct grammar
	lexer * l;
} grammar;

void init_grammar(grammar * g, lexer * l);
ast_base * build_ast(grammar * g);
void destroy_grammar(grammar * g);

 * Read functions to build AST parts
ast_base * read_function_ast_node(grammar * g);
ast_base * read_function_body(grammar * g);
#endif //GRAMMAR_H

Generation of assembly

The Generator has a Grammar that will be used to generate the assembly with the method generate_code:


#include <stdio.h>
#include "grammar.h"

typedef struct generator
	grammar * g;
	FILE * f;
} generator;

// API
void init_generator(generator * gen, grammar * gra, const char * out);
void destroy_generator(generator * g);
void generate_code(generator * g);

// Internals
void __generate_code(generator * g, ast_base * ast);
void __generate_code_for_main(generator * g, ast_base * ast);
void __generate_code_for_function(generator * g, node_function * ast);
void __generate_code_for_return(generator * g, node_return * ast);
void __generate_code_for_int(generator * g, node_int * ast);

#endif //GENERATOR_H

Example of the current functionality

The code to be compiled into ARM assembly is:

int main()
	return 2;

Compile the example with our compiler (ACC):

./bin/acc example.c -o example.s

The assembly generated is:

	.global main
	mov r0, #2
	bx lr

Use GCC to translate that assembly into a executable binary:

gcc example.s -o example

Execute and check the result:

$ ./example
$ echo $?


As you can see this project is going to take a long time to be completed. My plan is to work adding small features at a time. The next step is to add conditionals (if and else). Afterwards, I will add operators such as <, <=, >, >=, == and !=. Everytime I achieve a new goal in this project I will create a post like this one to show the feature and how it has been implemented.