Compiler/code generator
A code generator translates the output of the syntax analyzer and/or semantic analyzer into lower level code, either assembly, object, or virtual.
Take the output of the Syntax analyzer task, and convert it to virtual machine code, that can be run by the Virtual machine interpreter.
The program should read input from a file and/or stdin, and write output to a file and/or stdout.
Input format: See Syntax analyzer output
Given the following program:
count = 1; while (count < 10) { print("count is: ", count, "\n"); count = count + 1; }
The output from the [[Compiler/syntax_analyzer|]syntax analyzer] is a parse tree:
Sequence Sequence ; Assign Identifier count Integer 1 While Less Identifier count Integer 10 Sequence Sequence ; Sequence Sequence Sequence ; Prts String "count is: " ; Prti Identifier count ; Prts String "\n" ; Assign Identifier count Add Identifier count Integer 1
- , Identifier, Integer, and String, are terminal nodes.
Loading this data into an internal parse tree should be as simple as:
<lang python> def load_ast()
line = readline() # Each line has at least one token line_list = tokenize the line, respecting double quotes
text = line_list[0] # first token is always the node type
if text == ";" return None
node_type = text # could convert to internal form if desired
# A line with two tokens is a leaf node # Leaf nodes are: Identifier, Integer String # The 2nd token is the value if len(line_list) > 1 return make_leaf(node_type, line_list[1])
left = load_ast() right = load_ast() return make_node(node_type, left, right)
</lang>
Example output of the code generator:
Datasize: 1 Strings: 2 "count is: " "\n" 0 push 1 5 store [0] 10 fetch [0] 15 push 10 20 lt 21 jz (43) 65 26 push 0 31 prts 32 fetch [0] 37 prti 38 push 1 43 prts 44 fetch [0] 49 push 1 54 add 55 store [0] 60 jmp (-51) 10 65 halt
This is a byte-code, 32-bit word stack based virtual machine.
Registers:
sp:
the stack pointer - points to the next top of stack. The stack is a 32-bit integer array.
pc:
the program counter - points to the current instruction to be performed. The code is an array of bytes.
Data:
data string pool
Instructions:
Each instruction is one byte. The following instructions also have a 32-bit integer operand:
fetch [index]
where index is an index into the data array.
store [index]
where index is an index into the data array.
push n
where value is a 32-bit integer that will be pushed onto the stack.
jmp (n) addr
where (n) is a 32-bit integer specifying the distance between the current location and the desired location. addr is an unsigned value of the actual code address.
jz (n) addr
where (n) is a 32-bit integer specifying the distance between the current location and the desired location. addr is an unsigned value of the actual code address.
The following instructions do not have an operand. They perform their operation directly against the stack:
For the following instructions, the operation is performed against the top two entries in the stack:
add sub mul div lt gt le ne and
For the following instructions, the operation is performed against the top entry in the stack:
neg
prtc
Print the word at stack top as a character.
prti
Print the word at stack top as an integer.
prts
Stack top points to an index into the string pool. Print that entry.
halt
Unconditional stop.
C
Tested with gcc 4.81 and later, compiles warning free with -Wall -Wextra <lang C>#include <stdlib.h>
- include <stdio.h>
- include <string.h>
- include <stdarg.h>
- include <stdint.h>
- include <ctype.h>
typedef unsigned char uchar;
typedef enum {
nd_Ident, nd_String, nd_Integer, nd_Sequence, nd_If, nd_Prtc, nd_Prts, nd_Prti, nd_While, nd_Assign, nd_Negate, nd_Not, nd_Mul, nd_Div, nd_Mod, nd_Add, nd_Sub, nd_Lss, nd_Leq, nd_Gtr, nd_Geq, nd_Eql, nd_Neq, nd_And, nd_Or
} NodeType;
typedef enum { FETCH, STORE, PUSH, ADD, SUB, MUL, DIV, MOD, LT, GT, LE, GE, EQ, NE, AND,
OR, NEG, NOT, JMP, JZ, PRTC, PRTS, PRTI, HALT
} Code_t;
typedef uchar code;
typedef struct Tree {
NodeType node_type; struct Tree *left; struct Tree *right; char *value;
} Tree;
- define da_dim(name, type) type *name = NULL; \
int _qy_ ## name ## _p = 0; \ int _qy_ ## name ## _max = 0
- define da_redim(name) do {if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \
name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]));} while (0)
- define da_rewind(name) _qy_ ## name ## _p = 0
- define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)
- define da_len(name) _qy_ ## name ## _p
- define da_add(name) do {da_redim(name); _qy_ ## name ## _p++;} while (0)
FILE *source_fp, *dest_fp; static int here; da_dim(object, code); da_dim(globals, const char *); da_dim(string_pool, const char *);
// dependency: Ordered by NodeType, must remain in same order as NodeType enum struct {
char *enum_text; NodeType node_type; Code_t opcode;
} atr[] = {
{"Identifier" , nd_Ident, -1 }, {"String" , nd_String, -1 }, {"Integer" , nd_Integer, -1 }, {"Sequence" , nd_Sequence, -1 }, {"If" , nd_If, -1 }, {"Prtc" , nd_Prtc, -1 }, {"Prts" , nd_Prts, -1 }, {"Prti" , nd_Prti, -1 }, {"While" , nd_While, -1 }, {"Assign" , nd_Assign, -1 }, {"Negate" , nd_Negate, NEG}, {"Not" , nd_Not, NOT}, {"Multiply" , nd_Mul, MUL}, {"Divide" , nd_Div, DIV}, {"Mod" , nd_Mod, MOD}, {"Add" , nd_Add, ADD}, {"Subtract" , nd_Sub, SUB}, {"Less" , nd_Lss, LT }, {"LessEqual" , nd_Leq, LE }, {"Greater" , nd_Gtr, GT }, {"GreaterEqual", nd_Geq, GE }, {"Equal" , nd_Eql, EQ }, {"NotEqual" , nd_Neq, NE }, {"And" , nd_And, AND}, {"Or" , nd_Or, OR },
};
void error(const char *fmt, ... ) {
va_list ap; char buf[1000];
va_start(ap, fmt); vsprintf(buf, fmt, ap); va_end(ap); printf("error: %s\n", buf); exit(1);
}
Code_t type_to_op(NodeType type) {
return atr[type].opcode;
}
Tree *make_node(NodeType node_type, Tree *left, Tree *right) {
Tree *t = calloc(sizeof(Tree), 1); t->node_type = node_type; t->left = left; t->right = right; return t;
}
Tree *make_leaf(NodeType node_type, char *value) {
Tree *t = calloc(sizeof(Tree), 1); t->node_type = node_type; t->value = strdup(value); return t;
}
/*** Code generator ***/
void emit_byte(int c) {
da_append(object, (uchar)c); ++here;
}
void emit_int(int32_t n) {
union { int32_t n; unsigned char c[sizeof(int32_t)]; } x;
x.n = n;
for (size_t i = 0; i < sizeof(x.n); ++i) { emit_byte(x.c[i]); }
}
int hole() {
int t = here; emit_int(0); return t;
}
void fix(int src, int dst) {
*(int32_t *)(object + src) = dst-src;
}
int fetch_var_offset(const char *id) {
for (int i = 0; i < da_len(globals); ++i) { if (strcmp(id, globals[i]) == 0) return i; } da_add(globals); int n = da_len(globals) - 1; globals[n] = strdup(id); return n;
}
int fetch_string_offset(const char *st) {
for (int i = 0; i < da_len(string_pool); ++i) { if (strcmp(st, string_pool[i]) == 0) return i; } da_add(string_pool); int n = da_len(string_pool) - 1; string_pool[n] = strdup(st); return n;
}
void code_gen(Tree *x) {
int p1, p2, n;
if (x == NULL) return; switch (x->node_type) { case nd_Ident: emit_byte(FETCH); n = fetch_var_offset(x->value); emit_int(n); break; case nd_Integer: emit_byte(PUSH); emit_int(atoi(x->value)); break; case nd_String: emit_byte(PUSH); n = fetch_string_offset(x->value); emit_int(n); break; case nd_Assign: n = fetch_var_offset(x->left->value); code_gen(x->right); emit_byte(STORE); emit_int(n); break; case nd_If: code_gen(x->left); // if expr emit_byte(JZ); // if false, jump p1 = hole(); // make room for jump dest code_gen(x->right->left); // if true statements if (x->right->right != NULL) { emit_byte(JMP); p2 = hole(); } fix(p1, here); if (x->right->right != NULL) { code_gen(x->right->right); fix(p2, here); } break; case nd_While: p1 = here; code_gen(x->left); // while expr emit_byte(JZ); // if false, jump p2 = hole(); // make room for jump dest code_gen(x->right); // statements emit_byte(JMP); // back to the top fix(hole(), p1); // plug the top fix(p2, here); // plug the 'if false, jump' break; case nd_Sequence: code_gen(x->left); code_gen(x->right); break; case nd_Prtc: code_gen(x->left); emit_byte(PRTC); break; case nd_Prti: code_gen(x->left); emit_byte(PRTI); break; case nd_Prts: code_gen(x->left); emit_byte(PRTS); break; case nd_Lss: case nd_Gtr: case nd_Leq: case nd_Geq: case nd_Eql: case nd_Neq: case nd_And: case nd_Or: case nd_Sub: case nd_Add: case nd_Div: case nd_Mul: case nd_Mod: code_gen(x->left); code_gen(x->right); emit_byte(type_to_op(x->node_type)); break; case nd_Negate: case nd_Not: code_gen(x->left); emit_byte(type_to_op(x->node_type)); break; default: error("error in code generator - found %d, expecting operator\n", x->node_type); }
}
void code_finish() {
emit_byte(HALT);
}
void list_code() {
fprintf(dest_fp, "Datasize: %d Strings: %d\n", da_len(globals), da_len(string_pool)); for (int i = 0; i < da_len(string_pool); ++i) fprintf(dest_fp, "%s\n", string_pool[i]);
code *pc = object;
again: fprintf(dest_fp, "%5d ", (int)(pc - object)); switch (*pc++) { case FETCH: fprintf(dest_fp, "fetch [%d]\n", *(int32_t *)pc); pc += sizeof(int32_t); goto again; case STORE: fprintf(dest_fp, "store [%d]\n", *(int32_t *)pc); pc += sizeof(int32_t); goto again; case PUSH : fprintf(dest_fp, "push %d\n", *(int32_t *)pc); pc += sizeof(int32_t); goto again; case ADD : fprintf(dest_fp, "add\n"); goto again; case SUB : fprintf(dest_fp, "sub\n"); goto again; case MUL : fprintf(dest_fp, "mul\n"); goto again; case DIV : fprintf(dest_fp, "div\n"); goto again; case MOD : fprintf(dest_fp, "mod\n"); goto again; case LT : fprintf(dest_fp, "lt\n"); goto again; case GT : fprintf(dest_fp, "gt\n"); goto again; case LE : fprintf(dest_fp, "le\n"); goto again; case GE : fprintf(dest_fp, "ge\n"); goto again; case EQ : fprintf(dest_fp, "eq\n"); goto again; case NE : fprintf(dest_fp, "ne\n"); goto again; case AND : fprintf(dest_fp, "and\n"); goto again; case OR : fprintf(dest_fp, "or\n"); goto again; case NOT : fprintf(dest_fp, "not\n"); goto again; case NEG : fprintf(dest_fp, "neg\n"); goto again; case JMP : fprintf(dest_fp, "jmp (%d) %d\n", *(int32_t *)pc, (int32_t)(pc + *(int32_t *)pc - object)); pc += sizeof(int32_t); goto again; case JZ : fprintf(dest_fp, "jz (%d) %d\n", *(int32_t *)pc, (int32_t)(pc + *(int32_t *)pc - object)); pc += sizeof(int32_t); goto again; case PRTC : fprintf(dest_fp, "prtc\n"); goto again; case PRTI : fprintf(dest_fp, "prti\n"); goto again; case PRTS : fprintf(dest_fp, "prts\n"); goto again; case HALT : fprintf(dest_fp, "halt\n"); break; default:error("listcode:Unknown opcode %d\n", *(pc - 1)); }
}
void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) {
if (fn[0] == '\0') *fp = std; else if ((*fp = fopen(fn, mode)) == NULL) error(0, 0, "Can't open %s\n", fn);
}
NodeType get_enum_value(const char name[]) {
for (size_t i = 0; i < sizeof(atr) / sizeof(atr[0]); i++) { if (strcmp(atr[i].enum_text, name) == 0) { return atr[i].node_type; } } error("Unknown token %s\n", name); return -1;
}
char *read_line(int *len) {
static char *text = NULL; static int textmax = 0;
for (*len = 0; ; (*len)++) { int ch = fgetc(source_fp); if (ch == EOF || ch == '\n') { if (*len == 0) return NULL; break; } if (*len + 1 >= textmax) { textmax = (textmax == 0 ? 128 : textmax * 2); text = realloc(text, textmax); } text[*len] = ch; } text[*len] = '\0'; return text;
}
char *rtrim(char *text, int *len) { // remove trailing spaces
for (; *len > 0 && isspace(text[*len - 1]); --(*len)) ;
text[*len] = '\0'; return text;
}
Tree *load_ast() {
int len; char *yytext = read_line(&len); yytext = rtrim(yytext, &len);
// get first token char *tok = strtok(yytext, " ");
if (tok[0] == ';') { return NULL; } NodeType node_type = get_enum_value(tok);
// if there is extra data, get it char *p = tok + strlen(tok); if (p != &yytext[len]) { for (++p; isspace(*p); ++p) ; return make_leaf(node_type, p); }
Tree *left = load_ast(); Tree *right = load_ast(); return make_node(node_type, left, right);
}
int main(int argc, char *argv[]) {
init_io(&source_fp, stdin, "r", argc > 1 ? argv[1] : ""); init_io(&dest_fp, stdout, "wb", argc > 2 ? argv[2] : "");
code_gen(load_ast()); code_finish(); list_code();
return 0;
}</lang>
Python
Tested with Python 2.7 and 3.x <lang Python>from __future__ import print_function import sys, struct, shlex, operator
nd_Ident, nd_String, nd_Integer, nd_Sequence, nd_If, nd_Prtc, nd_Prts, nd_Prti, nd_While, \ nd_Assign, nd_Negate, nd_Not, nd_Mul, nd_Div, nd_Mod, nd_Add, nd_Sub, nd_Lss, nd_Leq, \ nd_Gtr, nd_Geq, nd_Eql, nd_Neq, nd_And, nd_Or = range(25)
all_syms = {
"Identifier" : nd_Ident, "String" : nd_String, "Integer" : nd_Integer, "Sequence" : nd_Sequence, "If" : nd_If, "Prtc" : nd_Prtc, "Prts" : nd_Prts, "Prti" : nd_Prti, "While" : nd_While, "Assign" : nd_Assign, "Negate" : nd_Negate, "Not" : nd_Not, "Multiply" : nd_Mul, "Divide" : nd_Div, "Mod" : nd_Mod, "Add" : nd_Add, "Subtract" : nd_Sub, "Less" : nd_Lss, "LessEqual" : nd_Leq, "Greater" : nd_Gtr, "GreaterEqual": nd_Geq, "Equal" : nd_Eql, "NotEqual" : nd_Neq, "And" : nd_And, "Or" : nd_Or}
FETCH, STORE, PUSH, ADD, SUB, MUL, DIV, MOD, LT, GT, LE, GE, EQ, NE, AND, OR, NEG, NOT, \ JMP, JZ, PRTC, PRTS, PRTI, HALT = range(24)
operators = {nd_Lss: LT, nd_Gtr: GT, nd_Leq: LE, nd_Geq: GE, nd_Eql: EQ, nd_Neq: NE,
nd_And: AND, nd_Or: OR, nd_Sub: SUB, nd_Add: ADD, nd_Div: DIV, nd_Mul: MUL, nd_Mod: MOD}
unary_operators = {nd_Negate: NEG, nd_Not: NOT}
input_file = None code = bytearray() string_pool = {} globals = {} string_n = 0 globals_n = 0 word_size = 4
- show error and exit
def error(msg):
print("%s" % (msg)) exit(1)
def int_to_bytes(val):
return struct.pack("<i", val)
def bytes_to_int(bstr):
return struct.unpack("<i", bstr)
class Node:
def __init__(self, node_type, left = None, right = None, value = None): self.node_type = node_type self.left = left self.right = right self.value = value
def make_node(oper, left, right = None):
return Node(oper, left, right)
def make_leaf(oper, n):
return Node(oper, value = n)
def emit_byte(x):
code.append(x)
def emit_word(x):
s = int_to_bytes(x) for x in s: code.append(x)
def fetch_var_offset(value):
global globals_n
n = globals.get(value, None) if n == None: globals[value] = globals_n n = globals_n globals_n += 1 return n
def fetch_string_offset(value):
global string_n
n = string_pool.get(value, None) if n == None: string_pool[value] = string_n n = string_n string_n += 1 return n
def code_gen(x):
global string_n
if x == None: return elif x.node_type == nd_Ident: emit_byte(FETCH) n = fetch_var_offset(x.value) emit_word(n) elif x.node_type == nd_Integer: emit_byte(PUSH) emit_word(x.value) elif x.node_type == nd_String: emit_byte(PUSH) n = fetch_string_offset(x.value) emit_word(n) elif x.node_type == nd_Assign: n = fetch_var_offset(x.left.value) code_gen(x.right) emit_byte(STORE) emit_word(n) elif x.node_type == nd_If: code_gen(x.left) # expr emit_byte(JZ) # if false, jump p1 = len(code) emit_word(0) # make room for jump dest code_gen(x.right.left) # if true statements if (x.right.right != None): emit_byte(JMP) # jump over else statements p2 = len(code) emit_word(0) code[p1:p1+word_size] = int_to_bytes(len(code) - p1) if (x.right.right != None): code_gen(x.right.right) # else statements code[p2:p2+word_size] = int_to_bytes(len(code) - p2) elif x.node_type == nd_While: p1 = len(code) code_gen(x.left) emit_byte(JZ) p2 = len(code) emit_word(0) code_gen(x.right) emit_byte(JMP) # jump back to the top emit_word(p1 - len(code)) code[p2:p2+word_size] = int_to_bytes(len(code) - p2) elif x.node_type == nd_Sequence: code_gen(x.left) code_gen(x.right) elif x.node_type == nd_Prtc: code_gen(x.left) emit_byte(PRTC) elif x.node_type == nd_Prti: code_gen(x.left) emit_byte(PRTI) elif x.node_type == nd_Prts: code_gen(x.left) emit_byte(PRTS) elif x.node_type in operators: code_gen(x.left) code_gen(x.right) emit_byte(operators[x.node_type]) elif x.node_type in unary_operators: code_gen(x.left) emit_byte(unary_operators[x.node_type]) else: error("error in code generator - found %d, expecting operator" % (x.node_type))
def code_finish():
emit_byte(HALT)
def list_code():
print("Datasize: %d Strings: %d" % (len(globals), len(string_pool)))
for k in sorted(string_pool, key=string_pool.get): print('"' + k + '"')
pc = 0 while pc < len(code): print("%4d " % (pc), end=) op = code[pc] pc += 1 if op == FETCH: x = bytes_to_int(code[pc:pc+word_size])[0] print("fetch [%d]" % (x)); pc += word_size elif op == STORE: x = bytes_to_int(code[pc:pc+word_size])[0] print("store [%d]" % (x)); pc += word_size elif op == PUSH: x = bytes_to_int(code[pc:pc+word_size])[0] print("push %d" % (x)); pc += word_size elif op == ADD: print("add") elif op == SUB: print("sub") elif op == MUL: print("mul") elif op == DIV: print("div") elif op == MOD: print("mod") elif op == LT: print("lt") elif op == GT: print("gt") elif op == LE: print("le") elif op == GE: print("ge") elif op == EQ: print("eq") elif op == NE: print("ne") elif op == AND: print("and") elif op == OR: print("or") elif op == NEG: print("neg") elif op == NOT: print("not") elif op == JMP: x = bytes_to_int(code[pc:pc+word_size])[0] print("jmp (%d) %d" % (x, pc + x)); pc += word_size elif op == JZ: x = bytes_to_int(code[pc:pc+word_size])[0] print("jz (%d) %d" % (x, pc + x)); pc += word_size elif op == PRTC: print("prtc") elif op == PRTI: print("prti") elif op == PRTS: print("prts") elif op == HALT: print("halt") else: error("list_code: Unknown opcode %d", (op));
def load_ast():
line = input_file.readline() line_list = shlex.split(line)
text = line_list[0] if text == ";": return None node_type = all_syms[text]
if len(line_list) > 1: value = line_list[1] if value.isdigit(): value = int(value) return make_leaf(node_type, value)
left = load_ast() right = load_ast() return make_node(node_type, left, right)
- main driver
input_file = sys.stdin if len(sys.argv) > 1:
try: input_file = open(sys.argv[1], "r", 4096) except IOError as e: error("Can't open %s" % sys.argv[1])
n = load_ast() code_gen(n) code_finish() list_code()</lang>