28 Aug

Hooking up MPC & LLVM

I’ve been tinkering around with mpc and llvm recently – just to satisfy a few simple questions:

  • How easy is it to use mpc?
  • How easy is the LLVM c api to use?
  • How easy is it to connect a parser generator to an LLVM Module?

So given the above aims, I’ve embarked on creating my own stupid little language, that I’m calling ‘neil‘  – Not Exactly an Intermediate Language. It is utterly by chance that the name ‘neil‘ happens to be my own name, if you’ve believe that gratuitous lie!

A ‘neil‘ program currently consists of one allowed type – a 32 bit integer named i32, function definitions, function calls, and being able to return the results of one function from another. In short, it is (currently, and most likely permanently) a really stupid language.

An example legal ‘neil’ program is:

 i32 foo() { return 42; }

 i32 main() {
   return foo();
 }

So lets get right into how we construct this grammar in mpc. The mpc grammar for our ‘neil‘ language is as follows:

ident : /[a-zA-Z_][a-zA-Z0-9_]*/ ;

literal : /[0-9]+(\\.[0-9]+)?((e|E)(-|\\+)?[0-9]+)?/ ;

factor: <literal> | <ident> '(' <lexp>? (',' <lexp>)* ')' ;

term: <factor> ;

lexp: <term> ;

stmt: \"return\" <lexp>? ';'
    | <ident> '(' <ident>? (',' <ident>)* ')' ';' ;

type: (\"i1\" | /(i|u)(8|16|32|64)/ | /f(16|32|64)/) ;

typeident : <type> <ident> ;

args: <typeident>? (',' <typeident>)* ;

body: '{' <stmt>* '}' ;

procedure : <type> <ident> '(' <args> ')' <body> ;

lang: /^/ <procedure>* /$/ ;

It’s a very simple grammar, I looked at the mpc – smallc example to work out what to do. To parse using mpc, I have the following function:

int parse(
 const char* filename,
 const char* source,
 mpc_result_t* out_result) {
 mpc_parser_t* Ident = mpc_new("ident");
 mpc_parser_t* Number = mpc_new("literal");
 mpc_parser_t* Factor = mpc_new("factor");
 mpc_parser_t* Term = mpc_new("term");
 mpc_parser_t* Lexp = mpc_new("lexp");
 mpc_parser_t* Stmt = mpc_new("stmt");
 mpc_parser_t* Type = mpc_new("type");
 mpc_parser_t* Decls = mpc_new("decls");
 mpc_parser_t* Args = mpc_new("args");
 mpc_parser_t* Body = mpc_new("body");
 mpc_parser_t* Procedure = mpc_new("procedure");
 mpc_parser_t* Lang = mpc_new("lang");

 mpc_err_t* err = mpca_lang(MPCA_LANG_DEFAULT,
  " ident : /[a-zA-Z_][a-zA-Z0-9_]*/ ;                     \n"
  " literal : /[0-9]+(\\.[0-9]+)?((e|E)(-|\\+)?[0-9]+)?/ ; \n"
  " factor : <literal>                                     \n"
  " | <ident> '(' <lexp>? (',' <lexp>)* ')' ;              \n"
  " term : <factor> ;                                      \n"
  " lexp : <term> ;                                        \n"
  " stmt : \"return\" <lexp>? ';'                          \n"
  " | <ident> '(' <ident>? (',' <ident>)* ')' ';' ;        \n"
  " type : (\"i1\" | /(i|u)(8|16|32|64)/ | /f(16|32|64)/) ;\n"
  " typeident : <type> <ident> ;                           \n"
  " args : <typeident>? (',' <typeident>)* ;               \n"
  " body : '{' <stmt>* '}' ;                               \n"
  " procedure : <type> <ident> '(' <args> ')' <body> ;     \n"
  " lang : /^/ <procedure>* /$/ ;                         \n",
 Ident, Number, Factor, Term, Lexp, Stmt, 
 Type, Args, Body, Procedure, Lang, NULL);
 
 if (err != NULL) {
  mpc_err_print(err);
  mpc_err_delete(err);
  exit(1);
 }

 const int result = mpc_parse(filename, source, Lang, out_result);

 mpc_cleanup(11, Ident, Number, Factor, Term, Lexp, Stmt, 
 Type, Args, Body, Procedure, Lang);

 return result;
}

With this, we can then iterate through a successfully parsed ‘neil‘ input using the:

typedef struct mpc_ast_t {
 char *tag;
 char *contents;
 mpc_state_t state;
 int children_num;
 struct mpc_ast_t** children;
} mpc_ast_t;

type that mpc provides. The struct elements are as follows:

  • tag – the string name of the ast node’s type
  • contents – what the ast node is pointing at in the original source input
  • state – the line information for the ast node
  • children_num – the length of the children array
  • children – an array of AST node’s that are children of the current node

Next up, we need to create what we need from LLVM to produce a module for the current file. I’m using the LLVM C API because I’ve never used it before, having only ever used the C++ API in my day to day stuff at work, so I thought it would be useful to have a look at it.

First up we need an LLVM Module – this is an encapsulation of a ‘neil‘ input file for our uses. I use:

LLVMModuleRef module = LLVMModuleCreateWithName(filename);

To get a module to work with. Then, for each AST node that is a procedure, I create a corresponding LLVM function with:

mpc_ast_t *name = procedure->children[1];

LLVMTypeRef funcType = LLVMFunctionType(
 LLVMInt32Type(), 0, 0, false);

LLVMValueRef func = LLVMAddFunction(
 module, name->contents, funcType);

(Note I’m cheating for now because I know my functions return i32, and take no params, don’t do this in production code!).

Then, since I have no control flow within my functions, I can create one basic block to hold the body of the function, and an IR builder to help us make the instructions within the basic block, with:

LLVMBasicBlockRef block = LLVMAppendBasicBlock(func, "");

LLVMBuilderRef builder = LLVMCreateBuilder();

LLVMPositionBuilderAtEnd(builder, block);

And with this we can begin to parse the body of the functions!

I parse the return statement (the only allowed statement within our functions), and check if it returns a literal or the result of a call to a function:

if (weFoundALiteral) {
 LLVMValueRef literal = LLVMConstIntOfString(
  LLVMInt32Type(), stmt->children[1]->contents, 10);

 LLVMBuildRet(builder, literal);
} else {
 LLVMValueRef call = LLVMGetNamedFunction(module, ident_name);

 LLVMValueRef ret = LLVMBuildCall(builder, call, 0, 0, "");

 LLVMBuildRet(builder, ret);
}

And the result is:

; ModuleID = 'foo.neil'

define i32 @foo() {
 ret i32 42
}

define i32 @main() {
 %1 = call i32 @foo()
 ret i32 %1
}

From the example ‘neil‘ file I gave above.

TL;DR mpc is pretty cool, the LLVM C API is very easy to use, and I’ll be fleshing out my ‘neil‘ language in future blog posts once I try handling a more complicated input grammar.

PS the full source for the example is below:

// This is free and unencumbered software released into the public domain.
//
// Anyone is free to copy, modify, publish, use, compile, sell, or
// distribute this software, either in source code form or as a compiled
// binary, for any purpose, commercial or non-commercial, and by any
// means.
//
// In jurisdictions that recognize copyright laws, the author or authors
// of this software dedicate any and all copyright interest in the
// software to the public domain. We make this dedication for the benefit
// of the public at large and to the detriment of our heirs and
// successors. We intend this dedication to be an overt act of
// relinquishment in perpetuity of all present and future rights to this
// software under copyright law.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// For more information, please refer to <http://unlicense.org/>

#include <assert.h>

#define __STDC_CONSTANT_MACROS
#define __STDC_LIMIT_MACROS
#include <llvm-c/Core.h>

extern "C" {
#include <mpc/mpc.h>
}

const char *src =
 "i32 foo() { return 42; }\n"
 "i32 main() {\n"
 " return foo();\n"
 "}\n";

int parse(const char* filename, const char* source, mpc_result_t* out_result) {
 mpc_parser_t* Ident = mpc_new("ident");
 mpc_parser_t* Number = mpc_new("literal");
 mpc_parser_t* Factor = mpc_new("factor");
 mpc_parser_t* Term = mpc_new("term");
 mpc_parser_t* Lexp = mpc_new("lexp");
 mpc_parser_t* Stmt = mpc_new("stmt");
 mpc_parser_t* Type = mpc_new("type");
 mpc_parser_t* Decls = mpc_new("decls");
 mpc_parser_t* Args = mpc_new("args");
 mpc_parser_t* Body = mpc_new("body");
 mpc_parser_t* Procedure = mpc_new("procedure");
 mpc_parser_t* Lang = mpc_new("lang");

 mpc_err_t* err = mpca_lang(MPCA_LANG_DEFAULT,
 " ident : /[a-zA-Z_][a-zA-Z0-9_]*/ ; \n"
 " literal : /[0-9]+(\\.[0-9]+)?((e|E)(-|\\+)?[0-9]+)?/ ; \n"
 " \n"
 " factor : <literal> \n"
 " | <ident> '(' <lexp>? (',' <lexp>)* ')' ; \n"
 " \n"
 " term : <factor> ; \n"
 " lexp : <term> ; \n"
 " \n"
 " stmt : \"return\" <lexp>? ';' \n"
 " | <ident> '(' <ident>? (',' <ident>)* ')' ';' ; \n"
 " \n"
 " type : (\"i1\" | /(i|u)(8|16|32|64)/ | /f(16|32|64)/) ;\n"
 " typeident : <type> <ident> ; \n"
 " args : <typeident>? (',' <typeident>)* ; \n"
 " body : '{' <stmt>* '}' ; \n"
 " procedure : <type> <ident> '(' <args> ')' <body> ; \n"
 " lang : /^/ <procedure>* /$/ ; \n",
 Ident, Number, Factor, Term, Lexp, Stmt, 
 Type, Args, Body, Procedure, Lang, NULL);
 
 if (err != NULL) {
 mpc_err_print(err);
 mpc_err_delete(err);
 exit(1);
 }

 const int result = mpc_parse(filename, source, Lang, out_result);

 mpc_cleanup(11, Ident, Number, Factor, Term, Lexp, Stmt, 
 Type, Args, Body, Procedure, Lang);

 return result;
}

int mpc2llvm(const char* filename, mpc_ast_t *ast) {
 LLVMModuleRef module = LLVMModuleCreateWithName(filename);

 // the root is always called '>'
 assert('>' == ast->tag[0]);

 // the first child is a regex for '^'
 assert(0 == strcmp("regex", ast->children[0]->tag));

 // the last child is a regex for '$'
 assert(0 == strcmp("regex", ast->children[ast->children_num - 1]->tag));

 for (int i = 1; i < (ast->children_num - 1); i++) {
 mpc_ast_t *procedure = ast->children[i];

 // our root nodes are procedures
 assert(0 == strcmp("procedure|>", procedure->tag));

 // we need to have at least 5 children in a procedure
 assert(5 <= procedure->children_num);

 mpc_ast_t *return_type = procedure->children[0];

 assert(0 == strcmp("type|regex", return_type->tag));

 mpc_ast_t *name = procedure->children[1];

 assert(0 == strcmp("ident|regex", name->tag));

 assert(0 == strcmp("char", procedure->children[2]->tag));
 assert('(' == procedure->children[2]->contents[0]);
 assert(0 == strcmp("char", procedure->children[3]->tag));
 assert(')' == procedure->children[3]->contents[0]);

 LLVMTypeRef funcType = LLVMFunctionType(
 LLVMInt32Type(), 0, 0, false);

 LLVMValueRef func = LLVMAddFunction(
 module, name->contents, funcType);

 LLVMBasicBlockRef block = LLVMAppendBasicBlock(func, "");

 LLVMBuilderRef builder = LLVMCreateBuilder();

 LLVMPositionBuilderAtEnd(builder, block);

 mpc_ast_t *body = procedure->children[4];

 assert(0 == strcmp("body|>", body->tag));

 assert(0 == strcmp("char", body->children[0]->tag));
 assert('{' == body->children[0]->contents[0]);
 assert(0 == strcmp("char", body->children[body->children_num - 1]->tag));
 assert('}' == body->children[body->children_num - 1]->contents[0]);

 for (int k = 1; k < (body->children_num - 1); k++) {
 mpc_ast_t *stmt = body->children[k];

 assert(0 == strcmp("stmt|>", stmt->tag));

 assert(0 < stmt->children_num);

 if ((0 == strcmp("string", stmt->children[0]->tag)) &&
 (0 == strcmp("return", stmt->children[0]->contents))) {
 // we found a return statement

 if (0 == strcmp("lexp|term|factor|literal|regex", stmt->children[1]->tag)) {
 LLVMValueRef literal = LLVMConstIntOfString(
 typeMap[return_type->contents],
 stmt->children[1]->contents, 10);

 LLVMBuildRet(builder, literal);
 } else if (0 == strcmp("lexp|term|factor|>", stmt->children[1]->tag)) {
 mpc_ast_t *factor = stmt->children[1];

 if (0 == strcmp("ident|regex", factor->children[0]->tag)) {
 const char * ident_name = factor->children[0]->contents;

 if ('(' == factor->children[1]->contents[0]) {
 // we have a function call!
 assert(')' == factor->children[2]->contents[0]);

 LLVMValueRef called_func = LLVMGetNamedFunction(
 module, ident_name);

 LLVMValueRef ret = LLVMBuildCall(builder, called_func, 0, 0, "");

 LLVMBuildRet(builder, ret);
 }
 }
 }
 }
 }
 }

 LLVMDumpModule(module);

 return 0;
}

int main() {
 const char * filename = "foo.neil";
 mpc_result_t result;

 if (parse(filename, src, &result)) {
 const int r = mpc2llvm(filename, (mpc_ast_t *)result.output);

 mpc_ast_print((mpc_ast_t *)result.output);
 mpc_ast_delete((mpc_ast_t *)result.output);

 if (r) {
 printf("mpc -> llvm error!\n");
 return -1;
 }

 return 0;
 } else {
 mpc_err_print(result.error);
 mpc_err_delete(result.error);

 return -1;
 }
}

One thought on “Hooking up MPC & LLVM

  1. Pingback: Cleaning up the parser (MPC -> LLVM for the Neil Language #2) | Duskborn

Leave a Reply

Your email address will not be published. Required fields are marked *