diff --git a/Makefile b/Makefile index 413254d16f14a82cc0f073a5d7b718366b1d26c9..241c2f6c24ef0e5303ebfb827ef7b988e787aaf9 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,11 @@ all: main.native +include opts.mk + .PHONY: main.native -src/config.ml: configure - ./configure +src/config.ml: configure opts.mk + ./configure ${CONF_OPTS} main.native: src/config.ml make -C alpaga diff --git a/configure b/configure index 42d40d9cc10cf698bc80e95bad227a1d5a6ab5e3..03baefe1f861bb4bf817f223cb27f1af88fa0434 100755 --- a/configure +++ b/configure @@ -1,5 +1,35 @@ #!/bin/bash +LEX_HAND="true" +ALPAGA_PARSER="true" + +while [[ $# -gt 0 ]] +do + key="$1" + + case $key in + -l|--lex-hand) + LEX_HAND="true" + shift + ;; + -L|--lex-ocamllex) + LEX_HAND="false" + shift + ;; + -a|--alpaga-parser) + ALPAGA_PARSER="true" + shift + ;; + -m|--menhir-parser) + ALPAGA_PARSER="false" + shift + ;; + *) # unknown option + shift # past argument + ;; + esac +done + RUNTIME=$(pwd)/runtime RVAS=$(which -a riscv64-unknown-linux-gnu-as riscv64-unknown-elf-as riscv64-linux-gnu-as 2>/dev/null | grep -v "not found" | head -n1) @@ -14,4 +44,5 @@ echo "let qemu64 = \"${QEMU64}\"" >> src/config.ml echo "let rv_as = \"${RVAS}\"" >> src/config.ml echo "let rv_ld = \"${RVLD}\"" >> src/config.ml echo "let os_target = \"linux\"" >> src/config.ml - +echo "let lex_hand = $LEX_HAND" >> src/config.ml +echo "let alpaga_parser = $ALPAGA_PARSER" >> src/config.ml diff --git a/opts.mk b/opts.mk new file mode 100644 index 0000000000000000000000000000000000000000..607c5ddf74dd0d83dd623e4edb733ede96c0d6bf --- /dev/null +++ b/opts.mk @@ -0,0 +1,13 @@ +CONF_OPTS= + +# Use handwritten lexer +CONF_OPTS+=-l + +# Use ocamllex lexer +#CONF_OPTS+=-L + +# Use alpaga parser +CONF_OPTS+=-a + +# Use menhir parser +#CONF_OPTS+=-m diff --git a/src/.merlin b/src/.merlin index fb2c88f2ecabecb8412b129212f14998878bc5b4..5aca64dac0daf41b44358cea1613e02df7820526 100644 --- a/src/.merlin +++ b/src/.merlin @@ -7,4 +7,5 @@ PKG logs.lwt PKG batteries PKG yojson PKG websocket -PKG websocket-lwt-unix \ No newline at end of file +PKG websocket-lwt-unix +PKG menhirLib \ No newline at end of file diff --git a/src/Makefile b/src/Makefile index 02ef8aefa24f7571f3847bceee2bed0711a5e7cb..896e3db636653e3585a9f59e41722327ba81d0da 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,3 +1,5 @@ +include ../opts.mk + SRC=archi.ml ast.ml builtins.ml config.ml cfg_constprop.ml cfg_dead_assign.ml \ cfg.ml cfg_print.ml cfg_gen.ml cfg_liveness.ml cfg_nop_elim.ml cfg_run.ml \ elang.ml elang_print.ml elang_gen.ml elang_run.ml e_regexp.ml \ @@ -22,8 +24,8 @@ test_lexer: archi.ml config.ml e_regexp.ml lexer_generator.ml symbols.ml test_le dot -Tsvg /tmp/dfa.dot -o /tmp/dfa.svg dot -Tsvg /tmp/nfa.dot -o /tmp/nfa.svg -config.ml: ../configure - cd .. && ./configure +config.ml: ../configure ../opts.mk + cd .. && ./configure ${CONF_OPTS} clean: rm -rf _build diff --git a/src/_tags b/src/_tags index 2f8f5422ed5d34c1e9c1f0282b17330f1c40b458..85863b57f6f3a210cf25acdd94cceeb1b6d447d4 100644 --- a/src/_tags +++ b/src/_tags @@ -1,6 +1,7 @@ true: debug true: bin_annot <runtime>: -traverse -<*>: package(lwt lwt.unix logs logs.lwt batteries yojson websocket websocket-lwt-unix) +<*>: package(lwt lwt.unix logs logs.lwt batteries yojson websocket websocket-lwt-unix menhirLib) true: thread -<tykernel>: -traverse \ No newline at end of file +true: use_menhir, explain, table +<tykernel>: -traverse diff --git a/src/lexer.mll b/src/lexer.mll new file mode 100644 index 0000000000000000000000000000000000000000..9aa999daef2ff056c38dcc9864da6727fc7baec6 --- /dev/null +++ b/src/lexer.mll @@ -0,0 +1,96 @@ +{ +open Symbols +exception SyntaxError of string +exception Eof +} + + +let digit = ['0'-'9'] +let letter = ['a'-'z' 'A'-'Z'] +let id = letter (digit|letter|'_')* + +rule token = parse + | [' ' '\t' '\r'] { token lexbuf } + | '\n' { Lexing.new_line lexbuf; token lexbuf } + | ("0x" ['0'-'9''a'-'f''A'-'F']+) as i { SYM_INTEGER (int_of_string i) } + | ['0'-'9']+ as i { SYM_INTEGER (int_of_string i) } + | '+' { SYM_PLUS } + | "->" { SYM_ARROW } + | "&&" { SYM_BOOL_AND } + | "||" { SYM_BOOL_OR } + | "!" { SYM_BOOL_NOT } + | '&' { SYM_BITWISE_AND } + | '~' { SYM_BIT_NOT } + | '-' { SYM_MINUS } + | '*' { SYM_ASTERISK } + | "//" { single_line_comment lexbuf } + | "/*" { multi_line_comment lexbuf } + | '/' { SYM_DIV } + | '.' { SYM_POINT } + | "void" { SYM_VOID } + | "char" { SYM_CHAR } + | "int" { SYM_INT } + | "print" { SYM_PRINT } + | "struct" { SYM_STRUCT } + | "if" { SYM_IF } + | "else" { SYM_ELSE } + | "alloc" { SYM_ALLOC } + | "==" { SYM_EQUALITY } + | "=" { SYM_ASSIGN } + | "while" { SYM_WHILE } + | "return" { SYM_RETURN } + | id as s { SYM_IDENTIFIER s} + | '{' { SYM_LBRACE } + | '}' { SYM_RBRACE } + | '(' { SYM_LPARENTHESIS } + | ')' { SYM_RPARENTHESIS } + | '[' { SYM_LBRACKET } + | ']' { SYM_RBRACKET } + | ';' { SYM_SEMICOLON } + | ',' { SYM_COMMA } + | ">=" { SYM_GEQ } + | "<=" { SYM_LEQ } + | '>' { SYM_GT } + | '<' { SYM_LT } + | "!=" { SYM_NOTEQ } + | '^' { SYM_XOR } + | '%' { SYM_MOD } + | '\'' { parse_char lexbuf } + | '"' { read_string (Buffer.create 17) lexbuf } + | eof { SYM_EOF } + | _ as x { failwith (Printf.sprintf "unexpected char '%c' at %s \n" x (string_of_position (Lexing.lexeme_start_p lexbuf)))} + + +and parse_char = parse + | "\\n'" { SYM_CHARACTER '\n' } + | "\\0'" { SYM_CHARACTER (char_of_int 0) } + | "\\r'" { SYM_CHARACTER '\r' } + | "\\t'" { SYM_CHARACTER '\t' } + | _ as c '\'' { SYM_CHARACTER c} + | _ as x { failwith (Printf.sprintf "unexpected char literal '%c'\n" x) } + +and read_string buf = + parse + | '"' { SYM_STRING (Buffer.contents buf) } + | '\\' '/' { Buffer.add_char buf '/'; read_string buf lexbuf } + | '\\' '\\' { Buffer.add_char buf '\\'; read_string buf lexbuf } + | '\\' 'b' { Buffer.add_char buf '\b'; read_string buf lexbuf } + | '\\' 'f' { Buffer.add_char buf '\012'; read_string buf lexbuf } + | '\\' 'n' { Buffer.add_char buf '\n'; read_string buf lexbuf } + | '\\' 'r' { Buffer.add_char buf '\r'; read_string buf lexbuf } + | '\\' 't' { Buffer.add_char buf '\t'; read_string buf lexbuf } + | [^ '"' '\\']+ + { Buffer.add_string buf (Lexing.lexeme lexbuf); + read_string buf lexbuf + } + | _ { raise (SyntaxError ("Illegal string character: " ^ Lexing.lexeme lexbuf)) } + | eof { raise (SyntaxError ("String is not terminated")) } + +and single_line_comment = parse + | ['\n' '\r'] { Lexing.new_line lexbuf; token lexbuf } + | _ { single_line_comment lexbuf } + +and multi_line_comment = parse + | "*/" { token lexbuf } + | '\n' { Lexing.new_line lexbuf; multi_line_comment lexbuf } + | _ {multi_line_comment lexbuf} diff --git a/src/main.ml b/src/main.ml index c19c01fd1bf5ee85696dd43782870693430edb1b..7df1d41e4df078f4dcbc5e53782f7e805b497dbc 100644 --- a/src/main.ml +++ b/src/main.ml @@ -91,6 +91,8 @@ let speclist = ("-nostart", Arg.Set nostart, "Don't output _start code."); ("-nostats", Arg.Set nostats, "Don't output stats."); ("-nomul", Arg.Unit (fun _ -> has_mul := false), "Target architecture without mul instruction."); + ("-lex-hand", Arg.Unit (fun _ -> Options.handwritten_lexer := true), "Use handwritten lexer generator"); + ("-lex-auto", Arg.Unit (fun _ -> Options.handwritten_lexer := false), "Use OCamlLex lexer"); ("-linux", Arg.Unit (fun _ -> target := Linux), "emit linux syscalls"); ("-xv6", Arg.Unit (fun _ -> target := Xv6), "emit xv6 syscalls"); ("--", Arg.Rest (fun p -> params := int_of_string p::!params), "Run parameters.") diff --git a/src/options.ml b/src/options.ml index aac5b749e7a5610e1c11924b70fd7a0a19b00439..5b3df95e4627c01648dc397a3c6caa4e98c27db3 100644 --- a/src/options.ml +++ b/src/options.ml @@ -37,3 +37,5 @@ let no_linear_dse = ref false let alloc_order_st = ref true let naive_regalloc = ref true let rig_dump : string option ref = ref None +let handwritten_lexer = ref Config.lex_hand +let alpaga_parser = ref Config.alpaga_parser diff --git a/src/parser.ml b/src/parser.ml index 8f0a0bae497d6afb406f5cdf744b04801ee44dcb..2e2c5f02499871c8a8fb311480a970dcbca55089 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -1,13 +1,105 @@ +open Yaccparser open Generated_parser open Report open Utils open Options open Ast +open Symbols + let parse = parse_S +let to_yacc_token = function +| SYM_EOF -> Yaccparser.SYM_EOF +| SYM_IDENTIFIER(s) -> Yaccparser.SYM_IDENTIFIER s +| SYM_INTEGER(i) -> Yaccparser.SYM_INTEGER i +| SYM_VOID -> Yaccparser.SYM_VOID +| SYM_CHAR -> Yaccparser.SYM_CHAR +| SYM_INT -> Yaccparser.SYM_INT +| SYM_STRUCT -> Yaccparser.SYM_STRUCT +| SYM_SEMICOLON -> Yaccparser.SYM_SEMICOLON +| SYM_POINT -> Yaccparser.SYM_POINT +| SYM_IF -> Yaccparser.SYM_IF +| SYM_ELSE -> Yaccparser.SYM_ELSE +| SYM_PLUS -> Yaccparser.SYM_PLUS +| SYM_MINUS -> Yaccparser.SYM_MINUS +| SYM_ASTERISK -> Yaccparser.SYM_ASTERISK +| SYM_DIV -> Yaccparser.SYM_DIV +| SYM_EQUALITY -> Yaccparser.SYM_EQUALITY +| SYM_ASSIGN -> Yaccparser.SYM_ASSIGN +| SYM_LPARENTHESIS -> Yaccparser.SYM_LPARENTHESIS +| SYM_RPARENTHESIS -> Yaccparser.SYM_RPARENTHESIS +| SYM_LBRACE -> Yaccparser.SYM_LBRACE +| SYM_RBRACE -> Yaccparser.SYM_RBRACE +| SYM_WHILE -> Yaccparser.SYM_WHILE +| SYM_RETURN -> Yaccparser.SYM_RETURN +| SYM_COMMA -> Yaccparser.SYM_COMMA +| SYM_LT -> Yaccparser.SYM_LT +| SYM_LEQ -> Yaccparser.SYM_LEQ +| SYM_GT -> Yaccparser.SYM_GT +| SYM_GEQ -> Yaccparser.SYM_GEQ +| SYM_NOTEQ -> Yaccparser.SYM_NOTEQ +| SYM_MOD -> Yaccparser.SYM_MOD +| SYM_BOOL_NOT -> Yaccparser.SYM_BOOL_NOT +| SYM_BOOL_AND -> Yaccparser.SYM_BOOL_AND +| SYM_BOOL_OR -> Yaccparser.SYM_BOOL_OR +| SYM_ARROW -> Yaccparser.SYM_ARROW +| SYM_BITWISE_OR -> Yaccparser.SYM_BITWISE_OR +| SYM_BITWISE_AND -> Yaccparser.SYM_BITWISE_AND +| SYM_BIT_NOT -> Yaccparser.SYM_BIT_NOT +| SYM_XOR -> Yaccparser.SYM_XOR +| SYM_CHARACTER(c) -> Yaccparser.SYM_CHARACTER c +| SYM_STRING(s) -> Yaccparser.SYM_STRING s +| SYM_LBRACKET -> Yaccparser.SYM_LBRACKET +| SYM_RBRACKET -> Yaccparser.SYM_RBRACKET +| SYM_ALLOC -> Yaccparser.SYM_ALLOC +| SYM_PRINT -> Yaccparser.SYM_PRINT +| SYM_EXTERN -> Yaccparser.SYM_EXTERN +| SYM_INCLUDE(s) -> Yaccparser.SYM_INCLUDE s +| SYM_AMPERSAND -> Yaccparser.SYM_AMPERSAND + +let advance (l: ('a list * 'a list)) : ('a * ('a list * 'a list)) option = + let lbef, laft = l in + match laft with + [] -> None + | a::r -> Some (a, (a::lbef, r)) + +let back (l: ('a list * 'a list)) : ('a list * 'a list) option = + let lbef, laft = l in + match lbef with + [] -> None + | a::r -> Some ((r, a::laft)) + + + + +let menhir_parser (toks: (Symbols.token * Lexing.position option) list) () = + let mtoks = ref ([], toks) in + let get_tok () = + match advance !mtoks with + | None -> (Yaccparser.SYM_EOF, Lexing.dummy_pos, Lexing.dummy_pos) + | Some ((t, p), l) -> + mtoks := l; + (to_yacc_token t, Lexing.dummy_pos, Lexing.dummy_pos) + in + let mparser = MenhirLib.Convert.Simplified.traditional2revised Yaccparser.main in + match mparser get_tok with + | ast -> OK (ast, []) + | exception Yaccparser.Error -> + match back !mtoks with + | None -> Error (Printf.sprintf "Parser error while reading '???'\n") + | Some (lbef, laft) -> + Error (Printf.sprintf "Parser error while reading '%s'\n" + (String.concat " " (List.map (fun (t, _) -> string_of_symbol t) (take 20 laft))) + ) + +let parser toks () = + if !Options.alpaga_parser + then parse toks () + else menhir_parser toks () + let pass_parse tokens = - match parse tokens () with + match parser tokens () with | Error msg -> record_compile_result ~error:(Some msg) "Parsing"; Error msg | OK (ast, tokens) -> record_compile_result "Parsing"; diff --git a/src/tokenize.ml b/src/tokenize.ml index 8a1443915ca061f670dd996f081513bcd1264270..e1939cfc1a627c4da50cb08954e4ce9ed20d5136 100644 --- a/src/tokenize.ml +++ b/src/tokenize.ml @@ -5,10 +5,31 @@ open Utils open Options open Symbols -let tokenize file = +let tokenize_handwritten file = + Printf.printf "Handwritten lexer\n"; Lexer_generator.tokenize_file file >>= fun tokens -> OK (List.map (fun tok -> (tok, None)) tokens) +let tokenize_ocamllex file = + Printf.printf "OCamlLex lexer\n"; + let ic = open_in file in + let lexbuf = Lexing.from_channel ic in + lexbuf.Lexing.lex_curr_p <- { lexbuf.Lexing.lex_curr_p with pos_fname = file }; + let rec get_symbols () = + let s = Lexer.token lexbuf in + let ss = (s, Lexing.lexeme_start_p lexbuf) in + if s = SYM_EOF + then [ss] + else ss :: get_symbols () + in + let l = get_symbols () in + close_in ic; + OK (List.map (fun (tok, pos) -> (tok, Some pos)) l) + +let tokenize file = + if !Options.handwritten_lexer + then tokenize_handwritten file + else tokenize_ocamllex file let pass_tokenize file = tokenize file >>* (fun msg -> diff --git a/src/yaccparser.mly b/src/yaccparser.mly new file mode 100644 index 0000000000000000000000000000000000000000..3c9a390a95b44e92e4168327630e1d43af1873a3 --- /dev/null +++ b/src/yaccparser.mly @@ -0,0 +1,93 @@ +%{ + (* open Symbols *) + open Ast +%} + +%token SYM_EOF +%token SYM_VOID SYM_CHAR SYM_INT SYM_STRUCT SYM_POINT SYM_BOOL_NOT SYM_BOOL_AND SYM_BOOL_OR +%token SYM_ARROW SYM_BITWISE_OR SYM_BITWISE_AND SYM_BIT_NOT SYM_XOR SYM_LBRACKET SYM_RBRACKET +%token SYM_ALLOC SYM_EXTERN SYM_AMPERSAND +%token<char> SYM_CHARACTER +%token<string> SYM_STRING +%token<string> SYM_INCLUDE +%token<string> SYM_IDENTIFIER +%token<int> SYM_INTEGER +%token SYM_PLUS SYM_MINUS SYM_ASTERISK SYM_DIV SYM_MOD +%token SYM_LPARENTHESIS SYM_RPARENTHESIS SYM_LBRACE SYM_RBRACE +%token SYM_ASSIGN SYM_SEMICOLON SYM_RETURN SYM_IF SYM_WHILE SYM_ELSE SYM_COMMA SYM_PRINT +%token SYM_EQUALITY SYM_NOTEQ SYM_LT SYM_LEQ SYM_GT SYM_GEQ + +%left SYM_EQUALITY SYM_NOTEQ +%left SYM_GEQ SYM_LEQ SYM_LT SYM_GT +%left SYM_PLUS SYM_MINUS +%left SYM_ASTERISK SYM_DIV SYM_MOD +%nonassoc UMINUS + +%start main +%type <Ast.tree> main + +%% + + main: + | fundefs SYM_EOF { Node(Tlistglobdef, $1) } + ; + fundefs: + | fundef fundefs { $1 :: $2 } + | { [] } + ; + fundef: + identifier SYM_LPARENTHESIS lparams SYM_RPARENTHESIS instr { + let fargs = $3 in + let instr = $5 in + Node (Tfundef, [$1; Node (Tfunargs, fargs) ; instr ]) + } + ; + identifier: + SYM_IDENTIFIER { StringLeaf ($1) } + ; + + integer : SYM_INTEGER { IntLeaf ($1) }; + + lparams : + identifier rest_params { Node (Targ, [$1]) :: $2 } + | { [] }; + rest_params : + SYM_COMMA identifier rest_params { + Node (Targ, [$2]) :: $3 + } + | { [] }; + instrs : + | instr instrs { $1 :: $2 } + | { [] }; + linstrs : + SYM_LBRACE instrs SYM_RBRACE { Node (Tblock, $2) }; + instr : + identifier SYM_ASSIGN expr SYM_SEMICOLON { + Node (Tassign, [Node (Tassignvar,[$1; $3])]) + } + | SYM_IF SYM_LPARENTHESIS expr SYM_RPARENTHESIS linstrs ntelse { Node (Tif, [$3; $5; $6]) } + | SYM_WHILE SYM_LPARENTHESIS expr SYM_RPARENTHESIS instr { Node( Twhile, [$3; $5]) } + | SYM_RETURN expr SYM_SEMICOLON { Node(Treturn, [$2]) } + | SYM_PRINT expr SYM_SEMICOLON { Node(Tprint, [$2]) } + | linstrs { $1 }; + ntelse : + SYM_ELSE linstrs { $2 } + | { Node(Tblock, []) }; + + expr : + | expr SYM_EQUALITY expr { Node (Tceq, [$1; $3]) } + | expr SYM_NOTEQ expr { Node (Tne, [$1; $3]) } + | expr SYM_PLUS expr { Node (Tadd, [$1; $3]) } + | expr SYM_MINUS expr { Node (Tsub, [$1; $3]) } + | expr SYM_ASTERISK expr { Node (Tmul, [$1; $3]) } + | expr SYM_DIV expr { Node (Tdiv, [$1; $3]) } + | expr SYM_MOD expr { Node (Tmod, [$1; $3]) } + | expr SYM_LT expr { Node (Tclt, [$1; $3]) } + | expr SYM_GT expr { Node (Tcgt, [$1; $3]) } + | expr SYM_LEQ expr { Node (Tcle, [$1; $3]) } + | expr SYM_GEQ expr { Node (Tcge, [$1; $3]) } + | SYM_MINUS expr %prec UMINUS { Node (Tneg, [$2])} + | integer { Node(Tint, [$1])} + | identifier { $1 } + | SYM_LPARENTHESIS expr SYM_RPARENTHESIS { $2 } + ;