mescc.scm: parse simple main.c.

This commit is contained in:
Jan Nieuwenhuizen 2016-07-24 23:18:53 +02:00
parent d5fc30d265
commit 60a7c9099f
5 changed files with 821 additions and 0 deletions

2
.gitignore vendored
View file

@ -9,5 +9,7 @@
/syntax.test
/paren.test
/syntax-case.test
/mescc.test
?
?.mes

View file

@ -91,3 +91,9 @@ paren.test: lib/lalr.scm paren.scm
guile-paren: paren.test
echo '___P((()))' | guile -s $^
mescc.test: lib/lalr.scm c-lexer.scm mescc.scm
cat $^ > $@
guile-mescc: mescc.test
cat main.c | guile -s $^

399
c-lexer.scm Normal file
View file

@ -0,0 +1,399 @@
;;; ECMAScript for Guile
;; Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;;
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;;; Lesser General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;;; Code:
;; (define-module (language ecmascript tokenize)
;; #:use-module (ice-9 rdelim)
;; #:use-module ((srfi srfi-1) #:select (unfold-right))
;; #:use-module (system base lalr)
;; #:export (next-token make-tokenizer make-tokenizer/1 tokenize tokenize/1))
(cond-expand
(guile
(use-modules ((ice-9 rdelim)))
(define (syntax-error what loc form . args)
(throw 'syntax-error #f what
;;(and=> loc source-location->source-properties)
loc
form #f args))
)
(mes
)
)
(define (port-source-location port)
(make-source-location (port-filename port)
(port-line port)
(port-column port)
(false-if-exception (ftell port))
#f))
;; taken from SSAX, sorta
(define (read-until delims loc)
(if (eof-object? (peek-char))
(syntax-error "EOF while reading a token" loc #f)
(let ((token (read-delimited delims (current-input-port) 'peek)))
(if (eof-object? (peek-char))
(syntax-error "EOF while reading a token" loc token)
token))))
(define (char-hex? c)
(and (not (eof-object? c))
(or (char-numeric? c)
(memv c '(#\a #\b #\c #\d #\e #\f))
(memv c '(#\A #\B #\C #\D #\E #\F)))))
(define (digit->number c)
(- (char->integer c) (char->integer #\0)))
(define (hex->number c)
(if (char-numeric? c)
(digit->number c)
(+ 10 (- (char->integer (char-downcase c)) (char->integer #\a)))))
(define (read-slash loc div?)
(let ((c1 (begin
(read-char)
(peek-char))))
(cond
((eof-object? c1)
;; hmm. error if we're not looking for a div? ?
(make-lexical-token '/ loc #f))
((char=? c1 #\/)
(read-line)
(next-token div?))
((char=? c1 #\*)
(read-char)
(let lp ((c (read-char)))
(cond
((eof-object? c)
(syntax-error "EOF while in multi-line comment" loc #f))
((char=? c #\*)
(if (eqv? (peek-char) #\/)
(begin
(read-char)
(next-token div?))
(lp (read-char))))
(else
(lp (read-char))))))
(div?
(case c1
((#\=) (read-char) (make-lexical-token '/= loc #f))
(else (make-lexical-token '/ loc #f))))
(else
;;;(read-regexp loc)
(make-lexical-token '/ loc #f)))))
(define (read-string loc)
(let ((c (read-char)))
(let ((terms (string c #\\ #\nl #\cr)))
(define (read-escape)
(let ((c (read-char)))
(case c
((#\' #\" #\\) c)
((#\b) #\bs)
((#\f) #\np)
((#\n) #\nl)
((#\r) #\cr)
((#\t) #\tab)
((#\v) #\vt)
((#\0)
(let ((next (peek-char)))
(cond
((eof-object? next) #\nul)
((char-numeric? next)
(syntax-error "octal escape sequences are not supported"
loc #f))
(else #\nul))))
((#\x)
(let* ((a (read-char))
(b (read-char)))
(cond
((and (char-hex? a) (char-hex? b))
(integer->char (+ (* 16 (hex->number a)) (hex->number b))))
(else
(syntax-error "bad hex character escape" loc (string a b))))))
((#\u)
(let* ((a (read-char))
(b (read-char))
(c (read-char))
(d (read-char)))
(integer->char (string->number (string a b c d) 16))))
(else
c))))
(let lp ((str (read-until terms loc)))
(let ((terminator (peek-char)))
(cond
((char=? terminator c)
(read-char)
(make-lexical-token 'StringLiteral loc str))
((char=? terminator #\\)
(read-char)
(let ((echar (read-escape)))
(lp (string-append str (string echar)
(read-until terms loc)))))
(else
(syntax-error "string literals may not contain newlines"
loc str))))))))
(define *keywords*
'(("break" . break)
("case" . case)
("continue" . continue)
("else" . else)
("goto" . goto)
("char" . char)
("double" . double)
("float" . float)
("int" . int)
("long" . long)
("short" . short)
("unsigned" . unsigned)
("return" . return)
("void" . void)
("for" . for)
("switch" . switch)
("while" . while)
("continue" . continue)
("default" . default)
("if" . if)
("do" . do)
;; these aren't exactly keywords, but hey
("true" . true)
("false" . false)))
(define (read-identifier loc)
(let lp ((c (peek-char)) (chars '()))
(if (or (eof-object? c)
(not (or (char-alphabetic? c)
(char-numeric? c)
(char=? c #\$)
(char=? c #\_))))
(let ((word (list->string (reverse chars))))
(cond ((assoc-ref *keywords* word)
(make-lexical-token (assoc-ref *keywords* word) loc #f))
(else (make-lexical-token 'Identifier loc
(string->symbol word)))))
(begin (read-char)
(lp (peek-char) (cons c chars))))))
(define (read-numeric loc)
(let* ((c0 (if (char=? (peek-char) #\.)
#\0
(read-char)))
(c1 (peek-char)))
(cond
((eof-object? c1) (digit->number c0))
((and (char=? c0 #\0) (or (char=? c1 #\x) (char=? c1 #\X)))
(read-char)
(let ((c (peek-char)))
(if (not (char-hex? c))
(syntax-error "bad digit reading hexadecimal number"
loc c))
(let lp ((c c) (acc 0))
(cond ((char-hex? c)
(read-char)
(lp (peek-char)
(+ (* 16 acc) (hex->number c))))
(else
acc)))))
((and (char=? c0 #\0) (char-numeric? c1))
(let lp ((c c1) (acc 0))
(cond ((eof-object? c) acc)
((char-numeric? c)
(if (or (char=? c #\8) (char=? c #\9))
(syntax-error "invalid digit in octal sequence"
loc c))
(read-char)
(lp (peek-char)
(+ (* 8 acc) (digit->number c))))
(else
acc))))
(else
(let lp ((c1 c1) (acc (digit->number c0)))
(cond
((eof-object? c1) acc)
((char-numeric? c1)
(read-char)
(lp (peek-char)
(+ (* 10 acc) (digit->number c1))))
((or (char=? c1 #\e) (char=? c1 #\E))
(read-char)
(let ((add (let ((c (peek-char)))
(cond ((eof-object? c)
(syntax-error "error reading exponent: EOF"
loc #f))
((char=? c #\+) (read-char) +)
((char=? c #\-) (read-char) -)
((char-numeric? c) +)
(else
(syntax-error "error reading exponent: non-digit"
loc c))))))
(let lp ((c (peek-char)) (e 0))
(cond ((and (not (eof-object? c)) (char-numeric? c))
(read-char)
(lp (peek-char) (add (* 10 e) (digit->number c))))
(else
(* (if (negative? e) (* acc 1.0) acc) (expt 10 e)))))))
((char=? c1 #\.)
(read-char)
(let lp2 ((c (peek-char)) (dec 0.0) (n -1))
(cond ((and (not (eof-object? c)) (char-numeric? c))
(read-char)
(lp2 (peek-char)
(+ dec (* (digit->number c) (expt 10 n)))
(1- n)))
(else
;; loop back to catch an exponential part
(lp c (+ acc dec))))))
(else
acc)))))))
(define *punctuation*
'(("{" . lbrace)
("}" . rbrace)
("(" . lparen)
(")" . rparen)
("[" . lbracket)
("]" . rbracket)
("." . dot)
(";" . semicolon)
("," . comma)
("<" . <)
(">" . >)
("<=" . <=)
(">=" . >=)
("==" . ==)
("!=" . !=)
("===" . ===)
("!==" . !==)
("+" . +)
("-" . -)
("*" . *)
("%" . %)
("++" . ++)
("--" . --)
("<<" . <<)
(">>" . >>)
(">>>" . >>>)
("&" . &)
("|" . bor)
("^" . ^)
("!" . !)
("~" . ~)
("&&" . &&)
("||" . or)
("?" . ?)
(":" . colon)
("=" . =)
("+=" . +=)
("-=" . -=)
("*=" . *=)
("%=" . %=)
("<<=" . <<=)
(">>=" . >>=)
(">>>=" . >>>=)
("&=" . &=)
("|=" . bor=)
("^=" . ^=)))
(define *div-punctuation*
'(("/" . /)
("/=" . /=)))
;; node ::= (char (symbol | #f) node*)
(define read-punctuation
(let ((punc-tree (let lp ((nodes '()) (puncs *punctuation*))
(cond ((null? puncs)
nodes)
((assv-ref nodes (string-ref (caar puncs) 0))
(let ((node-tail (assv-ref nodes (string-ref (caar puncs) 0))))
(if (= (string-length (caar puncs)) 1)
(set-car! node-tail (cdar puncs))
(set-cdr! node-tail
(lp (cdr node-tail)
`((,(substring (caar puncs) 1)
. ,(cdar puncs))))))
(lp nodes (cdr puncs))))
(else
(lp (cons (list (string-ref (caar puncs) 0) #f) nodes)
puncs))))))
(lambda (loc)
(let lp ((c (peek-char)) (tree punc-tree) (candidate #f))
(display "read-punctuation c=") (display c) (newline)
(cond
((assv-ref tree c)
(let ((node-tail (assv-ref tree c)))
(read-char)
(lp (peek-char) (cdr node-tail) (car node-tail))))
(candidate
(make-lexical-token candidate loc #f))
(else
(syntax-error "bad syntax: character not allowed" loc c)))))))
(define (next-token div?)
(let ((c (peek-char))
(loc (port-source-location (current-input-port))))
(display "next-token c=") (display c) (newline)
(case c
((#\ht #\vt #\np #\space #\x00A0) ; whitespace
(read-char)
(next-token div?))
((#\newline #\cr) ; line break
(read-char)
(next-token div?))
((#\/)
;; division, single comment, double comment, or regexp
(read-slash loc div?))
((#\" #\') ; string literal
(read-string loc))
(else
(cond
((eof-object? c)
'*eoi*)
((or (char-alphabetic? c)
(char=? c #\$)
(char=? c #\_))
;; reserved word or identifier
(read-identifier loc))
((char-numeric? c)
;; numeric -- also accept . FIXME, requires lookahead
(make-lexical-token 'NumericLiteral loc (read-numeric loc)))
(else
;; punctuation
(read-punctuation loc)))))))
(define (c-lexer errorp)
(let ((div? #f))
(lambda ()
(let ((tok (next-token div?)))
(set! div? (and (lexical-token? tok)
(let ((cat (lexical-token-category tok)))
(or (eq? cat 'Identifier)
(eq? cat 'NumericLiteral)
(eq? cat 'StringLiteral)))))
tok))))

4
main.c Normal file
View file

@ -0,0 +1,4 @@
int main ()
{
return 0;
}

410
mescc.scm Normal file
View file

@ -0,0 +1,410 @@
(cond-expand
(guile
;;(use-modules ((system base lalr)))
)
(mes
))
(define c-parser
(lalr-parser
(lbrace rbrace lparen rparen lbracket rbracket semicolon colon dot comma
=
Identifier NumericLiteral
break case continue goto label
return switch
if else
(left: or && ! * / + -)
(left: bool double float enum void int struct)
(nonassoc: == !=)
)
(program
(translation-unit *eoi*) : `(root ,@$1))
(translation-unit
(external-declaration) : `(,$1)
(translation-unit external-declaration) : `(,@$1 ,@$2))
(external-declaration
(function-definition) : $1
(declaration) : $1
(error semicolon) : (begin (syntax-error "external declaration" @1 $1) '()))
(function-definition
(declarator compound-statement) : `(function ,$1 (signature int (formals)) ,$2)
(declaration-specifiers declarator compound-statement) : `(function ,$2 (signature ,$1 (formals)) ,$3)
(declaration-specifiers declarator declaration-list compound-statement) : `(function ,$2 (signature ,$1 ,$3) ,$4))
(declaration
(declaration-specifiers semicolon) : `(,$1)
;;(declaration-specifiers init-declarator-list semicolon): `(,$1 ,$2)
)
(declaration-list
(declaration) : `(formals $1)
(declaration-list declaration) : `(,@$1 ,@(cdr $2)))
(declaration-specifiers
;;(storage-class-specifier) : `(,$1)
(type-specifier) : `(,$1)
;;(type-qualifier) : `($1)
;;(storage-class-specifier declaration-specifiers) : (cons $1 $2)
;;(type-specifier declaration-specifiers) : (cons $1 $2)
;;(type-qualifier declaration-specifiers) : (cons $1 $2)
)
;; (storage_class_specifier
;; (auto)
;; (extern)
;; (register)
;; (static)
;; (typedef))
(type-specifier
;; (char) : $1
;; (double) : $1
;; (void) : $1
;; (float)
(int) : 'int
;; (long)
;; (short)
;; (unsigned)
;; (struct-or-enum-specifier)
;; (enum-specifier)
;; (type-name)
)
;; (type-qualifier
;; (const)
;; (volatile))
;; struct_or_union_specifier:
;; struct_or_union_ident lbrace struct_declaration_list rbrace
;; | struct_or_union_ident
;; ;
;; struct_or_union_ident: struct_or_union
;; | struct_or_union Identifier
;; ;
;; struct_or_union: STRUCT { ; }
;; | UNION { ; }
;; ;
;; struct_declaration_list: struct_declaration
;; | struct_declaration_list struct_declaration
;; ;
;; init_declarator_list: init_declarator %prec comma
;; | init_declarator_list comma init_declarator
;; ;
;; init_declarator: declarator
;; | declarator EQ initializer
;; | error { yyerror("init declarator error"); }
;; ;
;; struct_declaration: specifier_qualifier_list struct_declarator_list semicolon
;; ;
;; specifier_qualifier_list: type_specifier
;; | type_qualifier
;; | type_specifier specifier_qualifier_list
;; | type_qualifier specifier_qualifier_list
;; ;
;; struct_declarator_list: struct_declarator
;; | struct_declarator_list comma struct_declarator
;; ;
;; struct_declarator: declarator
;; | COLON constant_expression { ; }
;; | declarator COLON constant_expression
;; ;
;; enum_specifier: ENUM Identifier lbrace enumerator_list rbrace { ; }
;; | ENUM lbrace enumerator_list rbrace { ; }
;; | ENUM Identifier { ; }
;; ;
;; enumerator_list: enumerator
;; | enumerator_list comma enumerator
;; ;
;; enumerator: Identifier
;; | Identifier EQ constant_expression
;; ;
(declarator
(direct-declarator) : $1
;;(pointer direct-declarator)
)
(direct-declarator
(Identifier) : $1
;; (lparen declarator rparen)
;; (direct-declarator lbracket rbracket)
;; (direct-declarator lbracket constant-expression rbracket)
;; (lbracket constant-expression rbracket)
;; (direct-declarator lparen parameter-type-list rparen)
(direct-declarator lparen rparen) : $1
;; (direct-declarator lparen identifier-list rparen)
)
;; pointer: STAR { ; }
;; | STAR pointer { ; }
;; | STAR type_qualifier_list { ; }
;; | STAR type_qualifier_list pointer { ; }
;; ;
;; type_qualifier_list: type_qualifier
;; | type_qualifier_list type_qualifier
;; ;
;; parameter_type_list: parameter_list
;; | parameter_list comma ELLIPSIS
;; ;
;; parameter_list: parameter_declaration
;; | parameter_list comma parameter_declaration
;; ;
;; parameter_declaration:
;; declaration_specifiers declarator
;; | declaration_specifiers
;; | declaration_specifiers abstract_declarator
;; ;
;; identifier_list: Identifier
;; | identifier_list comma Identifier
;; | error { yyerror("identifier list error"); }
;; ;
;; initializer: assignment_expression %prec comma
;; | lbrace initializer_list rbrace { ; }
;; | lbrace initializer_list comma rbrace { ; }
;; ;
;; initializer_list: initializer %prec comma
;; | initializer_list comma initializer
;; ;
;; type_name: specifier_qualifier_list
;; | specifier_qualifier_list abstract_declarator
;; ;
;; abstract_declarator: pointer
;; | direct_abstract_declarator
;; | pointer direct_abstract_declarator
;; ;
;; direct_abstract_declarator:
;; lparen abstract_declarator rparen { ; }
;; | lbrace rbrace { ; }
;; | direct_abstract_declarator lbrace rbrace
;; | lbrace constant_expression rbrace { ; }
;; | direct_abstract_declarator lbrace constant_expression rbrace
;; | lparen rparen { ; }
;; | direct_abstract_declarator lparen rparen
;; | lparen parameter_list rparen { ; }
;; | direct_abstract_declarator lparen parameter_list rparen
;; ;
(statement
;;(labeled-statement)
(expression-statement) : $1
(compound-statement) : $1
;;(selection-statement)
;;(iteration-statement)
(jump-statement) : $1
(semicolon) : '()
(error semicolon) : (begin (syntax-error "statement error" @1 $1) '())
(error rbrace) : (begin (syntax-error "statement error" @1 $1) '()))
;; labeled_statement:
;; Identifier COLON statement
;; | CASE x COLON statement { ; }
;; | DEFAULT COLON statement { ; }
;; ;
(expression-statement
(x semicolon) : $1)
(compound-statement
(lbrace rbrace) : '(compound)
(lbrace declaration-list rbrace) : `(compound ,@$2)
(lbrace statement-list rbrace) : `(compound ,@$2)
(lbrace declaration-list statement-list rbrace) : `(compound ,@$2 ,@$3))
(statement-list
(statement) : `(,$1)
(statement-list statement) : `(,@$1 ,@$2))
;; selection_statement:
;; IF lparen x rparen statement { ; }
;; | IF lparen x rparen statement ELSE statement { ; }
;; | SWITCH lparen x rparen statement { ; }
;; ;
;; iteration_statement:
;; WHILE lparen x rparen statement { ; }
;; | DO statement WHILE lparen x rparen semicolon { ; }
;; | FOR lparen forcntrl rparen statement { ; }
;; ;
;; forcntrl: semicolon semicolon { ; }
;; | semicolon semicolon x { ; }
;; | semicolon x semicolon { ; }
;; | semicolon x semicolon x { ; }
;; | x semicolon semicolon
;; | x semicolon semicolon x
;; | x semicolon x semicolon
;; | x semicolon x semicolon x
;; ;
(jump-statement
(goto Identifier semicolon) : `(goto ,$1)
(continue semicolon) : '(continue)
(break semicolon) : '(break)
(return semicolon) : '(return)
(return x semicolon) : `(return ,$2))
(x
(assignment-expression) : $1
(x comma assignment-expression) : `($1 ,@$2))
(assignment-expression
(primary-expression) : $1 ;;(conditional-expression)
(unary-expression assignment-operator assignment-expression) : `(,$2 ,$1 ,$3))
(assignment-operator
(=) : $1)
;; EQ { ; }
;; | PLUSEQ { ; }
;; | MINUSEQ { ; }
;; | MUEQ { ; }
;; | DIVEQ { ; }
;; | MODEQ { ; }
;; | SLEQ { ; }
;; | SREQ { ; }
;; | ANEQ { ; }
;; | OREQ { ; }
;; | XOREQ { ; }
;; ;
;; conditional_expression: logical_or_expression
;; | logical_or_expression IF_THEN x COLON conditional_expression
;; ;
;; constant_expression: conditional_expression
;; ;
;; logical_or_expression: logical_and_expression
;; | logical_or_expression OROR logical_and_expression
;; ;
;; logical_and_expression: inclusive_or_expression
;; | logical_and_expression ANDAND inclusive_or_expression
;; ;
;; inclusive_or_expression: exclusive_or_expression
;; | inclusive_or_expression OR exclusive_or_expression
;; ;
;; exclusive_or_expression: and_expression
;; | exclusive_or_expression XOR and_expression
;; ;
;; and_expression: equality_expression
;; | and_expression AND equality_expression
;; ;
;; equality_expression: relational_expression
;; | equality_expression EQEQ relational_expression
;; | equality_expression NOTEQ relational_expression
;; ;
;; relational_expression: shift_expression
;; | relational_expression LT shift_expression
;; | relational_expression LE shift_expression
;; | relational_expression GT shift_expression
;; | relational_expression GE shift_expression
;; ;
;; shift_expression: additive_expression
;; | shift_expression LTLT additive_expression
;; | shift_expression GTGT additive_expression
;; ;
;; additive_expression: multiplicative_expression
;; | additive_expression PLUS multiplicative_expression
;; | additive_expression MINUS multiplicative_expression
;; ;
;; multiplicative_expression: cast_expression
;; | multiplicative_expression STAR cast_expression
;; | multiplicative_expression DIV cast_expression
;; | multiplicative_expression MOD cast_expression
;; ;
;; cast_expression: unary_expression
;; | lparen type_name rparen cast_expression { ; }
;; ;
(unary-expression
(primary-expression) : $1)
;; unary_expression: postfix_expression
;; | INCOP unary_expression { ; }
;; | DECOP unary_expression { ; }
;; | SIZEOF unary_expression { ; }
;; | SIZEOF lparen type_name rparen %prec SIZEOF { ; }
;; | STAR cast_expression { ; }
;; | AND cast_expression { ; }
;; | MINUS cast_expression { ; }
;; | PLUS cast_expression { ; }
;; | NEG cast_expression { ; }
;; | NOT cast_expression { ; }
;; ;
;; postfix_expression: primary_expression
;; | postfix_expression lbracket x rbracket
;; | postfix_expression lparen rparen
;; | postfix_expression lparen argument_expression_list rparen
;; | postfix_expression FOLLOW Identifier
;; | postfix_expression DOT Identifier
;; | postfix_expression INCOP
;; | postfix_expression DECOP
;; ;
(primary-expression
(Identifier): $1
(NumericLiteral) : $1)
;; primary_expression: Identifier
;; INT_LITERAL
;; CHAR_LITERAL
;; FLOAT_LITERAL
;; STRING_LITERAL
;; lparen x rparen
;;
;; argument_expression_list: assignment_expression
;; | argument_expression_list comma assignment_expression
;; ;
))
(define mescc
(let ((errorp
(lambda args
(for-each display args)
(newline))))
(lambda ()
(c-parser (c-lexer errorp) errorp))))
(display "program: ")
(display (mescc))
(newline)