From 33ac19d7dfa08120f009930cdf8bc605b992a669 Mon Sep 17 00:00:00 2001 From: Jan Nieuwenhuizen Date: Thu, 22 Dec 2016 20:16:44 +0100 Subject: [PATCH] Support PEG from Guile. * module/mes/peg.mes: New file. * module/mes/peg/cache.scm: New file. * module/mes/peg/codegen.scm: New file. * module/mes/peg/simplify-tree.scm: New file. * module/mes/peg/string-peg.scm: New file. * module/mes/peg/using-parsers.scm: New file. * tests/peg.test: New file. * GNUmakefile (TESTS): Add it. --- AUTHORS | 3 + GNUmakefile | 1 + module/mes/peg.mes | 38 ++++ module/mes/peg/cache.scm | 47 ++++ module/mes/peg/codegen.scm | 358 +++++++++++++++++++++++++++++++ module/mes/peg/simplify-tree.scm | 97 +++++++++ module/mes/peg/string-peg.scm | 273 +++++++++++++++++++++++ module/mes/peg/using-parsers.scm | 115 ++++++++++ tests/peg.test | 72 +++++++ 9 files changed, 1004 insertions(+) create mode 100644 module/mes/peg.mes create mode 100644 module/mes/peg/cache.scm create mode 100644 module/mes/peg/codegen.scm create mode 100644 module/mes/peg/simplify-tree.scm create mode 100644 module/mes/peg/string-peg.scm create mode 100644 module/mes/peg/using-parsers.scm create mode 100755 tests/peg.test diff --git a/AUTHORS b/AUTHORS index be66bc9c..625511e5 100644 --- a/AUTHORS +++ b/AUTHORS @@ -26,6 +26,9 @@ module/mes/psyntax.pp [generated] Optargs from Guile module/mes/optargs.scm +PEG from Guile +module/mes/peg/ + Pmatch from Guile module/mes/pmatch.scm diff --git a/GNUmakefile b/GNUmakefile index 9ca776b5..51d577b0 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -67,6 +67,7 @@ TESTS:=\ tests/let-syntax.test\ tests/record.test\ tests/match.test\ + tests/peg.test\ # BASE-0:=module/mes/base-0.mes diff --git a/module/mes/peg.mes b/module/mes/peg.mes new file mode 100644 index 00000000..ac3ba7bf --- /dev/null +++ b/module/mes/peg.mes @@ -0,0 +1,38 @@ +;;; -*-scheme-*- + +;;; Mes --- Maxwell Equations of Software +;;; Copyright © 2016 Jan Nieuwenhuizen +;;; +;;; This file is part of Mes. +;;; +;;; Mes is free software; you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 3 of the License, or (at +;;; your option) any later version. +;;; +;;; Mes is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with Mes. If not, see . + +;;; Commentary: + +;;; peg.mes is loaded after syntax-case: psyntax. It provides PEG +;;; from Guile-2.1. + +;;; Code: + +(mes-use-module (mes guile)) +(mes-use-module (mes pretty-print)) +(mes-use-module (mes psyntax)) +(mes-use-module (srfi srfi-13)) +(mes-use-module (srfi srfi-9-psyntax)) +(mes-use-module (mes pmatch)) +(include-from-path "mes/peg/cache.scm") +(include-from-path "mes/peg/codegen.scm") +(include-from-path "mes/peg/string-peg.scm") +(include-from-path "mes/peg/using-parsers.scm") +(include-from-path "mes/peg/simplify-tree.scm") diff --git a/module/mes/peg/cache.scm b/module/mes/peg/cache.scm new file mode 100644 index 00000000..c6e52db2 --- /dev/null +++ b/module/mes/peg/cache.scm @@ -0,0 +1,47 @@ +;;; -*-scheme-*- + +;;;; cache.scm --- cache the results of parsing +;;;; +;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +(define-module (ice-9 peg cache) + #:export (cg-cached-parser)) + +;; The results of parsing using a nonterminal are cached. Think of it like a +;; hash with no conflict resolution. Process for deciding on the cache size +;; wasn't very scientific; just ran the benchmarks and stopped a little after +;; the point of diminishing returns on my box. +(define *cache-size* 512) + +(define (make-cache) + (make-vector *cache-size* #f)) + +;; given a syntax object which is a parser function, returns syntax +;; which, if evaluated, will become a parser function that uses a cache. +(define (cg-cached-parser parser) + #`(let ((cache (make-cache))) + (lambda (str strlen at) + (let* ((vref (vector-ref cache (modulo at *cache-size*)))) + ;; Check to see whether the value is cached. + (if (and vref (eq? (car vref) str) (= (cadr vref) at)) + (caddr vref);; If it is return it. + (let ((fres ;; Else calculate it and cache it. + (#,parser str strlen at))) + (vector-set! cache (modulo at *cache-size*) + (list str at fres)) + fres)))))) diff --git a/module/mes/peg/codegen.scm b/module/mes/peg/codegen.scm new file mode 100644 index 00000000..701c5a81 --- /dev/null +++ b/module/mes/peg/codegen.scm @@ -0,0 +1,358 @@ +;;;; codegen.scm --- code generation for composable parsers +;;;; +;;;; Copyright (C) 2011 Free Software Foundation, Inc. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +(define-module (ice-9 peg codegen) + #:export (compile-peg-pattern wrap-parser-for-users add-peg-compiler!) + #:use-module (ice-9 pretty-print) + #:use-module (system base pmatch)) + +(define-syntax single? + (syntax-rules () + ;;"Return #t if X is a list of one element." + ((_ x) + (pmatch x + ((_) #t) + (else #f))))) + +(define-syntax single-filter + (syntax-rules () + ;;"If EXP is a list of one element, return the element. Otherwise return EXP." + ((_ exp) + (pmatch exp + ((,elt) elt) + (,elts elts))))) + +(define-syntax push-not-null! + (syntax-rules () + ;;"If OBJ is non-null, push it onto LST, otherwise do nothing." + ((_ lst obj) + (if (not (null? obj)) + (push! lst obj))))) + +(define-syntax push! + (syntax-rules () + ;;"Push an object onto a list." + ((_ lst obj) + (set! lst (cons obj lst))))) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;; CODE GENERATORS +;; These functions generate scheme code for parsing PEGs. +;; Conventions: +;; accum: (all name body none) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Code we generate will have a certain return structure depending on how we're +;; accumulating (the ACCUM variable). +(define (cg-generic-ret accum name body-uneval at) + ;; name, body-uneval and at are syntax + #`(let ((body #,body-uneval)) + #,(cond + ((and (eq? accum 'all) name) + #`(list #,at + (cond + ((not (list? body)) (list '#,name body)) + ((null? body) '#,name) + ((symbol? (car body)) (list '#,name body)) + (else (cons '#,name body))))) + ((eq? accum 'name) + #`(list #,at '#,name)) + ((eq? accum 'body) + #`(list #,at + (cond + ((single? body) (car body)) + (else body)))) + ((eq? accum 'none) + #`(list #,at '())) + (else + (begin + (pretty-print `(cg-generic-ret-error ,accum ,name ,body-uneval ,at)) + (pretty-print "Defaulting to accum of none.\n") + #`(list #,at '())))))) + +;; The short name makes the formatting below much easier to read. +(define cggr cg-generic-ret) + +;; Generates code that matches a particular string. +;; E.g.: (cg-string syntax "abc" 'body) +(define (cg-string pat accum) + (let ((plen (string-length pat))) + #`(lambda (str len pos) + (let ((end (+ pos #,plen))) + (and (<= end len) + (string= str #,pat pos end) + #,(case accum + ((all) #`(list end (list 'cg-string #,pat))) + ((name) #`(list end 'cg-string)) + ((body) #`(list end #,pat)) + ((none) #`(list end '())) + (else (error "bad accum" accum)))))))) + +;; Generates code for matching any character. +;; E.g.: (cg-peg-any syntax 'body) +(define (cg-peg-any accum) + #`(lambda (str len pos) + (and (< pos len) + #,(case accum + ((all) #`(list (1+ pos) + (list 'cg-peg-any (substring str pos (1+ pos))))) + ((name) #`(list (1+ pos) 'cg-peg-any)) + ((body) #`(list (1+ pos) (substring str pos (1+ pos)))) + ((none) #`(list (1+ pos) '())) + (else (error "bad accum" accum)))))) + +;; Generates code for matching a range of characters between start and end. +;; E.g.: (cg-range syntax #\a #\z 'body) +(define (cg-range pat accum) + (syntax-case pat () + ((start end) + (if (not (and (char? (syntax->datum #'start)) + (char? (syntax->datum #'end)))) + (error "range PEG should have characters after it; instead got" + #'start #'end)) + #`(lambda (str len pos) + (and (< pos len) + (let ((c (string-ref str pos))) + (and (char>=? c start) + (char<=? c end) + #,(case accum + ((all) #`(list (1+ pos) (list 'cg-range (string c)))) + ((name) #`(list (1+ pos) 'cg-range)) + ((body) #`(list (1+ pos) (string c))) + ((none) #`(list (1+ pos) '())) + (else (error "bad accum" accum)))))))))) + +;; Generate code to match a pattern and do nothing with the result +(define (cg-ignore pat accum) + (syntax-case pat () + ((inner) + (compile-peg-pattern #'inner 'none)))) + +(define (cg-capture pat accum) + (syntax-case pat () + ((inner) + (compile-peg-pattern #'inner 'body)))) + +;; Filters the accum argument to compile-peg-pattern for buildings like string +;; literals (since we don't want to tag them with their name if we're doing an +;; "all" accum). +(define (builtin-accum-filter accum) + (cond + ((eq? accum 'all) 'body) + ((eq? accum 'name) 'name) + ((eq? accum 'body) 'body) + ((eq? accum 'none) 'none))) +(define baf builtin-accum-filter) + +;; Top-level function builder for AND. Reduces to a call to CG-AND-INT. +(define (cg-and clauses accum) + #`(lambda (str len pos) + (let ((body '())) + #,(cg-and-int clauses (baf accum) #'str #'len #'pos #'body)))) + +;; Internal function builder for AND (calls itself). +(define (cg-and-int clauses accum str strlen at body) + (syntax-case clauses () + (() + (cggr accum 'cg-and #`(reverse #,body) at)) + ((first rest ...) + #`(let ((res (#,(compile-peg-pattern #'first accum) #,str #,strlen #,at))) + (and res + ;; update AT and BODY then recurse + (let ((newat (car res)) + (newbody (cadr res))) + (set! #,at newat) + (push-not-null! #,body (single-filter newbody)) + #,(cg-and-int #'(rest ...) accum str strlen at body))))))) + +;; Top-level function builder for OR. Reduces to a call to CG-OR-INT. +(define (cg-or clauses accum) + #`(lambda (str len pos) + #,(cg-or-int clauses (baf accum) #'str #'len #'pos))) + +;; Internal function builder for OR (calls itself). +(define (cg-or-int clauses accum str strlen at) + (syntax-case clauses () + (() + #f) + ((first rest ...) + #`(or (#,(compile-peg-pattern #'first accum) #,str #,strlen #,at) + #,(cg-or-int #'(rest ...) accum str strlen at))))) + +(define (cg-* args accum) + (syntax-case args () + ((pat) + #`(lambda (str strlen at) + (let ((body '())) + (let lp ((end at) (count 0)) + (let* ((match (#,(compile-peg-pattern #'pat (baf accum)) + str strlen end)) + (new-end (if match (car match) end)) + (count (if (> new-end end) (1+ count) count))) + (if (> new-end end) + (push-not-null! body (single-filter (cadr match)))) + (if (and (> new-end end) + #,#t) + (lp new-end count) + (let ((success #,#t)) + #,#`(and success + #,(cggr (baf accum) 'cg-body + #'(reverse body) #'new-end))))))))))) + +(define (cg-+ args accum) + (syntax-case args () + ((pat) + #`(lambda (str strlen at) + (let ((body '())) + (let lp ((end at) (count 0)) + (let* ((match (#,(compile-peg-pattern #'pat (baf accum)) + str strlen end)) + (new-end (if match (car match) end)) + (count (if (> new-end end) (1+ count) count))) + (if (> new-end end) + (push-not-null! body (single-filter (cadr match)))) + (if (and (> new-end end) + #,#t) + (lp new-end count) + (let ((success #,#'(>= count 1))) + #,#`(and success + #,(cggr (baf accum) 'cg-body + #'(reverse body) #'new-end))))))))))) + +(define (cg-? args accum) + (syntax-case args () + ((pat) + #`(lambda (str strlen at) + (let ((body '())) + (let lp ((end at) (count 0)) + (let* ((match (#,(compile-peg-pattern #'pat (baf accum)) + str strlen end)) + (new-end (if match (car match) end)) + (count (if (> new-end end) (1+ count) count))) + (if (> new-end end) + (push-not-null! body (single-filter (cadr match)))) + (if (and (> new-end end) + #,#'(< count 1)) + (lp new-end count) + (let ((success #,#t)) + #,#`(and success + #,(cggr (baf accum) 'cg-body + #'(reverse body) #'new-end))))))))))) + +(define (cg-followed-by args accum) + (syntax-case args () + ((pat) + #`(lambda (str strlen at) + (let ((body '())) + (let lp ((end at) (count 0)) + (let* ((match (#,(compile-peg-pattern #'pat (baf accum)) + str strlen end)) + (new-end (if match (car match) end)) + (count (if (> new-end end) (1+ count) count))) + (if (> new-end end) + (push-not-null! body (single-filter (cadr match)))) + (if (and (> new-end end) + #,#'(< count 1)) + (lp new-end count) + (let ((success #,#'(= count 1))) + #,#`(and success + #,(cggr (baf accum) 'cg-body #''() #'at))))))))))) + +(define (cg-not-followed-by args accum) + (syntax-case args () + ((pat) + #`(lambda (str strlen at) + (let ((body '())) + (let lp ((end at) (count 0)) + (let* ((match (#,(compile-peg-pattern #'pat (baf accum)) + str strlen end)) + (new-end (if match (car match) end)) + (count (if (> new-end end) (1+ count) count))) + (if (> new-end end) + (push-not-null! body (single-filter (cadr match)))) + (if (and (> new-end end) + #,#'(< count 1)) + (lp new-end count) + (let ((success #,#'(= count 1))) + #,#`(if success + #f + #,(cggr (baf accum) 'cg-body #''() #'at))))))))))) + +;; Association list of functions to handle different expressions as PEGs +(define peg-compiler-alist '()) + +(define (add-peg-compiler! symbol function) + (set! peg-compiler-alist + (assq-set! peg-compiler-alist symbol function))) + +(add-peg-compiler! 'range cg-range) +(add-peg-compiler! 'ignore cg-ignore) +(add-peg-compiler! 'capture cg-capture) +(add-peg-compiler! 'and cg-and) +(add-peg-compiler! 'or cg-or) +(add-peg-compiler! '* cg-*) +(add-peg-compiler! '+ cg-+) +(add-peg-compiler! '? cg-?) +(add-peg-compiler! 'followed-by cg-followed-by) +(add-peg-compiler! 'not-followed-by cg-not-followed-by) + +;; Takes an arbitrary expressions and accumulation variable, then parses it. +;; E.g.: (compile-peg-pattern syntax '(and "abc" (or "-" (range #\a #\z))) 'all) +(define (compile-peg-pattern pat accum) + (syntax-case pat (peg-any) + (peg-any + (cg-peg-any (baf accum))) + (sym (identifier? #'sym) ;; nonterminal + #'sym) + (str (string? (syntax->datum #'str)) ;; literal string + (cg-string (syntax->datum #'str) (baf accum))) + ((name . args) (let* ((nm (syntax->datum #'name)) + (entry (assq-ref peg-compiler-alist nm))) + (if entry + (entry #'args accum) + (error "Bad peg form" nm #'args + "Not one of" (map car peg-compiler-alist))))))) + +;; Packages the results of a parser +(define (wrap-parser-for-users for-syntax parser accumsym s-syn) + #`(lambda (str strlen at) + (let ((res (#,parser str strlen at))) + ;; Try to match the nonterminal. + (if res + ;; If we matched, do some post-processing to figure out + ;; what data to propagate upward. + (let ((at (car res)) + (body (cadr res))) + #,(cond + ((eq? accumsym 'name) + #`(list at '#,s-syn)) + ((eq? accumsym 'all) + #`(list (car res) + (cond + ((not (list? body)) + (list '#,s-syn body)) + ((null? body) '#,s-syn) + ((symbol? (car body)) + (list '#,s-syn body)) + (else (cons '#,s-syn body))))) + ((eq? accumsym 'none) #`(list (car res) '())) + (else #`(begin res)))) + ;; If we didn't match, just return false. + #f)))) diff --git a/module/mes/peg/simplify-tree.scm b/module/mes/peg/simplify-tree.scm new file mode 100644 index 00000000..82eb004d --- /dev/null +++ b/module/mes/peg/simplify-tree.scm @@ -0,0 +1,97 @@ +;;;; simplify-tree.scm --- utility functions for the PEG parser +;;;; +;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +(define-module (ice-9 peg simplify-tree) + #:export (keyword-flatten context-flatten string-collapse) + #:use-module (system base pmatch)) + +(define-syntax single? + (syntax-rules () + ;;"Return #t if X is a list of one element." + ((_ x) + (pmatch x + ((_) #t) + (else #f))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;; POST-PROCESSING FUNCTIONS (TO CANONICALIZE MATCH TREES) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Is everything in LST true? +(define (andlst lst) + (or (null? lst) + (and (car lst) (andlst (cdr lst))))) + +;; Is LST a list of strings? +(define (string-list? lst) + (and (list? lst) (not (null? lst)) + (andlst (map string? lst)))) + +;; Groups all strings that are next to each other in LST. Used in +;; STRING-COLLAPSE. +(define (string-group lst) + (if (not (list? lst)) + lst + (if (null? lst) + '() + (let ((next (string-group (cdr lst)))) + (if (not (string? (car lst))) + (cons (car lst) next) + (if (and (not (null? next)) + (list? (car next)) + (string? (caar next))) + (cons (cons (car lst) (car next)) (cdr next)) + (cons (list (car lst)) next))))))) + + +;; Collapses all the string in LST. +;; ("a" "b" (c d) "e" "f") -> ("ab" (c d) "ef") +(define (string-collapse lst) + (if (list? lst) + (let ((res (map (lambda (x) (if (string-list? x) + (apply string-append x) + x)) + (string-group (map string-collapse lst))))) + (if (single? res) (car res) res)) + lst)) + +;; If LST is an atom, return (list LST), else return LST. +(define (mklst lst) + (if (not (list? lst)) (list lst) lst)) + +;; Takes a list and "flattens" it, using the predicate TST to know when to stop +;; instead of terminating on atoms (see tutorial). +(define (context-flatten tst lst) + (if (or (not (list? lst)) (null? lst)) + lst + (if (tst lst) + (list lst) + (apply append + (map (lambda (x) (mklst (context-flatten tst x))) + lst))))) + +;; Takes a list and "flattens" it, using the list of keywords KEYWORD-LST to +;; know when to stop at (see tutorial). +(define (keyword-flatten keyword-lst lst) + (context-flatten + (lambda (x) + (if (or (not (list? x)) (null? x)) + #t + (member (car x) keyword-lst))) + lst)) diff --git a/module/mes/peg/string-peg.scm b/module/mes/peg/string-peg.scm new file mode 100644 index 00000000..45ed14bb --- /dev/null +++ b/module/mes/peg/string-peg.scm @@ -0,0 +1,273 @@ +;;;; string-peg.scm --- representing PEG grammars as strings +;;;; +;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +(define-module (ice-9 peg string-peg) + #:export (peg-as-peg + define-peg-string-patterns + peg-grammar) + #:use-module (ice-9 peg using-parsers) + #:use-module (ice-9 peg codegen) + #:use-module (ice-9 peg simplify-tree)) + +;; Gets the left-hand depth of a list. +(define (depth lst) + (if (or (not (list? lst)) (null? lst)) + 0 + (+ 1 (depth (car lst))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;; Parse string PEGs using sexp PEGs. +;; See the variable PEG-AS-PEG for an easier-to-read syntax. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Grammar for PEGs in PEG grammar. +(define peg-as-peg +"grammar <-- (nonterminal ('<--' / '<-' / '<') sp pattern)+ +pattern <-- alternative (SLASH sp alternative)* +alternative <-- ([!&]? sp suffix)+ +suffix <-- primary ([*+?] sp)* +primary <-- '(' sp pattern ')' sp / '.' sp / literal / charclass / nonterminal !'<' +literal <-- ['] (!['] .)* ['] sp +charclass <-- LB (!']' (CCrange / CCsingle))* RB sp +CCrange <-- . '-' . +CCsingle <-- . +nonterminal <-- [a-zA-Z0-9-]+ sp +sp < [ \t\n]* +SLASH < '/' +LB < '[' +RB < ']' +") + +(define-syntax define-sexp-parser + (lambda (x) + (syntax-case x () + ((_ sym accum pat) + (let* ((matchf (compile-peg-pattern #'pat (syntax->datum #'accum))) + (accumsym (syntax->datum #'accum)) + (syn (wrap-parser-for-users x matchf accumsym #'sym))) + #`(define sym #,syn)))))) + +(define-sexp-parser peg-grammar all + (+ (and peg-nonterminal (or "<--" "<-" "<") peg-sp peg-pattern))) +(define-sexp-parser peg-pattern all + (and peg-alternative + (* (and (ignore "/") peg-sp peg-alternative)))) +(define-sexp-parser peg-alternative all + (+ (and (? (or "!" "&")) peg-sp peg-suffix))) +(define-sexp-parser peg-suffix all + (and peg-primary (* (and (or "*" "+" "?") peg-sp)))) +(define-sexp-parser peg-primary all + (or (and "(" peg-sp peg-pattern ")" peg-sp) + (and "." peg-sp) + peg-literal + peg-charclass + (and peg-nonterminal (not-followed-by "<")))) +(define-sexp-parser peg-literal all + (and "'" (* (and (not-followed-by "'") peg-any)) "'" peg-sp)) +(define-sexp-parser peg-charclass all + (and (ignore "[") + (* (and (not-followed-by "]") + (or charclass-range charclass-single))) + (ignore "]") + peg-sp)) +(define-sexp-parser charclass-range all (and peg-any "-" peg-any)) +(define-sexp-parser charclass-single all peg-any) +(define-sexp-parser peg-nonterminal all + (and (+ (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9) "-")) peg-sp)) +(define-sexp-parser peg-sp none + (* (or " " "\t" "\n"))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;; PARSE STRING PEGS +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Takes a string representing a PEG grammar and returns syntax that +;; will define all of the nonterminals in the grammar with equivalent +;; PEG s-expressions. +(define (peg-parser str for-syntax) + (let ((parsed (match-pattern peg-grammar str))) + (if (not parsed) + (begin + ;; (display "Invalid PEG grammar!\n") + #f) + (let ((lst (peg:tree parsed))) + (cond + ((or (not (list? lst)) (null? lst)) + lst) + ((eq? (car lst) 'peg-grammar) + #`(begin + #,@(map (lambda (x) (peg-nonterm->defn x for-syntax)) + (context-flatten (lambda (lst) (<= (depth lst) 2)) + (cdr lst)))))))))) + +;; Macro wrapper for PEG-PARSER. Parses PEG grammars expressed as strings and +;; defines all the appropriate nonterminals. +(define-syntax define-peg-string-patterns + (lambda (x) + (syntax-case x () + ((_ str) + (peg-parser (syntax->datum #'str) x))))) + +;; lst has format (nonterm grabber pattern), where +;; nonterm is a symbol (the name of the nonterminal), +;; grabber is a string (either "<", "<-" or "<--"), and +;; pattern is the parse of a PEG pattern expressed as as string. +(define (peg-nonterm->defn lst for-syntax) + (let* ((nonterm (car lst)) + (grabber (cadr lst)) + (pattern (caddr lst)) + (nonterm-name (datum->syntax for-syntax + (string->symbol (cadr nonterm))))) + #`(define-peg-pattern #,nonterm-name + #,(cond + ((string=? grabber "<--") (datum->syntax for-syntax 'all)) + ((string=? grabber "<-") (datum->syntax for-syntax 'body)) + (else (datum->syntax for-syntax 'none))) + #,(compressor (peg-pattern->defn pattern for-syntax) for-syntax)))) + +;; lst has format ('peg-pattern ...). +;; After the context-flatten, (cdr lst) has format +;; (('peg-alternative ...) ...), where the outer list is a collection +;; of elements from a '/' alternative. +(define (peg-pattern->defn lst for-syntax) + #`(or #,@(map (lambda (x) (peg-alternative->defn x for-syntax)) + (context-flatten (lambda (x) (eq? (car x) 'peg-alternative)) + (cdr lst))))) + +;; lst has format ('peg-alternative ...). +;; After the context-flatten, (cdr lst) has the format +;; (item ...), where each item has format either ("!" ...), ("&" ...), +;; or ('peg-suffix ...). +(define (peg-alternative->defn lst for-syntax) + #`(and #,@(map (lambda (x) (peg-body->defn x for-syntax)) + (context-flatten (lambda (x) (or (string? (car x)) + (eq? (car x) 'peg-suffix))) + (cdr lst))))) + +;; lst has the format either +;; ("!" ('peg-suffix ...)), ("&" ('peg-suffix ...)), or +;; ('peg-suffix ...). +(define (peg-body->defn lst for-syntax) + (cond + ((equal? (car lst) "&") + #`(followed-by #,(peg-suffix->defn (cadr lst) for-syntax))) + ((equal? (car lst) "!") + #`(not-followed-by #,(peg-suffix->defn (cadr lst) for-syntax))) + ((eq? (car lst) 'peg-suffix) + (peg-suffix->defn lst for-syntax)) + (else `(peg-parse-body-fail ,lst)))) + +;; lst has format ('peg-suffix (? (/ "*" "?" "+"))) +(define (peg-suffix->defn lst for-syntax) + (let ((inner-defn (peg-primary->defn (cadr lst) for-syntax))) + (cond + ((null? (cddr lst)) + inner-defn) + ((equal? (caddr lst) "*") + #`(* #,inner-defn)) + ((equal? (caddr lst) "?") + #`(? #,inner-defn)) + ((equal? (caddr lst) "+") + #`(+ #,inner-defn))))) + +;; Parse a primary. +(define (peg-primary->defn lst for-syntax) + (let ((el (cadr lst))) + (cond + ((list? el) + (cond + ((eq? (car el) 'peg-literal) + (peg-literal->defn el for-syntax)) + ((eq? (car el) 'peg-charclass) + (peg-charclass->defn el for-syntax)) + ((eq? (car el) 'peg-nonterminal) + (datum->syntax for-syntax (string->symbol (cadr el)))))) + ((string? el) + (cond + ((equal? el "(") + (peg-pattern->defn (caddr lst) for-syntax)) + ((equal? el ".") + (datum->syntax for-syntax 'peg-any)) + (else (datum->syntax for-syntax + `(peg-parse-any unknown-string ,lst))))) + (else (datum->syntax for-syntax + `(peg-parse-any unknown-el ,lst)))))) + +;; Trims characters off the front and end of STR. +;; (trim-1chars "'ab'") -> "ab" +(define (trim-1chars str) (substring str 1 (- (string-length str) 1))) + +;; Parses a literal. +(define (peg-literal->defn lst for-syntax) + (datum->syntax for-syntax (trim-1chars (cadr lst)))) + +;; Parses a charclass. +(define (peg-charclass->defn lst for-syntax) + #`(or + #,@(map + (lambda (cc) + (cond + ((eq? (car cc) 'charclass-range) + #`(range #,(datum->syntax + for-syntax + (string-ref (cadr cc) 0)) + #,(datum->syntax + for-syntax + (string-ref (cadr cc) 2)))) + ((eq? (car cc) 'charclass-single) + (datum->syntax for-syntax (cadr cc))))) + (context-flatten + (lambda (x) (or (eq? (car x) 'charclass-range) + (eq? (car x) 'charclass-single))) + (cdr lst))))) + +;; Compresses a list to save the optimizer work. +;; e.g. (or (and a)) -> a +(define (compressor-core lst) + (if (or (not (list? lst)) (null? lst)) + lst + (cond + ((and (or (eq? (car lst) 'or) (eq? (car lst) 'and)) + (null? (cddr lst))) + (compressor-core (cadr lst))) + ((and (eq? (car lst) 'body) + (eq? (cadr lst) 'lit) + (eq? (cadddr lst) 1)) + (compressor-core (caddr lst))) + (else (map compressor-core lst))))) + +(define (compressor syn for-syntax) + (datum->syntax for-syntax + (compressor-core (syntax->datum syn)))) + +;; Builds a lambda-expressions for the pattern STR using accum. +(define (peg-string-compile args accum) + (syntax-case args () + ((str-stx) (string? (syntax->datum #'str-stx)) + (let ((string (syntax->datum #'str-stx))) + (compile-peg-pattern + (compressor + (peg-pattern->defn + (peg:tree (match-pattern peg-pattern string)) #'str-stx) + #'str-stx) + (if (eq? accum 'all) 'body accum)))) + (else (error "Bad embedded PEG string" args)))) + +(add-peg-compiler! 'peg peg-string-compile) + diff --git a/module/mes/peg/using-parsers.scm b/module/mes/peg/using-parsers.scm new file mode 100644 index 00000000..d1a9382b --- /dev/null +++ b/module/mes/peg/using-parsers.scm @@ -0,0 +1,115 @@ +;;;; using-parsers.scm --- utilities to make using parsers easier +;;;; +;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +(define-module (ice-9 peg using-parsers) + #:use-module (ice-9 peg simplify-tree) + #:use-module (ice-9 peg codegen) + #:use-module (ice-9 peg cache) + #:export (match-pattern define-peg-pattern search-for-pattern + prec make-prec peg:start peg:end peg:string + peg:tree peg:substring peg-record?)) + +;;; +;;; Helper Macros +;;; + +(define-syntax until + (syntax-rules () + ;;"Evaluate TEST. If it is true, return its value. Otherwise,execute the STMTs and try again." + ((_ test stmt stmt* ...) + (let lp () + (or test + (begin stmt stmt* ... (lp))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;; FOR DEFINING AND USING NONTERMINALS +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Parses STRING using NONTERM +(define (match-pattern nonterm string) + ;; We copy the string before using it because it might have been modified + ;; in-place since the last time it was parsed, which would invalidate the + ;; cache. Guile uses copy-on-write for strings, so this is fast. + (let ((res (nonterm (string-copy string) (string-length string) 0))) + (if (not res) + #f + (make-prec 0 (car res) string (string-collapse (cadr res)))))) + +;; Defines a new nonterminal symbol accumulating with ACCUM. +(define-syntax define-peg-pattern + (lambda (x) + (syntax-case x () + ((_ sym accum pat) + (let ((matchf (compile-peg-pattern #'pat (syntax->datum #'accum))) + (accumsym (syntax->datum #'accum))) + ;; CODE is the code to parse the string if the result isn't cached. + (let ((syn (wrap-parser-for-users x matchf accumsym #'sym))) + #`(define sym #,(cg-cached-parser syn)))))))) + +(define (peg-like->peg pat) + (syntax-case pat () + (str (string? (syntax->datum #'str)) #'(peg str)) + (else pat))) + +;; Searches through STRING for something that parses to PEG-MATCHER. Think +;; regexp search. +(define-syntax search-for-pattern + (lambda (x) + (syntax-case x () + ((_ pattern string-uncopied) + (let ((pmsym (syntax->datum #'pattern))) + (let ((matcher (compile-peg-pattern (peg-like->peg #'pattern) 'body))) + ;; We copy the string before using it because it might have been + ;; modified in-place since the last time it was parsed, which would + ;; invalidate the cache. Guile uses copy-on-write for strings, so + ;; this is fast. + #`(let ((string (string-copy string-uncopied)) + (strlen (string-length string-uncopied)) + (at 0)) + (let ((ret (until (or (>= at strlen) + (#,matcher string strlen at)) + (set! at (+ at 1))))) + (if (eq? ret #t) ;; (>= at strlen) succeeded + #f + (let ((end (car ret)) + (match (cadr ret))) + (make-prec + at end string + (string-collapse match)))))))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;; PMATCH STRUCTURE MUNGING +;; Pretty self-explanatory. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define prec + (make-record-type "peg" '(start end string tree))) +(define make-prec + (record-constructor prec '(start end string tree))) +(define (peg:start pm) + (if pm ((record-accessor prec 'start) pm) #f)) +(define (peg:end pm) + (if pm ((record-accessor prec 'end) pm) #f)) +(define (peg:string pm) + (if pm ((record-accessor prec 'string) pm) #f)) +(define (peg:tree pm) + (if pm ((record-accessor prec 'tree) pm) #f)) +(define (peg:substring pm) + (if pm (substring (peg:string pm) (peg:start pm) (peg:end pm)) #f)) +(define peg-record? (record-predicate prec)) diff --git a/tests/peg.test b/tests/peg.test new file mode 100755 index 00000000..bff07779 --- /dev/null +++ b/tests/peg.test @@ -0,0 +1,72 @@ +#! /bin/sh +# -*-scheme-*- +MES_ARENA=${MES_ARENA-10000000} +export MES_ARENA +echo ' ()' | cat $(dirname $0)/../module/mes/base-0.mes $0 /dev/stdin | $(dirname $0)/../scripts/mes $MES_FLAGS $MES_FLAGS"$@" +#paredit:|| +exit $? +!# + +;;; -*-scheme-*- + +;;; Mes --- Maxwell Equations of Software +;;; Copyright © 2016 Jan Nieuwenhuizen +;;; +;;; This file is part of Mes. +;;; +;;; Mes is free software; you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 3 of the License, or (at +;;; your option) any later version. +;;; +;;; Mes is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with Mes. If not, see . + + +(cond-expand + (guile-2.2 + (use-modules (ice-9 peg))) + (guile + (use-modules (ice-9 syncase)) + (display "guile 2.0: no PEG\n" (current-error-port)) + (exit 0)) + (mes + (mes-use-module (mes peg)) + (mes-use-module (mes test)))) + +(pass-if "first dummy" #t) +(pass-if-not "second dummy" #f) + +(define *etc-passwd* + "root:x:0:0:root:/root:/bin/bash +daemon:x:1:1:daemon:/usr/sbin:/bin/sh +bin:x:2:2:bin:/bin:/bin/sh +sys:x:3:3:sys:/dev:/bin/sh +nobody:x:65534:65534:nobody:/nonexistent:/bin/sh +messagebus:x:103:107::/var/run/dbus:/bin/false") + +(define-peg-string-patterns + "string-passwd <- entry* !. +entry <-- (! NL .)* NL* +NL < '\n'") + +(pass-if-equal "peg-tree" + (map (lambda (x) (list 'entry x)) (string-split *etc-passwd* #\newline)) + (peg:tree (match-pattern string-passwd *etc-passwd*))) + +(define-peg-pattern passwd body (and (* entry) (not-followed-by peg-any))) +(define-peg-pattern entry all (and (* (and (not-followed-by NL) peg-any)) + (* NL))) +(define-peg-pattern NL none "\n") +(define-peg-pattern passwd body (peg "entry* !.")) + +(pass-if-equal "peg-tree" + (map (lambda (x) (list 'entry x)) (string-split *etc-passwd* #\newline)) + (peg:tree (match-pattern passwd *etc-passwd*))) + +(result 'report)