2017-08-31 01:01:06 -04:00
|
|
|
|
;;; Prebirth Lisp implemented in Prebirth Lisp (self-hosting)
|
|
|
|
|
;;;
|
|
|
|
|
;;; Copyright (C) 2017 Mike Gerwitz
|
|
|
|
|
;;;
|
|
|
|
|
;;; This file is part of Gibble.
|
|
|
|
|
;;;
|
|
|
|
|
;;; Gibble is free software: you can redistribute it and/or modify
|
|
|
|
|
;;; it under the terms of the GNU Affero General Public License as
|
|
|
|
|
;;; published by the Free Software Foundation, either version 3 of the
|
|
|
|
|
;;; License, or (at your option) any later version.
|
|
|
|
|
;;;
|
|
|
|
|
;;; This program is distributed in the hope that it will be useful,
|
|
|
|
|
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
;;; GNU General Public License for more details.
|
|
|
|
|
;;;
|
|
|
|
|
;;; You should have received a copy of the GNU Affero General Public License
|
|
|
|
|
;;; along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
;;;
|
|
|
|
|
;;; THIS IS TEMPORARY CODE that will be REWRITTEN IN GIBBLE LISP ITSELF after
|
|
|
|
|
;;; a very basic bootstrap is complete. It is retained as an important
|
|
|
|
|
;;; artifact for those who wish to build Gibble from scratch without using
|
|
|
|
|
;;; another version of Gibble itself. This is called "self-hosting".
|
|
|
|
|
;;;
|
|
|
|
|
;;; This is the Prebirth Lisp implementation of the JavaScript Prebirth
|
|
|
|
|
;;; compiler, found in `prebirth.js'---that compiler can be used to compile
|
|
|
|
|
;;; this compiler, which can then be used to compile itself, completing the
|
2017-09-02 01:30:13 -04:00
|
|
|
|
;;; bootstrapping process. This process is termed "Birth", and the process
|
|
|
|
|
;;; is successful if the output of Birth compiling itself is byte-for-byte
|
|
|
|
|
;;; identical to the output of compiling Birth with Prebirth.
|
2017-08-31 01:01:06 -04:00
|
|
|
|
;;;
|
2017-09-02 01:30:13 -04:00
|
|
|
|
;;; This is largely a 1:1 translation of `prebirth.js'.
|
|
|
|
|
;;;
|
|
|
|
|
;;; Note that we're dealing with a small subset of Scheme here, so certain
|
|
|
|
|
;;; things might be done differently given a proper implementation. See
|
|
|
|
|
;;; that file for terminology.
|
2017-08-31 01:01:06 -04:00
|
|
|
|
|
|
|
|
|
;; pair selection
|
|
|
|
|
(define (cadr xs)
|
|
|
|
|
(car (cdr xs)))
|
|
|
|
|
(define (caadr xs)
|
|
|
|
|
(car (car (cdr xs))))
|
|
|
|
|
(define (caddr xs)
|
|
|
|
|
(car (cdr (cdr xs))))
|
2017-09-02 01:30:13 -04:00
|
|
|
|
(define (cadddr xs)
|
|
|
|
|
(car (cdr (cdr ( cdr xs)))))
|
2017-08-31 01:01:06 -04:00
|
|
|
|
|
2017-09-02 01:30:13 -04:00
|
|
|
|
;; for convenience
|
2017-08-31 01:01:06 -04:00
|
|
|
|
(define (js:match-regexp re s)
|
|
|
|
|
(js:match (js:regexp re) s))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;; Convert source input into a string of tokens.
|
|
|
|
|
;;
|
|
|
|
|
;; This is the lexer. Whitespace is ignored. The grammar consists of
|
|
|
|
|
;; simple s-expressions.
|
|
|
|
|
;;
|
|
|
|
|
;; This procedure is mutually recursive with `token'. It expects that
|
|
|
|
|
;; the source SRC will be left-truncated as input is processed. POS exists
|
|
|
|
|
;; for producing metadata for error reporting---it has no impact on
|
|
|
|
|
;; parsing.
|
|
|
|
|
;;
|
|
|
|
|
;; The result is a list of tokens. See `token' for the format.
|
|
|
|
|
(define (lex src pos)
|
|
|
|
|
(let* ((ws (or (js:match-regexp "^\\s+"
|
|
|
|
|
src)
|
|
|
|
|
(list "")))
|
|
|
|
|
(ws-len (string-length (car ws)))
|
|
|
|
|
(trim (substring src ws-len)) ; ignore whitespace, if any
|
|
|
|
|
(newpos (+ pos ws-len)))
|
|
|
|
|
|
|
|
|
|
(if (string=? "" trim)
|
|
|
|
|
(list) ; EOF and we're done
|
|
|
|
|
|
|
|
|
|
;; normally we'd use `string-ref' here, but then we'd have to
|
|
|
|
|
;; implement Scheme characters, so let's keep this simple and keep
|
|
|
|
|
;; with strings
|
|
|
|
|
(let ((ch (substring trim 0 1)))
|
|
|
|
|
(case ch
|
|
|
|
|
;; comments extend until the end of the line
|
|
|
|
|
((";") (let ((eol (js:match-regexp "^(.*?)(\\n|$)" trim)))
|
|
|
|
|
(token "comment" (cadr eol) trim newpos)))
|
|
|
|
|
|
|
|
|
|
;; left and right parenthesis are handled in the same manner:
|
|
|
|
|
;; they produce distinct tokens with single-character lexemes
|
|
|
|
|
(("(") (token "open" ch trim newpos))
|
|
|
|
|
((")") (token "close" ch trim newpos))
|
|
|
|
|
|
|
|
|
|
;; strings are delimited by opening and closing ASCII double
|
|
|
|
|
;; quotes, which can be escaped with a backslash
|
|
|
|
|
(("\"") (let ((str (js:match-regexp "^\"(|.*?[^\\\\])\""
|
|
|
|
|
trim)))
|
2017-08-31 12:44:41 -04:00
|
|
|
|
(or str (parse-error
|
|
|
|
|
src pos "missing closing string delimiter"))
|
2017-08-31 01:01:06 -04:00
|
|
|
|
;; a string token consists of the entire string
|
|
|
|
|
;; including quotes as its lexeme, but its value will
|
|
|
|
|
;; be the value of the string without quotes due to
|
|
|
|
|
;; the `str' match group (see `token')
|
|
|
|
|
(token "string" str trim newpos)))
|
|
|
|
|
|
|
|
|
|
(else
|
|
|
|
|
;; anything else is considered a symbol up until whitespace or
|
|
|
|
|
;; any of the aforementioned delimiters
|
|
|
|
|
(let ((symbol (js:match-regexp "^[^\\s()\"]+"
|
|
|
|
|
trim)))
|
|
|
|
|
(token "symbol" symbol trim newpos))))))))
|
|
|
|
|
|
|
|
|
|
|
2017-09-02 01:30:13 -04:00
|
|
|
|
|
2017-08-31 12:44:41 -04:00
|
|
|
|
;; Throw an error with a window of surrounding source code.
|
|
|
|
|
;;
|
|
|
|
|
;; The "window" is simply ten characters to the left and right of the
|
|
|
|
|
;; first character of the source input SRC that resulted in the error.
|
|
|
|
|
;; It's a little more than useless.
|
|
|
|
|
(define (parse-error src pos msg)
|
|
|
|
|
(let ((window (substring src (- pos 10) (+ pos 10))))
|
|
|
|
|
(error (string-append msg " (pos " pos "): " window)
|
|
|
|
|
src)))
|
|
|
|
|
|
|
|
|
|
|
2017-09-02 01:30:13 -04:00
|
|
|
|
|
2017-08-31 01:01:06 -04:00
|
|
|
|
;; Produce a token and recurse.
|
|
|
|
|
;;
|
|
|
|
|
;; The token will be concatenated with the result of the mutually
|
|
|
|
|
;; recursive procedure `lex'.
|
|
|
|
|
;;
|
|
|
|
|
;; For the record: I'm not fond of mutual recursion from a clarity
|
|
|
|
|
;; standpoint, but this is how the abstraction evolved to de-duplicate
|
|
|
|
|
;; code, and I don't much feel like refactoring it.
|
|
|
|
|
;;
|
|
|
|
|
;; Unlike the JS Prebirth implementation which uses a key/value object,
|
|
|
|
|
;; we're just using a simple list.
|
|
|
|
|
;;
|
|
|
|
|
;; The expected arguments are: the token type TYPE, the match group or
|
|
|
|
|
;; string MATCH, left-truncated source code SRC, and the position POS
|
|
|
|
|
;; relative to the original source code.
|
|
|
|
|
(define (token type match src pos)
|
|
|
|
|
(let* ((parts (if (list? match) match (list match match)))
|
|
|
|
|
(lexeme (car parts))
|
|
|
|
|
;; the value is the first group of the match (indicating what we
|
|
|
|
|
;; are actually interested in), and the lexeme is the full match,
|
|
|
|
|
;; which might include, for example, string delimiters
|
|
|
|
|
(value (or (and (pair? (cdr parts))
|
|
|
|
|
(cadr parts))
|
|
|
|
|
lexeme))
|
|
|
|
|
(len (string-length lexeme)))
|
|
|
|
|
|
|
|
|
|
;; produce token and recurse on `lex', left-truncating the source
|
|
|
|
|
;; string to discard what we have already processed
|
|
|
|
|
(cons (list type lexeme value pos)
|
|
|
|
|
(lex (substring src len)
|
|
|
|
|
(+ pos len)))))
|
|
|
|
|
|
|
|
|
|
|
2017-09-02 01:30:13 -04:00
|
|
|
|
;; various accessor procedures for token lists (we're Prebirth Lisp here,
|
|
|
|
|
;; so no record support or anything fancy!)
|
|
|
|
|
(define (token-type t) (car t))
|
|
|
|
|
(define (token-lexeme t) (cadr t))
|
|
|
|
|
(define (token-value t) (caddr t))
|
|
|
|
|
(define (token-pos t) (cadddr t))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;; Produce an AST from the given string SRC of sexps
|
|
|
|
|
;;
|
|
|
|
|
;; This is essentially the CST with whitespace removed. It first invokes
|
|
|
|
|
;; the lexer to produce a token string from the input sexps SRC. From this,
|
|
|
|
|
;; it verifies only proper nesting (that SRC does not close sexps too early
|
|
|
|
|
;; and that EOF isn't reached before all sexps are closed) and produces an
|
|
|
|
|
;; AST that is an isomorphism of the original sexps.
|
|
|
|
|
(define (parse-lisp src)
|
|
|
|
|
;; accessor methods to make you and me less consfused
|
|
|
|
|
(define (ast-depth ast) (car ast))
|
|
|
|
|
(define (ast-tree ast) (cadr ast))
|
|
|
|
|
(define (ast-stack ast) (caddr ast))
|
|
|
|
|
|
|
|
|
|
(define (toks->ast toks)
|
|
|
|
|
(fold ; then a leftmost reduction on the token string
|
|
|
|
|
(lambda (token result)
|
|
|
|
|
(let ((depth (ast-depth result))
|
|
|
|
|
(xs (ast-tree result))
|
|
|
|
|
(stack (ast-stack result))
|
|
|
|
|
(type (token-type token))
|
|
|
|
|
(pos (token-pos token)))
|
|
|
|
|
|
|
|
|
|
;; there are very few token types to deal with (again, this is a
|
|
|
|
|
;; very simple bootstrap lisp)
|
|
|
|
|
(case type
|
|
|
|
|
;; ignore comments
|
|
|
|
|
(("comment") result)
|
|
|
|
|
|
|
|
|
|
;; when beginning a new expression, place the expression
|
|
|
|
|
;; currently being processed onto a stack, allocate a new list,
|
|
|
|
|
;; and we'll continue processing into that new list
|
|
|
|
|
(("open") (list (+ depth 1)
|
|
|
|
|
(list)
|
|
|
|
|
(cons xs stack)))
|
|
|
|
|
|
|
|
|
|
;; once we reach the end of the expression, pop the parent off of
|
|
|
|
|
;; the stack and append the new list to it
|
|
|
|
|
(("close") (if (zero? depth)
|
|
|
|
|
(parse-error src pos
|
|
|
|
|
"unexpected closing parenthesis")
|
|
|
|
|
(list (- depth 1)
|
|
|
|
|
(append (car stack) (list xs))
|
|
|
|
|
(cdr stack))))
|
|
|
|
|
|
|
|
|
|
;; strings and symbols (we cheat and just consider everything,
|
|
|
|
|
;; including numbers and such, to be symbols) are just copied
|
|
|
|
|
;; in place
|
|
|
|
|
(("string" "symbol") (list depth
|
|
|
|
|
(append xs (list token))
|
|
|
|
|
stack))
|
|
|
|
|
|
|
|
|
|
;; we should never encounter anything else unless there's a bug
|
|
|
|
|
;; in the tokenizer or we forget a token type above
|
|
|
|
|
(else (parse-error
|
|
|
|
|
src pos (string-append
|
|
|
|
|
"unexpected token `" type "'"))))))
|
|
|
|
|
(list 0 (list) (list)) ; initial 0 depth; empty tree; expr stack
|
|
|
|
|
toks))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;; lex the input SRC and pass it to `toks->ast' to generate the AST;
|
|
|
|
|
;; if the depth is non-zero after we're done, then we're unbalanced.
|
|
|
|
|
(let* ((toks (lex src 0))
|
|
|
|
|
(ast (toks->ast toks)))
|
|
|
|
|
(if (zero? (ast-depth ast))
|
|
|
|
|
(ast-tree ast)
|
|
|
|
|
(error (string-append
|
|
|
|
|
"unexpected end of input at depth "
|
|
|
|
|
(ast-depth ast))))))
|
|
|
|
|
|
|
|
|
|
|
2017-08-31 01:01:06 -04:00
|
|
|
|
;; at this point, this program can parse itself and output a CST (sans
|
|
|
|
|
;; whitespace)
|
|
|
|
|
(js:console
|
2017-09-02 01:30:13 -04:00
|
|
|
|
(parse-lisp (js:stdin->string)))
|