;;; Prebirth Lisp implemented in Prebirth Lisp (self-hosting) ;;; ;;; Copyright (C) 2017 Mike Gerwitz ;;; ;;; This file is part of Gibble. ;;; ;;; Gibble is free software: you can redistribute it and/or modify ;;; it under the terms of the GNU Affero General Public License as ;;; published by the Free Software Foundation, either version 3 of the ;;; License, or (at your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;;; GNU General Public License for more details. ;;; ;;; You should have received a copy of the GNU Affero General Public License ;;; along with this program. If not, see . ;;; ;;; THIS IS TEMPORARY CODE that will be REWRITTEN IN GIBBLE LISP ITSELF after ;;; a very basic bootstrap is complete. It is retained as an important ;;; artifact for those who wish to build Gibble from scratch without using ;;; another version of Gibble itself. This is called "self-hosting". ;;; ;;; This is the Prebirth Lisp implementation of the JavaScript Prebirth ;;; compiler, found in `prebirth.js'---that compiler can be used to compile ;;; this compiler, which can then be used to compile itself, completing the ;;; bootstrapping process. This process is termed "Birth", and the process ;;; is successful if the output of Birth compiling itself is byte-for-byte ;;; identical to the output of compiling Birth with Prebirth. ;;; ;;; This is largely a 1:1 translation of `prebirth.js'. ;;; ;;; Note that we're dealing with a small subset of Scheme here, so certain ;;; things might be done differently given a proper implementation. See ;;; that file for terminology. ;; pair selection (define (cadr xs) (car (cdr xs))) (define (caadr xs) (car (car (cdr xs)))) (define (caddr xs) (car (cdr (cdr xs)))) (define (cadddr xs) (car (cdr (cdr ( cdr xs))))) ;; for convenience (define (js:match-regexp re s) (js:match (js:regexp re) s)) ;; Convert source input into a string of tokens. ;; ;; This is the lexer. Whitespace is ignored. The grammar consists of ;; simple s-expressions. ;; ;; This procedure is mutually recursive with `token'. It expects that ;; the source SRC will be left-truncated as input is processed. POS exists ;; for producing metadata for error reporting---it has no impact on ;; parsing. ;; ;; The result is a list of tokens. See `token' for the format. (define (lex src pos) (let* ((ws (or (js:match-regexp "^\\s+" src) (list ""))) (ws-len (string-length (car ws))) (trim (substring src ws-len)) ; ignore whitespace, if any (newpos (+ pos ws-len))) (if (string=? "" trim) (list) ; EOF and we're done ;; normally we'd use `string-ref' here, but then we'd have to ;; implement Scheme characters, so let's keep this simple and keep ;; with strings (let ((ch (substring trim 0 1))) (case ch ;; comments extend until the end of the line ((";") (let ((eol (js:match-regexp "^(.*?)(\\n|$)" trim))) (token "comment" (cadr eol) trim newpos))) ;; left and right parenthesis are handled in the same manner: ;; they produce distinct tokens with single-character lexemes (("(") (token "open" ch trim newpos)) ((")") (token "close" ch trim newpos)) ;; strings are delimited by opening and closing ASCII double ;; quotes, which can be escaped with a backslash (("\"") (let ((str (js:match-regexp "^\"(|.*?[^\\\\])\"" trim))) (or str (parse-error src pos "missing closing string delimiter")) ;; a string token consists of the entire string ;; including quotes as its lexeme, but its value will ;; be the value of the string without quotes due to ;; the `str' match group (see `token') (token "string" str trim newpos))) (else ;; anything else is considered a symbol up until whitespace or ;; any of the aforementioned delimiters (let ((symbol (js:match-regexp "^[^\\s()\"]+" trim))) (token "symbol" symbol trim newpos)))))))) ;; Throw an error with a window of surrounding source code. ;; ;; The "window" is simply ten characters to the left and right of the ;; first character of the source input SRC that resulted in the error. ;; It's a little more than useless. (define (parse-error src pos msg) (let ((window (substring src (- pos 10) (+ pos 10)))) (error (string-append msg " (pos " pos "): " window) src))) ;; Produce a token and recurse. ;; ;; The token will be concatenated with the result of the mutually ;; recursive procedure `lex'. ;; ;; For the record: I'm not fond of mutual recursion from a clarity ;; standpoint, but this is how the abstraction evolved to de-duplicate ;; code, and I don't much feel like refactoring it. ;; ;; Unlike the JS Prebirth implementation which uses a key/value object, ;; we're just using a simple list. ;; ;; The expected arguments are: the token type TYPE, the match group or ;; string MATCH, left-truncated source code SRC, and the position POS ;; relative to the original source code. (define (token type match src pos) (let* ((parts (if (list? match) match (list match match))) (lexeme (car parts)) ;; the value is the first group of the match (indicating what we ;; are actually interested in), and the lexeme is the full match, ;; which might include, for example, string delimiters (value (or (and (pair? (cdr parts)) (cadr parts)) lexeme)) (len (string-length lexeme))) ;; produce token and recurse on `lex', left-truncating the source ;; string to discard what we have already processed (cons (list type lexeme value pos) (lex (substring src len) (+ pos len))))) ;; various accessor procedures for token lists (we're Prebirth Lisp here, ;; so no record support or anything fancy!) (define (token-type t) (car t)) (define (token-lexeme t) (cadr t)) (define (token-value t) (caddr t)) (define (token-pos t) (cadddr t)) ;; Produce an AST from the given string SRC of sexps ;; ;; This is essentially the CST with whitespace removed. It first invokes ;; the lexer to produce a token string from the input sexps SRC. From this, ;; it verifies only proper nesting (that SRC does not close sexps too early ;; and that EOF isn't reached before all sexps are closed) and produces an ;; AST that is an isomorphism of the original sexps. (define (parse-lisp src) ;; accessor methods to make you and me less consfused (define (ast-depth ast) (car ast)) (define (ast-tree ast) (cadr ast)) (define (ast-stack ast) (caddr ast)) (define (toks->ast toks) (fold ; then a leftmost reduction on the token string (lambda (token result) (let ((depth (ast-depth result)) (xs (ast-tree result)) (stack (ast-stack result)) (type (token-type token)) (pos (token-pos token))) ;; there are very few token types to deal with (again, this is a ;; very simple bootstrap lisp) (case type ;; ignore comments (("comment") result) ;; when beginning a new expression, place the expression ;; currently being processed onto a stack, allocate a new list, ;; and we'll continue processing into that new list (("open") (list (+ depth 1) (list) (cons xs stack))) ;; once we reach the end of the expression, pop the parent off of ;; the stack and append the new list to it (("close") (if (zero? depth) (parse-error src pos "unexpected closing parenthesis") (list (- depth 1) (append (car stack) (list xs)) (cdr stack)))) ;; strings and symbols (we cheat and just consider everything, ;; including numbers and such, to be symbols) are just copied ;; in place (("string" "symbol") (list depth (append xs (list token)) stack)) ;; we should never encounter anything else unless there's a bug ;; in the tokenizer or we forget a token type above (else (parse-error src pos (string-append "unexpected token `" type "'")))))) (list 0 (list) (list)) ; initial 0 depth; empty tree; expr stack toks)) ;; lex the input SRC and pass it to `toks->ast' to generate the AST; ;; if the depth is non-zero after we're done, then we're unbalanced. (let* ((toks (lex src 0)) (ast (toks->ast toks))) (if (zero? (ast-depth ast)) (ast-tree ast) (error (string-append "unexpected end of input at depth " (ast-depth ast)))))) ;; at this point, this program can parse itself and output a CST (sans ;; whitespace) (js:console (parse-lisp (js:stdin->string)))