From 1810fb7c384e0ea4bd4987c45238bbf05d71a3c8 Mon Sep 17 00:00:00 2001 From: Mike Gerwitz Date: Sun, 12 Nov 2017 00:37:09 -0500 Subject: [PATCH] rebirth: Copy from birth.scm See diff for comments as to why this fork exists. build-aux/bootstrap/rebirth.scm: New file, copied from `birth.scm'. --- build-aux/bootstrap/rebirth.scm | 556 ++++++++++++++++++++++++++++++++ 1 file changed, 556 insertions(+) create mode 100644 build-aux/bootstrap/rebirth.scm diff --git a/build-aux/bootstrap/rebirth.scm b/build-aux/bootstrap/rebirth.scm new file mode 100644 index 0000000..f4a8011 --- /dev/null +++ b/build-aux/bootstrap/rebirth.scm @@ -0,0 +1,556 @@ +;;; Rebirth Lisp implemented in Birth Lisp (self-hosting) +;;; +;;; Copyright (C) 2017 Mike Gerwitz +;;; +;;; This file is part of Gibble. +;;; +;;; Gibble is free software: you can redistribute it and/or modify +;;; it under the terms of the GNU Affero General Public License as +;;; published by the Free Software Foundation, either version 3 of the +;;; License, or (at your option) any later version. +;;; +;;; This program is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU Affero General Public License +;;; along with this program. If not, see . +;;; +;;; THIS IS TEMPORARY CODE that will be REWRITTEN IN GIBBLE LISP ITSELF after +;;; a very basic bootstrap is complete. It is retained as an important +;;; artifact for those who wish to build Gibble from scratch without using +;;; another version of Gibble itself. This is called "self-hosting". +;;; +;;; This is the compiler for Rebirth Lisp---it builds off of Birth by +;;; first eliminating the need for libprebirth; this allows _all_ +;;; development to happen in a Lisp dialect, which liberates the last +;;; remaining process that isn't technically self-hosted. So, Rebirth +;;; completes the raw, self-hosting bootstrapping process. +;;; +;;; Of course, bootstrapping can't end there: we need a fully functioning +;;; Scheme compiler. So steps will follow as development continues. +;;; +;;; Note that we're dealing with a small subset of Scheme here, so certain +;;; things might be done differently given a proper implementation. +;;; +;;; This is an exact copy of `birth.csm', modified to introduce additional +;;; features. This is important, since Birth is a 1:1 translation of the +;;; Prebirth compiler and needs to stay that way. This fork allows us to +;;; vary as much as we want from the initial implementation. See the commit +;;; history for this file for more information as to how it evolved (the +;;; first commit is the direct copy before actual code changes). + +;; pair selection +(define (cadr xs) + (car (cdr xs))) +(define (caadr xs) + (car (car (cdr xs)))) +(define (caddr xs) + (car (cdr (cdr xs)))) +(define (cadddr xs) + (car (cdr (cdr (cdr xs))))) +(define (caddddr xs) + (car (cdr (cdr (cdr (cdr xs)))))) +(define (cddr xs) + (cdr (cdr xs))) + +(define (not x) + (if x #f #t)) + +;; for convenience +(define (js:match-regexp re s) + (js:match (js:regexp re) s)) + + + +;; Convert source input into a string of tokens. +;; +;; This is the lexer. Whitespace is ignored. The grammar consists of +;; simple s-expressions. +;; +;; Tokens are produced with `make-token'. The source SRC will be +;; left-truncated as input is processed. POS exists for producing metadata +;; for error reporting---it has no impact on parsing. +;; +;; This implementation was originally recursive and immutable, but the stack +;; was being exhausted, so it was refactored into an inferior +;; implementation. Note the use of `js:while' and `js:break'---these are +;; quick fixes to the problem of stack exhaustion in browsers (where we have +;; no control over the stack limit); proper tail call support will come +;; later when we have a decent architecture in place. +;; +;; The result is a list of tokens. See `token' for the format. +(define (lex src pos) + (let ((toks (list))) + (js:while #t ; browser stack workaround + (let* ((ws (or (js:match-regexp "^\\s+" + src) + (list ""))) + (ws-len (string-length (car ws))) + (trim (substring src ws-len)) ; ignore whitespace, if any + (newpos (+ pos ws-len))) ; adj pos to account for removed ws + + (if (string=? "" trim) + (js:break) ; EOF and we're done + + ;; normally we'd use `string-ref' here, but then we'd have to + ;; implement Scheme characters, so let's keep this simple and keep + ;; with strings + (let* ((ch (substring trim 0 1)) + (t (case ch + ;; comments extend until the end of the line + ((";") (let ((eol (js:match-regexp "^(.*?)(\\n|$)" trim))) + (make-token "comment" (cadr eol) trim newpos))) + + ;; left and right parenthesis are handled in the same + ;; manner: they produce distinct tokens with + ;; single-character lexemes + (("(") (make-token "open" ch trim newpos)) + ((")") (make-token "close" ch trim newpos)) + + ;; strings are delimited by opening and closing ASCII + ;; double quotes, which can be escaped with a + ;; backslash + (("\"") (let ((str (js:match-regexp "^\"(|.*?[^\\\\])\"" + trim))) + (or str (parse-error + src pos + "missing closing string delimiter")) + ;; a string token consists of the entire + ;; string including quotes as its lexeme, + ;; but its value will be the value of the + ;; string without quotes due to the `str' + ;; match group (see `token') + (make-token "string" str trim newpos))) + + (else + ;; anything else is considered a symbol up until + ;; whitespace or any of the aforementioned + ;; delimiters + (let ((symbol (js:match-regexp "^[^\\s()\"]+" + trim))) + (make-token "symbol" symbol trim newpos)))))) + + ;; yikes---see notes in docblock with regards to why + ;; we're using mutators here + (set! toks (append toks (list (car t)))) + (set! src (cadr t)) + (set! pos (caddr t)))))) + toks)) + + + +;; Throw an error with a window of surrounding source code. +;; +;; The "window" is simply ten characters to the left and right of the +;; first character of the source input SRC that resulted in the error. +;; It's a little more than useless. +(define (parse-error src pos msg) + (let ((window (substring src (- pos 10) (+ pos 10)))) + (error (string-append msg " (pos " pos "): " window) + src))) + + + +;; Produce a token, left-truncate src, and update pos. +;; +;; Unlike the JS Prebirth implementation which uses a key/value object, +;; we're just using a simple list. +;; +;; The expected arguments are: the token type TYPE, the match group or +;; string MATCH, left-truncated source code SRC, and the position POS +;; relative to the original source code. +(define (make-token type match src pos) + (let* ((parts (if (list? match) match (list match match))) + (lexeme (car parts)) + ;; the value is the first group of the match (indicating what we + ;; are actually interested in), and the lexeme is the full match, + ;; which might include, for example, string delimiters + (value (or (and (pair? (cdr parts)) + (cadr parts)) + lexeme)) + (len (string-length lexeme))) + + ;; produce token and recurse on `lex', left-truncating the source + ;; string to discard what we have already processed + (list (list (quote token) type lexeme value pos) + (substring src len) + (+ pos len)))) + + +;; various accessor procedures for token lists (we're Prebirth Lisp here, +;; so no record support or anything fancy!) +(define (token? t) (and (pair? t) (symbol=? (quote token) (car t)))) +(define (token-type t) (cadr t)) +(define (token-lexeme t) (caddr t)) +(define (token-value t) (cadddr t)) +(define (token-pos t) (caddddr t)) + + + +;; Produce an AST from the given string SRC of sexps +;; +;; This is essentially the CST with whitespace removed. It first invokes +;; the lexer to produce a token string from the input sexps SRC. From this, +;; it verifies only proper nesting (that SRC does not close sexps too early +;; and that EOF isn't reached before all sexps are closed) and produces an +;; AST that is an isomorphism of the original sexps. +(define (parse-lisp src) + ;; accessor methods to make you and me less consfused + (define (ast-depth ast) (car ast)) + (define (ast-tree ast) (cadr ast)) + (define (ast-stack ast) (caddr ast)) + + ;; perform a leftmost reduction on the token string + (define (toks->ast toks) + (fold + (lambda (token result) + (let ((depth (ast-depth result)) + (xs (ast-tree result)) + (stack (ast-stack result)) + (type (token-type token)) + (pos (token-pos token))) + + ;; there are very few token types to deal with (again, this is a + ;; very simple bootstrap lisp) + (case type + ;; ignore comments + (("comment") result) + + ;; when beginning a new expression, place the expression + ;; currently being processed onto a stack, allocate a new list, + ;; and we'll continue processing into that new list + (("open") (list (+ depth 1) + (list) + (cons xs stack))) + + ;; once we reach the end of the expression, pop the parent off of + ;; the stack and append the new list to it + (("close") (if (zero? depth) + (parse-error src pos + "unexpected closing parenthesis") + (list (- depth 1) + (append (car stack) (list xs)) + (cdr stack)))) + + ;; strings and symbols (we cheat and just consider everything, + ;; including numbers and such, to be symbols) are just copied + ;; in place + (("string" "symbol") (list depth + (append xs (list token)) + stack)) + + ;; we should never encounter anything else unless there's a bug + ;; in the tokenizer or we forget a token type above + (else (parse-error + src pos (string-append + "unexpected token `" type "'")))))) + (list 0 (list) (list)) ; initial 0 depth; empty tree; expr stack + toks)) + + + ;; lex the input SRC and pass it to `toks->ast' to generate the AST; + ;; if the depth is non-zero after we're done, then we're unbalanced. + (let* ((toks (lex src 0)) + (ast (toks->ast toks))) + (if (zero? (ast-depth ast)) + (ast-tree ast) + ;; if we terminate at a non-zero depth, that means there ar still + ;; open sexps + (error (string-append + "unexpected end of input at depth " + (ast-depth ast)))))) + + +;; Compile Prebirth Lisp AST into ECMAScript. +;; +;; The AST can be generated with `parse-lisp'. +(define (prebirth->ecmascript ast) + ;; Generate ECMAScript-friendly name from the given id. + ;; + ;; A subset of special characters that are acceptable in Scheme are + ;; converted in an identifiable manner; others are simply converted to `$' + ;; in a catch-all and therefore could result in conflicts and cannot be + ;; reliably distinguished from one-another. Remember: this is temporary + ;; code. + (define (tname->id name) + (if (js:match (js:regexp "^\\d+$") name) + name + (string-append + "$$" (js:replace (js:regexp "[^a-zA-Z0-9_]" "g") + (lambda (c) + (case c + (("-") "$_$") + (("?") "$7$") + (("@") "$a$") + (("!") "$b$") + ((">") "$g$") + (("#") "$h$") + (("*") "$k$") + (("<") "$l$") + (("&") "$n$") + (("%") "$o$") + (("+") "$p$") + (("=") "$q$") + (("^") "$v$") + (("/") "$w$") + (("$") "$$") + (else "$"))) + name)))) + + ;; Join a list of strings XS on a delimiter DELIM + (define (join delim xs) + (if (pair? xs) + (fold (lambda (x str) + (string-append str delim x)) + (car xs) + (cdr xs)) + "")) + + + ;; Compile parameter list. + ;; + ;; This simply takes the value of the symbol and outputs it (formatted), + ;; delimited by commas. + (define (params->es params) + (join ", " (map (lambda (t) + (tname->id (token-value t))) + params))) + + + ;; Compile body s-expressions into ECMAScript + ;; + ;; This produces a 1:1 mapping of body XS s-expressions to ES statements, + ;; recursively. The heavy lifting is done by `sexp->es'. + (define (body->es xs ret) + ;; recursively process body XS until we're out of pairs + (if (not (pair? xs)) + "" + (let* ((x (car xs)) + (rest (cdr xs)) + (more? (or (not ret) (pair? rest)))) + ;; the result is a semicolon-delimited string of statements, with + ;; the final statement prefixed with `return' unless (not ret) + (string-append + " " + (if more? "" "return ") ; prefix with `return' if last body exp + (sexp->es x) ";" ; process current body expression + (if (pair? rest) "\n" "") + (body->es rest ret))))) ; recurse + + + ;; Compile procedure definition into an ES function definition + ;; + ;; This will fail if the given token is not a `define'. + (define (cdfn t) + ;; e.g. (define (foo ...) body) + (let* ((dfn (cadr t)) + (id (tname->id (token-value (car dfn)))) + (params (params->es (cdr dfn))) + (body (body->es (cddr t) #t))) + ;; this is the final format---each procedure becomes its own function + ;; definition in ES + (string-append + "function " id "(" params ")\n{\n" body "\n};"))) + + + ;; Function/procedure aliases and special forms + ;; + ;; And here we have what is probably the most grotesque part of this file. + ;; + ;; This map allows for a steady transition---items can be removed as they + ;; are written in Prebirth Lisp. This should give us a sane (but still + ;; simple) environment with which we can start to self-host. + ;; + ;; String values are simple function aliases. Function values take over + ;; the compilation of that function and allow for defining special forms + ;; (in place of macro support). The first argument FN is the name of the + ;; function/procedure/form, and ARS is the list of arguments. + ;; + ;; These are by no means meant to be solid implementations; notable + ;; deficiencies are documented, but don't expect this to work properly in + ;; every case. They will be replaced with proper R7RS implementations in + ;; the future (Rebirth). + (define (fnmap fn args t) + (case fn + (("js:console") + (string-append "console.log(" (map sexp->es args) ")")) + + ;; yes, there are more important things to do until we get to the + ;; point where it's worth implementing proper tail calls + (("js:while") + (let ((pred (car args)) + (body (cdr args))) + (string-append + "(function(__whilebrk){" + "while (" (sexp->es pred) "){\n" + (body->es body #f) " if (__whilebrk) break;\n" + "}\n" + "})(false)"))) + (("js:break") "__whilebrk=true") + + ;; fortunately ES6+ has native symbol support :) + ;; we don't (yet?) need list quoting in Prebirth + (("quote") + (if (pair? (cdr args)) + (error "quoting lists is not yet supported; sorry!") + (string-append "Symbol.for('" (sexp->es args) "')"))) + + (("define") (cdfn t)) + + (("lambda") + (let ((fnargs (car args)) + (body (cdr args))) + (string-append + "function(" (join ", " (map sexp->es fnargs)) "){\n" + (body->es body #t) + "}"))) + + ;; simple if statement with optional else, wrapped in a self-executing + ;; function to simplify code generation (e.g. returning an if) + (("if") + (let ((pred (car args)) + (t (cadr args)) + (f (and (pair? (cddr args)) + (caddr args)))) + (string-append + "(function(){" + "if (_truep(" (sexp->es pred) ")){return " (sexp->es t) ";}" + (if (pair? f) + (string-append "else{return " (sexp->es f) ";}") + "") + "})()"))) + + ;; and short-circuits, so we need to implement it as a special form + ;; rather than an alias + (("and") + (string-append + "(function(__and){\n" + (join "" (map (lambda (expr) + (string-append + "__and = " (sexp->es expr) "; " + "if (!_truep(__and)) return false;\n")) + args)) + "return __and;})()")) + + ;; or short-circuits, so we need to implement it as a special form + ;; rather than an alias + (("or") + (string-append + "(function(__or){\n" + (join "" (map (lambda (expr) + (string-append + "__or = " (sexp->es expr) "; " + "if (_truep(__or)) return __or;\n")) + args)) + "return false;})()")) + + ;; (let ((binding val) ...) ...body), compiled as a self-executing + ;; function which allows us to easily represent the return value of + ;; the entire expression while maintaining local scope + (("let*") + (let ((bindings (car args)) + (body (cdr args))) + (string-append + "(function(){\n" + (join "" (map (lambda (binding) + (let ((var (car binding)) + (init (cadr binding))) + (string-append " let " (sexp->es var) + " = " (sexp->es init) ";\n"))) + bindings)) + (body->es body #t) "\n" + " })()"))) + + ;; similar to the above, but variables cannot reference one-another + (("let") + (let* ((bindings (car args)) + (body (cdr args)) + (fparams (join ", " (map sexp->es + (map car bindings)))) + (fargs (join ", " (map sexp->es + (map cadr bindings))))) + (string-append "(function(" fparams "){\n" + (body->es body #t) "\n" + "})(" fargs ")"))) + + ;; and here I thought Prebirth Lisp would be simple...but having + ;; `case' support really keeps things much more tidy, so here we are + ;; (note that it doesn't support the arrow form, nor does it support + ;; expressions as data) + (("case") + (let ((key (car args)) + (clauses (cdr args))) + (string-append + "(function(){const _key=" (sexp->es key) ";\n" + "switch (_key){\n" + (join "" + (map (lambda (data exprs) + (string-append + (if (and (token? data) + (string=? "else" (token-lexeme data))) + "default:\n" + (join "" + (map (lambda (datum) + (string-append + "case " (sexp->es datum) ":\n")) + data))) + (body->es exprs #t) "\n")) + (map car clauses) + (map cdr clauses))) + "}})()"))) + + (("set!") + (let ((varid (car args)) + (val (cadr args))) + (string-append (sexp->es varid) " = " (sexp->es val)))) + + ;; normal procedure application + (else (let* ((idfn (tname->id fn)) + (argstr (join ", " (map sexp->es args)))) + (string-append idfn "(" argstr ")"))))) + + + ;; Convert s-expressions or scalar into ECMAScript + ;; + ;; T may be either an array of tokens or a primitive token (e.g. string, + ;; symbol). This procedure is applied recursively to T as needed if T is + ;; a list. + (define (sexp->es t) + (if (not (list? t)) + (error "unexpected non-list for sexp->es token")) + + (if (token? t) + (case (token-type t) + ;; strings output as-is (note that we don't escape double quotes, + ;; because the method of escaping them is the same in Scheme as it + ;; is in ECMAScript---a backslash) + (("string") (string-append "\"" (token-value t) "\"")) + + ;; symbols have the same concerns as procedure definitions: the + ;; identifiers generated need to be ES-friendly + (("symbol") (tname->id (token-value t))) + + (else (error + (string-append + "cannot compile unknown token `" (token-type t) "'")))) + + ;; otherwise, process the expression + (fnmap (token-value (car t)) + (cdr t) + t))) + + ;; output libprebirth and compiled output, wrapped in a self-executing + ;; function to limit scope + (string-append "(function(){" + (js:file->string "libprebirth.js") "\n\n" + (join "\n\n" (map sexp->es ast)) + "})();")) + + +;; at this point, this program can parse itself and output a CST (sans +;; whitespace) +(js:console (prebirth->ecmascript + (parse-lisp + (js:file->string "/dev/stdin"))))