birth: Prebirth self-hosting

This completes bootstrapping for Prebirth Lisp.  The next step will be
Rebirth, which will replace libprebirth.js, removing hand-written JavaScript
entirely.
master
Mike Gerwitz 2017-09-21 13:37:16 -04:00
parent a5b53fedf5
commit 35fa13a8a0
Signed by: mikegerwitz
GPG Key ID: 8C917B7F5DC51BA2
3 changed files with 341 additions and 46 deletions

View File

@ -29,11 +29,17 @@
;;; is successful if the output of Birth compiling itself is byte-for-byte
;;; identical to the output of compiling Birth with Prebirth.
;;;
;;; This is largely a 1:1 translation of `prebirth.js'.
;;; This is largely a 1:1 translation of `prebirth.js'. See that file for
;;; terminology.
;;;
;;; Note that we're dealing with a small subset of Scheme here, so certain
;;; things might be done differently given a proper implementation. See
;;; that file for terminology.
;;; things might be done differently given a proper implementation.
;;;
;;; The next step after this is ``Rebirth'': both Prebirth and Birth require
;;; the manually written `libprebirth.js' to function. Rebirth will remove
;;; that completely, which bootstraps the runtime in its entirety. At that
;;; point, all development will be exclusively in Scheme and we can get on
;;; with Gibble.
;; pair selection
(define (cadr xs)
@ -43,7 +49,14 @@
(define (caddr xs)
(car (cdr (cdr xs))))
(define (cadddr xs)
(car (cdr (cdr ( cdr xs)))))
(car (cdr (cdr (cdr xs)))))
(define (caddddr xs)
(car (cdr (cdr (cdr (cdr xs))))))
(define (cddr xs)
(cdr (cdr xs)))
(define (not x)
(if x #f #t))
;; for convenience
(define (js:match-regexp re s)
@ -58,8 +71,7 @@
;;
;; This procedure is mutually recursive with `token'. It expects that
;; the source SRC will be left-truncated as input is processed. POS exists
;; for producing metadata for error reporting---it has no impact on
;; parsing.
;; for producing metadata for error reporting---it has no impact on parsing.
;;
;; The result is a list of tokens. See `token' for the format.
(define (lex src pos)
@ -68,7 +80,7 @@
(list "")))
(ws-len (string-length (car ws)))
(trim (substring src ws-len)) ; ignore whitespace, if any
(newpos (+ pos ws-len)))
(newpos (+ pos ws-len))) ; adj pos to account for removed ws
(if (string=? "" trim)
(list) ; EOF and we're done
@ -80,12 +92,12 @@
(case ch
;; comments extend until the end of the line
((";") (let ((eol (js:match-regexp "^(.*?)(\\n|$)" trim)))
(token "comment" (cadr eol) trim newpos)))
(make-token "comment" (cadr eol) trim newpos)))
;; left and right parenthesis are handled in the same manner:
;; they produce distinct tokens with single-character lexemes
(("(") (token "open" ch trim newpos))
((")") (token "close" ch trim newpos))
(("(") (make-token "open" ch trim newpos))
((")") (make-token "close" ch trim newpos))
;; strings are delimited by opening and closing ASCII double
;; quotes, which can be escaped with a backslash
@ -97,14 +109,14 @@
;; including quotes as its lexeme, but its value will
;; be the value of the string without quotes due to
;; the `str' match group (see `token')
(token "string" str trim newpos)))
(make-token "string" str trim newpos)))
(else
;; anything else is considered a symbol up until whitespace or
;; any of the aforementioned delimiters
(let ((symbol (js:match-regexp "^[^\\s()\"]+"
trim)))
(token "symbol" symbol trim newpos))))))))
(make-token "symbol" symbol trim newpos))))))))
@ -135,7 +147,7 @@
;; The expected arguments are: the token type TYPE, the match group or
;; string MATCH, left-truncated source code SRC, and the position POS
;; relative to the original source code.
(define (token type match src pos)
(define (make-token type match src pos)
(let* ((parts (if (list? match) match (list match match)))
(lexeme (car parts))
;; the value is the first group of the match (indicating what we
@ -148,17 +160,18 @@
;; produce token and recurse on `lex', left-truncating the source
;; string to discard what we have already processed
(cons (list type lexeme value pos)
(cons (list (quote token) type lexeme value pos)
(lex (substring src len)
(+ pos len)))))
;; various accessor procedures for token lists (we're Prebirth Lisp here,
;; so no record support or anything fancy!)
(define (token-type t) (car t))
(define (token-lexeme t) (cadr t))
(define (token-value t) (caddr t))
(define (token-pos t) (cadddr t))
(define (token? t) (and (pair? t) (symbol=? (quote token) (car t))))
(define (token-type t) (cadr t))
(define (token-lexeme t) (caddr t))
(define (token-value t) (cadddr t))
(define (token-pos t) (caddddr t))
@ -175,8 +188,9 @@
(define (ast-tree ast) (cadr ast))
(define (ast-stack ast) (caddr ast))
;; perform a leftmost reduction on the token string
(define (toks->ast toks)
(fold ; then a leftmost reduction on the token string
(fold
(lambda (token result)
(let ((depth (ast-depth result))
(xs (ast-tree result))
@ -228,12 +242,287 @@
(ast (toks->ast toks)))
(if (zero? (ast-depth ast))
(ast-tree ast)
;; if we terminate at a non-zero depth, that means there ar still
;; open sexps
(error (string-append
"unexpected end of input at depth "
(ast-depth ast))))))
;; Compile Prebirth Lisp AST into ECMAScript.
;;
;; The AST can be generated with `parse-lisp'.
(define (prebirth->ecmascript ast)
;; Generate ECMAScript-friendly name from the given id.
;;
;; A subset of special characters that are acceptable in Scheme are
;; converted in an identifiable manner; others are simply converted to `$'
;; in a catch-all and therefore could result in conflicts and cannot be
;; reliably distinguished from one-another. Remember: this is temporary
;; code.
(define (tname->id name)
(if (js:match (js:regexp "^\\d+$") name)
name
(string-append
"$$" (js:replace (js:regexp "[^a-zA-Z0-9_]" "g")
(lambda (c)
(case c
(("-") "$_$")
(("?") "$7$")
(("@") "$a$")
(("!") "$b$")
((">") "$g$")
(("#") "$h$")
(("*") "$k$")
(("<") "$l$")
(("&") "$n$")
(("%") "$o$")
(("+") "$p$")
(("=") "$q$")
(("^") "$v$")
(("/") "$w$")
(("$") "$$")
(else "$")))
name))))
;; Join a list of strings XS on a delimiter DELIM
(define (join delim xs)
(if (pair? xs)
(fold (lambda (x str)
(string-append str delim x))
(car xs)
(cdr xs))
""))
;; Compile parameter list.
;;
;; This simply takes the value of the symbol and outputs it (formatted),
;; delimited by commas.
(define (params->es params)
(join ", " (map (lambda (t)
(tname->id (token-value t)))
params)))
;; Compile body s-expressions into ECMAScript
;;
;; This produces a 1:1 mapping of body XS s-expressions to ES statements,
;; recursively. The heavy lifting is done by `sexp->es'.
(define (body->es xs)
;; recursively process body XS until we're out of pairs
(if (not (pair? xs))
""
(let* ((x (car xs))
(rest (cdr xs))
(more? (pair? rest)))
;; the result is a semicolon-delimited string of statements, with
;; the final statement prefixed with `return'
(string-append
" "
(if more? "" "return ") ; prefix with `return' if last body exp
(sexp->es x) ";" ; process current body expression
(if more? "\n" "")
(body->es rest))))) ; recurse
;; Compile procedure definition into an ES function definition
;;
;; This will fail if the given token is not a `define'.
(define (cdfn t)
;; e.g. (define (foo ...) body)
(let* ((dfn (cadr t))
(id (tname->id (token-value (car dfn))))
(params (params->es (cdr dfn)))
(body (body->es (cddr t))))
;; this is the final format---each procedure becomes its own function
;; definition in ES
(string-append
"function " id "(" params ")\n{\n" body "\n};")))
;; Function/procedure aliases and special forms
;;
;; And here we have what is probably the most grotesque part of this file.
;;
;; This map allows for a steady transition---items can be removed as they
;; are written in Prebirth Lisp. This should give us a sane (but still
;; simple) environment with which we can start to self-host.
;;
;; String values are simple function aliases. Function values take over
;; the compilation of that function and allow for defining special forms
;; (in place of macro support). The first argument FN is the name of the
;; function/procedure/form, and ARS is the list of arguments.
;;
;; These are by no means meant to be solid implementations; notable
;; deficiencies are documented, but don't expect this to work properly in
;; every case. They will be replaced with proper R7RS implementations in
;; the future (Rebirth).
(define (fnmap fn args t)
(case fn
(("js:console")
(string-append "console.log(" (map sexp->es args) ")"))
;; fortunately ES6+ has native symbol support :)
;; we don't (yet?) need list quoting in Prebirth
(("quote")
(if (pair? (cdr args))
(error "quoting lists is not yet supported; sorry!")
(string-append "Symbol.for('" (sexp->es args) "')")))
(("define") (cdfn t))
(("lambda")
(let ((fnargs (car args))
(body (cdr args)))
(string-append
"function(" (join ", " (map sexp->es fnargs)) "){\n"
(body->es body)
"}")))
;; simple if statement with optional else, wrapped in a self-executing
;; function to simplify code generation (e.g. returning an if)
(("if")
(let ((pred (car args))
(t (cadr args))
(f (and (pair? (cddr args))
(caddr args))))
(string-append
"(function(){"
"if (_truep(" (sexp->es pred) ")){return " (sexp->es t) ";}"
(if (pair? f)
(string-append "else{return " (sexp->es f) ";}")
"")
"})()")))
;; and short-circuits, so we need to implement it as a special form
;; rather than an alias
(("and")
(string-append
"(function(__and){\n"
(join "" (map (lambda (expr)
(string-append
"__and = " (sexp->es expr) "; "
"if (!_truep(__and)) return false;\n"))
args))
"return __and;})()"))
;; or short-circuits, so we need to implement it as a special form
;; rather than an alias
(("or")
(string-append
"(function(__or){\n"
(join "" (map (lambda (expr)
(string-append
"__or = " (sexp->es expr) "; "
"if (_truep(__or)) return __or;\n"))
args))
"return false;})()"))
;; (let ((binding val) ...) ...body), compiled as a self-executing
;; function which allows us to easily represent the return value of
;; the entire expression while maintaining local scope
(("let*")
(let ((bindings (car args))
(body (cdr args)))
(string-append
"(function(){\n"
(join "" (map (lambda (binding)
(let ((var (car binding))
(init (cadr binding)))
(string-append " const " (sexp->es var)
" = " (sexp->es init) ";\n")))
bindings))
(body->es body) "\n"
" })()")))
;; similar to the above, but variables cannot reference one-another
(("let")
(let* ((bindings (car args))
(body (cdr args))
(fparams (join ", " (map sexp->es
(map car bindings))))
(fargs (join ", " (map sexp->es
(map cadr bindings)))))
(string-append "(function(" fparams "){\n"
(body->es body) "\n"
"})(" fargs ")")))
;; and here I thought Prebirth Lisp would be simple...but having
;; `case' support really keeps things much more tidy, so here we are
;; (note that it doesn't support the arrow form, nor does it support
;; expressions as data)
(("case")
(let ((key (car args))
(clauses (cdr args)))
(string-append
"(function(){const _key=" (sexp->es key) ";\n"
"switch (_key){\n"
(join ""
(map (lambda (data exprs)
(string-append
(if (and (token? data)
(string=? "else" (token-lexeme data)))
"default:\n"
(join ""
(map (lambda (datum)
(string-append
"case " (sexp->es datum) ":\n"))
data)))
(body->es exprs) "\n"))
(map car clauses)
(map cdr clauses)))
"}})()")))
;; normal procedure application
(else (let* ((idfn (tname->id fn))
(argstr (join ", " (map sexp->es args))))
(string-append idfn "(" argstr ")")))))
;; Convert s-expressions or scalar into ECMAScript
;;
;; T may be either an array of tokens or a primitive token (e.g. string,
;; symbol). This procedure is applied recursively to T as needed if T is
;; a list.
;;
;; XXX: We hit stack limits ._. we want to compile in browser too! Until
;; then, increase Node.js' stack when invoking it from the command line
;; (`--stack-size').
(define (sexp->es t)
(if (not (list? t))
(error "unexpected non-list for sexp->es token"))
(if (token? t)
(case (token-type t)
;; strings output as-is (note that we don't escape double quotes,
;; because the method of escaping them is the same in Scheme as it
;; is in ECMAScript---a backslash)
(("string") (string-append "\"" (token-value t) "\""))
;; symbols have the same concerns as procedure definitions: the
;; identifiers generated need to be ES-friendly
(("symbol") (tname->id (token-value t)))
(else (error
(string-append
"cannot compile unknown token `" (token-type t) "'"))))
;; otherwise, process the expression
(fnmap (token-value (car t))
(cdr t)
t)))
;; output libprebirth and compiled output, wrapped in a self-executing
;; function to limit scope
(string-append "(function(){"
(js:file->string "build-aux/bootstrap/libprebirth.js") "\n\n"
(join "\n\n" (map sexp->es ast))
"})();"))
;; at this point, this program can parse itself and output a CST (sans
;; whitespace)
(js:console
(parse-lisp (js:stdin->string)))
(js:console (prebirth->ecmascript
(parse-lisp
(js:file->string "/dev/stdin"))))

View File

@ -34,10 +34,15 @@
* By convention, everything is prefixed with `js:' in Prebirth (not `es:',
* because we're using JavaScript-specific features). For character
* transformation rules, see `Compiler#_idFromName' in `prebirth.js'.
*
* These implementations are largely flawed in some manner, but that's okay,
* because they do their job. Some flaws are noted.
*/
const $h$t = true;
const $h$f = false;
const $$$h$t = true;
const $$$h$f = false;
const $$symbol$q$$7$ = ( a, b ) => ( ( typeof a === 'symbol' ) && ( a === b ) );
const argToArr = args => Array.prototype.slice.call( args );
@ -120,17 +125,25 @@ const $$zero$7$ = x => x === 0;
// warning: fold here only supports one list
const $$fold = ( f, init, xs ) =>
xs.reduce( ( prev, x ) => f( x, prev ), init );
// warning: map here uses the length of the first list, not the shortest
// (we implement this in ES for now so that we don't have to augment
// Prebirth Lisp to support the "rest" procedure definition syntax)
const $$map = ( f, ...xs ) =>
xs[ 0 ].map(
( _, i ) => f.apply( null,
xs.map( x => x[ i ] ) ) );
// Node.js stuff
const fs = require( 'fs' );
// stdin->string
const $$js$stdin$_$$g$string = () =>
fs.readFileSync( '/dev/stdin' ).toString();
const $$js$file$_$$g$string = ( path ) =>
fs.readFileSync( path ).toString();
const $$js$regexp = ( s, opts ) => new RegExp( s, opts );
const $$js$match = ( r, s ) => s.match( r ) || false;
const $$js$regexp = ( s, opts ) => new RegExp( s, opts );
const $$js$match = ( r, s ) => s.match( r ) || false;
const $$js$replace = ( r, repl, s ) => s.replace( r, repl );
/** =============== end of libprebirth =============== **/

View File

@ -330,12 +330,12 @@ class Compiler
// map every definition to a ES function definition and delimit them
// (for readability) by two newlines
return tree.map( this._sexpToEs.bind( this ) )
.join( "\n\n" ) + "\n";
.join( "\n\n" );
}
/**
* Compile function definition into a ES function definition
* Compile procedure definition into a ES function definition
*
* This will fail if the given token is not a `define'.
*
@ -345,13 +345,6 @@ class Compiler
*/
_cdfn( t )
{
// an application must be an s-expression
if ( !Array.isArray( t ) ) {
throw Error(
`\`${name}' application expected, found symbol \`${t.value}'`
);
}
// e.g. (define (foo ...) body)
const [ , [ { value: name }, ...params ], ...body ] = t;
@ -359,7 +352,7 @@ class Compiler
const paramjs = this._paramsToEs( params );
const bodyjs = this._bodyToEs( body );
// this is the final format---each function becomes its own function
// this is the final format---each procedure becomes its own function
// definition in ES
return `function ${id}(${paramjs})\n{\n${bodyjs}\n};`;
}
@ -525,7 +518,7 @@ class Compiler
/**
* Function aliases and special forms
* Function/procedure aliases and special forms
*
* And here we have what is probably the most grotesque part of this
* file. Saved the best for last.
@ -542,7 +535,7 @@ class Compiler
*
* These are by no means meant to be solid implementations; notable
* deficiencies are documented, but don't expect this to work properly in
* any case. They will be replaced with proper R7RS implementations in the
* every case. They will be replaced with proper R7RS implementations in the
* future (after Birth).
*
* @type {Object}
@ -574,20 +567,20 @@ const fnmap = {
// and short-circuits, so we need to implement it as a special form
// rather than an alias
'and': ( args, stoes ) =>
"(function(){\n" +
"(function(__and){\n" +
args.map( ( expr, i ) =>
`const _and${i} = ${stoes(expr)}; ` +
`if (!_truep(_and${i})) return false;\n`
`__and = ${stoes(expr)}; ` +
`if (!_truep(__and)) return false;\n`
).join( '' ) +
`return _and${args.length-1};})()`,
`return __and;})()`,
// or short-circuits, so we need to implement it as a special form
// rather than an alias
'or': ( args, stoes ) =>
"(function(){\n" +
"(function(__or){\n" +
args.map( ( expr, i ) =>
`const _or${i} = ${stoes(expr)}; ` +
`if (_truep(_or${i})) return _or${i};\n`
`__or = ${stoes(expr)}; ` +
`if (_truep(__or)) return __or;\n`
).join( '' ) +
"return false;})()",