diff --git a/build-aux/bootstrap/birth.scm b/build-aux/bootstrap/birth.scm index b36e85b..9055077 100644 --- a/build-aux/bootstrap/birth.scm +++ b/build-aux/bootstrap/birth.scm @@ -69,54 +69,75 @@ ;; This is the lexer. Whitespace is ignored. The grammar consists of ;; simple s-expressions. ;; -;; This procedure is mutually recursive with `token'. It expects that -;; the source SRC will be left-truncated as input is processed. POS exists -;; for producing metadata for error reporting---it has no impact on parsing. +;; Tokens are produced with `make-token'. The source SRC will be +;; left-truncated as input is processed. POS exists for producing metadata +;; for error reporting---it has no impact on parsing. +;; +;; This implementation was originally recursive and immutable, but the stack +;; was being exhausted, so it was refactored into an inferior +;; implementation. Note the use of `js:while' and `js:break'---these are +;; quick fixes to the problem of stack exhaustion in browsers (where we have +;; no control over the stack limit); proper tail call support will come +;; later when we have a decent architecture in place. ;; ;; The result is a list of tokens. See `token' for the format. (define (lex src pos) - (let* ((ws (or (js:match-regexp "^\\s+" - src) - (list ""))) - (ws-len (string-length (car ws))) - (trim (substring src ws-len)) ; ignore whitespace, if any - (newpos (+ pos ws-len))) ; adj pos to account for removed ws + (let ((toks (list))) + (js:while #t ; browser stack workaround + (let* ((ws (or (js:match-regexp "^\\s+" + src) + (list ""))) + (ws-len (string-length (car ws))) + (trim (substring src ws-len)) ; ignore whitespace, if any + (newpos (+ pos ws-len))) ; adj pos to account for removed ws - (if (string=? "" trim) - (list) ; EOF and we're done + (if (string=? "" trim) + (js:break) ; EOF and we're done - ;; normally we'd use `string-ref' here, but then we'd have to - ;; implement Scheme characters, so let's keep this simple and keep - ;; with strings - (let ((ch (substring trim 0 1))) - (case ch - ;; comments extend until the end of the line - ((";") (let ((eol (js:match-regexp "^(.*?)(\\n|$)" trim))) - (make-token "comment" (cadr eol) trim newpos))) + ;; normally we'd use `string-ref' here, but then we'd have to + ;; implement Scheme characters, so let's keep this simple and keep + ;; with strings + (let* ((ch (substring trim 0 1)) + (t (case ch + ;; comments extend until the end of the line + ((";") (let ((eol (js:match-regexp "^(.*?)(\\n|$)" trim))) + (make-token "comment" (cadr eol) trim newpos))) - ;; left and right parenthesis are handled in the same manner: - ;; they produce distinct tokens with single-character lexemes - (("(") (make-token "open" ch trim newpos)) - ((")") (make-token "close" ch trim newpos)) + ;; left and right parenthesis are handled in the same + ;; manner: they produce distinct tokens with + ;; single-character lexemes + (("(") (make-token "open" ch trim newpos)) + ((")") (make-token "close" ch trim newpos)) - ;; strings are delimited by opening and closing ASCII double - ;; quotes, which can be escaped with a backslash - (("\"") (let ((str (js:match-regexp "^\"(|.*?[^\\\\])\"" - trim))) - (or str (parse-error - src pos "missing closing string delimiter")) - ;; a string token consists of the entire string - ;; including quotes as its lexeme, but its value will - ;; be the value of the string without quotes due to - ;; the `str' match group (see `token') - (make-token "string" str trim newpos))) + ;; strings are delimited by opening and closing ASCII + ;; double quotes, which can be escaped with a + ;; backslash + (("\"") (let ((str (js:match-regexp "^\"(|.*?[^\\\\])\"" + trim))) + (or str (parse-error + src pos + "missing closing string delimiter")) + ;; a string token consists of the entire + ;; string including quotes as its lexeme, + ;; but its value will be the value of the + ;; string without quotes due to the `str' + ;; match group (see `token') + (make-token "string" str trim newpos))) - (else - ;; anything else is considered a symbol up until whitespace or - ;; any of the aforementioned delimiters - (let ((symbol (js:match-regexp "^[^\\s()\"]+" - trim))) - (make-token "symbol" symbol trim newpos)))))))) + (else + ;; anything else is considered a symbol up until + ;; whitespace or any of the aforementioned + ;; delimiters + (let ((symbol (js:match-regexp "^[^\\s()\"]+" + trim))) + (make-token "symbol" symbol trim newpos)))))) + + ;; yikes---see notes in docblock with regards to why + ;; we're using mutators here + (set! toks (append toks (list (car t)))) + (set! src (cadr t)) + (set! pos (caddr t)))))) + toks)) @@ -132,14 +153,7 @@ -;; Produce a token and recurse. -;; -;; The token will be concatenated with the result of the mutually -;; recursive procedure `lex'. -;; -;; For the record: I'm not fond of mutual recursion from a clarity -;; standpoint, but this is how the abstraction evolved to de-duplicate -;; code, and I don't much feel like refactoring it. +;; Produce a token, left-truncate src, and update pos. ;; ;; Unlike the JS Prebirth implementation which uses a key/value object, ;; we're just using a simple list. @@ -160,9 +174,9 @@ ;; produce token and recurse on `lex', left-truncating the source ;; string to discard what we have already processed - (cons (list (quote token) type lexeme value pos) - (lex (substring src len) - (+ pos len))))) + (list (list (quote token) type lexeme value pos) + (substring src len) + (+ pos len)))) ;; various accessor procedures for token lists (we're Prebirth Lisp here, @@ -309,21 +323,21 @@ ;; ;; This produces a 1:1 mapping of body XS s-expressions to ES statements, ;; recursively. The heavy lifting is done by `sexp->es'. - (define (body->es xs) + (define (body->es xs ret) ;; recursively process body XS until we're out of pairs (if (not (pair? xs)) "" (let* ((x (car xs)) (rest (cdr xs)) - (more? (pair? rest))) + (more? (or (not ret) (pair? rest)))) ;; the result is a semicolon-delimited string of statements, with - ;; the final statement prefixed with `return' + ;; the final statement prefixed with `return' unless (not ret) (string-append " " - (if more? "" "return ") ; prefix with `return' if last body exp - (sexp->es x) ";" ; process current body expression - (if more? "\n" "") - (body->es rest))))) ; recurse + (if more? "" "return ") ; prefix with `return' if last body exp + (sexp->es x) ";" ; process current body expression + (if (pair? rest) "\n" "") + (body->es rest ret))))) ; recurse ;; Compile procedure definition into an ES function definition @@ -334,7 +348,7 @@ (let* ((dfn (cadr t)) (id (tname->id (token-value (car dfn)))) (params (params->es (cdr dfn))) - (body (body->es (cddr t)))) + (body (body->es (cddr t) #t))) ;; this is the final format---each procedure becomes its own function ;; definition in ES (string-append @@ -363,6 +377,19 @@ (("js:console") (string-append "console.log(" (map sexp->es args) ")")) + ;; yes, there are more important things to do until we get to the + ;; point where it's worth implementing proper tail calls + (("js:while") + (let ((pred (car args)) + (body (cdr args))) + (string-append + "(function(__whilebrk){" + "while (" (sexp->es pred) "){\n" + (body->es body #f) " if (__whilebrk) break;\n" + "}\n" + "})(false)"))) + (("js:break") "__whilebrk=true") + ;; fortunately ES6+ has native symbol support :) ;; we don't (yet?) need list quoting in Prebirth (("quote") @@ -377,7 +404,7 @@ (body (cdr args))) (string-append "function(" (join ", " (map sexp->es fnargs)) "){\n" - (body->es body) + (body->es body #t) "}"))) ;; simple if statement with optional else, wrapped in a self-executing @@ -430,10 +457,10 @@ (join "" (map (lambda (binding) (let ((var (car binding)) (init (cadr binding))) - (string-append " const " (sexp->es var) + (string-append " let " (sexp->es var) " = " (sexp->es init) ";\n"))) bindings)) - (body->es body) "\n" + (body->es body #t) "\n" " })()"))) ;; similar to the above, but variables cannot reference one-another @@ -445,7 +472,7 @@ (fargs (join ", " (map sexp->es (map cadr bindings))))) (string-append "(function(" fparams "){\n" - (body->es body) "\n" + (body->es body #t) "\n" "})(" fargs ")"))) ;; and here I thought Prebirth Lisp would be simple...but having @@ -469,11 +496,16 @@ (string-append "case " (sexp->es datum) ":\n")) data))) - (body->es exprs) "\n")) + (body->es exprs #t) "\n")) (map car clauses) (map cdr clauses))) "}})()"))) + (("set!") + (let ((varid (car args)) + (val (cadr args))) + (string-append (sexp->es varid) " = " (sexp->es val)))) + ;; normal procedure application (else (let* ((idfn (tname->id fn)) (argstr (join ", " (map sexp->es args)))) @@ -516,7 +548,7 @@ ;; output libprebirth and compiled output, wrapped in a self-executing ;; function to limit scope (string-append "(function(){" - (js:file->string "build-aux/bootstrap/libprebirth.js") "\n\n" + (js:file->string "libprebirth.js") "\n\n" (join "\n\n" (map sexp->es ast)) "})();")) diff --git a/build-aux/bootstrap/libprebirth.js b/build-aux/bootstrap/libprebirth.js index 7b90c55..6c180c1 100644 --- a/build-aux/bootstrap/libprebirth.js +++ b/build-aux/bootstrap/libprebirth.js @@ -83,6 +83,11 @@ const $$cons = ( item, list ) => _assertList( list ) && [ item ].concat( list ) const $$car = xs => _assertPair( xs ) && xs[ 0 ]; const $$cdr = xs => _assertPair( xs ) && xs.slice( 1 ); +// warning: this should technically set the cdr to the next element, and +// should accept any number of arguments, but that's not what we're doing +// here (note that an empty list is not a pair and therefore has no cdr) +const $$append$b$ = ( dest, xs ) => ( dest.length === 0 ) ? [] : dest.push( xs ); + // warning: blows up if any items are non-lists, whereas the proper RnRS // implementation will set the cdr to the final item even if it's not a pair function $$append() @@ -134,16 +139,18 @@ const $$map = ( f, ...xs ) => xs.map( x => x[ i ] ) ) ); -// Node.js stuff -const fs = require( 'fs' ); - -// stdin->string -const $$js$file$_$$g$string = ( path ) => - fs.readFileSync( path ).toString(); - const $$js$regexp = ( s, opts ) => new RegExp( s, opts ); const $$js$match = ( r, s ) => s.match( r ) || false; const $$js$replace = ( r, repl, s ) => s.replace( r, repl ); +const fs = ( typeof require !== 'undefined' ) + ? require( 'fs' ) + : { readFileSync: path => window.fsdata[ path ] }; + +// stdin->string +const $$js$file$_$$g$string = ( path ) => + fs.readFileSync( path ).toString(); + + /** =============== end of libprebirth =============== **/ diff --git a/build-aux/bootstrap/prebirth.js b/build-aux/bootstrap/prebirth.js index cea604b..a77a6c0 100644 --- a/build-aux/bootstrap/prebirth.js +++ b/build-aux/bootstrap/prebirth.js @@ -177,10 +177,13 @@ class Parser * This is the lexer. Whitespace is ignored. The grammar consists of * simple s-expressions. * - * This function is mutually recursive with `#_token'. It expects that - * the source SRC will be left-truncated as input is - * processed. POS exists for producing metadata for error - * reporting---it has no impact on parsing. + * Tokens are produced with `#_token'. The source SRC will be + * left-truncated as input is processed. POS exists for producing + * metadata for error reporting---it has no impact on parsing. + * + * This implementation was originally recursive and immutable, but the + * stack was being exhausted, so it was refactored into an inferior + * implementation. * * @param {string} src source code * @param {number} pos position (character offset) in source @@ -189,63 +192,74 @@ class Parser */ _lex( src, pos = 0 ) { - // ignore whitespace, if any - const ws = src.match( /^\s+/ ) || [ "" ]; - const trim = src.substr( ws[ 0 ].length ); + const toks = []; - // adjust position to account for any removed whitespace - pos += ws[ 0 ].length; + // process until EOF (see break) + while ( true ) { + // ignore whitespace, if any + const ws = src.match( /^\s+/ ) || [ "" ]; + const trim = src.substr( ws[ 0 ].length ); - // EOF and we're done - if ( trim === '' ) { - return []; - } + // adjust position to account for any removed whitespace + pos += ws[ 0 ].length; - // comment until end of line - if ( trim[ 0 ] === ';' ) { - const eol = trim.match( /^(.*?)(\n|$)/ ); - return this._token( 'comment', eol[ 1 ], trim, pos ); - } - - // left and right parenthesis are handled in the same manner: they - // produce distinct tokens with single-character lexemes - if ( trim[ 0 ] === '(' ) { - return this._token( 'open', '(', trim, pos ); - } - if ( trim[ 0 ] === ')' ) { - return this._token( 'close', ')', trim, pos ); - } - - // strings are delimited by opening and closing ASCII double quotes, - // which can be escaped with a backslash - if ( trim[ 0 ] === '"' ) { - const str = trim.match( /^"(|.*?[^\\])"/ ); - if ( !str ) { - this._error( src, pos, "missing closing string delimiter" ); + // EOF and we're done + if ( trim === '' ) { + break; } - // a string token consists of the entire string including quotes - // as its lexeme, but its value will be the value of the string - // without quotes due to the `str' match group (see `#_token') - return this._token( 'string', str, trim, pos ); + let t = null; + + // comment until end of line + if ( trim[ 0 ] === ';' ) { + const eol = trim.match( /^(.*?)(\n|$)/ ); + t = this._token( 'comment', eol[ 1 ], trim, pos ); + } + + // left and right parenthesis are handled in the same manner: they + // produce distinct tokens with single-character lexemes + else if ( trim[ 0 ] === '(' ) { + t = this._token( 'open', '(', trim, pos ); + } + else if ( trim[ 0 ] === ')' ) { + t = this._token( 'close', ')', trim, pos ); + } + + // strings are delimited by opening and closing ASCII double quotes, + // which can be escaped with a backslash + else if ( trim[ 0 ] === '"' ) { + const str = trim.match( /^"(|.*?[^\\])"/ ); + if ( !str ) { + this._error( src, pos, "missing closing string delimiter" ); + } + + // a string token consists of the entire string including quotes + // as its lexeme, but its value will be the value of the string + // without quotes due to the `str' match group (see `#_token') + t = this._token( 'string', str, trim, pos ); + } + + // anything else is considered a symbol up until whitespace or any + // of the aforementioned delimiters + else { + const symbol = trim.match( /^[^\s()"]+/ ); + t = this._token( 'symbol', symbol, trim, pos ); + } + + const [ tok, newsrc, newpos ] = t; + + // add token, left-truncate src, update pos + toks.push( tok ); + src = newsrc; + pos = newpos; } - // anything else is considered a symbol up until whitespace or any - // of the aforementioned delimiters - const symbol = trim.match( /^[^\s()"]+/ ); - return this._token( 'symbol', symbol, trim, pos ); + return toks; } /** - * Produce a token and recurse - * - * The token will be concatenated with the result of the mutually - * recursive method `_lex'. - * - * For the record: I'm not fond of mutual recursion from a clarity - * standpoint, but this is how the abstraction evolved to de-duplicate - * code, and I don't much feel like refactoring it. + * Produce a token, left-truncate src, and update pos * * @param {string} type token type * @param {string|Array} match lexeme match @@ -272,14 +286,13 @@ class Parser pos: pos }; - // continue producing tokens by recursing, left-truncating the - // source string to discard what we have already processed - return [ token ].concat( - this._lex( - src.substr( lexeme.length ), - ( pos + lexeme.length ) - ) - ); + // produce token, left-truncating the source string to discard what + // we have already processed + return [ + token, + src.substr( lexeme.length ), + ( pos + lexeme.length ), + ]; } }; @@ -431,7 +444,7 @@ class Compiler * * @return {string} compiled BODY */ - _bodyToEs( body ) + _bodyToEs( body, ret = true ) { // the body must be an array of expressions (this should always be // the case unless we have a bug in the compiler) @@ -447,8 +460,8 @@ class Compiler // the final expression return js.map( ( s, i ) => { - const ret = ( i === ( js.length - 1 ) ) ? "return " : ""; - return ` ${ret}${s};`; + const retstmt = ( ret && i === ( js.length - 1 ) ) ? "return " : ""; + return ` ${retstmt}${s};`; } ).join( '\n' ); } @@ -543,6 +556,16 @@ class Compiler const fnmap = { 'js:console': 'console.log', + // yes, there are more important things to do until we get to the point + // where it's worth implementing proper tail calls + 'js:while': ( [ pred, ...body ], stoes, btoes ) => + "(function(__whilebrk){" + + `while (${stoes(pred)}){\n` + + `${btoes(body, false)} if (__whilebrk) break;\n` + + "}\n" + + "})(false)", + 'js:break': () => '__whilebrk=true', + // fortunately ES6+ has native symbol support :) // we don't (yet?) need list quoting in Prebirth 'quote': ( x, stoes ) => Array.isArray( x.value ) @@ -590,7 +613,7 @@ const fnmap = { 'let*': ( [ bindings, ...body ], stoes, btoes ) => "(function(){\n" + bindings - .map( ([ x, val ]) => ` const ${stoes(x)} = ${stoes(val)};\n` ) + .map( ([ x, val ]) => ` let ${stoes(x)} = ${stoes(val)};\n` ) .join( '' ) + btoes( body ) + "\n" + " })()", @@ -624,7 +647,11 @@ const fnmap = { btoes( exprs ) + "\n" ).join( '' ) + "}" + - "})()" + "})()", + + // basic mutator (variable assignment) + 'set!': ( [ varid, val ], stoes ) => + `${stoes(varid)} = ${stoes(val)}`, };