birth,prebirth: Non-recursive lexing to prevent stack exhaustion

This needs to run in the browser too, where we have no control over stack
limits.

* build-aux/bootstrap/birth.scm
  (lex): Non-recursive strategy (loop with mutable list).
  (make-token): Update doc.  Produce list of token, new string, and
    position.  Don't recurse.
  (body->es): Add `ret' param.  Only produce `return' statement if new param
    is set.
  (cdfn): Use it.
  (fnmap)
    [js:while, js:break]: Add forms.
    [lambda, let, case]: Use new `body->es' `ret' param.
    [let*]: Define JS variables in output using `let' instead of `const' to
      permit mutating with new `set!' form.  Use new `body->es' `ret' param.
    [set!]: Add form.
  (prebirth->ecmascript): Adjust libprebirth path to be relative to self.

* build-aux/bootstrap/libprebirth.js
  ($$append$b$): Add `append!' procedure.
  ($$js$regexp, $$js$match, $$js$replace): Move a few lines up.
  (fs): Provide stub if `require' is not defined.

* build-aux/bootstrap/prebirth.js
  (_lex): Non-recursive strategy (loop with array-appending).
  (_token): No more mutual recursion with `#_lex'.  Return new string
    and position.
  (_bodyToEs): Add `ret' param.  Only produce `return' statement if new
    param is set.
  (fnmap) [js:while, js:break]: Add forms.
    [let*]: Define JS variables in output using `let' instead of `const' to
      permit mutating with new `set!' form.  Use new `body->es' `ret' param.
    [set!]: Add form.
master
Mike Gerwitz 2017-10-09 00:59:11 -04:00
parent 35fa13a8a0
commit 431b18e1df
Signed by: mikegerwitz
GPG Key ID: 8C917B7F5DC51BA2
3 changed files with 201 additions and 135 deletions

View File

@ -69,12 +69,21 @@
;; This is the lexer. Whitespace is ignored. The grammar consists of ;; This is the lexer. Whitespace is ignored. The grammar consists of
;; simple s-expressions. ;; simple s-expressions.
;; ;;
;; This procedure is mutually recursive with `token'. It expects that ;; Tokens are produced with `make-token'. The source SRC will be
;; the source SRC will be left-truncated as input is processed. POS exists ;; left-truncated as input is processed. POS exists for producing metadata
;; for producing metadata for error reporting---it has no impact on parsing. ;; for error reporting---it has no impact on parsing.
;;
;; This implementation was originally recursive and immutable, but the stack
;; was being exhausted, so it was refactored into an inferior
;; implementation. Note the use of `js:while' and `js:break'---these are
;; quick fixes to the problem of stack exhaustion in browsers (where we have
;; no control over the stack limit); proper tail call support will come
;; later when we have a decent architecture in place.
;; ;;
;; The result is a list of tokens. See `token' for the format. ;; The result is a list of tokens. See `token' for the format.
(define (lex src pos) (define (lex src pos)
(let ((toks (list)))
(js:while #t ; browser stack workaround
(let* ((ws (or (js:match-regexp "^\\s+" (let* ((ws (or (js:match-regexp "^\\s+"
src) src)
(list ""))) (list "")))
@ -83,40 +92,52 @@
(newpos (+ pos ws-len))) ; adj pos to account for removed ws (newpos (+ pos ws-len))) ; adj pos to account for removed ws
(if (string=? "" trim) (if (string=? "" trim)
(list) ; EOF and we're done (js:break) ; EOF and we're done
;; normally we'd use `string-ref' here, but then we'd have to ;; normally we'd use `string-ref' here, but then we'd have to
;; implement Scheme characters, so let's keep this simple and keep ;; implement Scheme characters, so let's keep this simple and keep
;; with strings ;; with strings
(let ((ch (substring trim 0 1))) (let* ((ch (substring trim 0 1))
(case ch (t (case ch
;; comments extend until the end of the line ;; comments extend until the end of the line
((";") (let ((eol (js:match-regexp "^(.*?)(\\n|$)" trim))) ((";") (let ((eol (js:match-regexp "^(.*?)(\\n|$)" trim)))
(make-token "comment" (cadr eol) trim newpos))) (make-token "comment" (cadr eol) trim newpos)))
;; left and right parenthesis are handled in the same manner: ;; left and right parenthesis are handled in the same
;; they produce distinct tokens with single-character lexemes ;; manner: they produce distinct tokens with
;; single-character lexemes
(("(") (make-token "open" ch trim newpos)) (("(") (make-token "open" ch trim newpos))
((")") (make-token "close" ch trim newpos)) ((")") (make-token "close" ch trim newpos))
;; strings are delimited by opening and closing ASCII double ;; strings are delimited by opening and closing ASCII
;; quotes, which can be escaped with a backslash ;; double quotes, which can be escaped with a
;; backslash
(("\"") (let ((str (js:match-regexp "^\"(|.*?[^\\\\])\"" (("\"") (let ((str (js:match-regexp "^\"(|.*?[^\\\\])\""
trim))) trim)))
(or str (parse-error (or str (parse-error
src pos "missing closing string delimiter")) src pos
;; a string token consists of the entire string "missing closing string delimiter"))
;; including quotes as its lexeme, but its value will ;; a string token consists of the entire
;; be the value of the string without quotes due to ;; string including quotes as its lexeme,
;; the `str' match group (see `token') ;; but its value will be the value of the
;; string without quotes due to the `str'
;; match group (see `token')
(make-token "string" str trim newpos))) (make-token "string" str trim newpos)))
(else (else
;; anything else is considered a symbol up until whitespace or ;; anything else is considered a symbol up until
;; any of the aforementioned delimiters ;; whitespace or any of the aforementioned
;; delimiters
(let ((symbol (js:match-regexp "^[^\\s()\"]+" (let ((symbol (js:match-regexp "^[^\\s()\"]+"
trim))) trim)))
(make-token "symbol" symbol trim newpos)))))))) (make-token "symbol" symbol trim newpos))))))
;; yikes---see notes in docblock with regards to why
;; we're using mutators here
(set! toks (append toks (list (car t))))
(set! src (cadr t))
(set! pos (caddr t))))))
toks))
@ -132,14 +153,7 @@
;; Produce a token and recurse. ;; Produce a token, left-truncate src, and update pos.
;;
;; The token will be concatenated with the result of the mutually
;; recursive procedure `lex'.
;;
;; For the record: I'm not fond of mutual recursion from a clarity
;; standpoint, but this is how the abstraction evolved to de-duplicate
;; code, and I don't much feel like refactoring it.
;; ;;
;; Unlike the JS Prebirth implementation which uses a key/value object, ;; Unlike the JS Prebirth implementation which uses a key/value object,
;; we're just using a simple list. ;; we're just using a simple list.
@ -160,9 +174,9 @@
;; produce token and recurse on `lex', left-truncating the source ;; produce token and recurse on `lex', left-truncating the source
;; string to discard what we have already processed ;; string to discard what we have already processed
(cons (list (quote token) type lexeme value pos) (list (list (quote token) type lexeme value pos)
(lex (substring src len) (substring src len)
(+ pos len))))) (+ pos len))))
;; various accessor procedures for token lists (we're Prebirth Lisp here, ;; various accessor procedures for token lists (we're Prebirth Lisp here,
@ -309,21 +323,21 @@
;; ;;
;; This produces a 1:1 mapping of body XS s-expressions to ES statements, ;; This produces a 1:1 mapping of body XS s-expressions to ES statements,
;; recursively. The heavy lifting is done by `sexp->es'. ;; recursively. The heavy lifting is done by `sexp->es'.
(define (body->es xs) (define (body->es xs ret)
;; recursively process body XS until we're out of pairs ;; recursively process body XS until we're out of pairs
(if (not (pair? xs)) (if (not (pair? xs))
"" ""
(let* ((x (car xs)) (let* ((x (car xs))
(rest (cdr xs)) (rest (cdr xs))
(more? (pair? rest))) (more? (or (not ret) (pair? rest))))
;; the result is a semicolon-delimited string of statements, with ;; the result is a semicolon-delimited string of statements, with
;; the final statement prefixed with `return' ;; the final statement prefixed with `return' unless (not ret)
(string-append (string-append
" " " "
(if more? "" "return ") ; prefix with `return' if last body exp (if more? "" "return ") ; prefix with `return' if last body exp
(sexp->es x) ";" ; process current body expression (sexp->es x) ";" ; process current body expression
(if more? "\n" "") (if (pair? rest) "\n" "")
(body->es rest))))) ; recurse (body->es rest ret))))) ; recurse
;; Compile procedure definition into an ES function definition ;; Compile procedure definition into an ES function definition
@ -334,7 +348,7 @@
(let* ((dfn (cadr t)) (let* ((dfn (cadr t))
(id (tname->id (token-value (car dfn)))) (id (tname->id (token-value (car dfn))))
(params (params->es (cdr dfn))) (params (params->es (cdr dfn)))
(body (body->es (cddr t)))) (body (body->es (cddr t) #t)))
;; this is the final format---each procedure becomes its own function ;; this is the final format---each procedure becomes its own function
;; definition in ES ;; definition in ES
(string-append (string-append
@ -363,6 +377,19 @@
(("js:console") (("js:console")
(string-append "console.log(" (map sexp->es args) ")")) (string-append "console.log(" (map sexp->es args) ")"))
;; yes, there are more important things to do until we get to the
;; point where it's worth implementing proper tail calls
(("js:while")
(let ((pred (car args))
(body (cdr args)))
(string-append
"(function(__whilebrk){"
"while (" (sexp->es pred) "){\n"
(body->es body #f) " if (__whilebrk) break;\n"
"}\n"
"})(false)")))
(("js:break") "__whilebrk=true")
;; fortunately ES6+ has native symbol support :) ;; fortunately ES6+ has native symbol support :)
;; we don't (yet?) need list quoting in Prebirth ;; we don't (yet?) need list quoting in Prebirth
(("quote") (("quote")
@ -377,7 +404,7 @@
(body (cdr args))) (body (cdr args)))
(string-append (string-append
"function(" (join ", " (map sexp->es fnargs)) "){\n" "function(" (join ", " (map sexp->es fnargs)) "){\n"
(body->es body) (body->es body #t)
"}"))) "}")))
;; simple if statement with optional else, wrapped in a self-executing ;; simple if statement with optional else, wrapped in a self-executing
@ -430,10 +457,10 @@
(join "" (map (lambda (binding) (join "" (map (lambda (binding)
(let ((var (car binding)) (let ((var (car binding))
(init (cadr binding))) (init (cadr binding)))
(string-append " const " (sexp->es var) (string-append " let " (sexp->es var)
" = " (sexp->es init) ";\n"))) " = " (sexp->es init) ";\n")))
bindings)) bindings))
(body->es body) "\n" (body->es body #t) "\n"
" })()"))) " })()")))
;; similar to the above, but variables cannot reference one-another ;; similar to the above, but variables cannot reference one-another
@ -445,7 +472,7 @@
(fargs (join ", " (map sexp->es (fargs (join ", " (map sexp->es
(map cadr bindings))))) (map cadr bindings)))))
(string-append "(function(" fparams "){\n" (string-append "(function(" fparams "){\n"
(body->es body) "\n" (body->es body #t) "\n"
"})(" fargs ")"))) "})(" fargs ")")))
;; and here I thought Prebirth Lisp would be simple...but having ;; and here I thought Prebirth Lisp would be simple...but having
@ -469,11 +496,16 @@
(string-append (string-append
"case " (sexp->es datum) ":\n")) "case " (sexp->es datum) ":\n"))
data))) data)))
(body->es exprs) "\n")) (body->es exprs #t) "\n"))
(map car clauses) (map car clauses)
(map cdr clauses))) (map cdr clauses)))
"}})()"))) "}})()")))
(("set!")
(let ((varid (car args))
(val (cadr args)))
(string-append (sexp->es varid) " = " (sexp->es val))))
;; normal procedure application ;; normal procedure application
(else (let* ((idfn (tname->id fn)) (else (let* ((idfn (tname->id fn))
(argstr (join ", " (map sexp->es args)))) (argstr (join ", " (map sexp->es args))))
@ -516,7 +548,7 @@
;; output libprebirth and compiled output, wrapped in a self-executing ;; output libprebirth and compiled output, wrapped in a self-executing
;; function to limit scope ;; function to limit scope
(string-append "(function(){" (string-append "(function(){"
(js:file->string "build-aux/bootstrap/libprebirth.js") "\n\n" (js:file->string "libprebirth.js") "\n\n"
(join "\n\n" (map sexp->es ast)) (join "\n\n" (map sexp->es ast))
"})();")) "})();"))

View File

@ -83,6 +83,11 @@ const $$cons = ( item, list ) => _assertList( list ) && [ item ].concat( list )
const $$car = xs => _assertPair( xs ) && xs[ 0 ]; const $$car = xs => _assertPair( xs ) && xs[ 0 ];
const $$cdr = xs => _assertPair( xs ) && xs.slice( 1 ); const $$cdr = xs => _assertPair( xs ) && xs.slice( 1 );
// warning: this should technically set the cdr to the next element, and
// should accept any number of arguments, but that's not what we're doing
// here (note that an empty list is not a pair and therefore has no cdr)
const $$append$b$ = ( dest, xs ) => ( dest.length === 0 ) ? [] : dest.push( xs );
// warning: blows up if any items are non-lists, whereas the proper RnRS // warning: blows up if any items are non-lists, whereas the proper RnRS
// implementation will set the cdr to the final item even if it's not a pair // implementation will set the cdr to the final item even if it's not a pair
function $$append() function $$append()
@ -134,16 +139,18 @@ const $$map = ( f, ...xs ) =>
xs.map( x => x[ i ] ) ) ); xs.map( x => x[ i ] ) ) );
// Node.js stuff
const fs = require( 'fs' );
// stdin->string
const $$js$file$_$$g$string = ( path ) =>
fs.readFileSync( path ).toString();
const $$js$regexp = ( s, opts ) => new RegExp( s, opts ); const $$js$regexp = ( s, opts ) => new RegExp( s, opts );
const $$js$match = ( r, s ) => s.match( r ) || false; const $$js$match = ( r, s ) => s.match( r ) || false;
const $$js$replace = ( r, repl, s ) => s.replace( r, repl ); const $$js$replace = ( r, repl, s ) => s.replace( r, repl );
const fs = ( typeof require !== 'undefined' )
? require( 'fs' )
: { readFileSync: path => window.fsdata[ path ] };
// stdin->string
const $$js$file$_$$g$string = ( path ) =>
fs.readFileSync( path ).toString();
/** =============== end of libprebirth =============== **/ /** =============== end of libprebirth =============== **/

View File

@ -177,10 +177,13 @@ class Parser
* This is the lexer. Whitespace is ignored. The grammar consists of * This is the lexer. Whitespace is ignored. The grammar consists of
* simple s-expressions. * simple s-expressions.
* *
* This function is mutually recursive with `#_token'. It expects that * Tokens are produced with `#_token'. The source SRC will be
* the source SRC will be left-truncated as input is * left-truncated as input is processed. POS exists for producing
* processed. POS exists for producing metadata for error * metadata for error reporting---it has no impact on parsing.
* reporting---it has no impact on parsing. *
* This implementation was originally recursive and immutable, but the
* stack was being exhausted, so it was refactored into an inferior
* implementation.
* *
* @param {string} src source code * @param {string} src source code
* @param {number} pos position (character offset) in source * @param {number} pos position (character offset) in source
@ -189,6 +192,10 @@ class Parser
*/ */
_lex( src, pos = 0 ) _lex( src, pos = 0 )
{ {
const toks = [];
// process until EOF (see break)
while ( true ) {
// ignore whitespace, if any // ignore whitespace, if any
const ws = src.match( /^\s+/ ) || [ "" ]; const ws = src.match( /^\s+/ ) || [ "" ];
const trim = src.substr( ws[ 0 ].length ); const trim = src.substr( ws[ 0 ].length );
@ -198,27 +205,29 @@ class Parser
// EOF and we're done // EOF and we're done
if ( trim === '' ) { if ( trim === '' ) {
return []; break;
} }
let t = null;
// comment until end of line // comment until end of line
if ( trim[ 0 ] === ';' ) { if ( trim[ 0 ] === ';' ) {
const eol = trim.match( /^(.*?)(\n|$)/ ); const eol = trim.match( /^(.*?)(\n|$)/ );
return this._token( 'comment', eol[ 1 ], trim, pos ); t = this._token( 'comment', eol[ 1 ], trim, pos );
} }
// left and right parenthesis are handled in the same manner: they // left and right parenthesis are handled in the same manner: they
// produce distinct tokens with single-character lexemes // produce distinct tokens with single-character lexemes
if ( trim[ 0 ] === '(' ) { else if ( trim[ 0 ] === '(' ) {
return this._token( 'open', '(', trim, pos ); t = this._token( 'open', '(', trim, pos );
} }
if ( trim[ 0 ] === ')' ) { else if ( trim[ 0 ] === ')' ) {
return this._token( 'close', ')', trim, pos ); t = this._token( 'close', ')', trim, pos );
} }
// strings are delimited by opening and closing ASCII double quotes, // strings are delimited by opening and closing ASCII double quotes,
// which can be escaped with a backslash // which can be escaped with a backslash
if ( trim[ 0 ] === '"' ) { else if ( trim[ 0 ] === '"' ) {
const str = trim.match( /^"(|.*?[^\\])"/ ); const str = trim.match( /^"(|.*?[^\\])"/ );
if ( !str ) { if ( !str ) {
this._error( src, pos, "missing closing string delimiter" ); this._error( src, pos, "missing closing string delimiter" );
@ -227,25 +236,30 @@ class Parser
// a string token consists of the entire string including quotes // a string token consists of the entire string including quotes
// as its lexeme, but its value will be the value of the string // as its lexeme, but its value will be the value of the string
// without quotes due to the `str' match group (see `#_token') // without quotes due to the `str' match group (see `#_token')
return this._token( 'string', str, trim, pos ); t = this._token( 'string', str, trim, pos );
} }
// anything else is considered a symbol up until whitespace or any // anything else is considered a symbol up until whitespace or any
// of the aforementioned delimiters // of the aforementioned delimiters
else {
const symbol = trim.match( /^[^\s()"]+/ ); const symbol = trim.match( /^[^\s()"]+/ );
return this._token( 'symbol', symbol, trim, pos ); t = this._token( 'symbol', symbol, trim, pos );
}
const [ tok, newsrc, newpos ] = t;
// add token, left-truncate src, update pos
toks.push( tok );
src = newsrc;
pos = newpos;
}
return toks;
} }
/** /**
* Produce a token and recurse * Produce a token, left-truncate src, and update pos
*
* The token will be concatenated with the result of the mutually
* recursive method `_lex'.
*
* For the record: I'm not fond of mutual recursion from a clarity
* standpoint, but this is how the abstraction evolved to de-duplicate
* code, and I don't much feel like refactoring it.
* *
* @param {string} type token type * @param {string} type token type
* @param {string|Array} match lexeme match * @param {string|Array} match lexeme match
@ -272,14 +286,13 @@ class Parser
pos: pos pos: pos
}; };
// continue producing tokens by recursing, left-truncating the // produce token, left-truncating the source string to discard what
// source string to discard what we have already processed // we have already processed
return [ token ].concat( return [
this._lex( token,
src.substr( lexeme.length ), src.substr( lexeme.length ),
( pos + lexeme.length ) ( pos + lexeme.length ),
) ];
);
} }
}; };
@ -431,7 +444,7 @@ class Compiler
* *
* @return {string} compiled BODY * @return {string} compiled BODY
*/ */
_bodyToEs( body ) _bodyToEs( body, ret = true )
{ {
// the body must be an array of expressions (this should always be // the body must be an array of expressions (this should always be
// the case unless we have a bug in the compiler) // the case unless we have a bug in the compiler)
@ -447,8 +460,8 @@ class Compiler
// the final expression // the final expression
return js.map( ( s, i ) => return js.map( ( s, i ) =>
{ {
const ret = ( i === ( js.length - 1 ) ) ? "return " : ""; const retstmt = ( ret && i === ( js.length - 1 ) ) ? "return " : "";
return ` ${ret}${s};`; return ` ${retstmt}${s};`;
} ).join( '\n' ); } ).join( '\n' );
} }
@ -543,6 +556,16 @@ class Compiler
const fnmap = { const fnmap = {
'js:console': 'console.log', 'js:console': 'console.log',
// yes, there are more important things to do until we get to the point
// where it's worth implementing proper tail calls
'js:while': ( [ pred, ...body ], stoes, btoes ) =>
"(function(__whilebrk){" +
`while (${stoes(pred)}){\n` +
`${btoes(body, false)} if (__whilebrk) break;\n` +
"}\n" +
"})(false)",
'js:break': () => '__whilebrk=true',
// fortunately ES6+ has native symbol support :) // fortunately ES6+ has native symbol support :)
// we don't (yet?) need list quoting in Prebirth // we don't (yet?) need list quoting in Prebirth
'quote': ( x, stoes ) => Array.isArray( x.value ) 'quote': ( x, stoes ) => Array.isArray( x.value )
@ -590,7 +613,7 @@ const fnmap = {
'let*': ( [ bindings, ...body ], stoes, btoes ) => 'let*': ( [ bindings, ...body ], stoes, btoes ) =>
"(function(){\n" + "(function(){\n" +
bindings bindings
.map( ([ x, val ]) => ` const ${stoes(x)} = ${stoes(val)};\n` ) .map( ([ x, val ]) => ` let ${stoes(x)} = ${stoes(val)};\n` )
.join( '' ) + .join( '' ) +
btoes( body ) + "\n" + btoes( body ) + "\n" +
" })()", " })()",
@ -624,7 +647,11 @@ const fnmap = {
btoes( exprs ) + "\n" btoes( exprs ) + "\n"
).join( '' ) + ).join( '' ) +
"}" + "}" +
"})()" "})()",
// basic mutator (variable assignment)
'set!': ( [ varid, val ], stoes ) =>
`${stoes(varid)} = ${stoes(val)}`,
}; };