birth,prebirth: Non-recursive lexing to prevent stack exhaustion

This needs to run in the browser too, where we have no control over stack limits. * build-aux/bootstrap/birth.scm (lex): Non-recursive strategy (loop with mutable list). (make-token): Update doc. Produce list of token, new string, and position. Don't recurse. (body->es): Add `ret' param. Only produce `return' statement if new param is set. (cdfn): Use it. (fnmap) [js:while, js:break]: Add forms. [lambda, let, case]: Use new `body->es' `ret' param. [let*]: Define JS variables in output using `let' instead of `const' to permit mutating with new `set!' form. Use new `body->es' `ret' param. [set!]: Add form. (prebirth->ecmascript): Adjust libprebirth path to be relative to self. * build-aux/bootstrap/libprebirth.js ($$append$b$): Add `append!' procedure. ($$js$regexp, $$js$match, $$js$replace): Move a few lines up. (fs): Provide stub if `require' is not defined. * build-aux/bootstrap/prebirth.js (_lex): Non-recursive strategy (loop with array-appending). (_token): No more mutual recursion with `#_lex'. Return new string and position. (_bodyToEs): Add `ret' param. Only produce `return' statement if new param is set. (fnmap) [js:while, js:break]: Add forms. [let*]: Define JS variables in output using `let' instead of `const' to permit mutating with new `set!' form. Use new `body->es' `ret' param. [set!]: Add form.
2017-10-09 00:59:11 -04:00 · 2017-10-09 00:59:11 -04:00 · 431b18e1df
parent 35fa13a8a0
commit 431b18e1df
3 changed files with 201 additions and 135 deletions
--- a/build-aux/bootstrap/birth.scm
+++ b/build-aux/bootstrap/birth.scm
@ -69,54 +69,75 @@
 ;; This is the lexer.  Whitespace is ignored.  The grammar consists of
 ;; simple s-expressions.
 ;;
-;; This procedure is mutually recursive with `token'.  It expects that
-;; the source SRC will be left-truncated as input is processed.  POS exists
-;; for producing metadata for error reporting---it has no impact on parsing.
+;; Tokens are produced with `make-token'.  The source SRC will be
+;; left-truncated as input is processed.  POS exists for producing metadata
+;; for error reporting---it has no impact on parsing.
+;;
+;; This implementation was originally recursive and immutable, but the stack
+;; was being exhausted, so it was refactored into an inferior
+;; implementation.  Note the use of `js:while' and `js:break'---these are
+;; quick fixes to the problem of stack exhaustion in browsers (where we have
+;; no control over the stack limit); proper tail call support will come
+;; later when we have a decent architecture in place.
 ;;
 ;; The result is a list of tokens.  See `token' for the format.
 (define (lex src pos)
-  (let* ((ws     (or (js:match-regexp "^\\s+"
-                                      src)
-                     (list "")))
-         (ws-len (string-length (car ws)))
-         (trim   (substring src ws-len))   ; ignore whitespace, if any
-         (newpos (+ pos ws-len)))          ; adj pos to account for removed ws
+  (let ((toks (list)))
+    (js:while #t                              ; browser stack workaround
+     (let* ((ws     (or (js:match-regexp "^\\s+"
+                                         src)
+                        (list "")))
+            (ws-len (string-length (car ws)))
+            (trim   (substring src ws-len))   ; ignore whitespace, if any
+            (newpos (+ pos ws-len)))          ; adj pos to account for removed ws

-    (if (string=? "" trim)
-        (list)                             ; EOF and we're done
+       (if (string=? "" trim)
+           (js:break)                         ; EOF and we're done

-        ;; normally we'd use `string-ref' here, but then we'd have to
-        ;; implement Scheme characters, so let's keep this simple and keep
-        ;; with strings
-        (let ((ch (substring trim 0 1)))
-          (case ch
-            ;; comments extend until the end of the line
-            ((";") (let ((eol (js:match-regexp "^(.*?)(\\n|$)" trim)))
-                     (make-token "comment" (cadr eol) trim newpos)))
+           ;; normally we'd use `string-ref' here, but then we'd have to
+           ;; implement Scheme characters, so let's keep this simple and keep
+           ;; with strings
+           (let* ((ch (substring trim 0 1))
+                  (t (case ch
+                       ;; comments extend until the end of the line
+                       ((";") (let ((eol (js:match-regexp "^(.*?)(\\n|$)" trim)))
+                                (make-token "comment" (cadr eol) trim newpos)))

-            ;; left and right parenthesis are handled in the same manner:
-            ;; they produce distinct tokens with single-character lexemes
-            (("(")  (make-token "open" ch trim newpos))
-            ((")")  (make-token "close" ch trim newpos))
+                       ;; left and right parenthesis are handled in the same
+                       ;; manner: they produce distinct tokens with
+                       ;; single-character lexemes
+                       (("(")  (make-token "open" ch trim newpos))
+                       ((")")  (make-token "close" ch trim newpos))

-            ;; strings are delimited by opening and closing ASCII double
-            ;; quotes, which can be escaped with a backslash
-            (("\"") (let ((str (js:match-regexp "^\"(|.*?[^\\\\])\""
-                                                trim)))
-                      (or str (parse-error
-                               src pos "missing closing string delimiter"))
-                      ;; a string token consists of the entire string
-                      ;; including quotes as its lexeme, but its value will
-                      ;; be the value of the string without quotes due to
-                      ;; the `str' match group (see `token')
-                      (make-token "string" str trim newpos)))
+                       ;; strings are delimited by opening and closing ASCII
+                       ;; double quotes, which can be escaped with a
+                       ;; backslash
+                       (("\"") (let ((str (js:match-regexp "^\"(|.*?[^\\\\])\""
+                                                           trim)))
+                                 (or str (parse-error
+                                          src pos
+                                          "missing closing string delimiter"))
+                                 ;; a string token consists of the entire
+                                 ;; string including quotes as its lexeme,
+                                 ;; but its value will be the value of the
+                                 ;; string without quotes due to the `str'
+                                 ;; match group (see `token')
+                                 (make-token "string" str trim newpos)))

-            (else
-             ;; anything else is considered a symbol up until whitespace or
-             ;; any of the aforementioned delimiters
-             (let ((symbol (js:match-regexp "^[^\\s()\"]+"
-                                            trim)))
-               (make-token "symbol" symbol trim newpos))))))))
+                       (else
+                        ;; anything else is considered a symbol up until
+                        ;; whitespace or any of the aforementioned
+                        ;; delimiters
+                        (let ((symbol (js:match-regexp "^[^\\s()\"]+"
+                                                       trim)))
+                          (make-token "symbol" symbol trim newpos))))))
+
+             ;; yikes---see notes in docblock with regards to why
+             ;; we're using mutators here
+             (set! toks (append toks (list (car t))))
+             (set! src  (cadr t))
+             (set! pos  (caddr t))))))
+    toks))



@ -132,14 +153,7 @@



-;; Produce a token and recurse.
-;;
-;; The token will be concatenated with the result of the mutually
-;; recursive procedure `lex'.
-;;
-;; For the record: I'm not fond of mutual recursion from a clarity
-;; standpoint, but this is how the abstraction evolved to de-duplicate
-;; code, and I don't much feel like refactoring it.
+;; Produce a token, left-truncate src, and update pos.
 ;;
 ;; Unlike the JS Prebirth implementation which uses a key/value object,
 ;; we're just using a simple list.
@ -160,9 +174,9 @@

    ;; produce token and recurse on `lex', left-truncating the source
    ;; string to discard what we have already processed
-    (cons (list (quote token) type lexeme value pos)
-          (lex (substring src len)
-               (+ pos len)))))
+    (list (list (quote token) type lexeme value pos)
+          (substring src len)
+          (+ pos len))))


 ;; various accessor procedures for token lists (we're Prebirth Lisp here,
@ -309,21 +323,21 @@
  ;;
  ;; This produces a 1:1 mapping of body XS s-expressions to ES statements,
  ;; recursively.  The heavy lifting is done by `sexp->es'.
-  (define (body->es xs)
+  (define (body->es xs ret)
    ;; recursively process body XS until we're out of pairs
    (if (not (pair? xs))
        ""
        (let* ((x     (car xs))
               (rest  (cdr xs))
-               (more? (pair? rest)))
+               (more? (or (not ret) (pair? rest))))
          ;; the result is a semicolon-delimited string of statements, with
-          ;; the final statement prefixed with `return'
+          ;; the final statement prefixed with `return' unless (not ret)
          (string-append
           "    "
-           (if more? "" "return ")  ; prefix with `return' if last body exp
-           (sexp->es x) ";"         ; process current body expression
-           (if more? "\n" "")
-           (body->es rest)))))      ; recurse
+           (if more? "" "return ")   ; prefix with `return' if last body exp
+           (sexp->es x) ";"          ; process current body expression
+           (if (pair? rest) "\n" "")
+           (body->es rest ret)))))   ; recurse


  ;; Compile procedure definition into an ES function definition
@ -334,7 +348,7 @@
    (let* ((dfn    (cadr t))
           (id     (tname->id (token-value (car dfn))))
           (params (params->es (cdr dfn)))
-           (body   (body->es (cddr t))))
+           (body   (body->es (cddr t) #t)))
      ;; this is the final format---each procedure becomes its own function
      ;; definition in ES
      (string-append
@ -363,6 +377,19 @@
      (("js:console")
       (string-append "console.log(" (map sexp->es args) ")"))

+      ;; yes, there are more important things to do until we get to the
+      ;; point where it's worth implementing proper tail calls
+      (("js:while")
+       (let ((pred (car args))
+             (body (cdr args)))
+         (string-append
+          "(function(__whilebrk){"
+          "while (" (sexp->es pred) "){\n"
+          (body->es body #f) " if (__whilebrk) break;\n"
+          "}\n"
+          "})(false)")))
+      (("js:break") "__whilebrk=true")
+
      ;; fortunately ES6+ has native symbol support :)
      ;; we don't (yet?) need list quoting in Prebirth
      (("quote")
@ -377,7 +404,7 @@
             (body   (cdr args)))
         (string-append
          "function(" (join ", " (map sexp->es fnargs)) "){\n"
-          (body->es body)
+          (body->es body #t)
          "}")))

      ;; simple if statement with optional else, wrapped in a self-executing
@ -430,10 +457,10 @@
          (join "" (map (lambda (binding)
                          (let ((var  (car binding))
                                (init (cadr binding)))
-                            (string-append "    const " (sexp->es var)
+                            (string-append "    let " (sexp->es var)
                                           " = " (sexp->es init) ";\n")))
                        bindings))
-          (body->es body) "\n"
+          (body->es body #t) "\n"
          "    })()")))

      ;; similar to the above, but variables cannot reference one-another
@ -445,7 +472,7 @@
              (fargs    (join ", " (map sexp->es
                                        (map cadr bindings)))))
         (string-append "(function(" fparams "){\n"
-                        (body->es body) "\n"
+                        (body->es body #t) "\n"
                        "})(" fargs ")")))

      ;; and here I thought Prebirth Lisp would be simple...but having
@ -469,11 +496,16 @@
                                         (string-append
                                          "case " (sexp->es datum) ":\n"))
                                       data)))
-                        (body->es exprs) "\n"))
+                        (body->es exprs #t) "\n"))
                     (map car clauses)
                     (map cdr clauses)))
          "}})()")))

+      (("set!")
+       (let ((varid (car args))
+             (val   (cadr args)))
+         (string-append (sexp->es varid) " = " (sexp->es val))))
+
      ;; normal procedure application
      (else (let* ((idfn   (tname->id fn))
                   (argstr (join ", " (map sexp->es args))))
@ -516,7 +548,7 @@
  ;; output libprebirth and compiled output, wrapped in a self-executing
  ;; function to limit scope
  (string-append "(function(){"
-                 (js:file->string "build-aux/bootstrap/libprebirth.js") "\n\n"
+                 (js:file->string "libprebirth.js") "\n\n"
                 (join "\n\n" (map sexp->es ast))
                 "})();"))

--- a/build-aux/bootstrap/libprebirth.js
+++ b/build-aux/bootstrap/libprebirth.js
@ -83,6 +83,11 @@ const $$cons = ( item, list ) => _assertList( list ) && [ item ].concat( list )
 const $$car  = xs => _assertPair( xs ) && xs[ 0 ];
 const $$cdr  = xs => _assertPair( xs ) && xs.slice( 1 );

+// warning: this should technically set the cdr to the next element, and
+// should accept any number of arguments, but that's not what we're doing
+// here (note that an empty list is not a pair and therefore has no cdr)
+const $$append$b$ = ( dest, xs ) => ( dest.length === 0 ) ? [] : dest.push( xs );
+
 // warning: blows up if any items are non-lists, whereas the proper RnRS
 // implementation will set the cdr to the final item even if it's not a pair
 function $$append()
@ -134,16 +139,18 @@ const $$map = ( f, ...xs ) =>
                                           xs.map( x => x[ i ] ) ) );


-// Node.js stuff
-const fs = require( 'fs' );
-
-// stdin->string
-const $$js$file$_$$g$string = ( path ) =>
-  fs.readFileSync( path ).toString();
-
 const $$js$regexp   = ( s, opts )    => new RegExp( s, opts );
 const $$js$match    = ( r, s )       => s.match( r ) || false;
 const $$js$replace  = ( r, repl, s ) => s.replace( r, repl );


+const fs = ( typeof require !== 'undefined' )
+    ? require( 'fs' )
+    : { readFileSync: path => window.fsdata[ path ] };
+
+// stdin->string
+const $$js$file$_$$g$string = ( path ) =>
+  fs.readFileSync( path ).toString();
+
+
 /** =============== end of libprebirth =============== **/
--- a/build-aux/bootstrap/prebirth.js
+++ b/build-aux/bootstrap/prebirth.js
@ -177,10 +177,13 @@ class Parser
     * This is the lexer.  Whitespace is ignored.  The grammar consists of
     * simple s-expressions.
     *
-     * This function is mutually recursive with `#_token'.  It expects that
-     * the source SRC will be left-truncated as input is
-     * processed.  POS exists for producing metadata for error
-     * reporting---it has no impact on parsing.
+     * Tokens are produced with `#_token'. The source SRC will be
+     * left-truncated as input is processed.  POS exists for producing
+     * metadata for error reporting---it has no impact on parsing.
+     *
+     * This implementation was originally recursive and immutable, but the
+     * stack was being exhausted, so it was refactored into an inferior
+     * implementation.
     *
     * @param {string} src source code
     * @param {number} pos position (character offset) in source
@ -189,63 +192,74 @@ class Parser
     */
    _lex( src, pos = 0 )
    {
-        // ignore whitespace, if any
-        const ws   = src.match( /^\s+/ ) || [ "" ];
-        const trim = src.substr( ws[ 0 ].length );
+        const toks = [];

-        // adjust position to account for any removed whitespace
-        pos += ws[ 0 ].length;
+        // process until EOF (see break)
+        while ( true ) {
+            // ignore whitespace, if any
+            const ws   = src.match( /^\s+/ ) || [ "" ];
+            const trim = src.substr( ws[ 0 ].length );

-        // EOF and we're done
-        if ( trim === '' ) {
-            return [];
-        }
+            // adjust position to account for any removed whitespace
+            pos += ws[ 0 ].length;

-        // comment until end of line
-        if ( trim[ 0 ] === ';' ) {
-            const eol = trim.match( /^(.*?)(\n|$)/ );
-            return this._token( 'comment', eol[ 1 ], trim, pos );
-        }
-
-        // left and right parenthesis are handled in the same manner: they
-        // produce distinct tokens with single-character lexemes
-        if ( trim[ 0 ] === '(' ) {
-            return this._token( 'open', '(', trim, pos );
-        }
-        if ( trim[ 0 ] === ')' ) {
-            return this._token( 'close', ')', trim, pos );
-        }
-
-        // strings are delimited by opening and closing ASCII double quotes,
-        // which can be escaped with a backslash
-        if ( trim[ 0 ] === '"' ) {
-            const str = trim.match( /^"(|.*?[^\\])"/ );
-            if ( !str ) {
-                this._error( src, pos, "missing closing string delimiter" );
+            // EOF and we're done
+            if ( trim === '' ) {
+                break;
            }

-            // a string token consists of the entire string including quotes
-            // as its lexeme, but its value will be the value of the string
-            // without quotes due to the `str' match group (see `#_token')
-            return this._token( 'string', str, trim, pos );
+            let t = null;
+
+            // comment until end of line
+            if ( trim[ 0 ] === ';' ) {
+                const eol = trim.match( /^(.*?)(\n|$)/ );
+                t = this._token( 'comment', eol[ 1 ], trim, pos );
+            }
+
+            // left and right parenthesis are handled in the same manner: they
+            // produce distinct tokens with single-character lexemes
+            else if ( trim[ 0 ] === '(' ) {
+                t = this._token( 'open', '(', trim, pos );
+            }
+            else if ( trim[ 0 ] === ')' ) {
+                t = this._token( 'close', ')', trim, pos );
+            }
+
+            // strings are delimited by opening and closing ASCII double quotes,
+            // which can be escaped with a backslash
+            else if ( trim[ 0 ] === '"' ) {
+                const str = trim.match( /^"(|.*?[^\\])"/ );
+                if ( !str ) {
+                    this._error( src, pos, "missing closing string delimiter" );
+                }
+
+                // a string token consists of the entire string including quotes
+                // as its lexeme, but its value will be the value of the string
+                // without quotes due to the `str' match group (see `#_token')
+                t = this._token( 'string', str, trim, pos );
+            }
+
+            // anything else is considered a symbol up until whitespace or any
+            // of the aforementioned delimiters
+            else {
+                const symbol = trim.match( /^[^\s()"]+/ );
+                t = this._token( 'symbol', symbol, trim, pos );
+            }
+
+            const [ tok, newsrc, newpos ] = t;
+
+            // add token, left-truncate src, update pos
+            toks.push( tok );
+            src = newsrc;
+            pos = newpos;
        }

-        // anything else is considered a symbol up until whitespace or any
-        // of the aforementioned delimiters
-        const symbol = trim.match( /^[^\s()"]+/ );
-        return this._token( 'symbol', symbol, trim, pos );
+        return toks;
    }


    /**
-     * Produce a token and recurse
-     *
-     * The token will be concatenated with the result of the mutually
-     * recursive method `_lex'.
-     *
-     * For the record: I'm not fond of mutual recursion from a clarity
-     * standpoint, but this is how the abstraction evolved to de-duplicate
-     * code, and I don't much feel like refactoring it.
+     * Produce a token, left-truncate src, and update pos
     *
     * @param {string}       type  token type
     * @param {string|Array} match lexeme match
@ -272,14 +286,13 @@ class Parser
            pos:    pos
        };

-        // continue producing tokens by recursing, left-truncating the
-        // source string to discard what we have already processed
-        return [ token ].concat(
-            this._lex(
-                src.substr( lexeme.length ),
-                ( pos + lexeme.length )
-            )
-        );
+        // produce token, left-truncating the source string to discard what
+        // we have already processed
+        return [
+            token,
+            src.substr( lexeme.length ),
+            ( pos + lexeme.length ),
+        ];
    }
 };

@ -431,7 +444,7 @@ class Compiler
     *
     * @return {string} compiled BODY
     */
-    _bodyToEs( body )
+    _bodyToEs( body, ret = true )
    {
        // the body must be an array of expressions (this should always be
        // the case unless we have a bug in the compiler)
@ -447,8 +460,8 @@ class Compiler
        // the final expression
        return js.map( ( s, i ) =>
        {
-            const ret = ( i === ( js.length - 1 ) ) ? "return " : "";
-            return `    ${ret}${s};`;
+            const retstmt = ( ret && i === ( js.length - 1 ) ) ? "return " : "";
+            return `    ${retstmt}${s};`;
        } ).join( '\n' );
    }

@ -543,6 +556,16 @@ class Compiler
 const fnmap = {
    'js:console': 'console.log',

+    // yes, there are more important things to do until we get to the point
+    // where it's worth implementing proper tail calls
+    'js:while': ( [ pred, ...body ], stoes, btoes ) =>
+        "(function(__whilebrk){" +
+          `while (${stoes(pred)}){\n` +
+            `${btoes(body, false)} if (__whilebrk) break;\n` +
+          "}\n" +
+        "})(false)",
+    'js:break': () => '__whilebrk=true',
+
    // fortunately ES6+ has native symbol support :)
    // we don't (yet?) need list quoting in Prebirth
    'quote': ( x, stoes ) => Array.isArray( x.value )
@ -590,7 +613,7 @@ const fnmap = {
    'let*': ( [ bindings, ...body ], stoes, btoes ) =>
        "(function(){\n" +
            bindings
-                .map( ([ x, val ]) => `    const ${stoes(x)} = ${stoes(val)};\n` )
+                .map( ([ x, val ]) => `    let ${stoes(x)} = ${stoes(val)};\n` )
                .join( '' ) +
            btoes( body ) + "\n" +
        "    })()",
@ -624,7 +647,11 @@ const fnmap = {
                    btoes( exprs ) + "\n"
                ).join( '' ) +
            "}" +
-        "})()"
+        "})()",
+
+    // basic mutator (variable assignment)
+    'set!': ( [ varid, val ], stoes ) =>
+        `${stoes(varid)} = ${stoes(val)}`,
 };