;;;; ECMAScript compiler for Rebirth Lisp ;;;; ;;;; Copyright (C) 2017, 2018 Mike Gerwitz ;;;; ;;;; This file is part of Ulambda Scheme. ;;;; ;;;; Ulambda Scheme is free software: you can redistribute it and/or modify ;;;; it under the terms of the GNU Affero General Public License as ;;;; published by the Free Software Foundation, either version 3 of the ;;;; License, or (at your option) any later version. ;;;; ;;;; This program is distributed in the hope that it will be useful, ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;;;; GNU General Public License for more details. ;;;; ;;;; You should have received a copy of the GNU Affero General Public License ;;;; along with this program. If not, see . ;;;; ;;; Commentary: ;;; THIS IS BOOTSTRAP CODE INTENDED FOR USE ONLY IN REBIRTH. ;;; ;;; ;;; This defines the lexer, parser, and core of the compiler. Note that ;;; most of the actual compiling happens with the macros defined in ;;; `es.scm'. ;;; ;;; The core structure persists from Birth, along with much of the original ;;; code. Certain parts have been simplified or moved into other files, ;;; while other parts have been extended. ;;; ;;; Code: ;; Convert source input into a string of tokens. ;; ;; This is the lexer. Whitespace is ignored. The grammar consists of ;; simple s-expressions. ;; ;; Tokens are produced with `make-token'. The source SRC will be ;; left-truncated as input is processed. POS exists for producing metadata ;; for error reporting---it has no impact on parsing. ;; ;; This implementation was originally recursive and immutable, but the stack ;; was being exhausted, so it was refactored into an inferior ;; implementation. Note the use of `es:while' and `es:break'---these are ;; quick fixes to the problem of stack exhaustion in browsers (where we have ;; no control over the stack limit); proper tail call support will come ;; later when we have a decent architecture in place. ;; ;; The result is a list of tokens. See `token' for the format. (define (lex src pos) (define (es:match-regexp re s) (es:match (es:regexp re) s)) (let ((toks (list))) (es:while #t ; browser stack workaround (let* ((ws (or (es:match-regexp "^\\s+" src) (list ""))) (ws-len (string-length (car ws))) (trim (substring src ws-len)) ; ignore whitespace, if any (newpos (+ pos ws-len))) ; adj pos to account for removed ws (if (string=? "" trim) (es:break) ; EOF and we're done ;; normally we'd use `string-ref' here, but then we'd have to ;; implement Scheme characters, so let's keep this simple and keep ;; with strings (let* ((ch (substring trim 0 1)) (t (case ch ;; comments extend until the end of the line ((";") (let ((eol (es:match-regexp "^(.*?)(\\n|$)" trim))) (make-token "comment" (cadr eol) trim newpos))) ;; left and right parenthesis are handled in the same ;; manner: they produce distinct tokens with ;; single-character lexemes (("(") (make-token "open" ch trim newpos)) ((")") (make-token "close" ch trim newpos)) ;; strings are delimited by opening and closing ASCII ;; double quotes, which can be escaped with a ;; backslash (("\"") (let ((str (es:match-regexp "^\"(|(?:.|\\\n)*?[^\\\\])\"" trim))) (or str (parse-error src pos "missing closing string delimiter")) ;; a string token consists of the entire ;; string including quotes as its lexeme, ;; but its value will be the value of the ;; string without quotes due to the `str' ;; match group (see `token') (make-token "string" str trim newpos))) (else ;; anything else is considered a symbol up until ;; whitespace or any of the aforementioned ;; delimiters (let ((symbol (es:match-regexp "^[^\\s()\"]+" trim))) (make-token "symbol" symbol trim newpos)))))) ;; yikes---see notes in docblock with regards to why ;; we're using mutators here (set! toks (append toks (list (car t)))) (set! src (cadr t)) (set! pos (caddr t)))))) toks)) ;; Throw an error with a window of surrounding source code. ;; ;; The "window" is simply ten characters to the left and right of the ;; first character of the source input SRC that resulted in the error. ;; It's a little more than useless. (define (parse-error src pos msg) (let ((window (substring src (- pos 10) (+ pos 10)))) (error (string-append msg " (pos " pos "): " window) src))) ;; Produce a token, left-truncate src, and update pos. ;; ;; Unlike the JS Prebirth implementation which uses a key/value object, ;; we're just using a simple list. ;; ;; The expected arguments are: the token type TYPE, the match group or ;; string MATCH, left-truncated source code SRC, and the position POS ;; relative to the original source code. (define (make-token type match src pos) (let* ((parts (if (list? match) match (list match match))) (lexeme (car parts)) ;; the value is the first group of the match (indicating what we ;; are actually interested in), and the lexeme is the full match, ;; which might include, for example, string delimiters (value (or (and (pair? (cdr parts)) (cadr parts)) lexeme)) (len (string-length lexeme))) ;; produce token and recurse on `lex', left-truncating the source ;; string to discard what we have already processed (list (list (quote token) type lexeme value pos) (substring src len) (+ pos len)))) ;; various accessor procedures for token lists (we're Birth Lisp here, ;; so no record support or anything fancy!) (define (token? t) (and (pair? t) (symbol=? (quote token) (car t)))) (define (token-type t) (cadr t)) (define (token-lexeme t) (caddr t)) (define (token-value t) (cadddr t)) (define (token-pos t) (caddddr t)) ;; Produce an AST from the given string SRC of sexps ;; ;; This is essentially the CST with whitespace removed. It first invokes ;; the lexer to produce a token string from the input sexps SRC. From this, ;; it verifies only proper nesting (that SRC does not close sexps too early ;; and that EOF isn't reached before all sexps are closed) and produces an ;; AST that is an isomorphism of the original sexps. (define (parse-lisp src) ;; accessor methods to make you and me less confused (define (ast-depth ast) (car ast)) (define (ast-tree ast) (cadr ast)) (define (ast-stack ast) (caddr ast)) ;; perform a leftmost reduction on the token string (define (toks->ast toks) (fold (lambda (token result) (let ((depth (ast-depth result)) (xs (ast-tree result)) (stack (ast-stack result)) (type (token-type token)) (pos (token-pos token))) ;; there are very few token types to deal with (again, this is a ;; very simple bootstrap lisp) (case type ;; ignore comments (("comment") result) ;; when beginning a new expression, place the expression ;; currently being processed onto a stack, allocate a new list, ;; and we'll continue processing into that new list (("open") (list (+ depth 1) (list) (cons xs stack))) ;; once we reach the end of the expression, pop the parent off of ;; the stack and append the new list to it (("close") (if (zero? depth) (parse-error src pos "unexpected closing parenthesis") (list (- depth 1) (append (car stack) (list xs)) (cdr stack)))) ;; strings and symbols (we cheat and just consider everything, ;; including numbers and such, to be symbols) are just copied ;; in place (("string" "symbol") (list depth (append xs (list token)) stack)) ;; we should never encounter anything else unless there's a bug ;; in the tokenizer or we forget a token type above (else (parse-error src pos (string-append "unexpected token `" type "'")))))) (list 0 (list) (list)) ; initial 0 depth; empty tree; expr stack toks)) ;; lex the input SRC and pass it to `toks->ast' to generate the AST; ;; if the depth is non-zero after we're done, then we're unbalanced. (let* ((toks (lex src 0)) (ast (toks->ast toks))) (if (zero? (ast-depth ast)) (ast-tree ast) ;; if we terminate at a non-zero depth, that means there ar still ;; open sexps (error (string-append "unexpected end of input at depth " (ast-depth ast)))))) ;; Generate ECMAScript-friendly parameter name for the given token T. ;; ;; The generated name will not have any environment references and is ;; suitable only for the immediate scope. (define (tparam->es t) (tname->id (token-value t))) ;; Predicate determining whether NAME should be output verbatim as an ;; ECMAScript identifier. ;; ;; This only returns #t for numbers. (define (tname-verbatim? name) (es:match (es:regexp "^-?(\\d+(\\.\\d+)?|\\.\\d+)$") name)) ;; Generate ECMAScript to reference the variable associated with the token T ;; in the current environment. ;; ;; The "current" environment is relative to whatever context into which the ;; caller places this generated code---that is, the environment is ;; resolved by the runtime environment. ;; ;; If macro support is not yet compiled in, then this returns the identifier ;; name _without_ the environment, just as Birth. (define (env-ref t) (let ((name (if (token? t) (token-value t) t))) (if (tname-verbatim? name) name (cond-expand (cdfn-macro (string-append "_env." (tname->id name))) (else (tname->id name)))))) ;; Generate ECMAScript-friendly name from the given id. ;; ;; A subset of special characters that are acceptable in Scheme are ;; converted in an identifiable manner; others are simply converted to `$' ;; in a catch-all and therefore could result in conflicts and cannot be ;; reliably distinguished from one-another. Remember: this is temporary ;; code. (define (tname->id name) (if (tname-verbatim? name) name (string-append "$$" (es:replace (es:regexp "[^a-zA-Z0-9_]" "g") (lambda (c) (case c (("-") "$_$") (("?") "$7$") (("@") "$a$") (("!") "$b$") ((">") "$g$") (("#") "$h$") (("*") "$k$") (("<") "$l$") (("&") "$n$") (("%") "$o$") (("+") "$p$") (("=") "$q$") (("^") "$v$") (("/") "$w$") (("$") "$$") (else "$"))) name)))) ;; Join a list of strings XS on a delimiter DELIM (define (join delim xs) (if (pair? xs) (fold (lambda (x str) (string-append str delim x)) (car xs) (cdr xs)) "")) ;; Compile parameter list. ;; ;; This takes the value of the symbol and outputs it (formatted), delimited ;; by commas. ;; ;; Since we do not support actual pairs (yet), the "." syntax that normally ;; denotes the cdr is retained and presents itself here. The form "(arg1, ;; arg2 . rest)" creates a list `rest' containing all remaining arguments ;; after that point. Conveniently, ECMAScript Harmony supports this ;; natively with the "..." syntax. (define (params->es params) (define (%param-conv params) (let* ((param (car params)) (name (token-value param)) (id (tname->id name)) (rest (cdr params))) (if (string=? name ".") (list (string-append "..." (car (%param-conv rest)))) (if (pair? rest) (cons id (%param-conv rest)) (list id))))) (if (pair? params) (join "," (%param-conv params)) "")) ;; Compile body s-expressions into ECMAScript ;; ;; This produces a 1:1 mapping of body XS s-expressions to ES statements, ;; recursively. The heavy lifting is done by `sexp->es'. (define (body->es xs ret) ;; recursively process body XS until we're out of pairs (if (not (pair? xs)) "" (let* ((x (car xs)) (rest (cdr xs)) (more? (or (not ret) (pair? rest)))) ;; the result is a semicolon-delimited string of statements, with ;; the final statement prefixed with `return' unless (not ret) (string-append " " (if more? "" "return ") ; prefix with `return' if last body exp (sexp->es x) ";" ; process current body expression (if (pair? rest) "\n" "") (body->es rest ret))))) ; recurse ;; Place parameters PARAMS into the current environment. ;; ;; This is ugly so that Rebirth can support multiple implementations at ;; once---those with environment support and those without. (define (env-params params) (join "\n" (map (lambda (param) (if (string=? (token-value param) ".") "" ; next param is the cdr (string-append (env-ref param) " = " (tparam->es param) ";"))) params))) ;; Compile variable or procedure definition into ES ;; ;; This performs a crude check to determine whether a procedure definition ;; was supplied: if the cadr of the given token T is itself token, then it ;; is considered to be a variable. (define (cdfn t) (if (token? (cadr t)) (cdfn-var t) ;; (define foo ...) (cdfn-proc t #f))) ;; (define (foo ...) ...) ;; Compile variable definition into ES ;; ;; This compiles the token T into a simple let-assignment. (define (cdfn-var t) (let* ((dfn (cadr t)) (id (tname->id (token-value dfn))) (value (sexp->es (caddr t)))) (string-append "let " id "=" value ";_env." id " = " id))) ;; Compile procedure definition into an ES function definition ;; ;; This will fail if the given token is not a `define'. ;; ;; The output does something peculiar: it not only assigns to the active ;; scope, but also to the root of the environment, which has the effect of ;; making the procedure available to _everything_. The reason for this is a ;; kluge to make procedures available to macros during compilation without ;; having to wait for a rebirth repass. But this does have its issues and ;; it's important to understand that this is a temporary solution until ;; Ulambda has some level of static analysis. ;; ;; Care needs to be taken to make sure, regardless of scope, procedures of ;; the same name are not defined if used within macros, otherwise the latter ;; (again, regardless of scope) in the file will take precedence. This ;; behavior will not be observed by execution of compiled code, though, ;; because the scope will have the correct version. However, if a procedure ;; is _not_ in scope, then rather than being undefined, the one assigned to ;; root would be available. ;; ;; The generated ECMAScript is evaluated immediately to make it available to ;; macros during the compilation process. (define (cdfn-proc t id-override) ;; e.g. (define (foo ...) body) (let* ((dfn (cadr t)) (id (or id-override (tname->id (token-value (car dfn))))) (named? (not (string=? id ""))) (params (cdr dfn)) (fparams (params->es params)) (fenv (env-params params)) (body (body->es (cddr t) #t))) ;; this is the final format---each procedure becomes its own function ;; definition in ES (let ((es (string-append "function " id "(" fparams ")\n{" "return (function(_env){\n" fenv "\n" body "\n})(Object.create(_env));}" (if named? (string-append ";_env." id " = " id ";_env.root." id " = " id ";") "")))) ;; Immediately evaluate to make available to macros during ;; compilation. See procedure notes above. (cond-expand (string->es (if named? (string->es "eval($$es)")))) es))) ;; Quote an expression ;; ;; If SEXP is a token, produce an ECMAScript Symbol. Otherwise, ;; recursively apply to each element in the list. ;; ;; TODO: This implementation isn't wholly correct---numbers, for example, ;; should not be converted to symbols, as they already are one. (define (quote-sexp sexp) (if (token? sexp) (case (token-type sexp) (("string") (sexp->es sexp)) (else (string-append "Symbol.for('" (token-value sexp) "')"))) (string-append "[" (join "," (map quote-sexp sexp)) "]"))) ;; Quasiquote an expression ;; ;; A quasiquoted expression acts just like a quoted expression with one ;; notable exception---quoting can be escaped using special forms. For ;; example, each of these are equivalent: ;; ;; (quasiquote (a 1 2 (unquote (eq? 3 4)))) ;; (list (quote a) 1 2 (eq? 3 4)) ;; (quasiquote (a (unquote-splicing (list 1 2)) (unquote (eq? 3 4)))) ;; ;; TODO/WARNING: Normally "(quasiquote a (unquote-splicing b))" would ;; produce "(a . b)" in a proper Lisp, but we do not yet support proper ;; pairs at the time that this procedure was written; all cdrs are assumed ;; to be lists. So do not do that---always splice lists. (define (quasiquote-sexp sexp) ;; get type of token at car of pair, unless not a pair (define (%sexp-maybe-type sexp) (and (pair? sexp) (token? (car sexp)) (token-value (car sexp)))) ;; recursively process the sexp, handling various types of unquoting (define (%quote-maybe sexp delim) (if (pair? sexp) (let* ((item (car sexp)) (rest (cdr sexp)) (type (%sexp-maybe-type item)) (add-delim (not (or (string=? type "unquote-splicing") (string=? type "unquote@"))))) (string-append (case type ;; escape quoting, nest within (("unquote") (string-append (if delim "," "") (sexp->es (cadr item)))) ;; escape quoting, splice list into parent expression ;; (lazy kluge warning), along with an alias for brevity ;; given that we lack the ",@" syntax right now (("unquote-splicing" "unquote@") (string-append "]).concat(" (sexp->es (cadr item)) ").concat([")) ;; anything else, we're still quasiquoting recursively (else (string-append (if delim "," "") (quasiquote-sexp item)))) ;; continue processing this list (%quote-maybe rest add-delim))) "")) ;; tokens fall back to normal quoting (if (token? sexp) (quote-sexp sexp) (string-append "([" (%quote-maybe sexp #f) "])"))) ;; Statically expand expressions based on implementation features ;; ;; Support for `cond-expand' allows Rebirth to introduce new features each ;; time that it is compiled. If matched, expressions will be evaluated as ;; if they were entered in place of the `cond-expand' itself; otherwise, ;; the entire `cond-expand' expression as a whole will be discarded. ;; ;; Birth will always discard `cond-expand' expressions unless they contain ;; an `else' clause, which permits us to compile on the first pass without ;; error. (define (expand-cond-expand args) (if (pair? args) (let* ((clause (car args)) (feature (token-value (car clause))) (body (cdr clause))) ;; now we get meta (cond-expand (string->es (case feature (("string->es" "else") (body->es body #f)) (else (if (es:defined? feature) (body->es body #f) (expand-cond-expand (cdr args)))))) ;; if we're not yet compiled with Rebirth, then string->es will ;; not yet be available---but it _will_ be in Rebirth, so ;; compile cond-expand such that it marks it as supported (else (case feature ;; these two are always supported in Rebirth Lisp (("string->es" "else") (body->es body #f)) ;; keep recursing until we find something (this allows us to ;; short-circuit, most notably with "else") (else (expand-cond-expand (cdr args))))))) "")) ;; Determine whether the given name NAME represents a macro. ;; ;; If `string->es' is not yet supported, then this procedure always ;; yields `#f'. Otherwise, the compiler runtime `_env.macros' is consulted. ;; ;; See `cdfn-macro' for more information. (define (macro? name) (cond-expand (string->es (string->es "_env.macros[$$name] !== undefined")) (else #f))) ;; Determine if FN is a procedure or macro and apply it accordingly with ;; arguments ARGS. ;; ;; These actions represent two separate environments: If a macro, then the ;; call needs to be executed immediately within the context of the compiler ;; runtime. Otherwise, procedure applications are simply compiled to be ;; produced with the rest of the compiler output and will be run at a later ;; time within the context of the compiled program. (define (apply-proc-or-macro fn args) (if (macro? fn) (string->es "_env.macros[$$fn].apply(null,$$args)") ;; Procedures are produced as part of the compiler output. (let ((argstr (join ", " (map sexp->es args)))) (string-append (env-ref fn) "(" argstr ")")))) ;; Primitive special forms. ;; ;; These are forms that cannot be re-written as macros because of ;; chicken-and-egg issues. Since the Rebirth compiler is temporary, we're ;; not going to worry about getting rid of the rest of these. ;; ;; String values are simple function aliases. Function values take over ;; the compilation of that function and allow for defining special forms ;; (in place of macro support). The first argument FN is the name of the ;; function/procedure/form, and ARGS is the list of arguments. ;; ;; For the forms previously defined here in Birth, see `fnmap-premacro' in ;; `es.scm'. (define (fnmap fn args t) (case fn ;; output raw code into the compiled ECMAScript (what could go wrong?) (("string->es") (token-value (car args))) ;; very primitive cond-expand (("cond-expand") (expand-cond-expand args)) ;; Note that the unquote forms are only valid within a quasiquote; see ;; that procedure for the handling of those forms. Since we do not ;; support the special prefix form, we also offer "`quote" as a ;; shorthand for quasiquote. (("quote") (quote-sexp (car args))) (("quasiquote" "`quote") (quasiquote-sexp (car args))) (("define") (cdfn t)) (("define-macro") (cdfn-macro t)) ; not defined until string->es cond ;; Defining `include' is trivial right now since we're not doing any ;; sort of static analysis---we just need to start compilation of the ;; requested file and inline it right where we are. Note that this ;; doesn't enclose the file in `begin', so this isn't yet proper. (("include") (rebirth->ecmascript (parse-lisp (es:file->string (token-value (car args)))))) ;; If we have macro support (`cdfn-macro'), then assume that they exist ;; and try to use them; otherwise, continue to use built-in forms, which ;; have been moved into `fnmap-premacro'). (else (cond-expand (cdfn-macro (apply-proc-or-macro fn args)) (else (fnmap-premacro fn args t)))))) ;; Convert s-expressions or scalar into ECMAScript ;; ;; T may be either an array of tokens or a primitive token (e.g. string, ;; symbol). This procedure is applied recursively to T as needed if T is ;; a list. (define (sexp->es t) (if (not (list? t)) (error "unexpected non-list for sexp->es token")) (if (token? t) (case (token-type t) ;; strings output as-is (note that we don't escape double quotes, ;; because the method of escaping them is the same in Scheme as it ;; is in ECMAScript---a backslash) (("string") (string-append "\"" (token-value t) "\"")) ;; symbols have the same concerns as procedure definitions: the ;; identifiers generated need to be ES-friendly (("symbol") (env-ref t)) (else (error (string-append "cannot compile unknown token `" (token-type t) "'")))) ;; otherwise, process the expression (fnmap (token-value (car t)) (cdr t) t)))