ulambda/bootstrap/prebirth.js

/**
 * Bootstrap Ulambda Scheme ("Prebirth")
 *
 *  Copyright (C) 2017 Mike Gerwitz
 *
 *  This file is part of Ulambda Scheme.
 *
 *  Ulambda Scheme is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as
 *  published by the Free Software Foundation, either version 3 of the
 *  License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * THIS IS TEMPORARY CODE that will be REWRITTEN IN ULAMBDA SCHEME ITSELF
 * after a very basic bootstrap is complete.  It is retained as an important
 * artifact for those who wish to build Ulambda Scheme from scratch without
 * using another version of Ulambda itself.  This is called "self-hosting".
 *
 * Rather than producing a sophisticated self-hosting language, this
 * language will be a terribly incomplete and inadequate version of what
 * will ultimately become a formidable and competent language.
 *
 * I refer to this entire complication process as "Prebirth".¹ The "Birth"
 * of Ulambda Scheme is the act of reimplementing this Prebirth in a
 * Prebirth version of Ulambda itself.  It's the chicken-and-egg paradox,
 * without the paradox.²
 *
 * Some minor terminology:
 *   - AST: Abstract Syntax Tree, a processed form of the CST.
 *   - CST: Concrete Syntax Tree, a 1-1 conversion of source input to
 *          tokens.
 *   - token: an object produced by the lexer that represents a portion of
 *            the input language
 *   - lexer: sometimes called a ``tokenizer''---produces tokens by applying
 *            the grammar to a string of input.
 *   - grammar: a definition of the language (syntax).
 *   - lexeme: the portion of the original source string associated with a
 *             given token.
 *   - LL(0): Left-to-right, Leftmost derivation, 0 tokens lookahead
 *   - sexp: symbolic expression, (involving (lots (of (((parentheses))))))
 *
 * Excited?  Great!  My extemporaneous rambling is costing me more time than
 * I spent making this damn thing!  (No, really, it is.)
 */

'use strict';


/**
 * A very rudimentary (and extremely permissive) LL(0) Lisp parser
 *
 * This provides just enough to get by.  It transforms lists into nested
 * arrays of tokens with some very basic error checking (e.g. for proper
 * nesting).  This is not a general-purpose lisp parser.
 */
class Parser
{
    /**
     * Produce an AST from the given string SRC of sexps
     *
     * This is essentially the CST with whitespace removed.  It first
     * invokes the lexer to produce a token string from the input
     * sexps SRC.  From this, it verifies only proper nesting (that SRC does
     * not close sexps too early and that EOF isn't reached before all sexps
     * are closed) and produces an AST that is an isomorphism of the
     * original sexps.
     *
     * @param {string} src input Lisp
     *
     * @throws {SyntaxError} on improper sexp nesting
     *
     * @return {Array} primitive abstract syntax tree of SRC
     */
    parseLisp( src )
    {
        // token string from lexing
        const toks = this._lex( src );

        // perform a leftmost reduction on the token string
        const [ depth, ast ] = toks.reduce( ( result, token ) =>
        {
            const [ depth, xs, stack ] = result;
            const { type, pos }        = token;

            // there are very few token types to deal with (again, this is
            // a very simple bootstrap lisp)
            switch ( type )
            {
                // ignore comments
                case 'comment':
                   return result;

                // closing parenthesis (end of sexp)
                case 'close':
                    if ( depth === 0 ) {
                        this._error(
                            src, pos, `unexpected closing parenthesis`
                        );
                    }

                    // the sexp is complete; add to the AST, reduce depth
                    const top = stack.pop();
                    top.push( xs );

                    return [ ( depth - 1 ), top, stack ];

                // opening parenthesis (start of sexp)
                case 'open':
                    stack.push( xs );
                    return [ ( depth + 1 ), [], stack ];

                // symbol or primitive; just copy the token in place
                case 'string':
                case 'symbol':
                    xs.push( token );
                    return [ depth, xs, stack ];

                // should never happen unless there's a bug in the tokenizer
                // or we forget a token type above
                default:
                    this._error( src, pos, `unexpected token '${type}'` );
            }
        }, [ 0, [], [] ] );

        // if we terminate at a non-zero depth, that means there
        // are still open sexps
        if ( depth > 0 ) {
            throw SyntaxError(
                `unexpected end of input at depth ${depth}`
            );
        }

        // the result is a set of tokens organized into ES arrays
        // isomorphic to the original sexp structure (the same structure)
        return ast;
    }


    /**
     * Throw a SyntaxError with a window of surrounding source code
     *
     * The "window" is simply ten characters to the left and right of the
     * first character of the source input SRC that resulted in the error.
     * It's a little more than useless.
     *
     * @param {string} src source code (sexps)
     * @param {number} pos position of error
     * @param {string} msg error message
     *
     * @throws {SyntaxError}}
     *
     * @return {undefined}
     */
    _error( src, pos, msg )
    {
        const window = src.substr( pos - 10, pos + 10 )
              .replace( "\n", " " );

        throw new SyntaxError( `${msg}: '${window}'` );
    }


    /**
     * Convert source input into a string of tokens
     *
     * This is the lexer.  Whitespace is ignored.  The grammar consists of
     * simple s-expressions.
     *
     * Tokens are produced with `#_token'. The source SRC will be
     * left-truncated as input is processed.  POS exists for producing
     * metadata for error reporting---it has no impact on parsing.
     *
     * This implementation was originally recursive and immutable, but the
     * stack was being exhausted, so it was refactored into an inferior
     * implementation.
     *
     * @param {string} src source code
     * @param {number} pos position (character offset) in source
     *
     * @return {Array} string of tokens
     */
    _lex( src, pos = 0 )
    {
        const toks = [];

        // process until EOF (see break)
        while ( true ) {
            // ignore whitespace, if any
            const ws   = src.match( /^\s+/ ) || [ "" ];
            const trim = src.substr( ws[ 0 ].length );

            // adjust position to account for any removed whitespace
            pos += ws[ 0 ].length;

            // EOF and we're done
            if ( trim === '' ) {
                break;
            }

            let t = null;

            // comment until end of line
            if ( trim[ 0 ] === ';' ) {
                const eol = trim.match( /^(.*?)(\n|$)/ );
                t = this._token( 'comment', eol[ 1 ], trim, pos );
            }

            // left and right parenthesis are handled in the same manner: they
            // produce distinct tokens with single-character lexemes
            else if ( trim[ 0 ] === '(' ) {
                t = this._token( 'open', '(', trim, pos );
            }
            else if ( trim[ 0 ] === ')' ) {
                t = this._token( 'close', ')', trim, pos );
            }

            // strings are delimited by opening and closing ASCII double quotes,
            // which can be escaped with a backslash
            else if ( trim[ 0 ] === '"' ) {
                const str = trim.match( /^"(|(?:.|\n)*?[^\\])"/ );
                if ( !str ) {
                    this._error( src, pos, "missing closing string delimiter" );
                }

                // a string token consists of the entire string including quotes
                // as its lexeme, but its value will be the value of the string
                // without quotes due to the `str' match group (see `#_token')
                t = this._token( 'string', str, trim, pos );
            }

            // anything else is considered a symbol up until whitespace or any
            // of the aforementioned delimiters
            else {
                const symbol = trim.match( /^[^\s()"]+/ );
                t = this._token( 'symbol', symbol, trim, pos );
            }

            const [ tok, newsrc, newpos ] = t;

            // add token, left-truncate src, update pos
            toks.push( tok );
            src = newsrc;
            pos = newpos;
        }

        return toks;
    }


    /**
     * Produce a token, left-truncate src, and update pos
     *
     * @param {string}       type  token type
     * @param {string|Array} match lexeme match
     * @param {string}       src   source code string, left-truncated
     * @param {number}       pos   offset relative to original src
     *
     * @return {Array} string of tokens
     */
    _token( type, match, src, pos )
    {
        const parts = ( Array.isArray( match ) )
            ? match
            : [ match ];

        // the value is the first group of the match (indicating what we
        // are actually interested in), and the lexeme is the full match,
        // which might include, for example, string delimiters
        const [ lexeme, value ] = parts;

        const token = {
            type:   type,
            lexeme: lexeme,
            value:  ( value === undefined ) ? lexeme : value,
            pos:    pos
        };

        // produce token, left-truncating the source string to discard what
        // we have already processed
        return [
            token,
            src.substr( lexeme.length ),
            ( pos + lexeme.length ),
        ];
    }
};


/**
 * Dumb compiler to transform AST into ECMAScript
 *
 * This is a really dumb code generator: it takes the AST and essentially
 * transforms it 1:1 wherever possible into the target language.  There is
 * no intermediate representation (e.g. an ES AST).
 *
 * This is nothing like what we actually want the _ultimate_ compiler to do
 * after Birth, but it gets us to a point where we can self-host on a basic
 * Prebirth language and evolve from there.
 *
 * The code generation can be pretty much summed up by the last line of
 * `Compiler#_cdfn'.
 */
class Compiler
{
    /**
     * Initialize with function map
     *
     * The function map will be used to map certain functions into other
     * names or forms.  For example, `es:console' may map to `console.log'
     * and `if' to an `if' statement+expression.
     *
     * @param {Object} fnmap function map
     */
    constructor( fnmap )
    {
        this._fnmap = fnmap;
    }


    /**
     * Compile AST into ECMAScript
     *
     * Every function is mapped 1:1 to a function in ECMAScript.  So, we
     * just map all root children (which are expected to be Scheme-style
     * shorthand function definitions) to functions.
     *
     * @param {Array} tree root containing top-level function definitions
     */
    compile( tree )
    {
        // map every definition to a ES function definition and delimit them
        // (for readability) by two newlines
        return tree.map( this._sexpToEs.bind( this ) )
            .join( "\n\n" );
    }


    /**
     * Compile procedure definition into a ES function definition
     *
     * This will fail if the given token is not a `define'.
     *
     * @param {Object} t token
     *
     * @return {string} compiled function definition
     */
    _cdfn( t )
    {
        // e.g. (define (foo ...) body)
        const [ , [ { value: name }, ...params ], ...body ] = t;

        const id      = this._idFromName( name );
        const paramjs = this._paramsToEs( params );
        const bodyjs  = this._bodyToEs( body );

        // this is the final format---each procedure becomes its own function
        // definition in ES
        return `function ${id}(${paramjs})\n{\n${bodyjs}\n};`;
    }


    /**
     * Compile parameter list
     *
     * This simply takes the value of the symbol and outputs it (formatted),
     * delimited by commas.
     *
     * @param {Array} args token parameter list
     *
     * @return {string} compiled parameter list
     */
    _paramsToEs( args )
    {
        return args.map( ({ value: name }) => this._idFromName( name ) )
            .join( ", " );
    }


    /**
     * Generate ECMAScript-friendly name from the given id
     *
     * A subset of special characters that are acceptable in Scheme are
     * converted in an identifiable manner; others are simply converted to
     * `$' in a catch-all and therefore could result in conflicts and cannot
     * be reliably distinguished from one-another.  Remember: this is
     * temporary code.
     *
     * @param {string} name source name
     *
     * @return {string} ES-friendly identifier
     */
    _idFromName( name )
    {
        if ( /^\d+$/.test( name ) ) {
            return name;
        }

        // just some common ones; will fall back to `$' below
        const conv = {
            '-': '$_$',
            '?': '$7$',
            '@': '$a$',
            '!': '$b$',
            '>': '$g$',
            '#': '$h$',
            '*': '$k$',
            '<': '$l$',
            '&': '$n$',
            '%': '$o$',
            '+': '$p$',
            '=': '$q$',
            '^': '$v$',
            '/': '$w$',
            '$': '$$',
        };

        if ( name === undefined ) {
            throw SyntaxError( "Missing identifier name" );
        }

        return '$$' + name.replace( /[^a-zA-Z0-9_]/g, c => conv[ c ] || '$' );
    }


    /**
     * Compile body s-expressions into ECMAScript
     *
     * This produces a 1:1 mapping of BODY s-expressions to ES statements,
     * recursively.  The heavy lifting is done by `#_sexpToEs'.
     *
     * @param {Array} body s-expressions representing function body
     *
     * @return {string} compiled BODY
     */
    _bodyToEs( body, ret = true )
    {
        // the body must be an array of expressions (this should always be
        // the case unless we have a bug in the compiler)
        if ( !Array.isArray( body ) ) {
            throw Error( "body must be an Array" );
        }

        // process each s-expression in BODY
        const js = body.map( this._sexpToEs.bind( this ) );

        // the result (that is, an array of compiled s-expressions) is
        // joined semicolon-delimited, with a `return' statement preceding
        // the final expression
        return js.map( ( s, i ) =>
        {
            const retstmt = ( ret && i === ( js.length - 1 ) ) ? "return " : "";
            return `    ${retstmt}${s};`;
        } ).join( '\n' );
    }


    /**
     * Convert s-expression or scalar into ECMAScript
     *
     * T may be either an array of tokens or a primitive token (e.g. string,
     * symbol).  This method is applied recursively to T as needed if T is
     * an array.
     *
     * @param {Array|Object} t tokens representing s-expressions/scalars
     *
     * @return {string} compiled s-expression/scalar
     */
    _sexpToEs( t )
    {
        // just output symbols as identifiers as-is for now
        if ( !Array.isArray( t ) ) {
            switch ( t.type )
            {
                // strings are output as-is (note that we don't escape
                // double quotes, because the method of escaping them is the
                // same in Scheme as it is in ECMAScript---a backslash)
                case 'string':
                    return `"${t.value}"`;

                // symbols have the same concerns as function definitions: the
                // identifiers generated need to be ES-friendly
                case 'symbol':
                    return this._idFromName( t.value );

                default:
                    throw Error( `Cannot compile unknown token \`${t.type}'` );
            }
        }

        if ( t[ 0 ].value === 'define' ) {
            return this._cdfn( t );
        }

        // simple function application (fn ...args)
        const [ { value: fn }, ...args ] = t;

        const mapentry = this._fnmap[ fn ];

        // if the fnmap contains a function entry, then it will handle the
        // remaining processing
        if ( mapentry && ( typeof mapentry === 'function' ) ) {
            return mapentry(
                args,
                this._sexpToEs.bind( this ),
                this._bodyToEs.bind( this )
            );
        }

        // convert all remaining symbols (after the symbol representing the
        // function application) into arguments by parsing their sexps or
        // scalar values
        const idfn   = mapentry || this._idFromName( fn, true );
        const argstr = args.map( arg => this._sexpToEs( arg ) ).join( ", " );

        // final function application
        return `${idfn}(${argstr})`;
    }
}


/**
 * Function/procedure aliases and special forms
 *
 * And here we have what is probably the most grotesque part of this
 * file.  Saved the best for last.
 *
 * This map allows for a steady transition---items can be removed as they
 * are written in Prebirth Lisp.  This should give us a sane (but still
 * simple) environment with which we can start to self-host.
 *
 * String values are simple function aliases.  Function values take over the
 * compilation of that function and allow for defining special forms (in
 * place of macro support).  The first argument to the function is the list
 * of raw arguments (not yet compiled); the second argument is
 * `Compiler#_sexpToEs'; and the third is `Compiler#bodyToEs'.
 *
 * These are by no means meant to be solid implementations; notable
 * deficiencies are documented, but don't expect this to work properly in
 * every case.  They will be replaced with proper R7RS implementations in the
 * future (after Birth).
 *
 * @type {Object}
 */
const fnmap = {
    'es:console': 'console.log',

    // yes, there are more important things to do until we get to the point
    // where it's worth implementing proper tail calls
    'es:while': ( [ pred, ...body ], stoes, btoes ) =>
        "(function(__whilebrk){" +
          `while (${stoes(pred)}){\n` +
            `${btoes(body, false)} if (__whilebrk) break;\n` +
          "}\n" +
        "})(false)",
    'es:break': () => '__whilebrk=true',

    // fortunately ES6+ has native symbol support :)
    // we don't (yet?) need list quoting in Prebirth
    'quote': ( x, stoes ) => Array.isArray( x.value )
        ? (function () { throw SyntaxError(
            "quoting lists is not yet supported; sorry!" )
        } )()
        : `Symbol.for('${stoes(x)}')`,

    'lambda': ( [ args, ...body ], stoes, btoes ) =>
        "function(" + args.map( stoes ).join( ", " ) + "){\n" +
          btoes( body ) +
        "}",

    // simple if statement with optional else, wrapped in a self-executing
    // function to simplify code generation (e.g. returning an if)
    'if': ( [ pred, t, f ], stoes ) =>
        "(function(){" +
          `if (_truep(${stoes(pred)})){return ${stoes(t)};}` +
          ( ( f === undefined ) ? '' : `else{return ${stoes(f)};}` ) +
        "})()",

    // and short-circuits, so we need to implement it as a special form
    // rather than an alias
    'and': ( args, stoes ) =>
        "(function(__and){\n" +
            args.map( ( expr, i ) =>
                `__and = ${stoes(expr)}; ` +
                `if (!_truep(__and)) return false;\n`
            ).join( '' ) +
        `return __and;})()`,

    // or short-circuits, so we need to implement it as a special form
    // rather than an alias
    'or': ( args, stoes ) =>
        "(function(__or){\n" +
            args.map( ( expr, i ) =>
                `__or = ${stoes(expr)}; ` +
                `if (_truep(__or)) return __or;\n`
            ).join( '' ) +
        "return false;})()",

    // (let ((binding val) ...) ...body), compiled as a self-executing
    // function which allows us to easily represent the return value of the
    // entire expression while maintaining local scope
    'let*': ( [ bindings, ...body ], stoes, btoes ) =>
        "(function(){\n" +
            bindings
                .map( ([ x, val ]) => `    let ${stoes(x)} = ${stoes(val)};\n` )
                .join( '' ) +
            btoes( body ) + "\n" +
        "    })()",

    // similar to the above, but variables cannot reference one-another
    'let': ( [ bindings, ...body ], stoes, btoes ) =>
        "(function(" +
            bindings.map( ([ x ]) => stoes( x ) ).join( ", " ) +
        "){\n" +
            btoes( body ) + "\n" +
        "})(" +
            bindings.map( ([ , val ]) => stoes( val ) ).join( ", " ) +
        ")",

    // and here I thought Prebirth Lisp would be simple...but having `case'
    // support really keeps things much more tidy, so here we are (note that
    // it doesn't support the arrow form, nor does it support expressions as
    // data)
    'case': ( [ key, ...clauses ], stoes, btoes ) =>
        "(function(){" +
            `const _key=${stoes(key)};\n` +
            "switch (_key){\n" +
                clauses.map( ([ data, ...exprs ]) =>
                    // warning: doesn't support expressions as data!
                    ( ( data.lexeme === "else" )
                        ? "default:\n"
                        : data.map(
                            datum => `case ${stoes(datum)}:\n`
                        ).join( '' )
                    ) +
                    btoes( exprs ) + "\n"
                ).join( '' ) +
            "}" +
        "})()",

    // basic mutator (variable assignment)
    'set!': ( [ varid, val ], stoes ) =>
        `${stoes(varid)} = ${stoes(val)}`,
};


/**
 * Facade to make compilation as easy as `Prebirth#compile'
 */
class Prebirth
{
    /**
     * Parse and compile SRC.
     *
     * The output will be wrapped in a self-executing function to restrict
     * scope, and will be prefixed with LIB (which should be libprebirth.js).
     *
     * @param {string} src Prebirth Lisp source to compile
     * @param {string} lib libprebirth.js
     *
     * @return {string} compiler output with trailing newline
     */
    compile( src, lib )
    {
        const p = new Parser();
        const c = new Compiler( fnmap );

        const tree = p.parseLisp( src );

        // output libprebirth and compiled output, wrapped in a self-executing
        // function to limit scope
        return "(function(){" +
            lib + '\n\n' +
            c.compile( tree ) +
            "})();\n";
    }
}


/* If we're running in a CommonJS environment (e.g. Node.js), export our
/* entrypoint.
 */
if ( typeof module !== 'undefined' ) {
   module.exports = Prebirth;
}


/*
 * Prebirth was originally intended to be run via the command line using
 * Node.js.  But it doesn't have to be.  If you want, feel free to run it in
 * your web browser; you'll just have to instantiate your own Prebirth.
 */
( function ()
{
    if ( ( typeof process === 'undefined' )
       || ( ( typeof module !== 'undefined' ) && ( module.id !== '.' ) ) )
          return;

    const fs  = require( 'fs' );
    const src = fs.readFileSync( '/dev/stdin' ).toString();
    const lib = fs.readFileSync( __dirname + '/libprebirth.js' ).toString();

    const prebirth = new Prebirth();

    // compile and output to stdout
    process.stdout.write( prebirth.compile( src, lib ) );
} )();


/*
 * Now that we have output, the next step is the hard part: rewriting this
 * file in Prebirth Lisp.  As I mentioned, this process is called
 * "Birth".  It's at this point that we have to decide on basic
 * abstractions---we are starting from scratch.  The initial implementation
 * is therefore unlikely to be as concise and elegant as Prebirth
 * itself---it will be refactored.
 *
 * Here is an example Hello, World!:
 *
 *  (define (hello x)
 *    (es:console "Hello," x, "!"))
 *
 *
 * ¹ This term should invoke visuals of an abstract being entering existence
 *   in some strange nonlinear-time² kind of way.  If you thought of
 *   something less pleasant, well, I'm sorry you went through that.
 *
 * ² Because we're dealing with nonlinear time!¹  This would be some bizarre
 *   recursive footnote crap if it weren't for that.²
 */