Add prebirth.js

This is hopefully the beginning of a good thing that I'll actually finish. I began planning this project formally just before the beginning of Aug 2017. * build-aux/bootstrap/prebirth.js: New file.
2017-08-21 02:20:03 -04:00 · 2017-08-21 02:20:03 -04:00 · 7998296a20
parent ecd8b6d9e7
commit 7998296a20
1 changed files with 538 additions and 0 deletions
--- a/build-aux/bootstrap/prebirth.js
+++ b/build-aux/bootstrap/prebirth.js
@ -0,0 +1,538 @@
+/**
+ * Bootstrap Gibble Lisp ("Prebirth")
+ *
+ *  Copyright (C) 2017 Mike Gerwitz
+ *
+ *  This file is part of Gibble.
+ *
+ *  Gibble is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Affero General Public License as
+ *  published by the Free Software Foundation, either version 3 of the
+ *  License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Affero General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * THIS IS TEMPORARY CODE that will be REWRITTEN IN GIBBLE LISP ITSELF after
+ * a very basic bootstrap is complete.  It is retained as an important
+ * artifact for those who wish to build Gibble from scratch without using
+ * another version of Gibble itself.  This is called "self-hosting".
+ *
+ * Rather than producing a sophisticated self-hosting language, this
+ * language will be a terribly incomplete and inadequate version of what
+ * will ultimately become a formidable and competent language.
+ *
+ * I refer to this entire complication process as "Prebirth".¹ The "Birth"
+ * of Gibble is the act of reimplementing this Prebirth in a Prebirth
+ * version of Gibble Lisp itself.  It's the chicken-and-egg paradox, without
+ * the paradox.²
+ *
+ * Gibble Lisp is _not_ the most primitive language that will be understood
+ * by the system---it is too high-level.  After Birth, the language can
+ * devolve into something more powerful and workable.
+ *
+ * Some minor terminology:
+ *   - AST: Abstract Syntax Tree, a processed form of the CST.
+ *   - CST: Concrete Syntax Tree, a 1-1 conversion of source input to
+ *          tokens.
+ *   - token: an object produced by the lexer that represents a portion of
+ *            the input language
+ *   - lexer: sometimes called a ``tokenizer''---produces tokens by applying
+ *            the grammar to a string of input.
+ *   - grammar: a definition of the language (syntax).
+ *   - lexeme: the portion of the original source string associated with a
+ *             given token.
+ *   - LL(0): Left-to-right, Leftmost derivation, 0 tokens lookahead
+ *   - sexp: symbolic expression, (involving (lots (of (((parentheses))))))
+ *
+ * Excited?  Great!  My extemporaneous rambling is costing me more time than
+ * I spent making this damn thing!  (No, really, it is.)
+ */
+
+'use strict';
+
+
+/**
+ * A very rudimentary (and extremely permissive) LL(0) Lisp parser
+ *
+ * This provides just enough to get by.  It transforms lists into nested
+ * arrays of tokens with some very basic error checking (e.g. for proper
+ * nesting).  This is not a general-purpose lisp parser.
+ */
+class Parser
+{
+    /**
+     * Produce an AST from the given string SRC of sexps
+     *
+     * This is essentially the CST with whitespace removed.  It first
+     * invokes the lexer to produce a token string from the input
+     * sexps SRC.  From this, it verifies only proper nesting (that SRC does
+     * not close sexps too early and that EOF isn't reached before all sexps
+     * are closed) and produces an AST that is an isomorphism of the
+     * original sexps.
+     *
+     * @param {string} src input Lisp
+     *
+     * @throws {SyntaxError} on improper sexp nesting
+     *
+     * @return {Array} primitive abstract syntax tree of SRC
+     */
+    parseLisp( src )
+    {
+        // token string from lexing
+        const toks = this._lex( src );
+
+        // perform a leftmost reduction on the token string
+        const [ depth, ast ] = toks.reduce( ( result, token ) =>
+        {
+            const [ depth, xs, stack ] = result;
+            const { type, pos }        = token;
+
+            // there are very few token types to deal with (again, this is
+            // a very simple bootstrap lisp)
+            switch ( type )
+            {
+                // closing parenthesis (end of sexp)
+                case 'close':
+                    if ( depth === 0 ) {
+                        this._error(
+                            src, pos, `Unexpected closing parenthesis`
+                        );
+                    }
+
+                    // the sexp is complete; add to the AST, reduce depth
+                    const top = stack.pop();
+                    top.push( xs );
+
+                    return [ ( depth - 1 ), top, stack ];
+
+                // opening parenthesis (start of sexp)
+                case 'open':
+                    stack.push( xs );
+                    return [ ( depth + 1 ), [], stack ];
+
+                // symbol or primitive; just copy the token in place
+                case 'string':
+                case 'symbol':
+                    xs.push( token );
+                    return [ depth, xs, stack ];
+
+                // should never happen unless there's a bug in the tokenizer
+                // or we forget a token type above
+                default:
+                    this._error( src, pos, `Unexpected token '${type}'` );
+            }
+        }, [ 0, [], [] ] );
+
+        // if we terminate at a non-zero depth, that means there
+        // are still open sexps
+        if ( depth > 0 ) {
+            throw SyntaxError(
+                `Unexpected end of input at depth ${depth}`
+            );
+        }
+
+        // the result is a set of tokens organized into ES arrays
+        // isomorphic to the original sexp structure (the same structure)
+        return ast;
+    }
+
+
+    /**
+     * Throw a SyntaxError with a window of surrounding source code
+     *
+     * The "window" is simply ten characters to the left and right of the
+     * first character of the source input SRC that resulted in the error.
+     * It's a little more than useless.
+     *
+     * @param {string} src source code (sexps)
+     * @param {number} pos position of error
+     * @param {string} msg error message
+     *
+     * @throws {SyntaxError}}
+     *
+     * @return {undefined}
+     */
+    _error( src, pos, msg )
+    {
+        const window = src.substr( pos - 10, pos + 10 )
+              .replace( "\n", " " );
+
+        throw new SyntaxError( `${msg}: '${window}'` );
+    }
+
+
+    /**
+     * Convert source input into a string of tokens
+     *
+     * This is the lexer.  Whitespace is ignored.  The grammar consists of
+     * simple s-expressions.
+     *
+     * This function is mutually recursive with `#_token'.  It expects that
+     * the source SRC will be left-truncated as input is
+     * processed.  POS exists for producing metadata for error
+     * reporting---it has no impact on parsing.
+     *
+     * @param {string} src source code
+     * @param {number} pos position (character offset) in source
+     *
+     * @return {Array} string of tokens
+     */
+    _lex( src, pos = 0 )
+    {
+        // ignore whitespace, if any
+        const ws   = src.match( /^\s+/ ) || [ "" ];
+        const trim = src.substr( ws[ 0 ].length );
+
+        // adjust position to account for any removed whitespace
+        pos += ws[ 0 ].length;
+
+        // EOF and we're done
+        if ( trim === '' ) {
+            return [];
+        }
+
+        // left and right parenthesis are handled in the same manner: they
+        // produce distinct tokens with single-character lexemes
+        if ( trim[ 0 ] === '(' ) {
+            return this._token( 'open', '(', trim, pos );
+        }
+        if ( trim[ 0 ] === ')' ) {
+            return this._token( 'close', ')', trim, pos );
+        }
+
+        // strings are delimited by opening and closing ASCII double quotes,
+        // which can be escaped with a backslash
+        if ( trim[ 0 ] === '"' ) {
+            const str = trim.match( /^"(|.*?[^\\])"/ );
+            if ( !str ) {
+                this._error( src, pos, "missing closing string delimiter" );
+            }
+
+            // a string token consists of the entire string including quotes
+            // as its lexeme, but its value will be the value of the string
+            // without quotes due to the `str' match group (see `#_token')
+            return this._token( 'string', str, trim, pos );
+        }
+
+        // anything else is considered a symbol up until whitespace or any
+        // of the aforementioned delimiters
+        const symbol = trim.match( /^[^\s()"]+/ );
+        return this._token( 'symbol', symbol, trim, pos );
+    }
+
+
+    /**
+     * Produce a token and recurse
+     *
+     * The token will be concatenated with the result of the mutually
+     * recursive method `_lex'.
+     *
+     * For the record: I'm not fond of mutual recursion from a clarity
+     * standpoint, but this is how the abstraction evolved to de-duplicate
+     * code, and I don't much feel like refactoring it.
+     *
+     * @param {string}       type  token type
+     * @param {string|Array} match lexeme match
+     * @param {string}       src   source code string, left-truncated
+     * @param {number}       pos   offset relative to original src
+     *
+     * @return {Array} string of tokens
+     */
+    _token( type, match, src, pos )
+    {
+        const parts = ( Array.isArray( match ) )
+            ? match
+            : [ match ];
+
+        // the value is the first group of the match (indicating what we
+        // are actually interested in), and the lexeme is the full match,
+        // which might include, for example, string delimiters
+        const [ lexeme, value ] = parts;
+
+        const token = {
+            type:   type,
+            lexeme: lexeme,
+            value:  value || lexeme,
+            pos:    pos
+        };
+
+        // continue producing tokens by recursing, left-truncating the
+        // source string to discard what we have already processed
+        return [ token ].concat(
+            this._lex(
+                src.substr( lexeme.length ),
+                ( pos + lexeme.length )
+            )
+        );
+    }
+};
+
+
+
+/**
+ * Dumb compiler to transform AST into ECMAScript
+ *
+ * This is a really dumb code generator: it takes the AST and essentially
+ * transforms it 1:1 wherever possible into the target language.
+ *
+ * This is nothing like what we actually want the _ultimate_ compiler to do
+ * after Birth, but it gets us to a point where we can self-host on a basic
+ * Prebirth language and evolve from there.
+ *
+ * The code generation can be pretty much summed up by the last line of
+ * `Compiler#_cdfn'.
+ */
+class Compiler
+{
+    /**
+     * Compile AST into ECMAScript
+     *
+     * Every block is mapped 1:1 to a function in ECMAScript.  So, we just
+     * map all root children (which are expected to be block definitions) to
+     * functions.
+     *
+     * @param {Array} tree root of tree containing top-level block definitions
+     */
+    compile( tree )
+    {
+        // map every definition to a ES function definition and delimit them
+        // (for readability) by two newlines
+        return tree.map( this._cdfn.bind( this ) )
+            .join( "\n\n" ) + "\n";
+    }
+
+
+    /**
+     * Compile block definition into a ES function definition
+     *
+     * This will fail if the given token is not a `define-block'.
+     *
+     * @param {Object} t token
+     *
+     * @return {string} compiled block definition
+     */
+    _cdfn( t )
+    {
+        this.assertApply( t, 'define-block' );
+
+        // e.g. (define-block <foo> ((input ...)) body)
+        const [ , { value: name }, desc, ...body ] = t;
+
+        const id     = this._idFromName( name );
+        const bodyjs = this._bodyToEs( body );
+
+        // this is the final format---each block becomes its own function
+        // definition
+        return `function ${id}()\n{\n${bodyjs}\n};`;
+    }
+
+
+    /**
+     * Generate ECMAScript-friendly name from the given id
+     *
+     * @param {string} name source name
+     *
+     * @return {string} ES-friendly identifier
+     */
+    _idFromName( name )
+    {
+        return name.replace( /[^a-zA-Z0-9_]/g, '$' );
+    }
+
+
+    /**
+     * Compile body s-expressions into ECMAScript
+     *
+     * This produces a 1:1 mapping of BODY s-expressions to ES statements,
+     * recursively.  The heavy lifting is done by `#_sexpToEs'.
+     *
+     * @param {Array} body s-expressions representing block body
+     *
+     * @return {string} compiled BODY
+     */
+    _bodyToEs( body )
+    {
+        // the body must be an array of expressions (this should always be
+        // the case unless we have a bug in the compiler)
+        if ( !Array.isArray( body ) ) {
+            throw Error( "body must be an Array" );
+        }
+
+        // process each s-expression in BODY
+        const js = body.map( this._sexpToEs.bind( this ) );
+
+        // the result (that is, an array of compiled s-expressions) is
+        // joined semicolon-delimited, with a `return' statement preceding
+        // the final expression
+        return js.reduce( ( result, s, i ) =>
+        {
+            const ret = ( i === ( js.length - 1 ) ) ? "return " : "";
+            return result + "    " + ret + s + ";";
+        }, "" );
+    }
+
+
+    /**
+     * Convert s-expression or scalar into ECMAScript
+     *
+     * T may be either an array of tokens or a primitive token (e.g. string,
+     * symbol).  This method is applied recursively to T as needed if T is
+     * an array.
+     *
+     * @param {Array|Object} t tokens representing s-expressions/scalars
+     *
+     * @return {string} compiled s-expression/scalar
+     */
+    _sexpToEs( t )
+    {
+        // just output symbols as identifiers as-is for now
+        if ( !Array.isArray( t ) ) {
+            switch ( t.type )
+            {
+                // strings are output as-is (note that we don't escape
+                // double quotes, because the method of escaping them is the
+                // same in Scheme as it is in ECMAScript---a backslash)
+                case 'string':
+                    return `"${t.value}"`;
+
+                // symbols have the same concerns as block definitions: the
+                // identifiers generated need to be ES-friendly
+                case 'symbol':
+                    return this._idFromName( t.value );
+
+                default:
+                    throw Error( "Cannot compile unknown token `${t.type}'" );
+            }
+        }
+
+        // only support block form for now, and assume that `fn' is a
+        // string value (in the future, this doesn't have to be the
+        // case---fn should be able to be an arbitrary sexp)
+        const [ { value: fn }, argmap ] = t;
+
+        if ( !this._isBlockForm( t ) ) {
+            throw Error( `\`${fn}' application is not in block form`)
+        }
+
+        // convert all remaining symbols (after the symbol representing the
+        // function application) into arguments by parsing their sexps or
+        // scalar values; we're not going to worry about mapping them for
+        // now; they will be compiled in the order in which they appear
+        const idfn   = this._idFromName( fn );
+        const args   = argmap.map( ([ , v ]) => this._sexpToEs( v ) );
+        const argstr = args.join( ", " );
+
+        // make the dangerous assumption that arguments are ordered
+        // for now
+        return `${idfn}(${argstr})`;
+    }
+
+
+    /**
+     * Determine whether T represents a block form
+     *
+     * Block form is an application of a block, which has a certain
+     * syntax.  Specifically: `(<block> ((key value) ...))'.
+     *
+     * @param {*} t hopefully a token list
+     *
+     * @return {boolean} whether T represents a block form
+     */
+    _isBlockForm( t )
+    {
+        // the first symbol is the function name, second is an sexp
+        // containing each of the key/value argument mappings
+        const [ fn, argmap ] = t;
+
+        // enforce block id convention (at least for now)
+        const isblockid = /^<[^>]+>$/.test( fn.value );
+
+        return (
+            Array.isArray( t )
+                && isblockid
+                && Array.isArray( argmap )
+        );
+    }
+
+
+    /**
+     * Determine whether T is an application of a symbol NAME, or error
+     *
+     * @param {*}      t    hopefully a token or token list
+     * @param {string} name block name to assert against
+     */
+    assertApply( t, name )
+    {
+        // an application must be an s-expression
+        if ( !Array.isArray( t ) ) {
+            throw Error(
+                `\`${name}' application expected, found symbol \`${t.value}'`
+            );
+        }
+
+        // if there's a match, we can stop here
+        if ( t[ 0 ].value === name ) {
+            return;
+        }
+
+        // otherwise, provide an informative error of what we found and what
+        // we should have found
+        throw Error(
+            `\`${name}' expected, found \`${t[ 0 ].value}'`
+        );
+    }
+}
+
+
+
+/*
+ * Prebirth was originally intended to be run via the command line using
+ * Node.js.  But it doesn't have to be.  If you want, feel free to run it in
+ * your web browser; you'll just have to instantiate your own objects.
+ */
+( function ()
+{
+    if ( typeof process === 'undefined' )
+    {
+        return;
+    }
+
+    const p = new Parser();
+    const c = new Compiler();
+
+    const src  = require( 'fs' ).readFileSync( '/dev/stdin' ).toString();
+    const tree = p.parseLisp( src );
+
+    process.stdout.write( c.compile( tree ) );
+} )();
+
+
+
+/*
+ * Now that we have output, the next step is the hard part: rewriting this
+ * file in Prebirth Lisp.  As I mentioned, this process is called
+ * "Birth".  It's at this point that we have to decide on basic
+ * abstractions---we are starting from scratch.  The initial implementation
+ * is therefore unlikely to be as concise and elegant as Prebirth
+ * itself---it will be refactored.
+ *
+ * Here is an example Hello, World!:
+ *
+ *   (define-block <hello-world>
+ *     ()
+ *     (<js:console> ((message "Hello, world!"))))
+ *
+ *
+ * ¹ This term should invoke visuals of an abstract being entering existence
+ *   in some strange nonlinear-time² kind of way.  If you thought of
+ *   something less pleasant, well, I'm sorry you went through that.
+ *
+ * ² Because we're dealing with nonlinear time!¹  This would be some bizarre
+ *   recursive footnote crap if it weren't for that.²
+ */