diff --git a/build-aux/bootstrap/prebirth.js b/build-aux/bootstrap/prebirth.js new file mode 100644 index 0000000..b648cb6 --- /dev/null +++ b/build-aux/bootstrap/prebirth.js @@ -0,0 +1,538 @@ +/** + * Bootstrap Gibble Lisp ("Prebirth") + * + * Copyright (C) 2017 Mike Gerwitz + * + * This file is part of Gibble. + * + * Gibble is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * THIS IS TEMPORARY CODE that will be REWRITTEN IN GIBBLE LISP ITSELF after + * a very basic bootstrap is complete. It is retained as an important + * artifact for those who wish to build Gibble from scratch without using + * another version of Gibble itself. This is called "self-hosting". + * + * Rather than producing a sophisticated self-hosting language, this + * language will be a terribly incomplete and inadequate version of what + * will ultimately become a formidable and competent language. + * + * I refer to this entire complication process as "Prebirth".¹ The "Birth" + * of Gibble is the act of reimplementing this Prebirth in a Prebirth + * version of Gibble Lisp itself. It's the chicken-and-egg paradox, without + * the paradox.² + * + * Gibble Lisp is _not_ the most primitive language that will be understood + * by the system---it is too high-level. After Birth, the language can + * devolve into something more powerful and workable. + * + * Some minor terminology: + * - AST: Abstract Syntax Tree, a processed form of the CST. + * - CST: Concrete Syntax Tree, a 1-1 conversion of source input to + * tokens. + * - token: an object produced by the lexer that represents a portion of + * the input language + * - lexer: sometimes called a ``tokenizer''---produces tokens by applying + * the grammar to a string of input. + * - grammar: a definition of the language (syntax). + * - lexeme: the portion of the original source string associated with a + * given token. + * - LL(0): Left-to-right, Leftmost derivation, 0 tokens lookahead + * - sexp: symbolic expression, (involving (lots (of (((parentheses)))))) + * + * Excited? Great! My extemporaneous rambling is costing me more time than + * I spent making this damn thing! (No, really, it is.) + */ + +'use strict'; + + +/** + * A very rudimentary (and extremely permissive) LL(0) Lisp parser + * + * This provides just enough to get by. It transforms lists into nested + * arrays of tokens with some very basic error checking (e.g. for proper + * nesting). This is not a general-purpose lisp parser. + */ +class Parser +{ + /** + * Produce an AST from the given string SRC of sexps + * + * This is essentially the CST with whitespace removed. It first + * invokes the lexer to produce a token string from the input + * sexps SRC. From this, it verifies only proper nesting (that SRC does + * not close sexps too early and that EOF isn't reached before all sexps + * are closed) and produces an AST that is an isomorphism of the + * original sexps. + * + * @param {string} src input Lisp + * + * @throws {SyntaxError} on improper sexp nesting + * + * @return {Array} primitive abstract syntax tree of SRC + */ + parseLisp( src ) + { + // token string from lexing + const toks = this._lex( src ); + + // perform a leftmost reduction on the token string + const [ depth, ast ] = toks.reduce( ( result, token ) => + { + const [ depth, xs, stack ] = result; + const { type, pos } = token; + + // there are very few token types to deal with (again, this is + // a very simple bootstrap lisp) + switch ( type ) + { + // closing parenthesis (end of sexp) + case 'close': + if ( depth === 0 ) { + this._error( + src, pos, `Unexpected closing parenthesis` + ); + } + + // the sexp is complete; add to the AST, reduce depth + const top = stack.pop(); + top.push( xs ); + + return [ ( depth - 1 ), top, stack ]; + + // opening parenthesis (start of sexp) + case 'open': + stack.push( xs ); + return [ ( depth + 1 ), [], stack ]; + + // symbol or primitive; just copy the token in place + case 'string': + case 'symbol': + xs.push( token ); + return [ depth, xs, stack ]; + + // should never happen unless there's a bug in the tokenizer + // or we forget a token type above + default: + this._error( src, pos, `Unexpected token '${type}'` ); + } + }, [ 0, [], [] ] ); + + // if we terminate at a non-zero depth, that means there + // are still open sexps + if ( depth > 0 ) { + throw SyntaxError( + `Unexpected end of input at depth ${depth}` + ); + } + + // the result is a set of tokens organized into ES arrays + // isomorphic to the original sexp structure (the same structure) + return ast; + } + + + /** + * Throw a SyntaxError with a window of surrounding source code + * + * The "window" is simply ten characters to the left and right of the + * first character of the source input SRC that resulted in the error. + * It's a little more than useless. + * + * @param {string} src source code (sexps) + * @param {number} pos position of error + * @param {string} msg error message + * + * @throws {SyntaxError}} + * + * @return {undefined} + */ + _error( src, pos, msg ) + { + const window = src.substr( pos - 10, pos + 10 ) + .replace( "\n", " " ); + + throw new SyntaxError( `${msg}: '${window}'` ); + } + + + /** + * Convert source input into a string of tokens + * + * This is the lexer. Whitespace is ignored. The grammar consists of + * simple s-expressions. + * + * This function is mutually recursive with `#_token'. It expects that + * the source SRC will be left-truncated as input is + * processed. POS exists for producing metadata for error + * reporting---it has no impact on parsing. + * + * @param {string} src source code + * @param {number} pos position (character offset) in source + * + * @return {Array} string of tokens + */ + _lex( src, pos = 0 ) + { + // ignore whitespace, if any + const ws = src.match( /^\s+/ ) || [ "" ]; + const trim = src.substr( ws[ 0 ].length ); + + // adjust position to account for any removed whitespace + pos += ws[ 0 ].length; + + // EOF and we're done + if ( trim === '' ) { + return []; + } + + // left and right parenthesis are handled in the same manner: they + // produce distinct tokens with single-character lexemes + if ( trim[ 0 ] === '(' ) { + return this._token( 'open', '(', trim, pos ); + } + if ( trim[ 0 ] === ')' ) { + return this._token( 'close', ')', trim, pos ); + } + + // strings are delimited by opening and closing ASCII double quotes, + // which can be escaped with a backslash + if ( trim[ 0 ] === '"' ) { + const str = trim.match( /^"(|.*?[^\\])"/ ); + if ( !str ) { + this._error( src, pos, "missing closing string delimiter" ); + } + + // a string token consists of the entire string including quotes + // as its lexeme, but its value will be the value of the string + // without quotes due to the `str' match group (see `#_token') + return this._token( 'string', str, trim, pos ); + } + + // anything else is considered a symbol up until whitespace or any + // of the aforementioned delimiters + const symbol = trim.match( /^[^\s()"]+/ ); + return this._token( 'symbol', symbol, trim, pos ); + } + + + /** + * Produce a token and recurse + * + * The token will be concatenated with the result of the mutually + * recursive method `_lex'. + * + * For the record: I'm not fond of mutual recursion from a clarity + * standpoint, but this is how the abstraction evolved to de-duplicate + * code, and I don't much feel like refactoring it. + * + * @param {string} type token type + * @param {string|Array} match lexeme match + * @param {string} src source code string, left-truncated + * @param {number} pos offset relative to original src + * + * @return {Array} string of tokens + */ + _token( type, match, src, pos ) + { + const parts = ( Array.isArray( match ) ) + ? match + : [ match ]; + + // the value is the first group of the match (indicating what we + // are actually interested in), and the lexeme is the full match, + // which might include, for example, string delimiters + const [ lexeme, value ] = parts; + + const token = { + type: type, + lexeme: lexeme, + value: value || lexeme, + pos: pos + }; + + // continue producing tokens by recursing, left-truncating the + // source string to discard what we have already processed + return [ token ].concat( + this._lex( + src.substr( lexeme.length ), + ( pos + lexeme.length ) + ) + ); + } +}; + + + +/** + * Dumb compiler to transform AST into ECMAScript + * + * This is a really dumb code generator: it takes the AST and essentially + * transforms it 1:1 wherever possible into the target language. + * + * This is nothing like what we actually want the _ultimate_ compiler to do + * after Birth, but it gets us to a point where we can self-host on a basic + * Prebirth language and evolve from there. + * + * The code generation can be pretty much summed up by the last line of + * `Compiler#_cdfn'. + */ +class Compiler +{ + /** + * Compile AST into ECMAScript + * + * Every block is mapped 1:1 to a function in ECMAScript. So, we just + * map all root children (which are expected to be block definitions) to + * functions. + * + * @param {Array} tree root of tree containing top-level block definitions + */ + compile( tree ) + { + // map every definition to a ES function definition and delimit them + // (for readability) by two newlines + return tree.map( this._cdfn.bind( this ) ) + .join( "\n\n" ) + "\n"; + } + + + /** + * Compile block definition into a ES function definition + * + * This will fail if the given token is not a `define-block'. + * + * @param {Object} t token + * + * @return {string} compiled block definition + */ + _cdfn( t ) + { + this.assertApply( t, 'define-block' ); + + // e.g. (define-block ((input ...)) body) + const [ , { value: name }, desc, ...body ] = t; + + const id = this._idFromName( name ); + const bodyjs = this._bodyToEs( body ); + + // this is the final format---each block becomes its own function + // definition + return `function ${id}()\n{\n${bodyjs}\n};`; + } + + + /** + * Generate ECMAScript-friendly name from the given id + * + * @param {string} name source name + * + * @return {string} ES-friendly identifier + */ + _idFromName( name ) + { + return name.replace( /[^a-zA-Z0-9_]/g, '$' ); + } + + + /** + * Compile body s-expressions into ECMAScript + * + * This produces a 1:1 mapping of BODY s-expressions to ES statements, + * recursively. The heavy lifting is done by `#_sexpToEs'. + * + * @param {Array} body s-expressions representing block body + * + * @return {string} compiled BODY + */ + _bodyToEs( body ) + { + // the body must be an array of expressions (this should always be + // the case unless we have a bug in the compiler) + if ( !Array.isArray( body ) ) { + throw Error( "body must be an Array" ); + } + + // process each s-expression in BODY + const js = body.map( this._sexpToEs.bind( this ) ); + + // the result (that is, an array of compiled s-expressions) is + // joined semicolon-delimited, with a `return' statement preceding + // the final expression + return js.reduce( ( result, s, i ) => + { + const ret = ( i === ( js.length - 1 ) ) ? "return " : ""; + return result + " " + ret + s + ";"; + }, "" ); + } + + + /** + * Convert s-expression or scalar into ECMAScript + * + * T may be either an array of tokens or a primitive token (e.g. string, + * symbol). This method is applied recursively to T as needed if T is + * an array. + * + * @param {Array|Object} t tokens representing s-expressions/scalars + * + * @return {string} compiled s-expression/scalar + */ + _sexpToEs( t ) + { + // just output symbols as identifiers as-is for now + if ( !Array.isArray( t ) ) { + switch ( t.type ) + { + // strings are output as-is (note that we don't escape + // double quotes, because the method of escaping them is the + // same in Scheme as it is in ECMAScript---a backslash) + case 'string': + return `"${t.value}"`; + + // symbols have the same concerns as block definitions: the + // identifiers generated need to be ES-friendly + case 'symbol': + return this._idFromName( t.value ); + + default: + throw Error( "Cannot compile unknown token `${t.type}'" ); + } + } + + // only support block form for now, and assume that `fn' is a + // string value (in the future, this doesn't have to be the + // case---fn should be able to be an arbitrary sexp) + const [ { value: fn }, argmap ] = t; + + if ( !this._isBlockForm( t ) ) { + throw Error( `\`${fn}' application is not in block form`) + } + + // convert all remaining symbols (after the symbol representing the + // function application) into arguments by parsing their sexps or + // scalar values; we're not going to worry about mapping them for + // now; they will be compiled in the order in which they appear + const idfn = this._idFromName( fn ); + const args = argmap.map( ([ , v ]) => this._sexpToEs( v ) ); + const argstr = args.join( ", " ); + + // make the dangerous assumption that arguments are ordered + // for now + return `${idfn}(${argstr})`; + } + + + /** + * Determine whether T represents a block form + * + * Block form is an application of a block, which has a certain + * syntax. Specifically: `( ((key value) ...))'. + * + * @param {*} t hopefully a token list + * + * @return {boolean} whether T represents a block form + */ + _isBlockForm( t ) + { + // the first symbol is the function name, second is an sexp + // containing each of the key/value argument mappings + const [ fn, argmap ] = t; + + // enforce block id convention (at least for now) + const isblockid = /^<[^>]+>$/.test( fn.value ); + + return ( + Array.isArray( t ) + && isblockid + && Array.isArray( argmap ) + ); + } + + + /** + * Determine whether T is an application of a symbol NAME, or error + * + * @param {*} t hopefully a token or token list + * @param {string} name block name to assert against + */ + assertApply( t, name ) + { + // an application must be an s-expression + if ( !Array.isArray( t ) ) { + throw Error( + `\`${name}' application expected, found symbol \`${t.value}'` + ); + } + + // if there's a match, we can stop here + if ( t[ 0 ].value === name ) { + return; + } + + // otherwise, provide an informative error of what we found and what + // we should have found + throw Error( + `\`${name}' expected, found \`${t[ 0 ].value}'` + ); + } +} + + + +/* + * Prebirth was originally intended to be run via the command line using + * Node.js. But it doesn't have to be. If you want, feel free to run it in + * your web browser; you'll just have to instantiate your own objects. + */ +( function () +{ + if ( typeof process === 'undefined' ) + { + return; + } + + const p = new Parser(); + const c = new Compiler(); + + const src = require( 'fs' ).readFileSync( '/dev/stdin' ).toString(); + const tree = p.parseLisp( src ); + + process.stdout.write( c.compile( tree ) ); +} )(); + + + +/* + * Now that we have output, the next step is the hard part: rewriting this + * file in Prebirth Lisp. As I mentioned, this process is called + * "Birth". It's at this point that we have to decide on basic + * abstractions---we are starting from scratch. The initial implementation + * is therefore unlikely to be as concise and elegant as Prebirth + * itself---it will be refactored. + * + * Here is an example Hello, World!: + * + * (define-block + * () + * ( ((message "Hello, world!")))) + * + * + * ¹ This term should invoke visuals of an abstract being entering existence + * in some strange nonlinear-time² kind of way. If you thought of + * something less pleasant, well, I'm sorry you went through that. + * + * ² Because we're dealing with nonlinear time!¹ This would be some bizarre + * recursive footnote crap if it weren't for that.² + */