/** * Bootstrap Gibble Lisp ("Prebirth") * * Copyright (C) 2017 Mike Gerwitz * * This file is part of Gibble. * * Gibble is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * * THIS IS TEMPORARY CODE that will be REWRITTEN IN GIBBLE LISP ITSELF after * a very basic bootstrap is complete. It is retained as an important * artifact for those who wish to build Gibble from scratch without using * another version of Gibble itself. This is called "self-hosting". * * Rather than producing a sophisticated self-hosting language, this * language will be a terribly incomplete and inadequate version of what * will ultimately become a formidable and competent language. * * I refer to this entire complication process as "Prebirth".¹ The "Birth" * of Gibble is the act of reimplementing this Prebirth in a Prebirth * version of Gibble Lisp itself. It's the chicken-and-egg paradox, without * the paradox.² * * Gibble Lisp is _not_ the most primitive language that will be understood * by the system---it is too high-level. After Birth, the language can * devolve into something more powerful and workable. * * Some minor terminology: * - AST: Abstract Syntax Tree, a processed form of the CST. * - CST: Concrete Syntax Tree, a 1-1 conversion of source input to * tokens. * - token: an object produced by the lexer that represents a portion of * the input language * - lexer: sometimes called a ``tokenizer''---produces tokens by applying * the grammar to a string of input. * - grammar: a definition of the language (syntax). * - lexeme: the portion of the original source string associated with a * given token. * - LL(0): Left-to-right, Leftmost derivation, 0 tokens lookahead * - sexp: symbolic expression, (involving (lots (of (((parentheses)))))) * * Excited? Great! My extemporaneous rambling is costing me more time than * I spent making this damn thing! (No, really, it is.) */ 'use strict'; /** * A very rudimentary (and extremely permissive) LL(0) Lisp parser * * This provides just enough to get by. It transforms lists into nested * arrays of tokens with some very basic error checking (e.g. for proper * nesting). This is not a general-purpose lisp parser. */ class Parser { /** * Produce an AST from the given string SRC of sexps * * This is essentially the CST with whitespace removed. It first * invokes the lexer to produce a token string from the input * sexps SRC. From this, it verifies only proper nesting (that SRC does * not close sexps too early and that EOF isn't reached before all sexps * are closed) and produces an AST that is an isomorphism of the * original sexps. * * @param {string} src input Lisp * * @throws {SyntaxError} on improper sexp nesting * * @return {Array} primitive abstract syntax tree of SRC */ parseLisp( src ) { // token string from lexing const toks = this._lex( src ); // perform a leftmost reduction on the token string const [ depth, ast ] = toks.reduce( ( result, token ) => { const [ depth, xs, stack ] = result; const { type, pos } = token; // there are very few token types to deal with (again, this is // a very simple bootstrap lisp) switch ( type ) { // ignore comments case 'comment': return result; // closing parenthesis (end of sexp) case 'close': if ( depth === 0 ) { this._error( src, pos, `Unexpected closing parenthesis` ); } // the sexp is complete; add to the AST, reduce depth const top = stack.pop(); top.push( xs ); return [ ( depth - 1 ), top, stack ]; // opening parenthesis (start of sexp) case 'open': stack.push( xs ); return [ ( depth + 1 ), [], stack ]; // symbol or primitive; just copy the token in place case 'string': case 'symbol': xs.push( token ); return [ depth, xs, stack ]; // should never happen unless there's a bug in the tokenizer // or we forget a token type above default: this._error( src, pos, `Unexpected token '${type}'` ); } }, [ 0, [], [] ] ); // if we terminate at a non-zero depth, that means there // are still open sexps if ( depth > 0 ) { throw SyntaxError( `Unexpected end of input at depth ${depth}` ); } // the result is a set of tokens organized into ES arrays // isomorphic to the original sexp structure (the same structure) return ast; } /** * Throw a SyntaxError with a window of surrounding source code * * The "window" is simply ten characters to the left and right of the * first character of the source input SRC that resulted in the error. * It's a little more than useless. * * @param {string} src source code (sexps) * @param {number} pos position of error * @param {string} msg error message * * @throws {SyntaxError}} * * @return {undefined} */ _error( src, pos, msg ) { const window = src.substr( pos - 10, pos + 10 ) .replace( "\n", " " ); throw new SyntaxError( `${msg}: '${window}'` ); } /** * Convert source input into a string of tokens * * This is the lexer. Whitespace is ignored. The grammar consists of * simple s-expressions. * * This function is mutually recursive with `#_token'. It expects that * the source SRC will be left-truncated as input is * processed. POS exists for producing metadata for error * reporting---it has no impact on parsing. * * @param {string} src source code * @param {number} pos position (character offset) in source * * @return {Array} string of tokens */ _lex( src, pos = 0 ) { // ignore whitespace, if any const ws = src.match( /^\s+/ ) || [ "" ]; const trim = src.substr( ws[ 0 ].length ); // adjust position to account for any removed whitespace pos += ws[ 0 ].length; // EOF and we're done if ( trim === '' ) { return []; } // comment until end of line if ( trim[ 0 ] === ';' ) { const eol = trim.match( /^(.*?)(\n|$)/ ); return this._token( 'comment', eol[ 1 ], trim, pos ); } // left and right parenthesis are handled in the same manner: they // produce distinct tokens with single-character lexemes if ( trim[ 0 ] === '(' ) { return this._token( 'open', '(', trim, pos ); } if ( trim[ 0 ] === ')' ) { return this._token( 'close', ')', trim, pos ); } // strings are delimited by opening and closing ASCII double quotes, // which can be escaped with a backslash if ( trim[ 0 ] === '"' ) { const str = trim.match( /^"(|.*?[^\\])"/ ); if ( !str ) { this._error( src, pos, "missing closing string delimiter" ); } // a string token consists of the entire string including quotes // as its lexeme, but its value will be the value of the string // without quotes due to the `str' match group (see `#_token') return this._token( 'string', str, trim, pos ); } // anything else is considered a symbol up until whitespace or any // of the aforementioned delimiters const symbol = trim.match( /^[^\s()"]+/ ); return this._token( 'symbol', symbol, trim, pos ); } /** * Produce a token and recurse * * The token will be concatenated with the result of the mutually * recursive method `_lex'. * * For the record: I'm not fond of mutual recursion from a clarity * standpoint, but this is how the abstraction evolved to de-duplicate * code, and I don't much feel like refactoring it. * * @param {string} type token type * @param {string|Array} match lexeme match * @param {string} src source code string, left-truncated * @param {number} pos offset relative to original src * * @return {Array} string of tokens */ _token( type, match, src, pos ) { const parts = ( Array.isArray( match ) ) ? match : [ match ]; // the value is the first group of the match (indicating what we // are actually interested in), and the lexeme is the full match, // which might include, for example, string delimiters const [ lexeme, value ] = parts; const token = { type: type, lexeme: lexeme, value: value || lexeme, pos: pos }; // continue producing tokens by recursing, left-truncating the // source string to discard what we have already processed return [ token ].concat( this._lex( src.substr( lexeme.length ), ( pos + lexeme.length ) ) ); } }; /** * Dumb compiler to transform AST into ECMAScript * * This is a really dumb code generator: it takes the AST and essentially * transforms it 1:1 wherever possible into the target language. * * This is nothing like what we actually want the _ultimate_ compiler to do * after Birth, but it gets us to a point where we can self-host on a basic * Prebirth language and evolve from there. * * The code generation can be pretty much summed up by the last line of * `Compiler#_cdfn'. */ class Compiler { /** * Initialize with function map * * The function map will be used to map certain functions into other * names or forms. For example, `js:console' may map to `console.log' * and `if' to an `if' statement+expression. * * @param {Object} fnmap function map */ constructor( fnmap ) { this._fnmap = fnmap; } /** * Compile AST into ECMAScript * * Every function is mapped 1:1 to a function in ECMAScript. So, we * just map all root children (which are expected to be Scheme-style * shorthand function definitions) to functions. * * @param {Array} tree root containing top-level function definitions */ compile( tree ) { // map every definition to a ES function definition and delimit them // (for readability) by two newlines return tree.map( this._cdfn.bind( this ) ) .join( "\n\n" ) + "\n"; } /** * Compile function definition into a ES function definition * * This will fail if the given token is not a `define'. * * @param {Object} t token * * @return {string} compiled function definition */ _cdfn( t ) { // an application must be an s-expression if ( !Array.isArray( t ) ) { throw Error( `\`${name}' application expected, found symbol \`${t.value}'` ); } // if it's not a definition, then it's a top-level application if ( t[ 0 ].value !== 'define' ) { return this._sexpToEs( t ) + ';'; } // e.g. (define (foo ...) body) const [ , [ { value: name }, ...params ], ...body ] = t; const id = this._idFromName( name, true ); const paramjs = this._paramsToEs( params ); const bodyjs = this._bodyToEs( body ); // this is the final format---each function becomes its own function // definition in ES return `function ${id}(${paramjs})\n{\n${bodyjs}\n};`; } /** * Compile parameter list * * This simply takes the value of the symbol and outputs it (formatted), * delimited by commas. * * @param {Array} args token parameter list * * @return {string} compiled parameter list */ _paramsToEs( args ) { return args.map( ({ value: name }) => this._idFromName( name ) ) .join( ", " ); } /** * Generate ECMAScript-friendly name from the given id * * A subset of special characters that are acceptable in Scheme are * converted in an identifiable manner; others are simply converted to * `$' in a catch-all and therefore could result in conflicts and cannot * be reliably distinguished from one-another. Remember: this is * temporary code. * * @param {string} name source name * @param {boolean} global whether identifier should be globally unique * * @return {string} ES-friendly identifier */ _idFromName( name, global ) { // just some common ones; will fall back to `$' below const conv = { '-': '$_$', '?': '$7$', '@': '$a$', '!': '$b$', '>': '$g$', '#': '$h$', '*': '$k$', '<': '$l$', '&': '$n$', '%': '$o$', '+': '$p$', '=': '$q$', '^': '$v$', '/': '$w$', '$': '$$', }; if ( name === undefined ) { throw SyntaxError( "Missing identifier name" ); } return ( global ? '$$' : '' ) + name.replace( /[^a-zA-Z0-9_]/g, c => conv[ c ] || '$' ); } /** * Compile body s-expressions into ECMAScript * * This produces a 1:1 mapping of BODY s-expressions to ES statements, * recursively. The heavy lifting is done by `#_sexpToEs'. * * @param {Array} body s-expressions representing function body * * @return {string} compiled BODY */ _bodyToEs( body ) { // the body must be an array of expressions (this should always be // the case unless we have a bug in the compiler) if ( !Array.isArray( body ) ) { throw Error( "body must be an Array" ); } // process each s-expression in BODY const js = body.map( this._sexpToEs.bind( this ) ); // the result (that is, an array of compiled s-expressions) is // joined semicolon-delimited, with a `return' statement preceding // the final expression return js.map( ( s, i ) => { const ret = ( i === ( js.length - 1 ) ) ? "return " : ""; return ` ${ret}${s};`; } ).join( '\n' ); } /** * Convert s-expression or scalar into ECMAScript * * T may be either an array of tokens or a primitive token (e.g. string, * symbol). This method is applied recursively to T as needed if T is * an array. * * @param {Array|Object} t tokens representing s-expressions/scalars * * @return {string} compiled s-expression/scalar */ _sexpToEs( t ) { // just output symbols as identifiers as-is for now if ( !Array.isArray( t ) ) { switch ( t.type ) { // strings are output as-is (note that we don't escape // double quotes, because the method of escaping them is the // same in Scheme as it is in ECMAScript---a backslash) case 'string': return `"${t.value}"`; // symbols have the same concerns as function definitions: the // identifiers generated need to be ES-friendly case 'symbol': return this._idFromName( t.value ); default: throw Error( `Cannot compile unknown token \`${t.type}'` ); } } // simple function application (fn ...args) const [ { value: fn }, ...args ] = t; const mapentry = this._fnmap[ fn ]; // if the fnmap contains a function entry, then it will handle the // remaining processing if ( mapentry && ( typeof mapentry === 'function' ) ) { return mapentry( args, this._sexpToEs.bind( this ), this._bodyToEs.bind( this ) ); } // convert all remaining symbols (after the symbol representing the // function application) into arguments by parsing their sexps or // scalar values const idfn = mapentry || this._idFromName( fn, true ); const argstr = args.map( arg => this._sexpToEs( arg ) ).join( ", " ); // final function application return `${idfn}(${argstr})`; } } /** * Function aliases and special forms * * This map allows for a steady transition---items can be removed as they * are written in Prebirth Lisp. This should give us a sane (but still * simple) environment with which we can start to self-host. * * String values are simple function aliases. Function values take over the * compilation of that function and allow for defining special forms. The * first argument to the function is the list of raw arguments (not yet * compiled); the second argument is `Compiler#_sexpToEs'; and the third is * `Compiler#bodyToEs'. * * These are by no means meant to be solid implementations. * * @type {Object} */ const fnmap = { 'js:console': 'console.log', // simple if statement with optional else, wrapped in a self-executing // function to simplify code generation (e.g. returning an if) 'if': ( [ pred, t, f ], stoes ) => "(function(){" + `if (${stoes(pred)}){return ${stoes(t)};}` + ( ( f === undefined ) ? '' : `else{return ${stoes(f)};}` ) + "})()", // (let ((binding val) ...) ...body), compiled as a self-executing // function which allows us to easily represent the return value of the // entire expression while maintaining local scope 'let*': ( [ bindings, ...body ], stoes, btoes ) => "(function(){\n" + bindings .map( ([ x, val ]) => ` const ${stoes(x)} = ${stoes(val)};\n` ) .join( '' ) + btoes( body ) + "\n" + " })()", // similar to the above, but variables cannot reference one-another 'let': ( [ bindings, ...body ], stoes, btoes ) => "(function(" + bindings.map( ([ x ]) => stoes( x ) ).join( ", " ) + "){\n" + btoes( body ) + "\n" + "})(" + bindings.map( ([ , val ]) => stoes( val ) ).join( ", " ) + ")", }; /* * Prebirth was originally intended to be run via the command line using * Node.js. But it doesn't have to be. If you want, feel free to run it in * your web browser; you'll just have to instantiate your own objects. */ ( function () { if ( typeof process === 'undefined' ) { return; } const p = new Parser(); const c = new Compiler( fnmap ); const fs = require( 'fs' ); const src = fs.readFileSync( '/dev/stdin' ).toString(); const tree = p.parseLisp( src ); const lib = fs.readFileSync( __dirname + '/libprebirth.js' ).toString(); // output libprebirth and compiled output, wrapped in a self-executing // function to limit scope process.stdout.write( "(function(){" ); process.stdout.write( lib + '\n\n' ); process.stdout.write( c.compile( tree ) ); process.stdout.write( "})();\n" ); } )(); /* * Now that we have output, the next step is the hard part: rewriting this * file in Prebirth Lisp. As I mentioned, this process is called * "Birth". It's at this point that we have to decide on basic * abstractions---we are starting from scratch. The initial implementation * is therefore unlikely to be as concise and elegant as Prebirth * itself---it will be refactored. * * Here is an example Hello, World!: * * (define (hello x) * (js:console "Hello," x, "!")) * * * ¹ This term should invoke visuals of an abstract being entering existence * in some strange nonlinear-time² kind of way. If you thought of * something less pleasant, well, I'm sorry you went through that. * * ² Because we're dealing with nonlinear time!¹ This would be some bizarre * recursive footnote crap if it weren't for that.² */