ulambda/build-aux/bootstrap/prebirth.js

522 lines
17 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

/**
* Bootstrap Gibble Lisp ("Prebirth")
*
* Copyright (C) 2017 Mike Gerwitz
*
* This file is part of Gibble.
*
* Gibble is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* THIS IS TEMPORARY CODE that will be REWRITTEN IN GIBBLE LISP ITSELF after
* a very basic bootstrap is complete. It is retained as an important
* artifact for those who wish to build Gibble from scratch without using
* another version of Gibble itself. This is called "self-hosting".
*
* Rather than producing a sophisticated self-hosting language, this
* language will be a terribly incomplete and inadequate version of what
* will ultimately become a formidable and competent language.
*
* I refer to this entire complication process as "Prebirth".¹ The "Birth"
* of Gibble is the act of reimplementing this Prebirth in a Prebirth
* version of Gibble Lisp itself. It's the chicken-and-egg paradox, without
* the paradox.²
*
* Gibble Lisp is _not_ the most primitive language that will be understood
* by the system---it is too high-level. After Birth, the language can
* devolve into something more powerful and workable.
*
* Some minor terminology:
* - AST: Abstract Syntax Tree, a processed form of the CST.
* - CST: Concrete Syntax Tree, a 1-1 conversion of source input to
* tokens.
* - token: an object produced by the lexer that represents a portion of
* the input language
* - lexer: sometimes called a ``tokenizer''---produces tokens by applying
* the grammar to a string of input.
* - grammar: a definition of the language (syntax).
* - lexeme: the portion of the original source string associated with a
* given token.
* - LL(0): Left-to-right, Leftmost derivation, 0 tokens lookahead
* - sexp: symbolic expression, (involving (lots (of (((parentheses))))))
*
* Excited? Great! My extemporaneous rambling is costing me more time than
* I spent making this damn thing! (No, really, it is.)
*/
'use strict';
/**
* A very rudimentary (and extremely permissive) LL(0) Lisp parser
*
* This provides just enough to get by. It transforms lists into nested
* arrays of tokens with some very basic error checking (e.g. for proper
* nesting). This is not a general-purpose lisp parser.
*/
class Parser
{
/**
* Produce an AST from the given string SRC of sexps
*
* This is essentially the CST with whitespace removed. It first
* invokes the lexer to produce a token string from the input
* sexps SRC. From this, it verifies only proper nesting (that SRC does
* not close sexps too early and that EOF isn't reached before all sexps
* are closed) and produces an AST that is an isomorphism of the
* original sexps.
*
* @param {string} src input Lisp
*
* @throws {SyntaxError} on improper sexp nesting
*
* @return {Array} primitive abstract syntax tree of SRC
*/
parseLisp( src )
{
// token string from lexing
const toks = this._lex( src );
// perform a leftmost reduction on the token string
const [ depth, ast ] = toks.reduce( ( result, token ) =>
{
const [ depth, xs, stack ] = result;
const { type, pos } = token;
// there are very few token types to deal with (again, this is
// a very simple bootstrap lisp)
switch ( type )
{
// ignore comments
case 'comment':
return result;
// closing parenthesis (end of sexp)
case 'close':
if ( depth === 0 ) {
this._error(
src, pos, `Unexpected closing parenthesis`
);
}
// the sexp is complete; add to the AST, reduce depth
const top = stack.pop();
top.push( xs );
return [ ( depth - 1 ), top, stack ];
// opening parenthesis (start of sexp)
case 'open':
stack.push( xs );
return [ ( depth + 1 ), [], stack ];
// symbol or primitive; just copy the token in place
case 'string':
case 'symbol':
xs.push( token );
return [ depth, xs, stack ];
// should never happen unless there's a bug in the tokenizer
// or we forget a token type above
default:
this._error( src, pos, `Unexpected token '${type}'` );
}
}, [ 0, [], [] ] );
// if we terminate at a non-zero depth, that means there
// are still open sexps
if ( depth > 0 ) {
throw SyntaxError(
`Unexpected end of input at depth ${depth}`
);
}
// the result is a set of tokens organized into ES arrays
// isomorphic to the original sexp structure (the same structure)
return ast;
}
/**
* Throw a SyntaxError with a window of surrounding source code
*
* The "window" is simply ten characters to the left and right of the
* first character of the source input SRC that resulted in the error.
* It's a little more than useless.
*
* @param {string} src source code (sexps)
* @param {number} pos position of error
* @param {string} msg error message
*
* @throws {SyntaxError}}
*
* @return {undefined}
*/
_error( src, pos, msg )
{
const window = src.substr( pos - 10, pos + 10 )
.replace( "\n", " " );
throw new SyntaxError( `${msg}: '${window}'` );
}
/**
* Convert source input into a string of tokens
*
* This is the lexer. Whitespace is ignored. The grammar consists of
* simple s-expressions.
*
* This function is mutually recursive with `#_token'. It expects that
* the source SRC will be left-truncated as input is
* processed. POS exists for producing metadata for error
* reporting---it has no impact on parsing.
*
* @param {string} src source code
* @param {number} pos position (character offset) in source
*
* @return {Array} string of tokens
*/
_lex( src, pos = 0 )
{
// ignore whitespace, if any
const ws = src.match( /^\s+/ ) || [ "" ];
const trim = src.substr( ws[ 0 ].length );
// adjust position to account for any removed whitespace
pos += ws[ 0 ].length;
// EOF and we're done
if ( trim === '' ) {
return [];
}
// comment until end of line
if ( trim[ 0 ] === ';' ) {
const eol = trim.match( /^(.*?)(\n|$)/ );
return this._token( 'comment', eol[ 1 ], trim, pos );
}
// left and right parenthesis are handled in the same manner: they
// produce distinct tokens with single-character lexemes
if ( trim[ 0 ] === '(' ) {
return this._token( 'open', '(', trim, pos );
}
if ( trim[ 0 ] === ')' ) {
return this._token( 'close', ')', trim, pos );
}
// strings are delimited by opening and closing ASCII double quotes,
// which can be escaped with a backslash
if ( trim[ 0 ] === '"' ) {
const str = trim.match( /^"(|.*?[^\\])"/ );
if ( !str ) {
this._error( src, pos, "missing closing string delimiter" );
}
// a string token consists of the entire string including quotes
// as its lexeme, but its value will be the value of the string
// without quotes due to the `str' match group (see `#_token')
return this._token( 'string', str, trim, pos );
}
// anything else is considered a symbol up until whitespace or any
// of the aforementioned delimiters
const symbol = trim.match( /^[^\s()"]+/ );
return this._token( 'symbol', symbol, trim, pos );
}
/**
* Produce a token and recurse
*
* The token will be concatenated with the result of the mutually
* recursive method `_lex'.
*
* For the record: I'm not fond of mutual recursion from a clarity
* standpoint, but this is how the abstraction evolved to de-duplicate
* code, and I don't much feel like refactoring it.
*
* @param {string} type token type
* @param {string|Array} match lexeme match
* @param {string} src source code string, left-truncated
* @param {number} pos offset relative to original src
*
* @return {Array} string of tokens
*/
_token( type, match, src, pos )
{
const parts = ( Array.isArray( match ) )
? match
: [ match ];
// the value is the first group of the match (indicating what we
// are actually interested in), and the lexeme is the full match,
// which might include, for example, string delimiters
const [ lexeme, value ] = parts;
const token = {
type: type,
lexeme: lexeme,
value: value || lexeme,
pos: pos
};
// continue producing tokens by recursing, left-truncating the
// source string to discard what we have already processed
return [ token ].concat(
this._lex(
src.substr( lexeme.length ),
( pos + lexeme.length )
)
);
}
};
/**
* Dumb compiler to transform AST into ECMAScript
*
* This is a really dumb code generator: it takes the AST and essentially
* transforms it 1:1 wherever possible into the target language.
*
* This is nothing like what we actually want the _ultimate_ compiler to do
* after Birth, but it gets us to a point where we can self-host on a basic
* Prebirth language and evolve from there.
*
* The code generation can be pretty much summed up by the last line of
* `Compiler#_cdfn'.
*/
class Compiler
{
/**
* Compile AST into ECMAScript
*
* Every function is mapped 1:1 to a function in ECMAScript. So, we
* just map all root children (which are expected to be Scheme-style
* shorthand function definitions) to functions.
*
* @param {Array} tree root containing top-level function definitions
*/
compile( tree )
{
// map every definition to a ES function definition and delimit them
// (for readability) by two newlines
return tree.map( this._cdfn.bind( this ) )
.join( "\n\n" ) + "\n";
}
/**
* Compile function definition into a ES function definition
*
* This will fail if the given token is not a `define'.
*
* @param {Object} t token
*
* @return {string} compiled function definition
*/
_cdfn( t )
{
// an application must be an s-expression
if ( !Array.isArray( t ) ) {
throw Error(
`\`${name}' application expected, found symbol \`${t.value}'`
);
}
// if it's not a definition, then it's a top-level application
if ( t[ 0 ].value !== 'define' )
{
return this._sexpToEs( t ) + ';';
}
// e.g. (define (foo ...) body)
const [ , [ { value: name }, ...params ], ...body ] = t;
const id = this._idFromName( name, true );
const paramjs = this._paramsToEs( params );
const bodyjs = this._bodyToEs( body );
// this is the final format---each function becomes its own function
// definition in ES
return `function ${id}(${paramjs})\n{\n${bodyjs}\n};`;
}
/**
* Compile parameter list
*
* This simply takes the value of the symbol and outputs it (formatted),
* delimited by commas.
*
* @param {Array} args token parameter list
*
* @return {string} compiled parameter list
*/
_paramsToEs( args )
{
return args.map( ({ value: name }) => this._idFromName( name ) )
.join( ", " );
}
/**
* Generate ECMAScript-friendly name from the given id
*
* @param {string} name source name
* @param {boolean} global whether identifier should be globally unique
*
* @return {string} ES-friendly identifier
*/
_idFromName( name, global )
{
return ( global ? '$$' : '' )
+ name.replace( /[^a-zA-Z0-9_]/g, '$' );
}
/**
* Compile body s-expressions into ECMAScript
*
* This produces a 1:1 mapping of BODY s-expressions to ES statements,
* recursively. The heavy lifting is done by `#_sexpToEs'.
*
* @param {Array} body s-expressions representing function body
*
* @return {string} compiled BODY
*/
_bodyToEs( body )
{
// the body must be an array of expressions (this should always be
// the case unless we have a bug in the compiler)
if ( !Array.isArray( body ) ) {
throw Error( "body must be an Array" );
}
// process each s-expression in BODY
const js = body.map( this._sexpToEs.bind( this ) );
// the result (that is, an array of compiled s-expressions) is
// joined semicolon-delimited, with a `return' statement preceding
// the final expression
return js.reduce( ( result, s, i ) =>
{
const ret = ( i === ( js.length - 1 ) ) ? "return " : "";
return result + " " + ret + s + ";";
}, "" );
}
/**
* Convert s-expression or scalar into ECMAScript
*
* T may be either an array of tokens or a primitive token (e.g. string,
* symbol). This method is applied recursively to T as needed if T is
* an array.
*
* @param {Array|Object} t tokens representing s-expressions/scalars
*
* @return {string} compiled s-expression/scalar
*/
_sexpToEs( t )
{
// just output symbols as identifiers as-is for now
if ( !Array.isArray( t ) ) {
switch ( t.type )
{
// strings are output as-is (note that we don't escape
// double quotes, because the method of escaping them is the
// same in Scheme as it is in ECMAScript---a backslash)
case 'string':
return `"${t.value}"`;
// symbols have the same concerns as function definitions: the
// identifiers generated need to be ES-friendly
case 'symbol':
return this._idFromName( t.value );
default:
throw Error( `Cannot compile unknown token \`${t.type}'` );
}
}
// simple function application (fn ...args)
const [ { value: fn }, ...args ] = t;
// convert all remaining symbols (after the symbol representing the
// function application) into arguments by parsing their sexps or
// scalar values
const idfn = this._idFromName( fn, true );
const argstr = args.map( arg => this._sexpToEs( arg ) ).join( ", " );
// final function application
return `${idfn}(${argstr})`;
}
}
/*
* Prebirth was originally intended to be run via the command line using
* Node.js. But it doesn't have to be. If you want, feel free to run it in
* your web browser; you'll just have to instantiate your own objects.
*/
( function ()
{
if ( typeof process === 'undefined' )
{
return;
}
const p = new Parser();
const c = new Compiler();
const fs = require( 'fs' );
const src = fs.readFileSync( '/dev/stdin' ).toString();
const tree = p.parseLisp( src );
const lib = fs.readFileSync( __dirname + '/libprebirth.js' ).toString();
// output libprebirth and compiled output, wrapped in a self-executing
// function to limit scope
process.stdout.write( "(function(){" );
process.stdout.write( lib + '\n\n' );
process.stdout.write( c.compile( tree ) );
process.stdout.write( "})();\n" );
} )();
/*
* Now that we have output, the next step is the hard part: rewriting this
* file in Prebirth Lisp. As I mentioned, this process is called
* "Birth". It's at this point that we have to decide on basic
* abstractions---we are starting from scratch. The initial implementation
* is therefore unlikely to be as concise and elegant as Prebirth
* itself---it will be refactored.
*
* Here is an example Hello, World!:
*
* (define (hello x)
* (js:console "Hello," x, "!"))
*
*
* ¹ This term should invoke visuals of an abstract being entering existence
* in some strange nonlinear-time² kind of way. If you thought of
* something less pleasant, well, I'm sorry you went through that.
*
* ² Because we're dealing with nonlinear time!¹ This would be some bizarre
* recursive footnote crap if it weren't for that.²
*/