ulambda/bootstrap/prebirth.js

739 lines
24 KiB
JavaScript
Raw Permalink Blame History

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

/**
* Bootstrap Ulambda Scheme ("Prebirth")
*
* Copyright (C) 2017 Mike Gerwitz
*
* This file is part of Ulambda Scheme.
*
* Ulambda Scheme is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* THIS IS TEMPORARY CODE that will be REWRITTEN IN ULAMBDA SCHEME ITSELF
* after a very basic bootstrap is complete. It is retained as an important
* artifact for those who wish to build Ulambda Scheme from scratch without
* using another version of Ulambda itself. This is called "self-hosting".
*
* Rather than producing a sophisticated self-hosting language, this
* language will be a terribly incomplete and inadequate version of what
* will ultimately become a formidable and competent language.
*
* I refer to this entire complication process as "Prebirth".¹ The "Birth"
* of Ulambda Scheme is the act of reimplementing this Prebirth in a
* Prebirth version of Ulambda itself. It's the chicken-and-egg paradox,
* without the paradox.²
*
* Some minor terminology:
* - AST: Abstract Syntax Tree, a processed form of the CST.
* - CST: Concrete Syntax Tree, a 1-1 conversion of source input to
* tokens.
* - token: an object produced by the lexer that represents a portion of
* the input language
* - lexer: sometimes called a ``tokenizer''---produces tokens by applying
* the grammar to a string of input.
* - grammar: a definition of the language (syntax).
* - lexeme: the portion of the original source string associated with a
* given token.
* - LL(0): Left-to-right, Leftmost derivation, 0 tokens lookahead
* - sexp: symbolic expression, (involving (lots (of (((parentheses))))))
*
* Excited? Great! My extemporaneous rambling is costing me more time than
* I spent making this damn thing! (No, really, it is.)
*/
'use strict';
/**
* A very rudimentary (and extremely permissive) LL(0) Lisp parser
*
* This provides just enough to get by. It transforms lists into nested
* arrays of tokens with some very basic error checking (e.g. for proper
* nesting). This is not a general-purpose lisp parser.
*/
class Parser
{
/**
* Produce an AST from the given string SRC of sexps
*
* This is essentially the CST with whitespace removed. It first
* invokes the lexer to produce a token string from the input
* sexps SRC. From this, it verifies only proper nesting (that SRC does
* not close sexps too early and that EOF isn't reached before all sexps
* are closed) and produces an AST that is an isomorphism of the
* original sexps.
*
* @param {string} src input Lisp
*
* @throws {SyntaxError} on improper sexp nesting
*
* @return {Array} primitive abstract syntax tree of SRC
*/
parseLisp( src )
{
// token string from lexing
const toks = this._lex( src );
// perform a leftmost reduction on the token string
const [ depth, ast ] = toks.reduce( ( result, token ) =>
{
const [ depth, xs, stack ] = result;
const { type, pos } = token;
// there are very few token types to deal with (again, this is
// a very simple bootstrap lisp)
switch ( type )
{
// ignore comments
case 'comment':
return result;
// closing parenthesis (end of sexp)
case 'close':
if ( depth === 0 ) {
this._error(
src, pos, `unexpected closing parenthesis`
);
}
// the sexp is complete; add to the AST, reduce depth
const top = stack.pop();
top.push( xs );
return [ ( depth - 1 ), top, stack ];
// opening parenthesis (start of sexp)
case 'open':
stack.push( xs );
return [ ( depth + 1 ), [], stack ];
// symbol or primitive; just copy the token in place
case 'string':
case 'symbol':
xs.push( token );
return [ depth, xs, stack ];
// should never happen unless there's a bug in the tokenizer
// or we forget a token type above
default:
this._error( src, pos, `unexpected token '${type}'` );
}
}, [ 0, [], [] ] );
// if we terminate at a non-zero depth, that means there
// are still open sexps
if ( depth > 0 ) {
throw SyntaxError(
`unexpected end of input at depth ${depth}`
);
}
// the result is a set of tokens organized into ES arrays
// isomorphic to the original sexp structure (the same structure)
return ast;
}
/**
* Throw a SyntaxError with a window of surrounding source code
*
* The "window" is simply ten characters to the left and right of the
* first character of the source input SRC that resulted in the error.
* It's a little more than useless.
*
* @param {string} src source code (sexps)
* @param {number} pos position of error
* @param {string} msg error message
*
* @throws {SyntaxError}}
*
* @return {undefined}
*/
_error( src, pos, msg )
{
const window = src.substr( pos - 10, pos + 10 )
.replace( "\n", " " );
throw new SyntaxError( `${msg}: '${window}'` );
}
/**
* Convert source input into a string of tokens
*
* This is the lexer. Whitespace is ignored. The grammar consists of
* simple s-expressions.
*
* Tokens are produced with `#_token'. The source SRC will be
* left-truncated as input is processed. POS exists for producing
* metadata for error reporting---it has no impact on parsing.
*
* This implementation was originally recursive and immutable, but the
* stack was being exhausted, so it was refactored into an inferior
* implementation.
*
* @param {string} src source code
* @param {number} pos position (character offset) in source
*
* @return {Array} string of tokens
*/
_lex( src, pos = 0 )
{
const toks = [];
// process until EOF (see break)
while ( true ) {
// ignore whitespace, if any
const ws = src.match( /^\s+/ ) || [ "" ];
const trim = src.substr( ws[ 0 ].length );
// adjust position to account for any removed whitespace
pos += ws[ 0 ].length;
// EOF and we're done
if ( trim === '' ) {
break;
}
let t = null;
// comment until end of line
if ( trim[ 0 ] === ';' ) {
const eol = trim.match( /^(.*?)(\n|$)/ );
t = this._token( 'comment', eol[ 1 ], trim, pos );
}
// left and right parenthesis are handled in the same manner: they
// produce distinct tokens with single-character lexemes
else if ( trim[ 0 ] === '(' ) {
t = this._token( 'open', '(', trim, pos );
}
else if ( trim[ 0 ] === ')' ) {
t = this._token( 'close', ')', trim, pos );
}
// strings are delimited by opening and closing ASCII double quotes,
// which can be escaped with a backslash
else if ( trim[ 0 ] === '"' ) {
const str = trim.match( /^"(|(?:.|\n)*?[^\\])"/ );
if ( !str ) {
this._error( src, pos, "missing closing string delimiter" );
}
// a string token consists of the entire string including quotes
// as its lexeme, but its value will be the value of the string
// without quotes due to the `str' match group (see `#_token')
t = this._token( 'string', str, trim, pos );
}
// anything else is considered a symbol up until whitespace or any
// of the aforementioned delimiters
else {
const symbol = trim.match( /^[^\s()"]+/ );
t = this._token( 'symbol', symbol, trim, pos );
}
const [ tok, newsrc, newpos ] = t;
// add token, left-truncate src, update pos
toks.push( tok );
src = newsrc;
pos = newpos;
}
return toks;
}
/**
* Produce a token, left-truncate src, and update pos
*
* @param {string} type token type
* @param {string|Array} match lexeme match
* @param {string} src source code string, left-truncated
* @param {number} pos offset relative to original src
*
* @return {Array} string of tokens
*/
_token( type, match, src, pos )
{
const parts = ( Array.isArray( match ) )
? match
: [ match ];
// the value is the first group of the match (indicating what we
// are actually interested in), and the lexeme is the full match,
// which might include, for example, string delimiters
const [ lexeme, value ] = parts;
const token = {
type: type,
lexeme: lexeme,
value: ( value === undefined ) ? lexeme : value,
pos: pos
};
// produce token, left-truncating the source string to discard what
// we have already processed
return [
token,
src.substr( lexeme.length ),
( pos + lexeme.length ),
];
}
};
/**
* Dumb compiler to transform AST into ECMAScript
*
* This is a really dumb code generator: it takes the AST and essentially
* transforms it 1:1 wherever possible into the target language. There is
* no intermediate representation (e.g. an ES AST).
*
* This is nothing like what we actually want the _ultimate_ compiler to do
* after Birth, but it gets us to a point where we can self-host on a basic
* Prebirth language and evolve from there.
*
* The code generation can be pretty much summed up by the last line of
* `Compiler#_cdfn'.
*/
class Compiler
{
/**
* Initialize with function map
*
* The function map will be used to map certain functions into other
* names or forms. For example, `es:console' may map to `console.log'
* and `if' to an `if' statement+expression.
*
* @param {Object} fnmap function map
*/
constructor( fnmap )
{
this._fnmap = fnmap;
}
/**
* Compile AST into ECMAScript
*
* Every function is mapped 1:1 to a function in ECMAScript. So, we
* just map all root children (which are expected to be Scheme-style
* shorthand function definitions) to functions.
*
* @param {Array} tree root containing top-level function definitions
*/
compile( tree )
{
// map every definition to a ES function definition and delimit them
// (for readability) by two newlines
return tree.map( this._sexpToEs.bind( this ) )
.join( "\n\n" );
}
/**
* Compile procedure definition into a ES function definition
*
* This will fail if the given token is not a `define'.
*
* @param {Object} t token
*
* @return {string} compiled function definition
*/
_cdfn( t )
{
// e.g. (define (foo ...) body)
const [ , [ { value: name }, ...params ], ...body ] = t;
const id = this._idFromName( name );
const paramjs = this._paramsToEs( params );
const bodyjs = this._bodyToEs( body );
// this is the final format---each procedure becomes its own function
// definition in ES
return `function ${id}(${paramjs})\n{\n${bodyjs}\n};`;
}
/**
* Compile parameter list
*
* This simply takes the value of the symbol and outputs it (formatted),
* delimited by commas.
*
* @param {Array} args token parameter list
*
* @return {string} compiled parameter list
*/
_paramsToEs( args )
{
return args.map( ({ value: name }) => this._idFromName( name ) )
.join( ", " );
}
/**
* Generate ECMAScript-friendly name from the given id
*
* A subset of special characters that are acceptable in Scheme are
* converted in an identifiable manner; others are simply converted to
* `$' in a catch-all and therefore could result in conflicts and cannot
* be reliably distinguished from one-another. Remember: this is
* temporary code.
*
* @param {string} name source name
*
* @return {string} ES-friendly identifier
*/
_idFromName( name )
{
if ( /^\d+$/.test( name ) ) {
return name;
}
// just some common ones; will fall back to `$' below
const conv = {
'-': '$_$',
'?': '$7$',
'@': '$a$',
'!': '$b$',
'>': '$g$',
'#': '$h$',
'*': '$k$',
'<': '$l$',
'&': '$n$',
'%': '$o$',
'+': '$p$',
'=': '$q$',
'^': '$v$',
'/': '$w$',
'$': '$$',
};
if ( name === undefined ) {
throw SyntaxError( "Missing identifier name" );
}
return '$$' + name.replace( /[^a-zA-Z0-9_]/g, c => conv[ c ] || '$' );
}
/**
* Compile body s-expressions into ECMAScript
*
* This produces a 1:1 mapping of BODY s-expressions to ES statements,
* recursively. The heavy lifting is done by `#_sexpToEs'.
*
* @param {Array} body s-expressions representing function body
*
* @return {string} compiled BODY
*/
_bodyToEs( body, ret = true )
{
// the body must be an array of expressions (this should always be
// the case unless we have a bug in the compiler)
if ( !Array.isArray( body ) ) {
throw Error( "body must be an Array" );
}
// process each s-expression in BODY
const js = body.map( this._sexpToEs.bind( this ) );
// the result (that is, an array of compiled s-expressions) is
// joined semicolon-delimited, with a `return' statement preceding
// the final expression
return js.map( ( s, i ) =>
{
const retstmt = ( ret && i === ( js.length - 1 ) ) ? "return " : "";
return ` ${retstmt}${s};`;
} ).join( '\n' );
}
/**
* Convert s-expression or scalar into ECMAScript
*
* T may be either an array of tokens or a primitive token (e.g. string,
* symbol). This method is applied recursively to T as needed if T is
* an array.
*
* @param {Array|Object} t tokens representing s-expressions/scalars
*
* @return {string} compiled s-expression/scalar
*/
_sexpToEs( t )
{
// just output symbols as identifiers as-is for now
if ( !Array.isArray( t ) ) {
switch ( t.type )
{
// strings are output as-is (note that we don't escape
// double quotes, because the method of escaping them is the
// same in Scheme as it is in ECMAScript---a backslash)
case 'string':
return `"${t.value}"`;
// symbols have the same concerns as function definitions: the
// identifiers generated need to be ES-friendly
case 'symbol':
return this._idFromName( t.value );
default:
throw Error( `Cannot compile unknown token \`${t.type}'` );
}
}
if ( t[ 0 ].value === 'define' ) {
return this._cdfn( t );
}
// simple function application (fn ...args)
const [ { value: fn }, ...args ] = t;
const mapentry = this._fnmap[ fn ];
// if the fnmap contains a function entry, then it will handle the
// remaining processing
if ( mapentry && ( typeof mapentry === 'function' ) ) {
return mapentry(
args,
this._sexpToEs.bind( this ),
this._bodyToEs.bind( this )
);
}
// convert all remaining symbols (after the symbol representing the
// function application) into arguments by parsing their sexps or
// scalar values
const idfn = mapentry || this._idFromName( fn, true );
const argstr = args.map( arg => this._sexpToEs( arg ) ).join( ", " );
// final function application
return `${idfn}(${argstr})`;
}
}
/**
* Function/procedure aliases and special forms
*
* And here we have what is probably the most grotesque part of this
* file. Saved the best for last.
*
* This map allows for a steady transition---items can be removed as they
* are written in Prebirth Lisp. This should give us a sane (but still
* simple) environment with which we can start to self-host.
*
* String values are simple function aliases. Function values take over the
* compilation of that function and allow for defining special forms (in
* place of macro support). The first argument to the function is the list
* of raw arguments (not yet compiled); the second argument is
* `Compiler#_sexpToEs'; and the third is `Compiler#bodyToEs'.
*
* These are by no means meant to be solid implementations; notable
* deficiencies are documented, but don't expect this to work properly in
* every case. They will be replaced with proper R7RS implementations in the
* future (after Birth).
*
* @type {Object}
*/
const fnmap = {
'es:console': 'console.log',
// yes, there are more important things to do until we get to the point
// where it's worth implementing proper tail calls
'es:while': ( [ pred, ...body ], stoes, btoes ) =>
"(function(__whilebrk){" +
`while (${stoes(pred)}){\n` +
`${btoes(body, false)} if (__whilebrk) break;\n` +
"}\n" +
"})(false)",
'es:break': () => '__whilebrk=true',
// fortunately ES6+ has native symbol support :)
// we don't (yet?) need list quoting in Prebirth
'quote': ( x, stoes ) => Array.isArray( x.value )
? (function () { throw SyntaxError(
"quoting lists is not yet supported; sorry!" )
} )()
: `Symbol.for('${stoes(x)}')`,
'lambda': ( [ args, ...body ], stoes, btoes ) =>
"function(" + args.map( stoes ).join( ", " ) + "){\n" +
btoes( body ) +
"}",
// simple if statement with optional else, wrapped in a self-executing
// function to simplify code generation (e.g. returning an if)
'if': ( [ pred, t, f ], stoes ) =>
"(function(){" +
`if (_truep(${stoes(pred)})){return ${stoes(t)};}` +
( ( f === undefined ) ? '' : `else{return ${stoes(f)};}` ) +
"})()",
// and short-circuits, so we need to implement it as a special form
// rather than an alias
'and': ( args, stoes ) =>
"(function(__and){\n" +
args.map( ( expr, i ) =>
`__and = ${stoes(expr)}; ` +
`if (!_truep(__and)) return false;\n`
).join( '' ) +
`return __and;})()`,
// or short-circuits, so we need to implement it as a special form
// rather than an alias
'or': ( args, stoes ) =>
"(function(__or){\n" +
args.map( ( expr, i ) =>
`__or = ${stoes(expr)}; ` +
`if (_truep(__or)) return __or;\n`
).join( '' ) +
"return false;})()",
// (let ((binding val) ...) ...body), compiled as a self-executing
// function which allows us to easily represent the return value of the
// entire expression while maintaining local scope
'let*': ( [ bindings, ...body ], stoes, btoes ) =>
"(function(){\n" +
bindings
.map( ([ x, val ]) => ` let ${stoes(x)} = ${stoes(val)};\n` )
.join( '' ) +
btoes( body ) + "\n" +
" })()",
// similar to the above, but variables cannot reference one-another
'let': ( [ bindings, ...body ], stoes, btoes ) =>
"(function(" +
bindings.map( ([ x ]) => stoes( x ) ).join( ", " ) +
"){\n" +
btoes( body ) + "\n" +
"})(" +
bindings.map( ([ , val ]) => stoes( val ) ).join( ", " ) +
")",
// and here I thought Prebirth Lisp would be simple...but having `case'
// support really keeps things much more tidy, so here we are (note that
// it doesn't support the arrow form, nor does it support expressions as
// data)
'case': ( [ key, ...clauses ], stoes, btoes ) =>
"(function(){" +
`const _key=${stoes(key)};\n` +
"switch (_key){\n" +
clauses.map( ([ data, ...exprs ]) =>
// warning: doesn't support expressions as data!
( ( data.lexeme === "else" )
? "default:\n"
: data.map(
datum => `case ${stoes(datum)}:\n`
).join( '' )
) +
btoes( exprs ) + "\n"
).join( '' ) +
"}" +
"})()",
// basic mutator (variable assignment)
'set!': ( [ varid, val ], stoes ) =>
`${stoes(varid)} = ${stoes(val)}`,
};
/**
* Facade to make compilation as easy as `Prebirth#compile'
*/
class Prebirth
{
/**
* Parse and compile SRC.
*
* The output will be wrapped in a self-executing function to restrict
* scope, and will be prefixed with LIB (which should be libprebirth.js).
*
* @param {string} src Prebirth Lisp source to compile
* @param {string} lib libprebirth.js
*
* @return {string} compiler output with trailing newline
*/
compile( src, lib )
{
const p = new Parser();
const c = new Compiler( fnmap );
const tree = p.parseLisp( src );
// output libprebirth and compiled output, wrapped in a self-executing
// function to limit scope
return "(function(){" +
lib + '\n\n' +
c.compile( tree ) +
"})();\n";
}
}
/* If we're running in a CommonJS environment (e.g. Node.js), export our
/* entrypoint.
*/
if ( typeof module !== 'undefined' ) {
module.exports = Prebirth;
}
/*
* Prebirth was originally intended to be run via the command line using
* Node.js. But it doesn't have to be. If you want, feel free to run it in
* your web browser; you'll just have to instantiate your own Prebirth.
*/
( function ()
{
if ( ( typeof process === 'undefined' )
|| ( ( typeof module !== 'undefined' ) && ( module.id !== '.' ) ) )
return;
const fs = require( 'fs' );
const src = fs.readFileSync( '/dev/stdin' ).toString();
const lib = fs.readFileSync( __dirname + '/libprebirth.js' ).toString();
const prebirth = new Prebirth();
// compile and output to stdout
process.stdout.write( prebirth.compile( src, lib ) );
} )();
/*
* Now that we have output, the next step is the hard part: rewriting this
* file in Prebirth Lisp. As I mentioned, this process is called
* "Birth". It's at this point that we have to decide on basic
* abstractions---we are starting from scratch. The initial implementation
* is therefore unlikely to be as concise and elegant as Prebirth
* itself---it will be refactored.
*
* Here is an example Hello, World!:
*
* (define (hello x)
* (es:console "Hello," x, "!"))
*
*
* ¹ This term should invoke visuals of an abstract being entering existence
* in some strange nonlinear-time² kind of way. If you thought of
* something less pleasant, well, I'm sorry you went through that.
*
* ² Because we're dealing with nonlinear time!¹ This would be some bizarre
* recursive footnote crap if it weren't for that.²
*/