Add prebirth.js
This is hopefully the beginning of a good thing that I'll actually finish. I began planning this project formally just before the beginning of Aug 2017. * build-aux/bootstrap/prebirth.js: New file.master
parent
ecd8b6d9e7
commit
7998296a20
|
@ -0,0 +1,538 @@
|
|||
/**
|
||||
* Bootstrap Gibble Lisp ("Prebirth")
|
||||
*
|
||||
* Copyright (C) 2017 Mike Gerwitz
|
||||
*
|
||||
* This file is part of Gibble.
|
||||
*
|
||||
* Gibble is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* THIS IS TEMPORARY CODE that will be REWRITTEN IN GIBBLE LISP ITSELF after
|
||||
* a very basic bootstrap is complete. It is retained as an important
|
||||
* artifact for those who wish to build Gibble from scratch without using
|
||||
* another version of Gibble itself. This is called "self-hosting".
|
||||
*
|
||||
* Rather than producing a sophisticated self-hosting language, this
|
||||
* language will be a terribly incomplete and inadequate version of what
|
||||
* will ultimately become a formidable and competent language.
|
||||
*
|
||||
* I refer to this entire complication process as "Prebirth".¹ The "Birth"
|
||||
* of Gibble is the act of reimplementing this Prebirth in a Prebirth
|
||||
* version of Gibble Lisp itself. It's the chicken-and-egg paradox, without
|
||||
* the paradox.²
|
||||
*
|
||||
* Gibble Lisp is _not_ the most primitive language that will be understood
|
||||
* by the system---it is too high-level. After Birth, the language can
|
||||
* devolve into something more powerful and workable.
|
||||
*
|
||||
* Some minor terminology:
|
||||
* - AST: Abstract Syntax Tree, a processed form of the CST.
|
||||
* - CST: Concrete Syntax Tree, a 1-1 conversion of source input to
|
||||
* tokens.
|
||||
* - token: an object produced by the lexer that represents a portion of
|
||||
* the input language
|
||||
* - lexer: sometimes called a ``tokenizer''---produces tokens by applying
|
||||
* the grammar to a string of input.
|
||||
* - grammar: a definition of the language (syntax).
|
||||
* - lexeme: the portion of the original source string associated with a
|
||||
* given token.
|
||||
* - LL(0): Left-to-right, Leftmost derivation, 0 tokens lookahead
|
||||
* - sexp: symbolic expression, (involving (lots (of (((parentheses))))))
|
||||
*
|
||||
* Excited? Great! My extemporaneous rambling is costing me more time than
|
||||
* I spent making this damn thing! (No, really, it is.)
|
||||
*/
|
||||
|
||||
'use strict';
|
||||
|
||||
|
||||
/**
|
||||
* A very rudimentary (and extremely permissive) LL(0) Lisp parser
|
||||
*
|
||||
* This provides just enough to get by. It transforms lists into nested
|
||||
* arrays of tokens with some very basic error checking (e.g. for proper
|
||||
* nesting). This is not a general-purpose lisp parser.
|
||||
*/
|
||||
class Parser
|
||||
{
|
||||
/**
|
||||
* Produce an AST from the given string SRC of sexps
|
||||
*
|
||||
* This is essentially the CST with whitespace removed. It first
|
||||
* invokes the lexer to produce a token string from the input
|
||||
* sexps SRC. From this, it verifies only proper nesting (that SRC does
|
||||
* not close sexps too early and that EOF isn't reached before all sexps
|
||||
* are closed) and produces an AST that is an isomorphism of the
|
||||
* original sexps.
|
||||
*
|
||||
* @param {string} src input Lisp
|
||||
*
|
||||
* @throws {SyntaxError} on improper sexp nesting
|
||||
*
|
||||
* @return {Array} primitive abstract syntax tree of SRC
|
||||
*/
|
||||
parseLisp( src )
|
||||
{
|
||||
// token string from lexing
|
||||
const toks = this._lex( src );
|
||||
|
||||
// perform a leftmost reduction on the token string
|
||||
const [ depth, ast ] = toks.reduce( ( result, token ) =>
|
||||
{
|
||||
const [ depth, xs, stack ] = result;
|
||||
const { type, pos } = token;
|
||||
|
||||
// there are very few token types to deal with (again, this is
|
||||
// a very simple bootstrap lisp)
|
||||
switch ( type )
|
||||
{
|
||||
// closing parenthesis (end of sexp)
|
||||
case 'close':
|
||||
if ( depth === 0 ) {
|
||||
this._error(
|
||||
src, pos, `Unexpected closing parenthesis`
|
||||
);
|
||||
}
|
||||
|
||||
// the sexp is complete; add to the AST, reduce depth
|
||||
const top = stack.pop();
|
||||
top.push( xs );
|
||||
|
||||
return [ ( depth - 1 ), top, stack ];
|
||||
|
||||
// opening parenthesis (start of sexp)
|
||||
case 'open':
|
||||
stack.push( xs );
|
||||
return [ ( depth + 1 ), [], stack ];
|
||||
|
||||
// symbol or primitive; just copy the token in place
|
||||
case 'string':
|
||||
case 'symbol':
|
||||
xs.push( token );
|
||||
return [ depth, xs, stack ];
|
||||
|
||||
// should never happen unless there's a bug in the tokenizer
|
||||
// or we forget a token type above
|
||||
default:
|
||||
this._error( src, pos, `Unexpected token '${type}'` );
|
||||
}
|
||||
}, [ 0, [], [] ] );
|
||||
|
||||
// if we terminate at a non-zero depth, that means there
|
||||
// are still open sexps
|
||||
if ( depth > 0 ) {
|
||||
throw SyntaxError(
|
||||
`Unexpected end of input at depth ${depth}`
|
||||
);
|
||||
}
|
||||
|
||||
// the result is a set of tokens organized into ES arrays
|
||||
// isomorphic to the original sexp structure (the same structure)
|
||||
return ast;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Throw a SyntaxError with a window of surrounding source code
|
||||
*
|
||||
* The "window" is simply ten characters to the left and right of the
|
||||
* first character of the source input SRC that resulted in the error.
|
||||
* It's a little more than useless.
|
||||
*
|
||||
* @param {string} src source code (sexps)
|
||||
* @param {number} pos position of error
|
||||
* @param {string} msg error message
|
||||
*
|
||||
* @throws {SyntaxError}}
|
||||
*
|
||||
* @return {undefined}
|
||||
*/
|
||||
_error( src, pos, msg )
|
||||
{
|
||||
const window = src.substr( pos - 10, pos + 10 )
|
||||
.replace( "\n", " " );
|
||||
|
||||
throw new SyntaxError( `${msg}: '${window}'` );
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Convert source input into a string of tokens
|
||||
*
|
||||
* This is the lexer. Whitespace is ignored. The grammar consists of
|
||||
* simple s-expressions.
|
||||
*
|
||||
* This function is mutually recursive with `#_token'. It expects that
|
||||
* the source SRC will be left-truncated as input is
|
||||
* processed. POS exists for producing metadata for error
|
||||
* reporting---it has no impact on parsing.
|
||||
*
|
||||
* @param {string} src source code
|
||||
* @param {number} pos position (character offset) in source
|
||||
*
|
||||
* @return {Array} string of tokens
|
||||
*/
|
||||
_lex( src, pos = 0 )
|
||||
{
|
||||
// ignore whitespace, if any
|
||||
const ws = src.match( /^\s+/ ) || [ "" ];
|
||||
const trim = src.substr( ws[ 0 ].length );
|
||||
|
||||
// adjust position to account for any removed whitespace
|
||||
pos += ws[ 0 ].length;
|
||||
|
||||
// EOF and we're done
|
||||
if ( trim === '' ) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// left and right parenthesis are handled in the same manner: they
|
||||
// produce distinct tokens with single-character lexemes
|
||||
if ( trim[ 0 ] === '(' ) {
|
||||
return this._token( 'open', '(', trim, pos );
|
||||
}
|
||||
if ( trim[ 0 ] === ')' ) {
|
||||
return this._token( 'close', ')', trim, pos );
|
||||
}
|
||||
|
||||
// strings are delimited by opening and closing ASCII double quotes,
|
||||
// which can be escaped with a backslash
|
||||
if ( trim[ 0 ] === '"' ) {
|
||||
const str = trim.match( /^"(|.*?[^\\])"/ );
|
||||
if ( !str ) {
|
||||
this._error( src, pos, "missing closing string delimiter" );
|
||||
}
|
||||
|
||||
// a string token consists of the entire string including quotes
|
||||
// as its lexeme, but its value will be the value of the string
|
||||
// without quotes due to the `str' match group (see `#_token')
|
||||
return this._token( 'string', str, trim, pos );
|
||||
}
|
||||
|
||||
// anything else is considered a symbol up until whitespace or any
|
||||
// of the aforementioned delimiters
|
||||
const symbol = trim.match( /^[^\s()"]+/ );
|
||||
return this._token( 'symbol', symbol, trim, pos );
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Produce a token and recurse
|
||||
*
|
||||
* The token will be concatenated with the result of the mutually
|
||||
* recursive method `_lex'.
|
||||
*
|
||||
* For the record: I'm not fond of mutual recursion from a clarity
|
||||
* standpoint, but this is how the abstraction evolved to de-duplicate
|
||||
* code, and I don't much feel like refactoring it.
|
||||
*
|
||||
* @param {string} type token type
|
||||
* @param {string|Array} match lexeme match
|
||||
* @param {string} src source code string, left-truncated
|
||||
* @param {number} pos offset relative to original src
|
||||
*
|
||||
* @return {Array} string of tokens
|
||||
*/
|
||||
_token( type, match, src, pos )
|
||||
{
|
||||
const parts = ( Array.isArray( match ) )
|
||||
? match
|
||||
: [ match ];
|
||||
|
||||
// the value is the first group of the match (indicating what we
|
||||
// are actually interested in), and the lexeme is the full match,
|
||||
// which might include, for example, string delimiters
|
||||
const [ lexeme, value ] = parts;
|
||||
|
||||
const token = {
|
||||
type: type,
|
||||
lexeme: lexeme,
|
||||
value: value || lexeme,
|
||||
pos: pos
|
||||
};
|
||||
|
||||
// continue producing tokens by recursing, left-truncating the
|
||||
// source string to discard what we have already processed
|
||||
return [ token ].concat(
|
||||
this._lex(
|
||||
src.substr( lexeme.length ),
|
||||
( pos + lexeme.length )
|
||||
)
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Dumb compiler to transform AST into ECMAScript
|
||||
*
|
||||
* This is a really dumb code generator: it takes the AST and essentially
|
||||
* transforms it 1:1 wherever possible into the target language.
|
||||
*
|
||||
* This is nothing like what we actually want the _ultimate_ compiler to do
|
||||
* after Birth, but it gets us to a point where we can self-host on a basic
|
||||
* Prebirth language and evolve from there.
|
||||
*
|
||||
* The code generation can be pretty much summed up by the last line of
|
||||
* `Compiler#_cdfn'.
|
||||
*/
|
||||
class Compiler
|
||||
{
|
||||
/**
|
||||
* Compile AST into ECMAScript
|
||||
*
|
||||
* Every block is mapped 1:1 to a function in ECMAScript. So, we just
|
||||
* map all root children (which are expected to be block definitions) to
|
||||
* functions.
|
||||
*
|
||||
* @param {Array} tree root of tree containing top-level block definitions
|
||||
*/
|
||||
compile( tree )
|
||||
{
|
||||
// map every definition to a ES function definition and delimit them
|
||||
// (for readability) by two newlines
|
||||
return tree.map( this._cdfn.bind( this ) )
|
||||
.join( "\n\n" ) + "\n";
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compile block definition into a ES function definition
|
||||
*
|
||||
* This will fail if the given token is not a `define-block'.
|
||||
*
|
||||
* @param {Object} t token
|
||||
*
|
||||
* @return {string} compiled block definition
|
||||
*/
|
||||
_cdfn( t )
|
||||
{
|
||||
this.assertApply( t, 'define-block' );
|
||||
|
||||
// e.g. (define-block <foo> ((input ...)) body)
|
||||
const [ , { value: name }, desc, ...body ] = t;
|
||||
|
||||
const id = this._idFromName( name );
|
||||
const bodyjs = this._bodyToEs( body );
|
||||
|
||||
// this is the final format---each block becomes its own function
|
||||
// definition
|
||||
return `function ${id}()\n{\n${bodyjs}\n};`;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Generate ECMAScript-friendly name from the given id
|
||||
*
|
||||
* @param {string} name source name
|
||||
*
|
||||
* @return {string} ES-friendly identifier
|
||||
*/
|
||||
_idFromName( name )
|
||||
{
|
||||
return name.replace( /[^a-zA-Z0-9_]/g, '$' );
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compile body s-expressions into ECMAScript
|
||||
*
|
||||
* This produces a 1:1 mapping of BODY s-expressions to ES statements,
|
||||
* recursively. The heavy lifting is done by `#_sexpToEs'.
|
||||
*
|
||||
* @param {Array} body s-expressions representing block body
|
||||
*
|
||||
* @return {string} compiled BODY
|
||||
*/
|
||||
_bodyToEs( body )
|
||||
{
|
||||
// the body must be an array of expressions (this should always be
|
||||
// the case unless we have a bug in the compiler)
|
||||
if ( !Array.isArray( body ) ) {
|
||||
throw Error( "body must be an Array" );
|
||||
}
|
||||
|
||||
// process each s-expression in BODY
|
||||
const js = body.map( this._sexpToEs.bind( this ) );
|
||||
|
||||
// the result (that is, an array of compiled s-expressions) is
|
||||
// joined semicolon-delimited, with a `return' statement preceding
|
||||
// the final expression
|
||||
return js.reduce( ( result, s, i ) =>
|
||||
{
|
||||
const ret = ( i === ( js.length - 1 ) ) ? "return " : "";
|
||||
return result + " " + ret + s + ";";
|
||||
}, "" );
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Convert s-expression or scalar into ECMAScript
|
||||
*
|
||||
* T may be either an array of tokens or a primitive token (e.g. string,
|
||||
* symbol). This method is applied recursively to T as needed if T is
|
||||
* an array.
|
||||
*
|
||||
* @param {Array|Object} t tokens representing s-expressions/scalars
|
||||
*
|
||||
* @return {string} compiled s-expression/scalar
|
||||
*/
|
||||
_sexpToEs( t )
|
||||
{
|
||||
// just output symbols as identifiers as-is for now
|
||||
if ( !Array.isArray( t ) ) {
|
||||
switch ( t.type )
|
||||
{
|
||||
// strings are output as-is (note that we don't escape
|
||||
// double quotes, because the method of escaping them is the
|
||||
// same in Scheme as it is in ECMAScript---a backslash)
|
||||
case 'string':
|
||||
return `"${t.value}"`;
|
||||
|
||||
// symbols have the same concerns as block definitions: the
|
||||
// identifiers generated need to be ES-friendly
|
||||
case 'symbol':
|
||||
return this._idFromName( t.value );
|
||||
|
||||
default:
|
||||
throw Error( "Cannot compile unknown token `${t.type}'" );
|
||||
}
|
||||
}
|
||||
|
||||
// only support block form for now, and assume that `fn' is a
|
||||
// string value (in the future, this doesn't have to be the
|
||||
// case---fn should be able to be an arbitrary sexp)
|
||||
const [ { value: fn }, argmap ] = t;
|
||||
|
||||
if ( !this._isBlockForm( t ) ) {
|
||||
throw Error( `\`${fn}' application is not in block form`)
|
||||
}
|
||||
|
||||
// convert all remaining symbols (after the symbol representing the
|
||||
// function application) into arguments by parsing their sexps or
|
||||
// scalar values; we're not going to worry about mapping them for
|
||||
// now; they will be compiled in the order in which they appear
|
||||
const idfn = this._idFromName( fn );
|
||||
const args = argmap.map( ([ , v ]) => this._sexpToEs( v ) );
|
||||
const argstr = args.join( ", " );
|
||||
|
||||
// make the dangerous assumption that arguments are ordered
|
||||
// for now
|
||||
return `${idfn}(${argstr})`;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determine whether T represents a block form
|
||||
*
|
||||
* Block form is an application of a block, which has a certain
|
||||
* syntax. Specifically: `(<block> ((key value) ...))'.
|
||||
*
|
||||
* @param {*} t hopefully a token list
|
||||
*
|
||||
* @return {boolean} whether T represents a block form
|
||||
*/
|
||||
_isBlockForm( t )
|
||||
{
|
||||
// the first symbol is the function name, second is an sexp
|
||||
// containing each of the key/value argument mappings
|
||||
const [ fn, argmap ] = t;
|
||||
|
||||
// enforce block id convention (at least for now)
|
||||
const isblockid = /^<[^>]+>$/.test( fn.value );
|
||||
|
||||
return (
|
||||
Array.isArray( t )
|
||||
&& isblockid
|
||||
&& Array.isArray( argmap )
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determine whether T is an application of a symbol NAME, or error
|
||||
*
|
||||
* @param {*} t hopefully a token or token list
|
||||
* @param {string} name block name to assert against
|
||||
*/
|
||||
assertApply( t, name )
|
||||
{
|
||||
// an application must be an s-expression
|
||||
if ( !Array.isArray( t ) ) {
|
||||
throw Error(
|
||||
`\`${name}' application expected, found symbol \`${t.value}'`
|
||||
);
|
||||
}
|
||||
|
||||
// if there's a match, we can stop here
|
||||
if ( t[ 0 ].value === name ) {
|
||||
return;
|
||||
}
|
||||
|
||||
// otherwise, provide an informative error of what we found and what
|
||||
// we should have found
|
||||
throw Error(
|
||||
`\`${name}' expected, found \`${t[ 0 ].value}'`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Prebirth was originally intended to be run via the command line using
|
||||
* Node.js. But it doesn't have to be. If you want, feel free to run it in
|
||||
* your web browser; you'll just have to instantiate your own objects.
|
||||
*/
|
||||
( function ()
|
||||
{
|
||||
if ( typeof process === 'undefined' )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
const p = new Parser();
|
||||
const c = new Compiler();
|
||||
|
||||
const src = require( 'fs' ).readFileSync( '/dev/stdin' ).toString();
|
||||
const tree = p.parseLisp( src );
|
||||
|
||||
process.stdout.write( c.compile( tree ) );
|
||||
} )();
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Now that we have output, the next step is the hard part: rewriting this
|
||||
* file in Prebirth Lisp. As I mentioned, this process is called
|
||||
* "Birth". It's at this point that we have to decide on basic
|
||||
* abstractions---we are starting from scratch. The initial implementation
|
||||
* is therefore unlikely to be as concise and elegant as Prebirth
|
||||
* itself---it will be refactored.
|
||||
*
|
||||
* Here is an example Hello, World!:
|
||||
*
|
||||
* (define-block <hello-world>
|
||||
* ()
|
||||
* (<js:console> ((message "Hello, world!"))))
|
||||
*
|
||||
*
|
||||
* ¹ This term should invoke visuals of an abstract being entering existence
|
||||
* in some strange nonlinear-time² kind of way. If you thought of
|
||||
* something less pleasant, well, I'm sorry you went through that.
|
||||
*
|
||||
* ² Because we're dealing with nonlinear time!¹ This would be some bizarre
|
||||
* recursive footnote crap if it weren't for that.²
|
||||
*/
|
Loading…
Reference in New Issue