From b5f70fd8a6214cd8144b25a93f2df9cf3c9e4467 Mon Sep 17 00:00:00 2001 From: Mike Gerwitz Date: Thu, 31 Aug 2017 01:01:06 -0400 Subject: [PATCH] birth: Add birth.scm, which can parse itself and output a CST Exciting first step! Though it required a much more complicated Prebirth Lisp than I was hoping to create. And I never intended to go into a full Scheme implementation, but that's the route this is headed in. I just can't stomach creating this full system in a block language. With that said, the block language will still be able to work with all Lisp code; you'll see. * build-aux/bootstrap/birth.scm: Add beginning of Birth, capable of parsing itself! Baby steps! --- build-aux/bootstrap/birth.scm | 139 ++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 build-aux/bootstrap/birth.scm diff --git a/build-aux/bootstrap/birth.scm b/build-aux/bootstrap/birth.scm new file mode 100644 index 0000000..90b9b7f --- /dev/null +++ b/build-aux/bootstrap/birth.scm @@ -0,0 +1,139 @@ +;;; Prebirth Lisp implemented in Prebirth Lisp (self-hosting) +;;; +;;; Copyright (C) 2017 Mike Gerwitz +;;; +;;; This file is part of Gibble. +;;; +;;; Gibble is free software: you can redistribute it and/or modify +;;; it under the terms of the GNU Affero General Public License as +;;; published by the Free Software Foundation, either version 3 of the +;;; License, or (at your option) any later version. +;;; +;;; This program is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU Affero General Public License +;;; along with this program. If not, see . +;;; +;;; THIS IS TEMPORARY CODE that will be REWRITTEN IN GIBBLE LISP ITSELF after +;;; a very basic bootstrap is complete. It is retained as an important +;;; artifact for those who wish to build Gibble from scratch without using +;;; another version of Gibble itself. This is called "self-hosting". +;;; +;;; This is the Prebirth Lisp implementation of the JavaScript Prebirth +;;; compiler, found in `prebirth.js'---that compiler can be used to compile +;;; this compiler, which can then be used to compile itself, completing the +;;; bootstrapping process. This process is termed "Birth". +;;; +;;; This is largely a 1:1 translation of `prebirth.js'. Note that we're +;;; dealing with a small subset of Scheme here, so certain things might be +;;; done differently given a proper implementation. See that file for +;;; terminology. + +;; pair selection +(define (cadr xs) + (car (cdr xs))) +(define (caadr xs) + (car (car (cdr xs)))) +(define (caddr xs) + (car (cdr (cdr xs)))) + +(define (js:match-regexp re s) + (js:match (js:regexp re) s)) + + + +;; Convert source input into a string of tokens. +;; +;; This is the lexer. Whitespace is ignored. The grammar consists of +;; simple s-expressions. +;; +;; This procedure is mutually recursive with `token'. It expects that +;; the source SRC will be left-truncated as input is processed. POS exists +;; for producing metadata for error reporting---it has no impact on +;; parsing. +;; +;; The result is a list of tokens. See `token' for the format. +(define (lex src pos) + (let* ((ws (or (js:match-regexp "^\\s+" + src) + (list ""))) + (ws-len (string-length (car ws))) + (trim (substring src ws-len)) ; ignore whitespace, if any + (newpos (+ pos ws-len))) + + (if (string=? "" trim) + (list) ; EOF and we're done + + ;; normally we'd use `string-ref' here, but then we'd have to + ;; implement Scheme characters, so let's keep this simple and keep + ;; with strings + (let ((ch (substring trim 0 1))) + (case ch + ;; comments extend until the end of the line + ((";") (let ((eol (js:match-regexp "^(.*?)(\\n|$)" trim))) + (token "comment" (cadr eol) trim newpos))) + + ;; left and right parenthesis are handled in the same manner: + ;; they produce distinct tokens with single-character lexemes + (("(") (token "open" ch trim newpos)) + ((")") (token "close" ch trim newpos)) + + ;; strings are delimited by opening and closing ASCII double + ;; quotes, which can be escaped with a backslash + (("\"") (let ((str (js:match-regexp "^\"(|.*?[^\\\\])\"" + trim))) + (or str (error "missing closing string delimiter" str)) + ;; a string token consists of the entire string + ;; including quotes as its lexeme, but its value will + ;; be the value of the string without quotes due to + ;; the `str' match group (see `token') + (token "string" str trim newpos))) + + (else + ;; anything else is considered a symbol up until whitespace or + ;; any of the aforementioned delimiters + (let ((symbol (js:match-regexp "^[^\\s()\"]+" + trim))) + (token "symbol" symbol trim newpos)))))))) + + +;; Produce a token and recurse. +;; +;; The token will be concatenated with the result of the mutually +;; recursive procedure `lex'. +;; +;; For the record: I'm not fond of mutual recursion from a clarity +;; standpoint, but this is how the abstraction evolved to de-duplicate +;; code, and I don't much feel like refactoring it. +;; +;; Unlike the JS Prebirth implementation which uses a key/value object, +;; we're just using a simple list. +;; +;; The expected arguments are: the token type TYPE, the match group or +;; string MATCH, left-truncated source code SRC, and the position POS +;; relative to the original source code. +(define (token type match src pos) + (let* ((parts (if (list? match) match (list match match))) + (lexeme (car parts)) + ;; the value is the first group of the match (indicating what we + ;; are actually interested in), and the lexeme is the full match, + ;; which might include, for example, string delimiters + (value (or (and (pair? (cdr parts)) + (cadr parts)) + lexeme)) + (len (string-length lexeme))) + + ;; produce token and recurse on `lex', left-truncating the source + ;; string to discard what we have already processed + (cons (list type lexeme value pos) + (lex (substring src len) + (+ pos len))))) + + +;; at this point, this program can parse itself and output a CST (sans +;; whitespace) +(js:console + (lex (js:stdin->string) 0))