ulambda/build-aux/bootstrap/birth.scm

;;; Prebirth Lisp implemented in Prebirth Lisp (self-hosting)
;;;
;;;  Copyright (C) 2017 Mike Gerwitz
;;;
;;;  This file is part of Gibble.
;;;
;;;  Gibble is free software: you can redistribute it and/or modify
;;;  it under the terms of the GNU Affero General Public License as
;;;  published by the Free Software Foundation, either version 3 of the
;;;  License, or (at your option) any later version.
;;;
;;;  This program is distributed in the hope that it will be useful,
;;;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;;  GNU General Public License for more details.
;;;
;;;  You should have received a copy of the GNU Affero General Public License
;;;  along with this program.  If not, see <http://www.gnu.org/licenses/>.
;;;
;;; THIS IS TEMPORARY CODE that will be REWRITTEN IN GIBBLE LISP ITSELF after
;;; a very basic bootstrap is complete.  It is retained as an important
;;; artifact for those who wish to build Gibble from scratch without using
;;; another version of Gibble itself.  This is called "self-hosting".
;;;
;;; This is the Prebirth Lisp implementation of the JavaScript Prebirth
;;; compiler, found in `prebirth.js'---that compiler can be used to compile
;;; this compiler, which can then be used to compile itself, completing the
;;; bootstrapping process.  This process is termed "Birth", and the process
;;; is successful if the output of Birth compiling itself is byte-for-byte
;;; identical to the output of compiling Birth with Prebirth.
;;;
;;; This is largely a 1:1 translation of `prebirth.js'.
;;;
;;; Note that we're dealing with a small subset of Scheme here, so certain
;;; things might be done differently given a proper implementation.  See
;;; that file for terminology.

;; pair selection
(define (cadr xs)
  (car (cdr xs)))
(define (caadr xs)
  (car (car (cdr xs))))
(define (caddr xs)
  (car (cdr (cdr xs))))
(define (cadddr xs)
  (car (cdr (cdr ( cdr xs)))))

;; for convenience
(define (js:match-regexp re s)
  (js:match (js:regexp re) s))


;; Convert source input into a string of tokens.
;;
;; This is the lexer.  Whitespace is ignored.  The grammar consists of
;; simple s-expressions.
;;
;; This procedure is mutually recursive with `token'.  It expects that
;; the source SRC will be left-truncated as input is processed.  POS exists
;; for producing metadata for error reporting---it has no impact on
;; parsing.
;;
;; The result is a list of tokens.  See `token' for the format.
(define (lex src pos)
  (let* ((ws     (or (js:match-regexp "^\\s+"
                                      src)
                     (list "")))
         (ws-len (string-length (car ws)))
         (trim   (substring src ws-len))   ; ignore whitespace, if any
         (newpos (+ pos ws-len)))

    (if (string=? "" trim)
        (list)                             ; EOF and we're done

        ;; normally we'd use `string-ref' here, but then we'd have to
        ;; implement Scheme characters, so let's keep this simple and keep
        ;; with strings
        (let ((ch (substring trim 0 1)))
          (case ch
            ;; comments extend until the end of the line
            ((";") (let ((eol (js:match-regexp "^(.*?)(\\n|$)" trim)))
                     (token "comment" (cadr eol) trim newpos)))

            ;; left and right parenthesis are handled in the same manner:
            ;; they produce distinct tokens with single-character lexemes
            (("(")  (token "open" ch trim newpos))
            ((")")  (token "close" ch trim newpos))

            ;; strings are delimited by opening and closing ASCII double
            ;; quotes, which can be escaped with a backslash
            (("\"") (let ((str (js:match-regexp "^\"(|.*?[^\\\\])\""
                                                trim)))
                      (or str (parse-error
                               src pos "missing closing string delimiter"))
                      ;; a string token consists of the entire string
                      ;; including quotes as its lexeme, but its value will
                      ;; be the value of the string without quotes due to
                      ;; the `str' match group (see `token')
                      (token "string" str trim newpos)))

            (else
             ;; anything else is considered a symbol up until whitespace or
             ;; any of the aforementioned delimiters
             (let ((symbol (js:match-regexp "^[^\\s()\"]+"
                                            trim)))
               (token "symbol" symbol trim newpos))))))))


;; Throw an error with a window of surrounding source code.
;;
;; The "window" is simply ten characters to the left and right of the
;; first character of the source input SRC that resulted in the error.
;; It's a little more than useless.
(define (parse-error src pos msg)
  (let ((window (substring src (- pos 10) (+ pos 10))))
    (error (string-append msg " (pos " pos "): " window)
           src)))


;; Produce a token and recurse.
;;
;; The token will be concatenated with the result of the mutually
;; recursive procedure `lex'.
;;
;; For the record: I'm not fond of mutual recursion from a clarity
;; standpoint, but this is how the abstraction evolved to de-duplicate
;; code, and I don't much feel like refactoring it.
;;
;; Unlike the JS Prebirth implementation which uses a key/value object,
;; we're just using a simple list.
;;
;; The expected arguments are: the token type TYPE, the match group or
;; string MATCH, left-truncated source code SRC, and the position POS
;; relative to the original source code.
(define (token type match src pos)
  (let* ((parts  (if (list? match) match (list match match)))
         (lexeme (car parts))
         ;; the value is the first group of the match (indicating what we
         ;; are actually interested in), and the lexeme is the full match,
         ;; which might include, for example, string delimiters
         (value  (or (and (pair? (cdr parts))
                          (cadr parts))
                     lexeme))
         (len    (string-length lexeme)))

    ;; produce token and recurse on `lex', left-truncating the source
    ;; string to discard what we have already processed
    (cons (list type lexeme value pos)
          (lex (substring src len)
               (+ pos len)))))


;; various accessor procedures for token lists (we're Prebirth Lisp here,
;; so no record support or anything fancy!)
(define (token-type t)   (car t))
(define (token-lexeme t) (cadr t))
(define (token-value t)  (caddr t))
(define (token-pos t)    (cadddr t))


;; Produce an AST from the given string SRC of sexps
;;
;; This is essentially the CST with whitespace removed.  It first invokes
;; the lexer to produce a token string from the input sexps SRC.  From this,
;; it verifies only proper nesting (that SRC does not close sexps too early
;; and that EOF isn't reached before all sexps are closed) and produces an
;; AST that is an isomorphism of the original sexps.
(define (parse-lisp src)
  ;; accessor methods to make you and me less consfused
  (define (ast-depth ast) (car ast))
  (define (ast-tree  ast) (cadr ast))
  (define (ast-stack ast) (caddr ast))

  (define (toks->ast toks)
    (fold                    ; then a leftmost reduction on the token string
     (lambda (token result)
       (let ((depth (ast-depth result))
             (xs    (ast-tree result))
             (stack (ast-stack result))
             (type  (token-type token))
             (pos   (token-pos token)))

         ;; there are very few token types to deal with (again, this is a
         ;; very simple bootstrap lisp)
         (case type
           ;; ignore comments
           (("comment") result)

           ;; when beginning a new expression, place the expression
           ;; currently being processed onto a stack, allocate a new list,
           ;; and we'll continue processing into that new list
           (("open") (list (+ depth 1)
                           (list)
                           (cons xs stack)))

           ;; once we reach the end of the expression, pop the parent off of
           ;; the stack and append the new list to it
           (("close") (if (zero? depth)
                          (parse-error src pos
                                       "unexpected closing parenthesis")
                          (list (- depth 1)
                                (append (car stack) (list xs))
                                (cdr stack))))

           ;; strings and symbols (we cheat and just consider everything,
           ;; including numbers and such, to be symbols) are just copied
           ;; in place
           (("string" "symbol") (list depth
                                      (append xs (list token))
                                      stack))

           ;; we should never encounter anything else unless there's a bug
           ;; in the tokenizer or we forget a token type above
           (else (parse-error
                  src pos (string-append
                           "unexpected token `" type "'"))))))
     (list 0 (list) (list))    ; initial 0 depth; empty tree;  expr stack
     toks))


  ;; lex the input SRC and pass it to `toks->ast' to generate the AST;
  ;; if the depth is non-zero after we're done, then we're unbalanced.
  (let* ((toks (lex src 0))
         (ast  (toks->ast toks)))
    (if (zero? (ast-depth ast))
        (ast-tree ast)
        (error (string-append
                "unexpected end of input at depth "
                (ast-depth ast))))))


;; at this point, this program can parse itself and output a CST (sans
;; whitespace)
(js:console
 (parse-lisp (js:stdin->string)))