tame/tamer/src/parse.rs

1301 lines
44 KiB
Rust
Raw Blame History

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

// Basic streaming parsing framework
//
// Copyright (C) 2014-2022 Ryan Specialty Group, LLC.
//
// This file is part of TAME.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//! Basic streaming parser framework for lowering operations.
//!
//! _TODO: Some proper docs and examples!_
use crate::diagnose::{Annotate, AnnotatedSpan, Diagnostic};
use crate::iter::{TripIter, TrippableIterator};
use crate::span::{Span, UNKNOWN_SPAN};
use std::fmt::Debug;
use std::hint::unreachable_unchecked;
use std::iter::{self, Empty};
use std::mem::take;
use std::ops::{ControlFlow, Deref, DerefMut, FromResidual, Try};
use std::{convert::Infallible, error::Error, fmt::Display};
/// Result of applying a [`Token`] to a [`ParseState`],
/// with any error having been wrapped in a [`ParseError`].
pub type ParsedResult<S> = ParseResult<S, Parsed<<S as ParseState>::Object>>;
/// Result of some non-parsing operation on a [`Parser`],
/// with any error having been wrapped in a [`ParseError`].
pub type ParseResult<S, T> =
Result<T, ParseError<<S as ParseState>::Token, <S as ParseState>::Error>>;
/// A single datum from a streaming IR with an associated [`Span`].
///
/// A token may be a lexeme with associated data,
/// or a more structured object having been lowered from other IRs.
pub trait Token: Display + Debug + PartialEq {
/// Retrieve the [`Span`] representing the source location of the token.
fn span(&self) -> Span;
}
impl<T: Token> From<T> for Span {
fn from(tok: T) -> Self {
tok.span()
}
}
/// An IR object produced by a lowering operation on one or more [`Token`]s.
///
/// Note that an [`Object`] may also be a [`Token`] if it will be in turn
/// fed to another [`Parser`] for lowering.
///
/// This trait exists to disambiguate an otherwise unbounded type for
/// [`From`] conversions,
/// used in the [`Transition`] API to provide greater flexibility.
pub trait Object: Debug + PartialEq {}
/// An infallible [`Token`] stream.
///
/// If the token stream originates from an operation that could potentially
/// fail and ought to be propagated,
/// use [`TokenResultStream`].
///
/// The name "stream" in place of "iterator" is intended to convey that this
/// type is expected to be processed in real-time as a stream,
/// not read into memory.
pub trait TokenStream<T: Token> = Iterator<Item = T>;
/// A [`Token`] stream that may encounter errors during parsing.
///
/// If the stream cannot fail,
/// consider using [`TokenStream`].
pub trait TokenResultStream<T: Token, E: Error> = Iterator<Item = Result<T, E>>;
/// A [`ParseState`] capable of being automatically stitched together with
/// a parent [`ParseState`] `SP` to create a composite parser.
///
/// Conceptually,
/// this can be visualized as combining the state machines of multiple
/// parsers into one larger state machine.
///
/// The term _state stitching_ refers to a particular pattern able to be
/// performed automatically by this parsing framework;
/// it is not necessary for parser composition,
/// provided that you perform the necessary wiring yourself in absence
/// of state stitching.
pub trait StitchableParseState<SP: ParseState> = ParseState
where
SP: ParseState<Token = <Self as ParseState>::Token>,
<Self as ParseState>::Object: Into<<SP as ParseState>::Object>,
<Self as ParseState>::Error: Into<<SP as ParseState>::Error>;
/// A parsing automaton.
///
/// These states are utilized by a [`Parser`].
///
/// A [`ParseState`] is also responsible for storing data about the
/// accepted input,
/// and handling appropriate type conversions into the final type.
/// That is---an
/// automaton may store metadata that is subsequently emitted once an
/// accepting state has been reached.
/// Whatever the underlying automaton,
/// a `(state, token, context)` triple must uniquely determine the next
/// parser action.
pub trait ParseState: Default + PartialEq + Eq + Debug {
/// Input tokens to the parser.
type Token: Token;
/// Objects produced by a parser utilizing these states.
type Object: Object;
/// Errors specific to this set of states.
type Error: Debug + Diagnostic + PartialEq;
/// Object provided to parser alongside each token.
///
/// This may be used in situations where Rust/LLVM are unable to
/// optimize away moves of interior data associated with the
/// otherwise-immutable [`ParseState`].
type Context: Debug = EmptyContext;
/// Construct a parser.
///
/// Whether this method is helpful or provides any clarity depends on
/// the context and the types that are able to be inferred.
fn parse<I: TokenStream<Self::Token>>(toks: I) -> Parser<Self, I>
where
Self::Context: Default,
{
Parser::from(toks)
}
/// Construct a parser with a non-default [`ParseState::Context`].
///
/// This is useful in two ways:
///
/// 1. To allow for parsing using a context that does not implement
/// [`Default`],
/// or whose default is not sufficient; and
/// 2. To re-use a context from a previous [`Parser`].
///
/// If neither of these apply to your situation,
/// consider [`ParseState::parse`] instead.
///
/// To retrieve a context from a parser for re-use,
/// see [`Parser::finalize`].
fn parse_with_context<I: TokenStream<Self::Token>>(
toks: I,
ctx: Self::Context,
) -> Parser<Self, I> {
Parser::from((toks, ctx))
}
/// Parse a single [`Token`] and optionally perform a state transition.
///
/// The current state is represented by `self`.
/// The result of a parsing operation is a state transition with
/// associated [`ParseStatus`] data.
///
/// Note that `self` is owned,
/// for a couple primary reasons:
///
/// 1. This forces the parser to explicitly consider and document all
/// state transitions,
/// rather than potentially missing unintended behavior through
/// implicit behavior; and
/// 2. It allows for more natural functional composition of state,
/// which in turn makes it easier to compose parsers
/// (which conceptually involves stitching together state
/// machines).
///
/// Since a [`ParseState`] produces a new version of itself with each
/// invocation,
/// it is functionally pure.
/// Generally,
/// Rust/LLVM are able to optimize moves into direct assignments.
/// However,
/// there are circumstances where this is _not_ the case,
/// in which case [`Context`] can be used to provide a mutable context
/// owned by the caller (e.g. [`Parser`]) to store additional
/// information that is not subject to Rust's move semantics.
/// If this is not necessary,
/// see [`NoContext`].
fn parse_token(
self,
tok: Self::Token,
ctx: &mut Self::Context,
) -> TransitionResult<Self>;
/// Whether the current state represents an accepting state.
///
/// An accepting state represents a valid state to stop parsing.
/// If parsing stops at a state that is _not_ accepting,
/// then the [`TokenStream`] has ended unexpectedly and should produce
/// a [`ParseError::UnexpectedEof`].
///
/// It makes sense for there to be exist multiple accepting states for a
/// parser.
/// For example:
/// A parser that parses a list of attributes may be used to parse one
/// or more attributes,
/// or the entire list of attributes.
/// It is acceptable to attempt to parse just one of those attributes,
/// or it is acceptable to parse all the way until the end.
fn is_accepting(&self) -> bool;
/// Delegate parsing from a compatible, stitched [`ParseState`]~`SP`.
///
/// This helps to combine two state machines that speak the same input
/// language
/// (share the same [`Self::Token`]),
/// handling the boilerplate of delegating [`Self::Token`] from a
/// parent state~`SP` to `Self`.
///
/// Token delegation happens after [`Self`] has been entered from a
/// parent [`ParseState`] context~`SP`,
/// so stitching the start and accepting states must happen elsewhere
/// (for now).
///
/// This assumes that no lookahead token from [`ParseStatus::Dead`] will
/// need to be handled by the parent state~`SP`.
/// To handle a token of lookahead,
/// use [`Self::delegate_lookahead`] instead.
///
/// _TODO: More documentation once this is finalized._
fn delegate<SP, C>(
self,
mut context: C,
tok: <Self as ParseState>::Token,
into: impl FnOnce(Self) -> SP,
) -> TransitionResult<SP>
where
Self: StitchableParseState<SP>,
C: AsMut<<Self as ParseState>::Context>,
{
use ParseStatus::{Dead, Incomplete, Object as Obj};
let (Transition(newst), result) =
self.parse_token(tok, context.as_mut()).into();
// This does not use `delegate_lookahead` so that we can have
// `into: impl FnOnce` instead of `Fn`.
Transition(into(newst)).result(match result {
Ok(Incomplete) => Ok(Incomplete),
Ok(Obj(obj)) => Ok(Obj(obj.into())),
Ok(Dead(tok)) => Ok(Dead(tok)),
Err(e) => Err(e.into()),
})
}
/// Delegate parsing from a compatible, stitched [`ParseState`]~`SP` with
/// support for a lookahead token.
///
/// This does the same thing as [`Self::delegate`],
/// but allows for the handling of a lookahead token from [`Self`]
/// rather than simply proxying [`ParseStatus::Dead`].
///
/// _TODO: More documentation once this is finalized._
fn delegate_lookahead<SP, C>(
self,
mut context: C,
tok: <Self as ParseState>::Token,
into: impl FnOnce(Self) -> SP,
) -> ControlFlow<TransitionResult<SP>, (Self, <Self as ParseState>::Token, C)>
where
Self: StitchableParseState<SP>,
C: AsMut<<Self as ParseState>::Context>,
{
use ControlFlow::*;
use ParseStatus::{Dead, Incomplete, Object as Obj};
// NB: Rust/LLVM are generally able to elide these moves into direct
// assignments,
// but sometimes this does not work
// (e.g. XIRF's use of `ArrayVec`).
// If your [`ParseState`] has a lot of `memcpy`s or other
// performance issues,
// move heavy objects into `context`.
let (Transition(newst), result) =
self.parse_token(tok, context.as_mut()).into();
match result {
Ok(Incomplete) => Break(Transition(into(newst)).incomplete()),
Ok(Obj(obj)) => Break(Transition(into(newst)).ok(obj.into())),
Ok(Dead(tok)) => Continue((newst, tok, context)),
Err(e) => Break(Transition(into(newst)).err(e)),
}
}
}
/// Empty [`Context`] for [`ParseState`]s with pure functional
/// implementations with no mutable state.
///
/// Using this value means that a [`ParseState`] does not require a
/// context.
/// All [`Context`]s implement [`AsMut<EmptyContext>`](AsMut),
/// and so all pure [`ParseState`]s have contexts compatible with every
/// other parser for composition
/// (provided that the other invariants in [`StitchableParseState`] are
/// met).
///
/// This can be clearly represented in function signatures using
/// [`EmptyContext`].
#[derive(Debug, PartialEq, Eq, Default)]
pub struct EmptyContext;
impl AsMut<EmptyContext> for EmptyContext {
fn as_mut(&mut self) -> &mut EmptyContext {
self
}
}
/// A [`ParseState`] does not require any mutable [`Context`].
///
/// A [`ParseState`] using this context is pure
/// (has no mutable state),
/// returning a new version of itself on each state change.
///
/// This type is intended to be self-documenting:
/// `_: EmptyContext` is nicer to readers than `_: &mut EmptyContext`.
///
/// See [`EmptyContext`] for more information.
pub type NoContext<'a> = &'a mut EmptyContext;
/// Mutable context for [`ParseState`].
///
/// [`ParseState`]s are immutable and pure---they
/// are invoked via [`ParseState::parse_token`] and return a new version
/// of themselves representing their new state.
/// Rust/LLVM are generally able to elide intermediate values and moves,
/// optimizing these parsers away into assignments.
///
/// However,
/// there are circumstances where moves may not be elided and may retain
/// their `memcpy` equivalents.
/// To work around this,
/// [`ParseState::parse_token`] accepts a mutable [`Context`] reference
/// which is held by the parent [`Parser`],
/// which can be mutated in-place without worrying about Rust's move
/// semantics.
///
/// Plainly: you should only use this if you have to.
/// This was added because certain parsers may be invoked millions of times
/// for each individual token in systems with many source packages,
/// which may otherwise result in millions of `memcpy`s.
///
/// When composing two [`ParseState`]s `A<B, C>`,
/// a [`Context<B, C>`](Context) must be contravariant over `B` and~`C`.
/// Concretely,
/// this means that [`AsMut<B::Context>`](AsMut) and
/// [`AsMut<C::Context>`](AsMut) must be implemented for `A::Context`.
/// This almost certainly means that `A::Context` is a product type.
/// Consequently,
/// a single [`Parser`] is able to hold a composite [`Context`] in a
/// single memory location.
///
/// [`Context<T>`](Context) implements [`Deref<T>`](Deref) for convenience.
///
/// If your [`ParseState`] does not require a mutable [`Context`],
/// see [`NoContext`].
#[derive(Debug, Default)]
pub struct Context<T: Debug + Default>(T, EmptyContext);
impl<T: Debug + Default> AsMut<EmptyContext> for Context<T> {
fn as_mut(&mut self) -> &mut EmptyContext {
&mut self.1
}
}
impl<T: Debug + Default> Deref for Context<T> {
type Target = T;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<T: Debug + Default> DerefMut for Context<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl<T: Debug + Default> From<T> for Context<T> {
fn from(x: T) -> Self {
Context(x, EmptyContext)
}
}
/// Result of applying a [`Token`] to a [`ParseState`].
///
/// This is used by [`ParseState::parse_token`];
/// see that function for rationale.
pub type ParseStateResult<S> = Result<ParseStatus<S>, <S as ParseState>::Error>;
/// A state transition with associated data.
///
/// Conceptually,
/// imagine the act of a state transition producing data.
/// See [`Transition`] for convenience methods for producing this tuple.
#[derive(Debug, PartialEq)]
pub struct TransitionResult<S: ParseState>(
pub Transition<S>,
pub ParseStateResult<S>,
);
/// Denotes a state transition.
///
/// This newtype was created to produce clear, self-documenting code;
/// parsers can get confusing to read with all of the types involved,
/// so this provides a mental synchronization point.
///
/// This also provides some convenience methods to help remote boilerplate
/// and further improve code clarity.
#[derive(Debug, PartialEq, Eq)]
pub struct Transition<S: ParseState>(pub S);
impl<S: ParseState> Transition<S> {
/// A state transition with corresponding data.
///
/// This allows [`ParseState::parse_token`] to emit a parsed object and
/// corresponds to [`ParseStatus::Object`].
pub fn ok<T>(self, obj: T) -> TransitionResult<S>
where
T: Into<ParseStatus<S>>,
{
TransitionResult(self, Ok(obj.into()))
}
/// A transition with corresponding error.
///
/// This indicates a parsing failure.
/// The state ought to be suitable for error recovery.
pub fn err<E: Into<S::Error>>(self, err: E) -> TransitionResult<S> {
TransitionResult(self, Err(err.into()))
}
/// A state transition with corresponding [`Result`].
///
/// This translates the provided [`Result`] in a manner equivalent to
/// [`Transition::ok`] and [`Transition::err`].
pub fn result<T, E>(self, result: Result<T, E>) -> TransitionResult<S>
where
T: Into<ParseStatus<S>>,
E: Into<S::Error>,
{
TransitionResult(self, result.map(Into::into).map_err(Into::into))
}
/// A state transition indicating that more data is needed before an
/// object can be emitted.
///
/// This corresponds to [`ParseStatus::Incomplete`].
pub fn incomplete(self) -> TransitionResult<S> {
TransitionResult(self, Ok(ParseStatus::Incomplete))
}
/// A dead state transition.
///
/// This corresponds to [`ParseStatus::Dead`],
/// and a calling parser should use the provided [`Token`] as
/// lookahead.
pub fn dead(self, tok: S::Token) -> TransitionResult<S> {
TransitionResult(self, Ok(ParseStatus::Dead(tok)))
}
}
impl<S: ParseState> Into<(Transition<S>, ParseStateResult<S>)>
for TransitionResult<S>
{
fn into(self) -> (Transition<S>, ParseStateResult<S>) {
(self.0, self.1)
}
}
impl<S: ParseState> Try for TransitionResult<S> {
type Output = (Transition<S>, ParseStateResult<S>);
type Residual = (Transition<S>, ParseStateResult<S>);
fn from_output(output: Self::Output) -> Self {
match output {
(st, result) => Self(st, result),
}
}
fn branch(self) -> ControlFlow<Self::Residual, Self::Output> {
match self.into() {
(st, Ok(x)) => ControlFlow::Continue((st, Ok(x))),
(st, Err(e)) => ControlFlow::Break((st, Err(e))),
}
}
}
impl<S: ParseState> FromResidual<(Transition<S>, ParseStateResult<S>)>
for TransitionResult<S>
{
fn from_residual(residual: (Transition<S>, ParseStateResult<S>)) -> Self {
match residual {
(st, result) => Self(st, result),
}
}
}
impl<S: ParseState> FromResidual<Result<Infallible, TransitionResult<S>>>
for TransitionResult<S>
{
fn from_residual(
residual: Result<Infallible, TransitionResult<S>>,
) -> Self {
match residual {
Err(e) => e,
// SAFETY: This match arm doesn't seem to be required in
// core::result::Result's FromResidual implementation,
// but as of 1.61 nightly it is here.
// Since this is Infallable,
// it cannot occur.
Ok(_) => unsafe { unreachable_unchecked() },
}
}
}
impl<S: ParseState> FromResidual<ControlFlow<TransitionResult<S>, Infallible>>
for TransitionResult<S>
{
fn from_residual(
residual: ControlFlow<TransitionResult<S>, Infallible>,
) -> Self {
match residual {
ControlFlow::Break(result) => result,
// SAFETY: Infallible, so cannot hit.
ControlFlow::Continue(_) => unsafe { unreachable_unchecked() },
}
}
}
/// An object able to be used as data for a state [`Transition`].
///
/// This flips the usual order of things:
/// rather than using a method of [`Transition`] to provide data,
/// this starts with the data and produces a transition from it.
/// This is sometimes necessary to satisfy ownership/borrowing rules.
///
/// This trait simply removes boilerplate associated with storing
/// intermediate values and translating into the resulting type.
pub trait Transitionable<S: ParseState> {
/// Perform a state transition to `S` using [`Self`] as the associated
/// data.
///
/// This may be necessary to satisfy ownership/borrowing rules when
/// state data from `S` is used to compute [`Self`].
fn transition(self, to: S) -> TransitionResult<S>;
}
impl<S, E> Transitionable<S> for Result<ParseStatus<S>, E>
where
S: ParseState,
<S as ParseState>::Error: From<E>,
{
fn transition(self, to: S) -> TransitionResult<S> {
Transition(to).result(self)
}
}
impl<S, E> Transitionable<S> for Result<(), E>
where
S: ParseState,
<S as ParseState>::Error: From<E>,
{
fn transition(self, to: S) -> TransitionResult<S> {
Transition(to).result(self.map(|_| ParseStatus::Incomplete))
}
}
/// A streaming parser defined by a [`ParseState`] with exclusive
/// mutable access to an underlying [`TokenStream`].
///
/// This parser handles operations that are common among all types of
/// parsers,
/// such that specialized parsers need only implement logic that is
/// unique to their operation.
/// This also simplifies combinators,
/// since there is more uniformity among distinct parser types.
///
/// After you have finished with a parser,
/// if you have not consumed the entire iterator,
/// call [`finalize`](Parser::finalize) to ensure that parsing has
/// completed in an accepting state.
#[derive(Debug, PartialEq, Eq)]
pub struct Parser<S: ParseState, I: TokenStream<S::Token>> {
toks: I,
state: S,
last_span: Span,
ctx: S::Context,
}
impl<S: ParseState, I: TokenStream<S::Token>> Parser<S, I> {
/// Indicate that no further parsing will take place using this parser,
/// retrieve any final aggregate state (the context),
/// and [`drop`] it.
///
/// Invoking the method is equivalent to stating that the stream has
/// ended,
/// since the parser will have no later opportunity to continue
/// parsing.
/// Consequently,
/// the caller should expect [`ParseError::UnexpectedEof`] if the
/// parser is not in an accepting state.
///
/// To re-use the context returned by this method,
/// see [`ParseState::parse_with_context`].
/// Note that whether the context is permitted to be reused,
/// or is useful independently to the caller,
/// is a decision made by the [`ParseState`].
pub fn finalize(
self,
) -> Result<S::Context, (Self, ParseError<S::Token, S::Error>)> {
match self.assert_accepting() {
Ok(()) => Ok(self.ctx),
Err(err) => Err((self, err)),
}
}
/// Return [`Ok`] if the parser is in an accepting state,
/// otherwise [`Err`] with [`ParseError::UnexpectedEof`].
///
/// See [`finalize`](Self::finalize) for the public-facing method.
fn assert_accepting(&self) -> Result<(), ParseError<S::Token, S::Error>> {
if self.state.is_accepting() {
Ok(())
} else {
let endpoints = self.last_span.endpoints();
Err(ParseError::UnexpectedEof(
endpoints.1.unwrap_or(endpoints.0),
))
}
}
/// Feed an input token to the parser.
///
/// This _pushes_ data into the parser,
/// rather than the typical pull system used by [`Parser`]'s
/// [`Iterator`] implementation.
/// The pull system also uses this method to provided data to the
/// parser.
///
/// This method is intentionally private,
/// since push parsers are currently supported only internally.
/// The only thing preventing this being public is formalization and a
/// commitment to maintain it.
fn feed_tok(&mut self, tok: S::Token) -> ParsedResult<S> {
// Store the most recently encountered Span for error
// reporting in case we encounter an EOF.
self.last_span = tok.span();
let result;
TransitionResult(Transition(self.state), result) =
take(&mut self.state).parse_token(tok, &mut self.ctx);
use ParseStatus::*;
match result {
// Nothing handled this dead state,
// and we cannot discard a lookahead token,
// so we have no choice but to produce an error.
Ok(Dead(invalid)) => Err(ParseError::UnexpectedToken(invalid)),
Ok(parsed @ (Incomplete | Object(..))) => Ok(parsed.into()),
Err(e) => Err(e.into()),
}
}
/// Lower the IR produced by this [`Parser`] into another IR by piping
/// the output to a new parser defined by the [`ParseState`] `LS`.
///
/// This parser consumes tokens `S::Token` and produces the IR
/// `S::Output`.
/// If there is some other [`ParseState`] `LS` such that
/// `LS::Token == S::Output`
/// (that is—the output of this parser is the input to another),
/// then this method will wire the two together into a new iterator
/// that produces `LS::Output`.
///
/// Visually, we have,
/// within the provided closure `f`,
/// a [`LowerIter`] that acts as this pipeline:
///
/// ```text
/// (S::Token) -> (S::Output == LS::Token) -> (LS::Output)
/// ```
///
/// The new iterator is a [`LowerIter`],
/// and scoped to the provided closure `f`.
/// The outer [`Result`] of `Self`'s [`ParsedResult`] is stripped by
/// a [`TripIter`] before being provided as input to a new push
/// [`Parser`] utilizing `LS`.
/// A push parser,
/// rather than pulling tokens from a [`TokenStream`],
/// has tokens pushed into it;
/// this parser is created automatically for you.
///
/// _TODO_: There's no way to access the inner parser for error recovery
/// after tripping the [`TripIter`].
/// Consequently,
/// this API (likely the return type) will change.
#[inline]
pub fn lower_while_ok<LS, U>(
&mut self,
f: impl FnOnce(&mut LowerIter<S, I, LS>) -> U,
) -> Result<U, ParseError<S::Token, S::Error>>
where
LS: ParseState<Token = S::Object>,
<S as ParseState>::Object: Token,
<LS as ParseState>::Context: Default,
{
self.while_ok(|toks| {
// TODO: This parser is not accessible after error recovery!
let lower = LS::parse(iter::empty());
f(&mut LowerIter { lower, toks })
})
}
}
/// An IR lowering operation that pipes the output of one [`Parser`] to the
/// input of another.
///
/// This is produced by [`Parser::lower_while_ok`].
pub struct LowerIter<'a, 'b, S, I, LS>
where
S: ParseState,
I: TokenStream<S::Token>,
LS: ParseState<Token = S::Object>,
<S as ParseState>::Object: Token,
{
/// A push [`Parser`].
lower: Parser<LS, Empty<LS::Token>>,
/// Source tokens from higher-level [`Parser`],
/// with the outer [`Result`] having been stripped by a [`TripIter`].
toks: &'a mut TripIter<
'b,
Parser<S, I>,
Parsed<S::Object>,
ParseError<S::Token, S::Error>,
>,
}
impl<'a, 'b, S, I, LS> Iterator for LowerIter<'a, 'b, S, I, LS>
where
S: ParseState,
I: TokenStream<S::Token>,
LS: ParseState<Token = S::Object>,
<S as ParseState>::Object: Token,
{
type Item = ParsedResult<LS>;
/// Pull a token through the higher-level [`Parser`],
/// push it to the lowering parser,
/// and yield the resulting [`ParseResult`].
#[inline]
fn next(&mut self) -> Option<Self::Item> {
match self.toks.next() {
None => None,
Some(Parsed::Incomplete) => Some(Ok(Parsed::Incomplete)),
Some(Parsed::Object(obj)) => Some(self.lower.feed_tok(obj)),
}
}
}
impl<S: ParseState, I: TokenStream<S::Token>> Iterator for Parser<S, I> {
type Item = ParsedResult<S>;
/// Parse a single [`Token`] according to the current
/// [`ParseState`],
/// if available.
///
/// If the underlying [`TokenStream`] yields [`None`],
/// then the [`ParseState`] must be in an accepting state;
/// otherwise, [`ParseError::UnexpectedEof`] will occur.
///
/// This is intended to be invoked by [`Iterator::next`].
/// Accepting a token rather than the [`TokenStream`] allows the caller
/// to inspect the token first
/// (e.g. to store a copy of the [`Span`][crate::span::Span]).
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let otok = self.toks.next();
match otok {
None => match self.assert_accepting() {
Ok(()) => None,
Err(e) => Some(Err(e)),
},
Some(tok) => Some(self.feed_tok(tok)),
}
}
}
/// Common parsing errors produced by [`Parser`].
///
/// These errors are common enough that they are handled in a common way,
/// such that individual parsers needn't check for these situations
/// themselves.
///
/// Having a common type also allows combinators to handle error types in a
/// consistent way when composing parsers.
///
/// Parsers may return their own unique errors via the
/// [`StateError`][ParseError::StateError] variant.
#[derive(Debug, PartialEq)]
pub enum ParseError<T: Token, E: Diagnostic + PartialEq> {
/// Token stream ended unexpectedly.
///
/// This error means that the parser was expecting more input before
/// reaching an accepting state.
/// This could represent a truncated file,
/// a malformed stream,
/// or maybe just a user that's not done typing yet
/// (e.g. in the case of an LSP implementation).
///
/// If no span is available,
/// then parsing has not even had the chance to begin.
/// If this parser follows another,
/// then the combinator ought to substitute a missing span with
/// whatever span preceded this invocation.
UnexpectedEof(Span),
/// The parser reached an unhandled dead state.
///
/// Once a parser returns [`ParseStatus::Dead`],
/// a parent context must use that provided token as a lookahead.
/// If that does not occur,
/// [`Parser`] produces this error.
///
/// In the future,
/// it may be desirable to be able to query [`ParseState`] for what
/// tokens are acceptable at this point,
/// to provide better error messages.
UnexpectedToken(T),
/// A parser-specific error associated with an inner
/// [`ParseState`].
StateError(E),
}
impl<T: Token, EA: Diagnostic + PartialEq> ParseError<T, EA> {
pub fn inner_into<EB: Diagnostic + PartialEq + Eq>(
self,
) -> ParseError<T, EB>
where
EA: Into<EB>,
{
use ParseError::*;
match self {
UnexpectedEof(x) => UnexpectedEof(x),
UnexpectedToken(x) => UnexpectedToken(x),
StateError(e) => StateError(e.into()),
}
}
}
impl<T: Token, E: Diagnostic + PartialEq> From<E> for ParseError<T, E> {
fn from(e: E) -> Self {
Self::StateError(e)
}
}
impl<T: Token, E: Diagnostic + PartialEq> Display for ParseError<T, E> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::UnexpectedEof(_) => {
write!(f, "unexpected end of input")
}
Self::UnexpectedToken(_tok) => {
write!(f, "unexpected input")
}
Self::StateError(e) => Display::fmt(e, f),
}
}
}
impl<T: Token, E: Diagnostic + PartialEq + 'static> Error for ParseError<T, E> {
fn source(&self) -> Option<&(dyn Error + 'static)> {
match self {
Self::StateError(e) => Some(e),
_ => None,
}
}
}
impl<T: Token, E: Diagnostic + PartialEq + 'static> Diagnostic
for ParseError<T, E>
{
fn describe(&self) -> Vec<AnnotatedSpan> {
use ParseError::*;
match self {
// TODO: More information from the underlying parser on what was expected.
UnexpectedEof(span) => {
span.error("unexpected end of input here").into()
}
UnexpectedToken(tok) => {
tok.span().error("this was unexpected").into()
}
// TODO: Is there any additional useful context we can augment
// this with?
StateError(e) => e.describe(),
}
}
}
impl<S, I> From<I> for Parser<S, I>
where
S: ParseState,
I: TokenStream<S::Token>,
<S as ParseState>::Context: Default,
{
/// Create a new parser with a default context.
///
/// This can only be used if the associated [`ParseState::Context`] does
/// not implement [`Default`];
/// otherwise,
/// consider instantiating from a `(TokenStream, Context)` pair.
/// See also [`ParseState::parse`] and
/// [`ParseState::parse_with_context`].
fn from(toks: I) -> Self {
Self {
toks,
state: Default::default(),
last_span: UNKNOWN_SPAN,
ctx: Default::default(),
}
}
}
impl<S, I, C> From<(I, C)> for Parser<S, I>
where
S: ParseState<Context = C>,
I: TokenStream<S::Token>,
{
/// Create a new parser with a provided context.
///
/// For more information,
/// see [`ParseState::parse_with_context`].
///
/// See also [`ParseState::parse`].
fn from((toks, ctx): (I, C)) -> Self {
Self {
toks,
state: Default::default(),
last_span: UNKNOWN_SPAN,
ctx,
}
}
}
/// Result of a parsing operation.
#[derive(Debug, PartialEq, Eq)]
pub enum ParseStatus<S: ParseState> {
/// Additional tokens are needed to complete parsing of the next object.
Incomplete,
/// Parsing of an object is complete.
///
/// This does not indicate that the parser is complete,
/// as more objects may be able to be emitted.
Object(S::Object),
/// Parser encountered a dead state relative to the given token.
///
/// A dead state is an empty accepting state that has no state
/// transition for the given token.
/// A state is empty if a [`ParseStatus::Object`] will not be lost if
/// parsing ends at this point
/// (that is---there is no partially-built object).
/// This could simply mean that the parser has completed its job and
/// that control must be returned to a parent context.
///
/// If a parser is _not_ in an accepting state,
/// then an error ought to occur rather than a dead state;
/// the difference between the two is that the token associated with
/// a dead state can be used as a lookahead token in order to
/// produce a state transition at a higher level,
/// whereas an error indicates that parsing has failed.
/// Intuitively,
/// this means that a [`ParseStatus::Object`] had just been emitted
/// and that the token following it isn't something that can be
/// parsed.
///
/// If there is no parent context to handle the token,
/// [`Parser`] must yield an error.
Dead(S::Token),
}
impl<S: ParseState<Object = T>, T: Object> From<T> for ParseStatus<S> {
fn from(obj: T) -> Self {
Self::Object(obj)
}
}
/// Result of a parsing operation.
///
/// Whereas [`ParseStatus`] is used by [`ParseState`] to influence parser
/// operation,
/// this type is public-facing and used by [`Parser`].
#[derive(Debug, PartialEq, Eq)]
pub enum Parsed<O> {
/// Additional tokens are needed to complete parsing of the next object.
Incomplete,
/// Parsing of an object is complete.
///
/// This does not indicate that the parser is complete,
/// as more objects may be able to be emitted.
Object(O),
}
impl<S: ParseState> From<ParseStatus<S>> for Parsed<S::Object> {
fn from(status: ParseStatus<S>) -> Self {
match status {
ParseStatus::Incomplete => Parsed::Incomplete,
ParseStatus::Object(x) => Parsed::Object(x),
ParseStatus::Dead(_) => {
unreachable!("Dead status must be filtered by Parser")
}
}
}
}
#[cfg(test)]
pub mod test {
use std::{assert_matches::assert_matches, iter::once};
use super::*;
use crate::{span::DUMMY_SPAN as DS, sym::GlobalSymbolIntern};
#[derive(Debug, PartialEq, Eq, Clone)]
enum TestToken {
Close(Span),
MarkDone(Span),
Text(Span),
SetCtxVal(u8),
}
impl Display for TestToken {
fn fmt(&self, _f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
unimplemented!("fmt::Display")
}
}
impl Token for TestToken {
fn span(&self) -> Span {
use TestToken::*;
match self {
Close(span) | MarkDone(span) | Text(span) => *span,
_ => UNKNOWN_SPAN,
}
}
}
impl Object for TestToken {}
#[derive(Debug, PartialEq, Eq)]
enum EchoState {
Empty,
Done,
}
impl Default for EchoState {
fn default() -> Self {
Self::Empty
}
}
#[derive(Debug, PartialEq, Default)]
struct StubContext {
val: u8,
}
impl ParseState for EchoState {
type Token = TestToken;
type Object = TestToken;
type Error = EchoStateError;
type Context = StubContext;
fn parse_token(
self,
tok: TestToken,
ctx: &mut StubContext,
) -> TransitionResult<Self> {
match tok {
TestToken::MarkDone(..) => Transition(Self::Done).ok(tok),
TestToken::Close(..) => {
Transition(self).err(EchoStateError::InnerError(tok))
}
TestToken::Text(..) => Transition(self).dead(tok),
TestToken::SetCtxVal(val) => {
ctx.val = val;
Transition(Self::Done).incomplete()
}
}
}
fn is_accepting(&self) -> bool {
*self == Self::Done
}
}
#[derive(Debug, PartialEq, Eq)]
enum EchoStateError {
InnerError(TestToken),
}
impl Display for EchoStateError {
fn fmt(&self, _: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
unimplemented!()
}
}
impl Error for EchoStateError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
None
}
}
impl Diagnostic for EchoStateError {
fn describe(&self) -> Vec<AnnotatedSpan> {
unimplemented!()
}
}
type Sut<I> = Parser<EchoState, I>;
#[test]
fn successful_parse_in_accepting_state_with_spans() {
// EchoState is placed into a Done state given Comment.
let tok = TestToken::MarkDone(DS);
let mut toks = once(tok.clone());
let mut sut = Sut::from(&mut toks);
// The first token should be processed normally.
// EchoState proxies the token back.
assert_eq!(Some(Ok(Parsed::Object(tok))), sut.next());
// This is now the end of the token stream,
// which should be okay provided that the first token put us into
// a proper accepting state.
assert_eq!(None, sut.next());
// Further, finalizing should work in this state.
assert!(sut.finalize().is_ok());
}
#[test]
fn fails_on_end_of_stream_when_not_in_accepting_state() {
let span = Span::new(10, 20, "ctx".intern());
let mut toks = [TestToken::Close(span)].into_iter();
let mut sut = Sut::from(&mut toks);
// The first token is fine,
// and allows us to acquire our most recent span.
sut.next();
// Given that we have no tokens,
// and that EchoState::default does not start in an accepting
// state,
// we must fail when we encounter the end of the stream.
assert_eq!(
Some(Err(ParseError::UnexpectedEof(span.endpoints().1.unwrap()))),
sut.next()
);
}
#[test]
fn returns_state_specific_error() {
// TestToken::Close causes EchoState to produce an error.
let errtok = TestToken::Close(DS);
let mut toks = [errtok.clone()].into_iter();
let mut sut = Sut::from(&mut toks);
assert_eq!(
Some(Err(ParseError::StateError(EchoStateError::InnerError(
errtok
)))),
sut.next()
);
// The token must have been consumed.
// It is up to a recovery process to either bail out or provide
// recovery tokens;
// continuing without recovery is unlikely to make sense.
assert_eq!(0, toks.len());
}
#[test]
fn fails_when_parser_is_finalized_in_non_accepting_state() {
let span = Span::new(10, 10, "ctx".intern());
// Set up so that we have a single token that we can use for
// recovery as part of the same iterator.
let recovery = TestToken::MarkDone(DS);
let mut toks = [
// Used purely to populate a Span.
TestToken::Close(span),
// Recovery token here:
recovery.clone(),
]
.into_iter();
let mut sut = Sut::from(&mut toks);
// Populate our most recently seen token's span.
sut.next();
// Attempting to finalize now in a non-accepting state should fail
// in the same way that encountering an end-of-stream does,
// since we're effectively saying "we're done with the stream"
// and the parser will have no further opportunity to reach an
// accepting state.
let result = sut.finalize();
assert_matches!(
result,
Err((_, ParseError::UnexpectedEof(s))) if s == span.endpoints().1.unwrap()
);
// The sut should have been re-returned,
// allowing for attempted error recovery if the caller can manage
// to produce a sequence of tokens that will be considered valid.
// `toks` above is set up already for this,
// which allows us to assert that we received back the same `sut`.
let mut sut = result.unwrap_err().0;
assert_eq!(Some(Ok(Parsed::Object(recovery))), sut.next());
// And so we should now be in an accepting state,
// able to finalize.
assert!(sut.finalize().is_ok());
}
#[test]
fn unhandled_dead_state_results_in_error() {
// A Text will cause our parser to return Dead.
let tok = TestToken::Text(DS);
let mut toks = once(tok.clone());
let mut sut = Sut::from(&mut toks);
// Our parser returns a Dead status,
// which is unhandled by any parent context
// (since we're not composing parsers),
// which causes an error due to an unhandled Dead state.
assert_eq!(sut.next(), Some(Err(ParseError::UnexpectedToken(tok))),);
}
// A context can be both retrieved from a finished parser and provided
// to a new one.
#[test]
fn provide_and_retrieve_context() {
// First, verify that it's initialized to a default context.
let mut toks = vec![TestToken::MarkDone(DS)].into_iter();
let mut sut = Sut::from(&mut toks);
sut.next().unwrap().unwrap();
let ctx = sut.finalize().unwrap();
assert_eq!(ctx, Default::default());
// Next, verify that the context that is manipulated is the context
// that is returned to us.
let val = 5;
let mut toks = vec![TestToken::SetCtxVal(5)].into_iter();
let mut sut = Sut::from(&mut toks);
sut.next().unwrap().unwrap();
let ctx = sut.finalize().unwrap();
assert_eq!(ctx, StubContext { val });
// Finally, verify that the context provided is the context that is
// used.
let val = 10;
let given_ctx = StubContext { val };
let mut toks = vec![TestToken::MarkDone(DS)].into_iter();
let mut sut = EchoState::parse_with_context(&mut toks, given_ctx);
sut.next().unwrap().unwrap();
let ctx = sut.finalize().unwrap();
assert_eq!(ctx, StubContext { val });
}
}