// Basic streaming parsing framework // // Copyright (C) 2014-2021 Ryan Specialty Group, LLC. // // This file is part of TAME. // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . //! Basic streaming parser framework for lowering operations. //! //! _TODO: Some proper docs and examples!_ use crate::span::Span; use std::fmt::Debug; use std::mem::take; use std::{error::Error, fmt::Display}; /// Result of applying a [`Token`] to a [`ParseState`], /// with any error having been wrapped in a [`ParseError`]. pub type ParsedResult = ParseResult::Object>>; /// Result of some non-parsing operation on a [`Parser`], /// with any error having been wrapped in a [`ParseError`]. pub type ParseResult = Result::Token, ::Error>>; /// A single datum from a streaming IR with an associated [`Span`]. /// /// A token may be a lexeme with associated data, /// or a more structured object having been lowered from other IRs. pub trait Token: Display + Debug + PartialEq + Eq { /// Retrieve the [`Span`] representing the source location of the token. fn span(&self) -> Span; } impl From for Span { fn from(tok: T) -> Self { tok.span() } } /// An infallible [`Token`] stream. /// /// If the token stream originates from an operation that could potentially /// fail and ought to be propagated, /// use [`TokenResultStream`]. /// /// The name "stream" in place of "iterator" is intended to convey that this /// type is expected to be processed in real-time as a stream, /// not read into memory. pub trait TokenStream = Iterator; /// A [`Token`] stream that may encounter errors during parsing. /// /// If the stream cannot fail, /// consider using [`TokenStream`]. pub trait TokenResultStream = Iterator>; /// A deterministic parsing automaton. /// /// These states are utilized by a [`Parser`]. /// /// A [`ParseState`] is also responsible for storing data about the /// accepted input, /// and handling appropriate type conversions into the final type. /// That is---an /// automaton may store metadata that is subsequently emitted once an /// accepting state has been reached. /// Whatever the underlying automaton, /// a `(state, token)` pair must uniquely determine the next parser /// action. /// /// Intuitively, /// since only one [`Parser`] may hold a mutable reference to /// an underlying [`TokenStream`] at any given point, /// this does in fact represent the current state of the entire /// [`TokenStream`] at the current position for a given parser /// composition. pub trait ParseState: Default + PartialEq + Eq + Debug { /// Input tokens to the parser. type Token: Token; /// Objects produced by a parser utilizing these states. type Object; /// Errors specific to this set of states. type Error: Error + PartialEq + Eq; /// Construct a parser. /// /// Whether this method is helpful or provides any clarity depends on /// the context and the types that are able to be inferred. fn parse>(toks: I) -> Parser { Parser::from(toks) } /// Parse a single [`Token`] and optionally perform a state transition. /// /// The current state is represented by `self`. /// The result of a parsing operation is a state transition with /// associated [`ParseStatus`] data. /// /// Note that `self` is owned, /// for a couple primary reasons: /// /// 1. This forces the parser to explicitly consider and document all /// state transitions, /// rather than potentially missing unintended behavior through /// implicit behavior; and /// 2. It allows for more natural functional composition of state, /// which in turn makes it easier to compose parsers /// (which conceptually involves stitching together state /// machines). fn parse_token(self, tok: Self::Token) -> TransitionResult; /// Whether the current state represents an accepting state. /// /// An accepting state represents a valid state to stop parsing. /// If parsing stops at a state that is _not_ accepting, /// then the [`TokenStream`] has ended unexpectedly and should produce /// a [`ParseError::UnexpectedEof`]. /// /// It makes sense for there to be exist multiple accepting states for a /// parser. /// For example: /// A parser that parses a list of attributes may be used to parse one /// or more attributes, /// or the entire list of attributes. /// It is acceptable to attempt to parse just one of those attributes, /// or it is acceptable to parse all the way until the end. fn is_accepting(&self) -> bool; } /// Result of applying a [`Token`] to a [`ParseState`]. /// /// This is used by [`ParseState::parse_token`]; /// see that function for rationale. pub type ParseStateResult = Result< ParseStatus<::Token, ::Object>, ::Error, >; /// Denotes a state transition. /// /// This newtype was created to produce clear, self-documenting code; /// parsers can get confusing to read with all of the types involved, /// so this provides a mental synchronization point. /// /// This also provides some convenience methods to help remote boilerplate /// and further improve code clarity. #[derive(Debug, PartialEq, Eq)] pub struct Transition(pub S); impl Transition { /// A state transition with corresponding data. /// /// This allows [`ParseState::parse_token`] to emit a parsed object and /// corresponds to [`ParseStatus::Object`]. pub fn with(self, obj: S::Object) -> (Self, ParseStateResult) { (self, Ok(ParseStatus::Object(obj))) } /// A state transition indicating that more data is needed before an /// object can be emitted. /// /// This corresponds to [`ParseStatus::Incomplete`]. pub fn incomplete(self) -> (Self, ParseStateResult) { (self, Ok(ParseStatus::Incomplete)) } /// A dead state transition. /// /// This corresponds to [`ParseStatus::Dead`], /// and a calling parser should use the provided [`Token`] as /// lookahead. pub fn dead(self, tok: S::Token) -> (Self, ParseStateResult) { (self, Ok(ParseStatus::Dead(tok))) } /// A transition with corresponding error. /// /// This indicates a parsing failure. /// The state ought to be suitable for error recovery. pub fn err>(self, err: E) -> (Self, ParseStateResult) { (self, Err(err.into())) } } /// A state transition with associated data. /// /// Conceptually, /// imagine the act of a state transition producing data. /// See [`Transition`] for convenience methods for producing this tuple. pub type TransitionResult = (Transition, ParseStateResult); /// A streaming parser defined by a [`ParseState`] with exclusive /// mutable access to an underlying [`TokenStream`]. /// /// This parser handles operations that are common among all types of /// parsers, /// such that specialized parsers need only implement logic that is /// unique to their operation. /// This also simplifies combinators, /// since there is more uniformity among distinct parser types. /// /// After you have finished with a parser, /// if you have not consumed the entire iterator, /// call [`finalize`](Parser::finalize) to ensure that parsing has /// completed in an accepting state. #[derive(Debug, PartialEq, Eq)] pub struct Parser> { toks: I, state: S, last_span: Option, } impl> Parser { /// Indicate that no further parsing will take place using this parser, /// and [`drop`] it. /// /// Invoking the method is equivalent to stating that the stream has /// ended, /// since the parser will have no later opportunity to continue /// parsing. /// Consequently, /// the caller should expect [`ParseError::UnexpectedEof`] if the /// parser is not in an accepting state. pub fn finalize( self, ) -> Result<(), (Self, ParseError)> { if self.state.is_accepting() { Ok(()) } else { let span = self.last_span.and_then(|s| s.endpoints().1); Err((self, ParseError::UnexpectedEof(span))) } } } impl> Iterator for Parser { type Item = ParsedResult; /// Parse a single [`Token`] according to the current /// [`ParseState`], /// if available. /// /// If the underlying [`TokenStream`] yields [`None`], /// then the [`ParseState`] must be in an accepting state; /// otherwise, [`ParseError::UnexpectedEof`] will occur. /// /// This is intended to be invoked by [`Iterator::next`]. /// Accepting a token rather than the [`TokenStream`] allows the caller /// to inspect the token first /// (e.g. to store a copy of the [`Span`][crate::span::Span]). #[inline] fn next(&mut self) -> Option { let otok = self.toks.next(); match otok { None if self.state.is_accepting() => None, // The EOF occurred at the end of the last encountered span, // if any. None => Some(Err(ParseError::UnexpectedEof( self.last_span.and_then(|s| s.endpoints().1), ))), Some(tok) => { // Store the most recently encountered Span for error // reporting in case we encounter an EOF. self.last_span = Some(tok.span()); let result; (Transition(self.state), result) = take(&mut self.state).parse_token(tok); use ParseStatus::*; match result { // Nothing handled this dead state, // and we cannot discard a lookahead token, // so we have no choice but to produce an error. Ok(Dead(invalid)) => { Some(Err(ParseError::UnexpectedToken(invalid))) } Ok(parsed @ (Incomplete | Object(..))) => { Some(Ok(parsed.into())) } Err(e) => Some(Err(e.into())), } } } } } /// Common parsing errors produced by [`Parser`]. /// /// These errors are common enough that they are handled in a common way, /// such that individual parsers needn't check for these situations /// themselves. /// /// Having a common type also allows combinators to handle error types in a /// consistent way when composing parsers. /// /// Parsers may return their own unique errors via the /// [`StateError`][ParseError::StateError] variant. #[derive(Debug, PartialEq, Eq)] pub enum ParseError { /// Token stream ended unexpectedly. /// /// This error means that the parser was expecting more input before /// reaching an accepting state. /// This could represent a truncated file, /// a malformed stream, /// or maybe just a user that's not done typing yet /// (e.g. in the case of an LSP implementation). /// /// If no span is available, /// then parsing has not even had the chance to begin. /// If this parser follows another, /// then the combinator ought to substitute a missing span with /// whatever span preceded this invocation. UnexpectedEof(Option), /// The parser reached an unhandled dead state. /// /// Once a parser returns [`ParseStatus::Dead`], /// a parent context must use that provided token as a lookahead. /// If that does not occur, /// [`Parser`] produces this error. /// /// In the future, /// it may be desirable to be able to query [`ParseState`] for what /// tokens are acceptable at this point, /// to provide better error messages. UnexpectedToken(T), /// A parser-specific error associated with an inner /// [`ParseState`]. StateError(E), } impl ParseError { pub fn inner_into(self) -> ParseError where EA: Into, { use ParseError::*; match self { UnexpectedEof(x) => UnexpectedEof(x), UnexpectedToken(x) => UnexpectedToken(x), StateError(e) => StateError(e.into()), } } } impl From for ParseError { fn from(e: E) -> Self { Self::StateError(e) } } impl Display for ParseError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::UnexpectedEof(ospan) => { write!(f, "unexpected end of input at ")?; match ospan { None => write!(f, ""), Some(span) => write!(f, "{}", span), } } Self::UnexpectedToken(tok) => { write!(f, "unexpected {}", tok) } Self::StateError(e) => Display::fmt(e, f), } } } impl Error for ParseError { fn source(&self) -> Option<&(dyn Error + 'static)> { match self { Self::StateError(e) => Some(e), _ => None, } } } impl> From for Parser { fn from(toks: I) -> Self { Self { toks, state: Default::default(), last_span: None, } } } /// Result of a parsing operation. #[derive(Debug, PartialEq, Eq)] pub enum ParseStatus { /// Additional tokens are needed to complete parsing of the next object. Incomplete, /// Parsing of an object is complete. /// /// This does not indicate that the parser is complete, /// as more objects may be able to be emitted. Object(O), /// Parser encountered a dead state relative to the given token. /// /// A dead state is an empty accepting state that has no state /// transition for the given token. /// A state is empty if a [`ParseStatus::Object`] will not be lost if /// parsing ends at this point /// (that is---there is no partially-built object). /// This could simply mean that the parser has completed its job and /// that control must be returned to a parent context. /// /// If a parser is _not_ in an accepting state, /// then an error ought to occur rather than a dead state; /// the difference between the two is that the token associated with /// a dead state can be used as a lookahead token in order to /// produce a state transition at a higher level, /// whereas an error indicates that parsing has failed. /// Intuitively, /// this means that a [`ParseStatus::Object`] had just been emitted /// and that the token following it isn't something that can be /// parsed. /// /// If there is no parent context to handle the token, /// [`Parser`] must yield an error. Dead(T), } /// Result of a parsing operation. /// /// Whereas [`ParseStatus`] is used by [`ParseState`] to influence parser /// operation, /// this type is public-facing and used by [`Parser`]. #[derive(Debug, PartialEq, Eq)] pub enum Parsed { /// Additional tokens are needed to complete parsing of the next object. Incomplete, /// Parsing of an object is complete. /// /// This does not indicate that the parser is complete, /// as more objects may be able to be emitted. Object(O), } impl From> for Parsed { fn from(status: ParseStatus) -> Self { match status { ParseStatus::Incomplete => Parsed::Incomplete, ParseStatus::Object(x) => Parsed::Object(x), ParseStatus::Dead(_) => { unreachable!("Dead status must be filtered by Parser") } } } } #[cfg(test)] pub mod test { use std::{assert_matches::assert_matches, iter::once}; use super::*; use crate::{span::DUMMY_SPAN as DS, sym::GlobalSymbolIntern}; #[derive(Debug, PartialEq, Eq, Clone)] enum TestToken { Close(Span), Comment(Span), Text(Span), } impl Display for TestToken { fn fmt(&self, _f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { unimplemented!("fmt::Display") } } impl Token for TestToken { fn span(&self) -> Span { use TestToken::*; match self { Close(span) | Comment(span) | Text(span) => *span, } } } #[derive(Debug, PartialEq, Eq)] enum EchoState { Empty, Done, } impl Default for EchoState { fn default() -> Self { Self::Empty } } impl ParseState for EchoState { type Token = TestToken; type Object = TestToken; type Error = EchoStateError; fn parse_token(self, tok: TestToken) -> TransitionResult { match tok { TestToken::Comment(..) => Transition(Self::Done).with(tok), TestToken::Close(..) => { Transition(self).err(EchoStateError::InnerError(tok)) } TestToken::Text(..) => Transition(self).dead(tok), } } fn is_accepting(&self) -> bool { *self == Self::Done } } #[derive(Debug, PartialEq, Eq)] enum EchoStateError { InnerError(TestToken), } impl Display for EchoStateError { fn fmt(&self, _: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { unimplemented!() } } impl Error for EchoStateError { fn source(&self) -> Option<&(dyn Error + 'static)> { None } } type Sut = Parser; #[test] fn successful_parse_in_accepting_state_with_spans() { // EchoState is placed into a Done state given Comment. let tok = TestToken::Comment(DS); let mut toks = once(tok.clone()); let mut sut = Sut::from(&mut toks); // The first token should be processed normally. // EchoState proxies the token back. assert_eq!(Some(Ok(Parsed::Object(tok))), sut.next()); // This is now the end of the token stream, // which should be okay provided that the first token put us into // a proper accepting state. assert_eq!(None, sut.next()); // Further, finalizing should work in this state. assert!(sut.finalize().is_ok()); } #[test] fn fails_on_end_of_stream_when_not_in_accepting_state() { let span = Span::new(10, 20, "ctx".intern()); let mut toks = [TestToken::Close(span)].into_iter(); let mut sut = Sut::from(&mut toks); // The first token is fine, // and allows us to acquire our most recent span. sut.next(); // Given that we have no tokens, // and that EchoState::default does not start in an accepting // state, // we must fail when we encounter the end of the stream. assert_eq!( Some(Err(ParseError::UnexpectedEof(span.endpoints().1))), sut.next() ); } #[test] fn returns_state_specific_error() { // TestToken::Close causes EchoState to produce an error. let errtok = TestToken::Close(DS); let mut toks = [errtok.clone()].into_iter(); let mut sut = Sut::from(&mut toks); assert_eq!( Some(Err(ParseError::StateError(EchoStateError::InnerError( errtok )))), sut.next() ); // The token must have been consumed. // It is up to a recovery process to either bail out or provide // recovery tokens; // continuing without recovery is unlikely to make sense. assert_eq!(0, toks.len()); } #[test] fn fails_when_parser_is_finalized_in_non_accepting_state() { let span = Span::new(10, 10, "ctx".intern()); // Set up so that we have a single token that we can use for // recovery as part of the same iterator. let recovery = TestToken::Comment(DS); let mut toks = [ // Used purely to populate a Span. TestToken::Close(span), // Recovery token here: recovery.clone(), ] .into_iter(); let mut sut = Sut::from(&mut toks); // Populate our most recently seen token's span. sut.next(); // Attempting to finalize now in a non-accepting state should fail // in the same way that encountering an end-of-stream does, // since we're effectively saying "we're done with the stream" // and the parser will have no further opportunity to reach an // accepting state. let result = sut.finalize(); assert_matches!( result, Err((_, ParseError::UnexpectedEof(s))) if s == span.endpoints().1 ); // The sut should have been re-returned, // allowing for attempted error recovery if the caller can manage // to produce a sequence of tokens that will be considered valid. // `toks` above is set up already for this, // which allows us to assert that we received back the same `sut`. let mut sut = result.unwrap_err().0; assert_eq!(Some(Ok(Parsed::Object(recovery))), sut.next()); // And so we should now be in an accepting state, // able to finalize. assert!(sut.finalize().is_ok()); } #[test] fn unhandled_dead_state_results_in_error() { // A Text will cause our parser to return Dead. let tok = TestToken::Text(DS); let mut toks = once(tok.clone()); let mut sut = Sut::from(&mut toks); // Our parser returns a Dead status, // which is unhandled by any parent context // (since we're not composing parsers), // which causes an error due to an unhandled Dead state. assert_eq!(sut.next(), Some(Err(ParseError::UnexpectedToken(tok))),); } }