tamer: parse: Introduce lookahaed token in Parser

*NB: This is the initial change to introduce the token of lookahead, but this
does not fully integrate it.  In particular, this is missing from the
stitching/delegation layer.*

This has been a long time coming, I suppose, though I had tried to avoid it
with `Parser::delegate_lookahead`.  But the problem with doing that is that
it forced the ParserState to recurse, which both violates that I want no
looping constructs except for the toplevel, and performs additional stack
allocation as it is not in tail position.

The final straw was having to both return an error _and_ an aggregate object
for the attribute parser when an unexpected element is encountered (this
code is not yet committed).  One option was to add a recovery object to the
error object, and formalize that, but then we have other concerns; for
example, what if that recovery object triggered an error?  We'd have to mask
either the old or the new error.  But we wouldn't want to mask either,
because the object causing the error would be the aggregate attributes,
which is _not_ a recovery object, but actual data we want to emit.  And so
it's a kluge right off of the bat.

The use of a token of lookahaed is a more traditional approach and has uses
outside of just this one scenario.  It'll also allow for the removal of
recursion from the existing ParserStates, and possibly the elimination of
dead state associated data, though I may end up leaving that; more to come.

Rust will also optimize away lookahead storage and processing in Parsers
that do not utilize it.

DEV-7145
main
Mike Gerwitz 2022-07-06 14:55:43 -04:00
parent 6385270fe6
commit e54f93b30f
4 changed files with 512 additions and 17 deletions

View File

@ -76,6 +76,20 @@ pub enum ParseError<T: Token, E: Diagnostic + PartialEq> {
/// needn't be stored.
UnexpectedToken(T, String),
/// The parser contains an outstanding token of lookahead that is no
/// longer
/// (or possibly never was)
/// part of the token stream,
/// and would therefore be lost if the parser is finalized.
///
/// The parser must consume the next token,
/// which will be the token of lookahead,
/// after which it may finalize provided that it is in an accepting
/// state.
///
/// See [`Parser::take_lookahead_tok`] for more information.
Lookahead(Span, String),
/// A parser-specific error associated with an inner
/// [`ParseState`].
StateError(E),
@ -92,6 +106,7 @@ impl<T: Token, EA: Diagnostic + PartialEq> ParseError<T, EA> {
match self {
UnexpectedEof(span, desc) => UnexpectedEof(span, desc),
UnexpectedToken(x, desc) => UnexpectedToken(x, desc),
Lookahead(span, desc) => Lookahead(span, desc),
StateError(e) => StateError(e.into()),
}
}
@ -112,6 +127,20 @@ impl<T: Token, E: Diagnostic + PartialEq> Display for ParseError<T, E> {
Self::UnexpectedToken(_, desc) => {
write!(f, "unexpected input while {desc}")
}
// This is not really something the user should have to deal
// with,
// but maybe this will provide enough information that the
// user can alter the input in such a way as to avoid this
// condition.
// It likely represents a bug in the parser,
// or at the very least poor handling of unexpected input.
Self::Lookahead(_, desc) => {
write!(
f,
"internal error: attempt to finalize parsing with \
outstanding token of lookahead while {desc}"
)
}
Self::StateError(e) => Display::fmt(e, f),
}
}
@ -134,9 +163,8 @@ impl<T: Token, E: Diagnostic + PartialEq + 'static> Diagnostic
match self {
UnexpectedEof(span, desc) => span.error(desc).into(),
UnexpectedToken(tok, desc) => tok.span().error(desc).into(),
Lookahead(span, desc) => span.error(desc).into(),
// TODO: Is there any additional useful context we can augment
// this with?
StateError(e) => e.describe(),

View File

@ -185,7 +185,13 @@ where
/// and yield the resulting [`ParsedResult`].
#[inline]
fn next(&mut self) -> Option<Self::Item> {
match self.toks.next() {
let tok = self
.lower
.take_lookahead_tok()
.map(Parsed::Object)
.or_else(|| self.toks.next());
match tok {
None => None,
Some(Parsed::Incomplete) => Some(Ok(Parsed::Incomplete)),
Some(Parsed::Object(obj)) => Some(self.lower.feed_tok(obj)),
@ -245,3 +251,81 @@ impl<O: Object, E: Diagnostic + PartialEq> ParseState for ParsedObject<O, E> {
unreachable!("ParsedObject must be used for type information only")
}
}
// See `super::test` for more information on why there are so few tests
// here.
#[cfg(test)]
mod test {
use super::super::{
parser::test::{StubError, StubObject, StubParseState, StubToken},
Transition,
};
use super::*;
#[derive(Debug, PartialEq, Eq, Default)]
struct StubEchoParseState {}
impl Display for StubEchoParseState {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "StubEchoParseState")
}
}
impl ParseState for StubEchoParseState {
type Token = StubToken;
type Object = StubToken;
type Error = StubError;
fn parse_token(
self,
tok: Self::Token,
_: &mut Self::Context,
) -> TransitionResult<Self> {
Transition(self).ok(tok)
}
fn is_accepting(&self) -> bool {
true
}
}
// Similar to tests in parse::parser::test.
#[test]
fn can_emit_object_with_lookahead_for_lower_iter() {
let given = 27; // some value
let toks = vec![StubToken::YieldWithLookahead(given)];
Lower::<StubEchoParseState, StubParseState>::lower(
&mut StubEchoParseState::parse(toks.into_iter()),
|sut| {
// We have a single token,
// and this consumes it,
// but it should introduce a lookahead token.
assert_eq!(
sut.next(),
Some(Ok(Parsed::Object(StubObject::FromYield(given))))
);
// The token of lookahead should still be available to the parser,
// and this should consume it.
assert_eq!(
sut.next(),
Some(Ok(Parsed::Object(StubObject::FromLookahead(given)))),
"lookahead token did not take effect"
);
// And now this should be the end,
// provided that the lookahead token was actually consumed and not
// copied and retained.
assert_eq!(
sut.next(),
None,
"expected end of both input stream and lookahead"
);
Ok::<(), StubError>(())
},
)
.unwrap();
}
}

View File

@ -80,6 +80,13 @@ pub struct Parser<S: ParseState, I: TokenStream<S::Token>> {
/// Input token stream to be parsed by the [`ParseState`] `S`.
toks: I,
/// Token of lookahead to serve as the next input to the parser in place
/// of the next token of `toks`.
///
/// See [`take_lookahead_tok`](Parser::take_lookahead_tok) for more
/// information.
lookahead: Option<S::Token>,
/// Parsing automaton.
///
/// This [`ParseState`] is stored within an [`Option`] to allow for
@ -133,6 +140,7 @@ impl<S: ParseState, I: TokenStream<S::Token>> Parser<S, I> {
{
Self {
toks,
lookahead: None,
state: Some(state),
last_span: UNKNOWN_SPAN,
ctx: Default::default(),
@ -165,20 +173,25 @@ impl<S: ParseState, I: TokenStream<S::Token>> Parser<S, I> {
}
}
/// Return [`Ok`] if the parser is in an accepting state,
/// otherwise [`Err`] with [`ParseError::UnexpectedEof`].
/// Return [`Ok`] if the parser both has no outstanding lookahead token
/// and is in an accepting state,
/// otherwise [`Err`] with [`ParseError::UnexpectedEof`].
///
/// See [`finalize`](Self::finalize) for the public-facing method.
fn assert_accepting(
&self,
) -> Result<(), ParseError<S::DeadToken, S::Error>> {
if self.state.as_ref().unwrap().is_accepting() {
let st = self.state.as_ref().unwrap();
if let Some(lookahead) = &self.lookahead {
Err(ParseError::Lookahead(lookahead.span(), st.to_string()))
} else if st.is_accepting() {
Ok(())
} else {
let endpoints = self.last_span.endpoints();
Err(ParseError::UnexpectedEof(
endpoints.1.unwrap_or(endpoints.0),
self.state.as_ref().unwrap().to_string(),
st.to_string(),
))
}
}
@ -195,11 +208,28 @@ impl<S: ParseState, I: TokenStream<S::Token>> Parser<S, I> {
/// since push parsers are currently supported only internally.
/// The only thing preventing this being public is formalization and a
/// commitment to maintain it.
///
/// Panics
/// ------
/// This uses a debug assertion to enforce the invariant
/// `self.lookahead.is_none()`.
/// Failure to consume the lookahead token using
/// [`take_lookahead_tok`](Parser::take_lookahead_tok) and provide to
/// this method would result in the loss of data.
/// This is something that can only be done by the caller.
pub(super) fn feed_tok(&mut self, tok: S::Token) -> ParsedResult<S> {
// Store the most recently encountered Span for error
// reporting in case we encounter an EOF.
self.last_span = tok.span();
// Lookahead tokens must be consumed before invoking this method,
// otherwise they will be overwritten and lost.
// See doc block above for more information.
debug_assert!(
self.lookahead.is_none(),
"lookahead token is available but was not consumed",
);
// Parse a single token and perform the requested state transition.
//
// This is where the functional `ParseState` is married with a
@ -211,9 +241,10 @@ impl<S: ParseState, I: TokenStream<S::Token>> Parser<S, I> {
//
// Note that this used to use `mem::take`,
// and the generated assembly was identical in both cases.
let TransitionResult(Transition(state), result) =
let TransitionResult(Transition(state), result, lookahead) =
self.state.take().unwrap().parse_token(tok, &mut self.ctx);
self.state.replace(state);
self.lookahead = lookahead;
use ParseStatus::*;
match result {
@ -229,6 +260,54 @@ impl<S: ParseState, I: TokenStream<S::Token>> Parser<S, I> {
Err(e) => Err(e.into()),
}
}
/// Retrieve a single token of lookahead,
/// if any,
/// to be used in place of the next token from the input stream.
///
/// The term "lookahead" here means that a token has been read from an
/// input stream,
/// has been used to make a state determination,
/// but parsing the token is incomplete.
/// For example,
/// the token may have been used to recognize that an aggregating
/// parser has finished its work,
/// but now the token must be provided to the parent parser;
/// in this sense,
/// the token was used for context but not substance.
/// Such an operation is a byproduct of parser composition,
/// where inner parsers determine when they are complete and,
/// in doing so,
/// have an extra token of input that they have not consumed.
/// This is like a "rest" argument to parser combinators,
/// except "rest" here is the single token of lookahead + the
/// remainder of the token stream that has not yet been read.
///
/// Not all parsers produce lookahead tokens,
/// but all parsers have that option;
/// Rust will optimize away unneeded functionality appropriately.
/// It is essential that the parser use this token if it is available,
/// since it will be replaced
/// (possibly with [`None`])
/// after the next parsing step.
/// [`feed_tok`](Parser::feed_tok) will panic if a lookahead token is
/// available and is not consumed by this method.
/// This calling order could be enforced via Rust's type system,
/// but doing so would require a bit of work that may not be worth it
/// for how few systems have to worry about this.
///
/// Note that there is no protection against infinitely looping on the
/// same token---it
/// is the responsibility of the parser to ensure that such a thing
/// does not occur by ensuring that the token is properly consumed
/// and not re-emitted as a lookahead without also changing state.
/// Parser generator macros may have their own checks in place that
/// prevent this type of thing from happening,
/// but proving correctness is better left to proof systems than
/// Rust's type system.
pub(super) fn take_lookahead_tok(&mut self) -> Option<S::Token> {
self.lookahead.take()
}
}
impl<S: ParseState, I: TokenStream<S::Token>> Iterator for Parser<S, I> {
@ -248,7 +327,7 @@ impl<S: ParseState, I: TokenStream<S::Token>> Iterator for Parser<S, I> {
/// (e.g. to store a copy of the [`Span`][crate::span::Span]).
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let otok = self.toks.next();
let otok = self.take_lookahead_tok().or_else(|| self.toks.next());
match otok {
None => match self.assert_accepting() {
@ -281,6 +360,7 @@ where
fn from(toks: I) -> Self {
Self {
toks,
lookahead: None,
state: Some(Default::default()),
last_span: UNKNOWN_SPAN,
ctx: Default::default(),
@ -302,9 +382,267 @@ where
fn from((toks, ctx): (I, C)) -> Self {
Self {
toks,
lookahead: None,
state: Some(Default::default()),
last_span: UNKNOWN_SPAN,
ctx,
}
}
}
#[cfg(test)]
pub mod test {
use super::*;
use crate::{
diagnose::Diagnostic,
parse::{Object, Token},
span::DUMMY_SPAN,
};
use std::{assert_matches::assert_matches, error::Error, fmt::Display};
///! [`Parser`] unit tests.
///!
///! Note that this system is comprehensively tested via integration
///! tests by concrete parser implementations.
///! The reason for this is historical:
///! the implementation was quite volatile and the refactoring cost
///! was too high with unit tests when the system had complete test
///! coverage via integration tests.
///! The upside of this approach is that this system is complex to test
///! in isolation and so integration tests save a lot of development
///! time and frustration,
///! but the downside is that any regressions will potentially
///! manifest as errors in every parser that uses this framework.
///! Eventually,
///! parser generator macros may be derived (as an abstraction) that
///! will make testing this framework significantly less effort,
///! and perhaps then it will be worth implementing more unit
///! tests now that the implementation has settled.
///!
///! Generally speaking,
///! since this framework is written exclusively for TAMER,
///! features that are no longer used ought to be pruned,
///! and so there should be no features that are not tested by
///! parsers' tests.
///! Consequently,
///! features ought to be developed alongside the parsers that require
///! those features.
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum StubToken {
YieldWithLookahead(usize),
Lookahead(usize),
Foo,
}
impl Token for StubToken {
fn span(&self) -> Span {
DUMMY_SPAN
}
}
impl Object for StubToken {}
impl Display for StubToken {
fn fmt(&self, _f: &mut std::fmt::Formatter) -> std::fmt::Result {
unimplemented!()
}
}
#[derive(Debug, PartialEq)]
pub enum StubError {}
impl Error for StubError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
None
}
}
impl Display for StubError {
fn fmt(&self, _f: &mut std::fmt::Formatter) -> std::fmt::Result {
unimplemented!()
}
}
impl Diagnostic for StubError {
fn describe(&self) -> Vec<crate::diagnose::AnnotatedSpan> {
unimplemented!()
}
}
impl From<ParseError<StubToken, StubError>> for StubError {
fn from(_: ParseError<StubToken, StubError>) -> Self {
unimplemented!()
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum StubObject {
FromYield(usize),
FromLookahead(usize),
}
impl Object for StubObject {}
#[derive(Debug, PartialEq, Eq, Default)]
pub struct StubParseState {}
impl Display for StubParseState {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "StubParseState")
}
}
impl ParseState for StubParseState {
type Token = StubToken;
type Object = StubObject;
type Error = StubError;
fn parse_token(
self,
tok: Self::Token,
_: &mut Self::Context,
) -> TransitionResult<Self> {
match tok {
StubToken::YieldWithLookahead(val) => Transition(self)
.ok(StubObject::FromYield(val))
.with_lookahead(StubToken::Lookahead(val)),
StubToken::Lookahead(val) => {
Transition(self).ok(StubObject::FromLookahead(val))
}
_ => Transition(self).incomplete(),
}
}
fn is_accepting(&self) -> bool {
true
}
}
// This technically only fails when debug assertions are enabled,
// which happens to be the case for tests.
//
// The intent is to ensure that implementations working with Parser
// actually consider lookahead tokens.
#[test]
#[should_panic]
fn fails_if_lookahead_not_consumed() {
let given = 64; // a fine value, but not an important one
let mut sut = StubParseState::parse(std::iter::empty());
// Feed a token that will force a lookahead token to be stored.
assert_eq!(
sut.feed_tok(StubToken::YieldWithLookahead(given)),
Ok(Parsed::Object(StubObject::FromYield(given)))
);
// If we now attempt to feed another token without consuming
// the lookahead,
// it should panic.
let _ = sut.feed_tok(StubToken::Foo);
}
// This is the API that will be used by implementations utilizing
// Parser to utilize lookahead tokens.
#[test]
fn succeeds_if_lookahead_is_consumed() {
let given = 63; // one less than a fine value, but not important
let mut sut = StubParseState::parse(std::iter::empty());
// Given that this is a fresh parser,
// we shouldn't have any lookahead,
// but let's make sure that we don't cause any problems by
// checking.
assert!(sut.take_lookahead_tok().is_none());
// Feed a token that will force a lookahead token to be stored.
assert_eq!(
sut.feed_tok(StubToken::YieldWithLookahead(given)),
Ok(Parsed::Object(StubObject::FromYield(given)))
);
// Consume the lookahead token that was generated above.
let la_tok = sut.take_lookahead_tok();
assert_eq!(la_tok, Some(StubToken::Lookahead(given)));
// The _proper_ thing to do here would be to push the lookahead
// token we retrieved above.
// However,
// all we care about for this test is that we're able to push some
// sort of token without an error occurring.
let _ = sut.feed_tok(StubToken::Foo);
}
#[test]
fn cannot_finalize_with_outstanding_lookahead() {
let given = 32; // half of a fine number that is pretty fine itself
let mut sut = StubParseState::parse(std::iter::empty());
// Feed a token that will force a lookahead token to be stored.
assert_eq!(
sut.feed_tok(StubToken::YieldWithLookahead(given)),
Ok(Parsed::Object(StubObject::FromYield(given)))
);
// Even though our token stream is empty,
// and even though we are in an accepting state,
// we should _not_ be able to finalize,
// since doing so would discard a token.
// The parser is forced to consume it and,
// if that results in an error,
// that's fine---at
// least it was not ignored.
let (_, err) = sut
.finalize()
.expect_err("must not finalize with token of lookahead");
assert_matches!(err, ParseError::Lookahead(span, _) if span == DUMMY_SPAN);
}
// Tests the above,
// but using the Iterator API.
#[test]
fn can_emit_object_with_lookahead_for_iter_parser() {
let given = 27; // some value
let toks = vec![StubToken::YieldWithLookahead(given)];
let mut sut = StubParseState::parse(toks.into_iter());
// We have a single token,
// and this consumes it,
// but it should introduce a lookahead token.
assert_eq!(
sut.next(),
Some(Ok(Parsed::Object(StubObject::FromYield(given))))
);
// Normally this would be the end of the token stream,
// but we should have a token of lookahead.
let (mut sut, err) = sut
.finalize()
.expect_err("must not finalize with token of lookahead");
assert_matches!(err, ParseError::Lookahead(span, _) if span == DUMMY_SPAN);
// The token of lookahead should still be available to the parser,
// and this should consume it.
assert_eq!(
sut.next(),
Some(Ok(Parsed::Object(StubObject::FromLookahead(given)))),
"lookahead token did not take effect"
);
// And now this should be the end,
// provided that the lookahead token was actually consumed and not
// copied and retained.
assert_eq!(
sut.next(),
None,
"expected end of both input stream and lookahead"
);
}
}

View File

@ -379,19 +379,60 @@ mod transition {
/// Conceptually,
/// imagine the act of a state transition producing data.
/// See [`Transition`] for convenience methods for producing this tuple.
///
/// Sometimes a parser is not able to complete the operation requested
/// based on the provided input token.
/// Since TAMER uses a streaming parsing framework that places strict
/// limits on control flow,
/// a single token can be returned as lookahead to indicate that the
/// token could not be parsed yet and should be provided once again
/// in place of the next token from the input stream.
/// This allows,
/// for example,
/// for multiple data to be emitted in response to a single token.
///
/// This struct is opaque to ensure that critical invariants involving
/// transitions and lookahead are properly upheld;
/// callers must use the appropriate parsing APIs.
#[derive(Debug, PartialEq)]
pub struct TransitionResult<S: ParseState>(
/// New parser state.
pub(in super::super) Transition<S>,
/// Result of the parsing operation.
pub(in super::super) ParseStateResult<S>,
/// Optional unused token to use as a lookahead token in place of
/// the next token from the input stream.
pub(in super::super) Option<S::Token>,
);
impl<S: ParseState> TransitionResult<S> {
/// Indicate that this transition include a single token of lookahead,
/// which should be provided back to the parser in place of the
/// next token from the input stream.
pub fn with_lookahead(self, lookahead: S::Token) -> Self {
match self {
Self(transition, result, None) => {
Self(transition, result, Some(lookahead))
}
// This represents a problem with the parser;
// we should never specify a lookahead token more than once.
// This could be enforced statically with the type system if
// ever such a thing is deemed to be worth doing.
Self(.., Some(prev)) => panic!(
"internal error: lookahead token overwrite: {prev:?}"
),
}
}
}
/// Denotes a state transition.
///
/// This newtype was created to produce clear, self-documenting code;
/// parsers can get confusing to read with all of the types involved,
/// so this provides a mental synchronization point.
///
/// This also provides some convenience methods to help remote boilerplate
/// This also provides some convenience methods to help remove boilerplate
/// and further improve code clarity.
#[derive(Debug, PartialEq, Eq)]
pub struct Transition<S: ParseState>(pub S);
@ -405,7 +446,7 @@ mod transition {
where
T: Into<ParseStatus<S>>,
{
TransitionResult(self, Ok(obj.into()))
TransitionResult(self, Ok(obj.into()), None)
}
/// A transition with corresponding error.
@ -413,7 +454,7 @@ mod transition {
/// This indicates a parsing failure.
/// The state ought to be suitable for error recovery.
pub fn err<E: Into<S::Error>>(self, err: E) -> TransitionResult<S> {
TransitionResult(self, Err(err.into()))
TransitionResult(self, Err(err.into()), None)
}
/// A state transition with corresponding [`Result`].
@ -425,7 +466,11 @@ mod transition {
T: Into<ParseStatus<S>>,
E: Into<S::Error>,
{
TransitionResult(self, result.map(Into::into).map_err(Into::into))
TransitionResult(
self,
result.map(Into::into).map_err(Into::into),
None,
)
}
/// A state transition indicating that more data is needed before an
@ -433,7 +478,7 @@ mod transition {
///
/// This corresponds to [`ParseStatus::Incomplete`].
pub fn incomplete(self) -> TransitionResult<S> {
TransitionResult(self, Ok(ParseStatus::Incomplete))
TransitionResult(self, Ok(ParseStatus::Incomplete), None)
}
/// A dead state transition.
@ -442,7 +487,7 @@ mod transition {
/// and a calling parser should use the provided [`Token`] as
/// lookahead.
pub fn dead(self, tok: S::DeadToken) -> TransitionResult<S> {
TransitionResult(self, Ok(ParseStatus::Dead(tok)))
TransitionResult(self, Ok(ParseStatus::Dead(tok)), None)
}
}
@ -460,7 +505,7 @@ mod transition {
fn from_output(output: Self::Output) -> Self {
match output {
(st, result) => Self(st, result),
(st, result) => Self(st, result, None),
}
}
@ -479,7 +524,7 @@ mod transition {
residual: (Transition<S>, ParseStateResult<S>),
) -> Self {
match residual {
(st, result) => Self(st, result),
(st, result) => Self(st, result, None),
}
}
}