tamer: xir::parse::Transition: Generalize flat::Transition

XIRF introduced the concept of `Transition` to help document code and
provide mental synchronization points that make it easier to reason about
the system.  I decided to hoist this into XIR's parser itself, and have
`parse_token` accept an owned state and require a new state to be returned,
utilizing `Transition`.

Together with the convenience methods introduced on `Transition` itself,
this produces much clearer code, as is evidenced by tree::Stack (XIRT's
parser).  Passing an owned state is something that I had wanted to do
originally, but I thought it'd lead to more concise code to use a mutable
reference.  Unfortunately, that concision lead to code that was much more
difficult than necessary to understand, and ended up having a net negative
benefit by leading to some more boilerplate for the nested types (granted,
that could have been alleviated in other ways).

This also opens up the possibility to do something that I wasn't able to
before, which was continue to abstract away parser composition by stitching
their state machines together.  I don't know if this'll be done immediately,
but because the actual parsing operations are now able to compose
functionally without mutability getting the way, the previous state coupling
issues with the parent parser go away.

DEV-10863
main
Mike Gerwitz 2022-03-17 15:50:35 -04:00
parent 899fa79e59
commit 7b6d68af85
5 changed files with 234 additions and 189 deletions

View File

@ -22,11 +22,11 @@
use crate::{
span::Span,
xir::{
parse::{ParseState, ParseStateResult, ParseStatus},
parse::{ParseState, Transition, TransitionResult},
QName, Token,
},
};
use std::{error::Error, fmt::Display, mem::take};
use std::{error::Error, fmt::Display};
use super::Attr;
@ -48,25 +48,25 @@ impl ParseState for AttrParseState {
type Object = Attr;
type Error = AttrParseError;
fn parse_token(&mut self, tok: Token) -> ParseStateResult<Self> {
use AttrParseState::*;
fn parse_token(self, tok: Token) -> TransitionResult<Self> {
use AttrParseState::{Empty, Name};
match (take(self), tok) {
match (self, tok) {
(Empty, Token::AttrName(name, span)) => {
*self = Name(name, span);
Ok(ParseStatus::Incomplete)
Transition(Name(name, span)).incomplete()
}
(Empty, invalid) => return Ok(ParseStatus::Dead(invalid)),
(Empty, invalid) => Transition(Empty).dead(invalid),
(Name(name, nspan), Token::AttrValue(value, vspan)) => {
Ok(ParseStatus::Object(Attr::new(name, value, (nspan, vspan))))
Transition(Empty).with(Attr::new(name, value, (nspan, vspan)))
}
(Name(name, nspan), invalid) => {
// Restore state for error recovery.
*self = Name(name, nspan);
Err(AttrParseError::AttrValueExpected(name, nspan, invalid))
Transition(Name(name, nspan)).err(
AttrParseError::AttrValueExpected(name, nspan, invalid),
)
}
}
}
@ -120,31 +120,32 @@ impl Error for AttrParseError {
#[cfg(test)]
mod test {
use super::*;
use crate::{convert::ExpectInto, sym::GlobalSymbolIntern};
use crate::{
convert::ExpectInto,
sym::GlobalSymbolIntern,
xir::parse::{ParseStatus, Parsed},
};
// TODO: Just make these const
lazy_static! {
static ref S: Span =
Span::from_byte_interval((0, 0), "test case, 1".intern());
static ref S2: Span =
Span::from_byte_interval((0, 0), "test case, 2".intern());
static ref S3: Span =
Span::from_byte_interval((0, 0), "test case, 3".intern());
}
const S: Span = crate::span::DUMMY_SPAN;
const S2: Span = S.offset_add(1).unwrap();
#[test]
fn dead_if_first_token_is_non_attr() {
let tok = Token::Open("foo".unwrap_into(), *S);
let tok = Token::Open("foo".unwrap_into(), S);
let mut sut = AttrParseState::default();
let sut = AttrParseState::default();
// There is no state that we can transition to,
// and we're in an empty accepting state.
assert_eq!(Ok(ParseStatus::Dead(tok.clone())), sut.parse_token(tok));
// Let's just make sure we're in the same state we started in so
// that we know we can accommodate recovery token(s).
assert_eq!(sut, AttrParseState::default());
assert_eq!(
(
// Make sure we're in the same state we started in so that
// we know we can accommodate recovery token(s).
Transition(AttrParseState::default()),
Ok(ParseStatus::Dead(tok.clone()))
),
sut.parse_token(tok)
);
}
#[test]
@ -152,21 +153,17 @@ mod test {
let attr = "attr".unwrap_into();
let val = "val".intern();
let mut sut = AttrParseState::default();
let expected = Attr::new(attr, val, (*S, *S2));
let toks =
[Token::AttrName(attr, S), Token::AttrValue(val, S2)].into_iter();
// First token represents the name,
// and so we are awaiting a value.
assert_eq!(
sut.parse_token(Token::AttrName(attr, *S)),
Ok(ParseStatus::Incomplete)
);
let sut = AttrParseState::parse(toks);
// Once we have a value,
// an Attr can be emitted.
assert_eq!(
sut.parse_token(Token::AttrValue(val, *S2)),
Ok(ParseStatus::Object(expected))
Ok(vec![
Parsed::Incomplete,
Parsed::Object(Attr::new(attr, val, (S, S2))),
]),
sut.collect()
);
}
@ -174,22 +171,22 @@ mod test {
fn parse_fails_when_attribute_value_missing_but_can_recover() {
let attr = "bad".unwrap_into();
let mut sut = AttrParseState::default();
let sut = AttrParseState::default();
// This token indicates that we're expecting a value to come next in
// the token stream.
assert_eq!(
sut.parse_token(Token::AttrName(attr, *S)),
Ok(ParseStatus::Incomplete)
);
let (Transition(sut), result) =
sut.parse_token(Token::AttrName(attr, S));
assert_eq!(result, Ok(ParseStatus::Incomplete));
// But we provide something else unexpected.
let (Transition(sut), result) = sut.parse_token(Token::Close(None, S2));
assert_eq!(
sut.parse_token(Token::Close(None, *S2)),
result,
Err(AttrParseError::AttrValueExpected(
attr,
*S,
Token::Close(None, *S2)
S,
Token::Close(None, S2)
))
);
@ -203,10 +200,11 @@ mod test {
// Rather than checking for that state,
// let's actually attempt a recovery.
let recover = "value".intern();
let expected = Attr::new(attr, recover, (*S, *S2));
let (Transition(sut), result) =
sut.parse_token(Token::AttrValue(recover, S2));
assert_eq!(
sut.parse_token(Token::AttrValue(recover, *S2)),
Ok(ParseStatus::Object(expected))
result,
Ok(ParseStatus::Object(Attr::new(attr, recover, (S, S2)))),
);
// Finally, we should now be in an accepting state.

View File

@ -37,16 +37,19 @@
//! of the caller.
use super::{
parse::{ParseState, ParseStateResult, ParseStatus, ParsedResult},
parse::{
ParseState, ParseStateResult, ParseStatus, ParsedResult,
TransitionResult,
},
tree::{
attr::{AttrParseError, AttrParseState},
Attr,
},
QName, Token, TokenStream, Whitespace,
};
use crate::{span::Span, sym::SymbolId};
use crate::{span::Span, sym::SymbolId, xir::parse::Transition};
use arrayvec::ArrayVec;
use std::{error::Error, fmt::Display, mem::replace};
use std::{error::Error, fmt::Display};
/// Tag nesting depth
/// (`0` represents the root).
@ -138,10 +141,6 @@ where
/// Delegating to attribute parser.
AttrExpected(ElementStack<MAX_DEPTH>, SA),
/// Temporary state used to catch missing explicit state transitions in
/// `parse_token`.
Invalid,
}
impl<const MD: usize, SA: FlatAttrParseState> Default for State<MD, SA> {
@ -150,11 +149,6 @@ impl<const MD: usize, SA: FlatAttrParseState> Default for State<MD, SA> {
}
}
/// Denotes a state transition.
///
/// This newtype was created to produce clear, self-documenting code.
struct Transition<T>(T);
impl<const MAX_DEPTH: usize, SA> ParseState for State<MAX_DEPTH, SA>
where
SA: FlatAttrParseState,
@ -162,39 +156,31 @@ where
type Object = Object;
type Error = StateError;
fn parse_token(&mut self, tok: Token) -> ParseStateResult<Self> {
fn parse_token(self, tok: Token) -> TransitionResult<Self> {
use ParseStatus::{Dead, Incomplete, Object as Obj};
use State::{AttrExpected, Invalid, NodeExpected};
let result;
use State::{AttrExpected, NodeExpected};
// This awkward-looking take-reassign forces us to be explicit
// about state transitions in every case,
// ensuring that we always have documented proof of what state
// the system winds up in.
// The `Invalid` state prevents using `return`.
(Transition(*self), result) = match (replace(self, Invalid), tok) {
match (self, tok) {
(NodeExpected(stack), tok) => Self::parse_node(stack, tok),
(AttrExpected(stack, mut sa), tok) => match sa.parse_token(tok) {
Ok(Incomplete) => {
(Transition(AttrExpected(stack, sa)), Ok(Incomplete))
(AttrExpected(stack, sa), tok) => match sa.parse_token(tok) {
(Transition(sa), Ok(Incomplete)) => {
Transition(AttrExpected(stack, sa)).incomplete()
}
(Transition(sa), Ok(Obj(attr))) => {
Transition(AttrExpected(stack, sa)).with(Object::Attr(attr))
}
(_, Ok(Dead(lookahead))) => Self::parse_node(stack, lookahead),
(Transition(sa), Err(x)) => {
Transition(AttrExpected(stack, sa)).err(x)
}
Ok(Obj(attr)) => (
Transition(AttrExpected(stack, sa)),
Ok(Obj(Object::Attr(attr))),
),
Ok(Dead(lookahead)) => Self::parse_node(stack, lookahead),
Err(x) => (Transition(AttrExpected(stack, sa)), Err(x.into())),
},
// See comment at the top of this function.
(Invalid, _) => {
unreachable!("XIRF parser reached invalid state")
}
};
result
}
}
/// Whether all elements have been closed.
@ -220,81 +206,71 @@ where
mut stack: ElementStack<MAX_DEPTH>,
tok: Token,
) -> (Transition<Self>, ParseStateResult<Self>) {
use ParseStatus::Object as Obj;
use Object::*;
use State::{AttrExpected, NodeExpected};
match tok {
Token::Open(qname, span) if stack.len() == MAX_DEPTH => (
Transition(NodeExpected(stack)),
Err(StateError::MaxDepthExceeded {
open: (qname, span),
max: Depth(MAX_DEPTH),
}),
),
Token::Open(qname, span) if stack.len() == MAX_DEPTH => Transition(
NodeExpected(stack),
)
.err(StateError::MaxDepthExceeded {
open: (qname, span),
max: Depth(MAX_DEPTH),
}),
Token::Open(qname, span) => {
let depth = stack.len();
stack.push((qname, span));
// Delegate to the attribute parser until it is complete.
(
Transition(AttrExpected(stack, SA::default())),
Ok(Obj(Object::Open(qname, span, Depth(depth)))),
)
Transition(AttrExpected(stack, SA::default())).with(Open(
qname,
span,
Depth(depth),
))
}
Token::Close(close_oqname, close_span) => {
match (close_oqname, stack.pop()) {
(_, None) => (
Transition(NodeExpected(stack)),
Err(StateError::ExtraClosingTag(
close_oqname,
close_span,
)),
(_, None) => Transition(NodeExpected(stack)).err(
StateError::ExtraClosingTag(close_oqname, close_span),
),
(Some(qname), Some((open_qname, open_span)))
if qname != open_qname =>
{
(
Transition(NodeExpected(stack)),
Err(StateError::UnbalancedTag {
Transition(NodeExpected(stack)).err(
StateError::UnbalancedTag {
open: (open_qname, open_span),
close: (qname, close_span),
}),
},
)
}
(..) => {
let depth = stack.len();
(
Transition(NodeExpected(stack)),
Ok(Obj(Object::Close(
close_oqname,
close_span,
Depth(depth),
))),
)
Transition(NodeExpected(stack)).with(Close(
close_oqname,
close_span,
Depth(depth),
))
}
}
}
Token::Comment(sym, span) => (
Transition(NodeExpected(stack)),
Ok(Obj(Object::Comment(sym, span))),
),
Token::Text(sym, span) => (
Transition(NodeExpected(stack)),
Ok(Obj(Object::Text(sym, span))),
),
Token::CData(sym, span) => (
Transition(NodeExpected(stack)),
Ok(Obj(Object::CData(sym, span))),
),
Token::Whitespace(ws, span) => (
Transition(NodeExpected(stack)),
Ok(Obj(Object::Whitespace(ws, span))),
),
Token::Comment(sym, span) => {
Transition(NodeExpected(stack)).with(Comment(sym, span))
}
Token::Text(sym, span) => {
Transition(NodeExpected(stack)).with(Text(sym, span))
}
Token::CData(sym, span) => {
Transition(NodeExpected(stack)).with(CData(sym, span))
}
Token::Whitespace(ws, span) => {
Transition(NodeExpected(stack)).with(Whitespace(ws, span))
}
// We should transition to `State::Attr` before encountering any
// of these tokens.

View File

@ -22,6 +22,7 @@
use super::{Token, TokenStream};
use crate::span::Span;
use std::fmt::Debug;
use std::mem::take;
use std::{error::Error, fmt::Display};
/// Result of applying a [`Token`] to a [`ParseState`],
@ -69,12 +70,22 @@ pub trait ParseState: Default + PartialEq + Eq + Debug {
/// Parse a single [`Token`] and optionally perform a state transition.
///
/// The current state is represented by `self`,
/// which is mutable to allow for a state transition.
/// The result of a parsing operation is either an object or an
/// indication that additional tokens of input are needed;
/// see [`Parsed`] for more information.
fn parse_token(&mut self, tok: Token) -> ParseStateResult<Self>;
/// The current state is represented by `self`.
/// The result of a parsing operation is a state transition with
/// associated [`ParseStatus`] data.
///
/// Note that `self` is owned,
/// for a couple primary reasons:
///
/// 1. This forces the parser to explicitly consider and document all
/// state transitions,
/// rather than potentially missing unintended behavior through
/// implicit behavior; and
/// 2. It allows for more natural functional composition of state,
/// which in turn makes it easier to compose parsers
/// (which conceptually involves stitching together state
/// machines).
fn parse_token(self, tok: Token) -> TransitionResult<Self>;
/// Whether the current state represents an accepting state.
///
@ -95,9 +106,65 @@ pub trait ParseState: Default + PartialEq + Eq + Debug {
}
/// Result of applying a [`Token`] to a [`ParseState`].
///
/// This is used by [`ParseState::parse_token`];
/// see that function for rationale.
pub type ParseStateResult<S> =
Result<ParseStatus<<S as ParseState>::Object>, <S as ParseState>::Error>;
/// Denotes a state transition.
///
/// This newtype was created to produce clear, self-documenting code;
/// parsers can get confusing to read with all of the types involved,
/// so this provides a mental synchronization point.
///
/// This also provides some convenience methods to help remote boilerplate
/// and further improve code clarity.
#[derive(Debug, PartialEq, Eq)]
pub struct Transition<S: ParseState>(pub S);
impl<S: ParseState> Transition<S> {
/// A state transition with corresponding data.
///
/// This allows [`ParseState::parse_token`] to emit a parsed object and
/// corresponds to [`ParseStatus::Object`].
pub fn with(self, obj: S::Object) -> (Self, ParseStateResult<S>) {
(self, Ok(ParseStatus::Object(obj)))
}
/// A state transition indicating that more data is needed before an
/// object can be emitted.
///
/// This corresponds to [`ParseStatus::Incomplete`].
pub fn incomplete(self) -> (Self, ParseStateResult<S>) {
(self, Ok(ParseStatus::Incomplete))
}
/// A dead state transition.
///
/// This corresponds to [`ParseStatus::Dead`],
/// and a calling parser should use the provided [`Token`] as
/// lookahead.
pub fn dead(self, tok: Token) -> (Self, ParseStateResult<S>) {
(self, Ok(ParseStatus::Dead(tok)))
}
/// A transition with corresponding error.
///
/// This indicates a parsing failure.
/// The state ought to be suitable for error recovery.
pub fn err<E: Into<S::Error>>(self, err: E) -> (Self, ParseStateResult<S>) {
(self, Err(err.into()))
}
}
/// A state transition with associated data.
///
/// Conceptually,
/// imagine the act of a state transition producing data.
/// See [`Transition`] for convenience methods for producing this tuple.
pub type TransitionResult<S> = (Transition<S>, ParseStateResult<S>);
/// A streaming parser defined by a [`ParseState`] with exclusive
/// mutable access to an underlying [`TokenStream`].
///
@ -167,8 +234,12 @@ impl<S: ParseState, I: TokenStream> Iterator for Parser<S, I> {
// reporting in case we encounter an EOF.
self.last_span = Some(tok.span());
let result;
(Transition(self.state), result) =
take(&mut self.state).parse_token(tok);
use ParseStatus::*;
match self.state.parse_token(tok) {
match result {
// Nothing handled this dead state,
// and we cannot discard a lookahead token,
// so we have no choice but to produce an error.
@ -382,19 +453,15 @@ pub mod test {
type Object = Token;
type Error = EchoStateError;
fn parse_token(&mut self, tok: Token) -> ParseStateResult<Self> {
fn parse_token(self, tok: Token) -> TransitionResult<Self> {
match tok {
Token::Comment(..) => {
*self = Self::Done;
}
Token::Comment(..) => Transition(Self::Done).with(tok),
Token::Close(..) => {
return Err(EchoStateError::InnerError(tok))
Transition(self).err(EchoStateError::InnerError(tok))
}
Token::Text(..) => return Ok(ParseStatus::Dead(tok)),
_ => {}
Token::Text(..) => Transition(self).dead(tok),
_ => Transition(self).with(tok),
}
Ok(ParseStatus::Object(tok))
}
fn is_accepting(&self) -> bool {

View File

@ -177,15 +177,16 @@ pub mod attr;
use self::{
super::parse::{
ParseError, ParseResult, ParseState, ParseStateResult, ParseStatus,
ParsedResult,
ParseError, ParseResult, ParseState, ParseStatus, ParsedResult,
},
attr::{AttrParseError, AttrParseState},
};
use super::{QName, Token, TokenResultStream, TokenStream};
use crate::{span::Span, sym::SymbolId};
use std::{error::Error, fmt::Display, mem::take, result};
use super::{
parse::TransitionResult, QName, Token, TokenResultStream, TokenStream,
};
use crate::{span::Span, sym::SymbolId, xir::parse::Transition};
use std::{error::Error, fmt::Display, result};
pub use attr::{Attr, AttrList};
@ -515,65 +516,82 @@ impl<SA: StackAttrParseState> ParseState for Stack<SA> {
type Object = Tree;
type Error = StackError;
fn parse_token(&mut self, tok: Token) -> ParseStateResult<Self> {
fn parse_token(self, tok: Token) -> TransitionResult<Self> {
use Stack::*;
match (take(self), tok) {
match (self, tok) {
// Open a root element (or lack of context).
(Empty, Token::Open(name, span)) => {
Ok(Self::begin_attrs(name, span, None))
Self::begin_attrs(name, span, None)
}
// Open a child element.
(BuddingElement(pstack), Token::Open(name, span)) => {
Ok(Self::begin_attrs(name, span, Some(pstack.store())))
Self::begin_attrs(name, span, Some(pstack.store()))
}
// Open a child element in attribute parsing context.
(BuddingAttrList(pstack, attr_list), Token::Open(name, span)) => {
Ok(Self::begin_attrs(
Self::begin_attrs(
name,
span,
Some(pstack.consume_attrs(attr_list).store()),
))
)
}
// Attribute parsing.
(AttrState(estack, attrs, mut sa), tok) => {
(AttrState(estack, attrs, sa), tok) => {
use ParseStatus::*;
match sa.parse_token(tok) {
Ok(Incomplete) => Ok(AttrState(estack, attrs, sa)),
Ok(Object(attr)) => {
Ok(AttrState(estack, attrs.push(attr), sa))
(Transition(sa), Ok(Incomplete)) => {
Transition(AttrState(estack, attrs, sa)).incomplete()
}
Ok(Dead(lookahead)) => {
*self = BuddingElement(estack.consume_attrs(attrs));
return self.parse_token(lookahead);
(Transition(sa), Ok(Object(attr))) => {
Transition(AttrState(estack, attrs.push(attr), sa))
.incomplete()
}
(_, Ok(Dead(lookahead))) => {
BuddingElement(estack.consume_attrs(attrs))
.parse_token(lookahead)
}
(Transition(sa), Err(x)) => {
Transition(AttrState(estack, attrs, sa)).err(x.into())
}
Err(x) => Err(x.into()),
}
}
(BuddingElement(stack), Token::Close(name, span)) => stack
.try_close(name, span)
.map(ElementStack::consume_child_or_complete),
.map(ElementStack::consume_child_or_complete)
.map(|new_stack| match new_stack {
Stack::ClosedElement(ele) => {
Transition(Empty).with(Tree::Element(ele))
}
_ => Transition(new_stack).incomplete(),
})
.unwrap_or_else(|err| Transition(Empty).err(err)),
(BuddingAttrList(stack, attr_list), Token::Close(name, span)) => {
stack
.consume_attrs(attr_list)
.try_close(name, span)
.map(ElementStack::consume_child_or_complete)
.map(|new_stack| match new_stack {
Stack::ClosedElement(ele) => {
Transition(Empty).with(Tree::Element(ele))
}
_ => Transition(new_stack).incomplete(),
})
.unwrap_or_else(|err| Transition(Empty).err(err))
}
(BuddingElement(mut ele), Token::Text(value, span)) => {
ele.element.children.push(Tree::Text(value, span));
Ok(Self::BuddingElement(ele))
Transition(BuddingElement(ele)).incomplete()
}
(_, tok) if self.is_accepting() => {
return Ok(ParseStatus::Dead(tok))
}
(st, tok) if st.is_accepting() => Transition(st).dead(tok),
(stack, tok) => {
todo!(
@ -585,7 +603,6 @@ impl<SA: StackAttrParseState> ParseState for Stack<SA> {
)
}
}
.map(|new_stack| self.store_or_emit(new_stack))
}
fn is_accepting(&self) -> bool {
@ -598,29 +615,16 @@ impl<SA: StackAttrParseState> Stack<SA> {
name: QName,
span: Span,
pstack: Option<Box<ElementStack>>,
) -> Self {
Self::AttrState(
) -> TransitionResult<Self> {
Transition(Self::AttrState(
ElementStack {
element: Element::open(name, span),
pstack,
},
Default::default(),
SA::default(),
)
}
/// Emit a completed object or store the current stack for further processing.
fn store_or_emit(&mut self, new_stack: Self) -> ParseStatus<Tree> {
match new_stack {
Stack::ClosedElement(ele) => {
ParseStatus::Object(Tree::Element(ele))
}
_ => {
*self = new_stack;
ParseStatus::Incomplete
}
}
))
.incomplete()
}
}