tamer: xir::tree::attr_parser_from: Integrate AttrParser
This begins to integrate the isolated AttrParser. The next step will be integrating it into the larger XIRT parser. There's been considerable delay in getting this committed, because I went through quite the struggle with myself trying to determine what balance I want to strike between Rust's type system; convenience with parser combinators; iterators; and various other abstractions. I ended up being confounded by trying to maintain the current XmloReader abstraction, which is fundamentally incompatible with the way the new parsing system works (streaming iterators that do not collect or perform heap allocations). There'll be more information on this to come, but there are certain things that will be changing. There are a couple problems highlighted by this commit (not in code, but conceptually): 1. Introducing Option here for the TokenParserState doesn't feel right, in the sense that the abstraction is inappropriate. We should perhaps introduce a new variant Parsed::Done or something to indicate intent, rather than leaving the reader to have to read about what None actually means. 2. This turns Parsed into more of a statement influencing control flow/logic, and so should be encapsulated, with an external equivalent of Parsed that omits variants that ought to remain encapsulated. 3. TokenStreamState is true, but these really are the actual parsers; TokenStreamParser is more of a coordinator, and helps to abstract away some of the common logic so lower-level parsers do not have to worry about it. But calling it TokenStreamState is both a bit confusing and is an understatement---it _does_ hold the state, but it also holds the current parsing stack in its variants. Another thing that is not yet entirely clear is whether this AttrParser ought to care about detection of duplicate attributes, or if that should be done in a separate parser, perhaps even at the XIR level. The same can be said for checking for balanced tags. By pushing it to TokenStream in XIR, we would get a guaranteed check regardless of what parsers are used, which is attractive because it reduces the (almost certain-to-otherwise-occur) risk that individual parsers will not sufficiently check for semantically valid XML. But it does _potentially_ match error recovery more complicated. But at the same time, perhaps more specific parsers ought not care about recovery at that level. Anyway, point being, more to come, but I am disappointed how much time I'm spending considering parsing, given that there are so many things I need to move onto. I just want this done right and in a way that feels like it's working well with Rust while it's all in working memory, otherwise it's going to be a significant effort to get back into. DEV-11268main
parent
0e08cf3efe
commit
bfe46be5bb
|
@ -174,9 +174,11 @@
|
|||
mod attr;
|
||||
mod parse;
|
||||
|
||||
use self::attr::AttrParserState;
|
||||
|
||||
use super::{QName, Token, TokenResultStream, TokenStream};
|
||||
use crate::{span::Span, sym::SymbolId};
|
||||
use std::{error::Error, fmt::Display, iter, mem::take};
|
||||
use crate::{span::Span, sym::SymbolId, xir::tree::parse::DefaultParser};
|
||||
use std::{error::Error, fmt::Display, mem::take};
|
||||
|
||||
pub use attr::{Attr, AttrList};
|
||||
|
||||
|
@ -1054,35 +1056,40 @@ pub fn parse_attrs<'a>(
|
|||
pub fn attr_parser_from<'a>(
|
||||
toks: &'a mut impl TokenStream,
|
||||
) -> impl Iterator<Item = Result<Attr>> + 'a {
|
||||
let mut state = ParserState::with(Stack::IsolatedAttrEmpty);
|
||||
use parse::Parsed;
|
||||
|
||||
iter::from_fn(move || {
|
||||
loop {
|
||||
match toks.next().and_then(|tok| parse(&mut state, tok)) {
|
||||
None => return None,
|
||||
Some(Err(err)) => return Some(Err(err)),
|
||||
Some(Ok(Parsed::Attr(attr))) => return Some(Ok(attr)),
|
||||
Some(Ok(Parsed::Incomplete)) => continue,
|
||||
|
||||
// AttrEnd must have been encountered.
|
||||
Some(Ok(Parsed::Done)) => return None,
|
||||
|
||||
// These make no sense in this context and should never occur.
|
||||
Some(Ok(x @ (Parsed::Tree(_) | Parsed::AttrList(_)))) => {
|
||||
unreachable!(
|
||||
"unexpected yield by XIRT (Attr expected): {:?}",
|
||||
x
|
||||
)
|
||||
}
|
||||
}
|
||||
DefaultParser::<AttrParserState, _>::from(toks).filter_map(|parsed| {
|
||||
match parsed {
|
||||
Ok(Parsed::Object(attr)) => Some(Ok(attr)),
|
||||
Ok(Parsed::Incomplete) => None,
|
||||
Err(x) => Some(Err(x.into())),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Transitional; this will go away, or at least be refined.
|
||||
impl From<parse::ParseError<attr::AttrParseError>> for ParseError {
|
||||
fn from(e: parse::ParseError<attr::AttrParseError>) -> Self {
|
||||
match e {
|
||||
parse::ParseError::UnexpectedEof(_) => Self::UnexpectedAttrEof,
|
||||
|
||||
parse::ParseError::StateError(
|
||||
attr::AttrParseError::AttrNameExpected(tok),
|
||||
) => Self::AttrNameExpected(tok),
|
||||
|
||||
parse::ParseError::StateError(
|
||||
attr::AttrParseError::AttrValueExpected(..),
|
||||
) => Self::UnexpectedAttrEof,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn merge_attr_fragments<'a>(
|
||||
toks: &'a mut impl TokenStream,
|
||||
) -> impl TokenStream + 'a {
|
||||
use std::iter;
|
||||
|
||||
use crate::sym::{GlobalSymbolIntern, GlobalSymbolResolve};
|
||||
|
||||
let mut stack = Vec::with_capacity(4);
|
||||
|
|
|
@ -23,11 +23,13 @@
|
|||
//!
|
||||
//! See [parent module](super) for additional documentation.
|
||||
|
||||
mod parse;
|
||||
|
||||
use super::QName;
|
||||
use crate::{span::Span, sym::SymbolId};
|
||||
use std::fmt::Display;
|
||||
|
||||
mod parse;
|
||||
pub use parse::{AttrParseError, AttrParserState};
|
||||
|
||||
/// Element attribute.
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
|
@ -117,6 +119,12 @@ impl From<Vec<Attr>> for AttrList {
|
|||
}
|
||||
}
|
||||
|
||||
impl FromIterator<Attr> for AttrList {
|
||||
fn from_iter<T: IntoIterator<Item = Attr>>(iter: T) -> Self {
|
||||
iter.into_iter().collect::<Vec<Attr>>().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> From<[Attr; N]> for AttrList {
|
||||
fn from(attrs: [Attr; N]) -> Self {
|
||||
AttrList {
|
||||
|
|
|
@ -51,32 +51,26 @@ impl TokenStreamState for AttrParserState {
|
|||
fn parse_token(&mut self, tok: Token) -> TokenStreamStateResult<Self> {
|
||||
use AttrParserState::*;
|
||||
|
||||
*self = match (take(self), tok) {
|
||||
(Empty, Token::AttrName(name, span)) => Name(name, span),
|
||||
Some(match (take(self), tok) {
|
||||
(Empty, Token::AttrEnd(_)) => return None,
|
||||
|
||||
(Empty, invalid) => {
|
||||
return Err(AttrParseError::AttrNameExpected(invalid))
|
||||
(Empty, Token::AttrName(name, span)) => {
|
||||
*self = Name(name, span);
|
||||
Ok(Parsed::Incomplete)
|
||||
}
|
||||
|
||||
(Empty, invalid) => Err(AttrParseError::AttrNameExpected(invalid)),
|
||||
|
||||
(Name(name, nspan), Token::AttrValue(value, vspan)) => {
|
||||
return Ok(Parsed::Object(Attr::new(
|
||||
name,
|
||||
value,
|
||||
(nspan, vspan),
|
||||
)))
|
||||
Ok(Parsed::Object(Attr::new(name, value, (nspan, vspan))))
|
||||
}
|
||||
|
||||
(Name(name, nspan), invalid) => {
|
||||
// Restore state for error recovery.
|
||||
*self = Name(name, nspan);
|
||||
|
||||
return Err(AttrParseError::AttrValueExpected(
|
||||
name, nspan, invalid,
|
||||
));
|
||||
Err(AttrParseError::AttrValueExpected(name, nspan, invalid))
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Parsed::Incomplete)
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
@ -148,7 +142,7 @@ mod test {
|
|||
|
||||
// Fail immediately.
|
||||
assert_eq!(
|
||||
Err(AttrParseError::AttrNameExpected(tok.clone())),
|
||||
Some(Err(AttrParseError::AttrNameExpected(tok.clone()))),
|
||||
sut.parse_token(tok)
|
||||
);
|
||||
|
||||
|
@ -169,14 +163,14 @@ mod test {
|
|||
// and so we are awaiting a value.
|
||||
assert_eq!(
|
||||
sut.parse_token(Token::AttrName(attr, *S)),
|
||||
Ok(Parsed::Incomplete)
|
||||
Some(Ok(Parsed::Incomplete))
|
||||
);
|
||||
|
||||
// Once we have a value,
|
||||
// an Attr can be emitted.
|
||||
assert_eq!(
|
||||
sut.parse_token(Token::AttrValue(val, *S2)),
|
||||
Ok(Parsed::Object(expected))
|
||||
Some(Ok(Parsed::Object(expected)))
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -190,17 +184,17 @@ mod test {
|
|||
// the token stream.
|
||||
assert_eq!(
|
||||
sut.parse_token(Token::AttrName(attr, *S)),
|
||||
Ok(Parsed::Incomplete)
|
||||
Some(Ok(Parsed::Incomplete))
|
||||
);
|
||||
|
||||
// But we provide something else unexpected.
|
||||
assert_eq!(
|
||||
sut.parse_token(Token::AttrEnd(*S2)),
|
||||
Err(AttrParseError::AttrValueExpected(
|
||||
Some(Err(AttrParseError::AttrValueExpected(
|
||||
attr,
|
||||
*S,
|
||||
Token::AttrEnd(*S2)
|
||||
))
|
||||
)))
|
||||
);
|
||||
|
||||
// We should not be in an accepting state,
|
||||
|
@ -216,10 +210,18 @@ mod test {
|
|||
let expected = Attr::new(attr, recover, (*S, *S2));
|
||||
assert_eq!(
|
||||
sut.parse_token(Token::AttrValue(recover, *S2)),
|
||||
Ok(Parsed::Object(expected))
|
||||
Some(Ok(Parsed::Object(expected)))
|
||||
);
|
||||
|
||||
// Finally, we should now be in an accepting state.
|
||||
assert!(sut.is_accepting());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn yields_none_on_attr_end() {
|
||||
let mut sut = AttrParserState::default();
|
||||
|
||||
assert_eq!(sut.parse_token(Token::AttrEnd(*S)), None);
|
||||
assert!(sut.is_accepting());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,9 @@ use super::super::{Token, TokenStream};
|
|||
use crate::span::Span;
|
||||
use std::{error::Error, fmt::Display};
|
||||
|
||||
/// Preferred [`TokenStreamParser`].
|
||||
pub type DefaultParser<'a, S, I> = Parser<'a, S, I>;
|
||||
|
||||
/// Lower a [`TokenStream`] into XIRT.
|
||||
///
|
||||
/// Parsers are wrappers around a ([`TokenStreamState`], [`TokenStream`])
|
||||
|
@ -146,9 +149,11 @@ pub trait TokenStreamState: Default {
|
|||
}
|
||||
|
||||
/// Result of applying a [`Token`] to a [`TokenStreamState`].
|
||||
pub type TokenStreamStateResult<S> = Result<
|
||||
Parsed<<S as TokenStreamState>::Object>,
|
||||
<S as TokenStreamState>::Error,
|
||||
pub type TokenStreamStateResult<S> = Option<
|
||||
Result<
|
||||
Parsed<<S as TokenStreamState>::Object>,
|
||||
<S as TokenStreamState>::Error,
|
||||
>,
|
||||
>;
|
||||
|
||||
/// A streaming parser defined by a [`TokenStreamState`] with exclusive
|
||||
|
@ -216,7 +221,9 @@ impl<'a, S: TokenStreamState, I: TokenStream> Iterator for Parser<'a, S, I> {
|
|||
// reporting in case we encounter an EOF.
|
||||
self.last_span = Some(tok.span());
|
||||
|
||||
Some(self.state.parse_token(tok).map_err(ParseError::from))
|
||||
self.state
|
||||
.parse_token(tok)
|
||||
.map(|parsed| parsed.map_err(ParseError::from))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -312,11 +319,6 @@ pub mod test {
|
|||
use super::*;
|
||||
use crate::span::DUMMY_SPAN as DS;
|
||||
|
||||
/// Preferred [`TokenStreamParser`].
|
||||
///
|
||||
/// TODO: Move into parent module once used outside of tests.
|
||||
pub type DefaultParser<'a, S, I> = Parser<'a, S, I>;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
enum EchoState {
|
||||
Empty,
|
||||
|
@ -339,12 +341,12 @@ pub mod test {
|
|||
*self = Self::Done;
|
||||
}
|
||||
Token::Close(..) => {
|
||||
return Err(EchoStateError::InnerError(tok))
|
||||
return Some(Err(EchoStateError::InnerError(tok)))
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
Ok(Parsed::Object(tok))
|
||||
Some(Ok(Parsed::Object(tok)))
|
||||
}
|
||||
|
||||
fn is_accepting(&self) -> bool {
|
||||
|
|
Loading…
Reference in New Issue