tamer: xir::flat: Improve parser validation
This does a couple of things: it ensures that documents one and only one root note, and it properly handles dead transitions once parsing is complete (allowing it to be composed). This should make XIRF feature-complete for the time being. It does rely on the assumption that the reader is stripping out any trailing whitespace, so I guess we'll see if that's true as we proceed. DEV-10863main
parent
f04d845452
commit
150b3b9aa4
|
@ -51,6 +51,9 @@
|
|||
// We _could_ do without,
|
||||
// but this provides a nicer API.
|
||||
#![feature(explicit_generic_args_with_impl_trait)]
|
||||
// This simply removes a boilerplate `Default` impl;
|
||||
// we can do without if this does not get finalized.
|
||||
#![feature(derive_default_enum)]
|
||||
// We build docs for private items.
|
||||
#![allow(rustdoc::private_intra_doc_links)]
|
||||
|
||||
|
|
|
@ -28,8 +28,10 @@
|
|||
//! 1. All closing tags must correspond to a matching opening tag at the
|
||||
//! same depth;
|
||||
//! 2. [`Object`] exposes the [`Depth`] of each opening/closing tag;
|
||||
//! 3. Attribute tokens are parsed into [`Attr`] objects; and
|
||||
//! 4. Parsing will fail if input ends before all elements have been
|
||||
//! 3. Attribute tokens are parsed into [`Attr`] objects;
|
||||
//! 4. Documents must begin with an element and end with the closing of
|
||||
//! that element;
|
||||
//! 5. Parsing will fail if input ends before all elements have been
|
||||
//! closed.
|
||||
//!
|
||||
//! XIRF lowering does not perform any dynamic memory allocation;
|
||||
|
@ -118,29 +120,26 @@ where
|
|||
/// allowing XIRF's parser to avoid memory allocation entirely.
|
||||
type ElementStack<const MAX_DEPTH: usize> = ArrayVec<(QName, Span), MAX_DEPTH>;
|
||||
|
||||
/// XIRF parser state.
|
||||
/// XIRF document parser state.
|
||||
///
|
||||
/// This parser is a pushdown automaton.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
/// This parser is a pushdown automaton that parses a single XML document.
|
||||
#[derive(Debug, Default, PartialEq, Eq)]
|
||||
pub enum State<const MAX_DEPTH: usize, SA = AttrParseState>
|
||||
where
|
||||
SA: FlatAttrParseState,
|
||||
{
|
||||
// TODO: Ensure that non-comment nodes are not encountered before the
|
||||
// root,
|
||||
// and that we do not encounter any non-comment nodes after the
|
||||
// root.
|
||||
/// Document parsing has not yet begun.
|
||||
#[default]
|
||||
PreRoot,
|
||||
|
||||
/// Parsing nodes.
|
||||
NodeExpected(ElementStack<MAX_DEPTH>),
|
||||
|
||||
/// Delegating to attribute parser.
|
||||
AttrExpected(ElementStack<MAX_DEPTH>, SA),
|
||||
}
|
||||
|
||||
impl<const MD: usize, SA: FlatAttrParseState> Default for State<MD, SA> {
|
||||
fn default() -> Self {
|
||||
Self::NodeExpected(Default::default())
|
||||
}
|
||||
/// End of document has been reached.
|
||||
Done,
|
||||
}
|
||||
|
||||
impl<const MAX_DEPTH: usize, SA> ParseState for State<MAX_DEPTH, SA>
|
||||
|
@ -152,9 +151,22 @@ where
|
|||
|
||||
fn parse_token(self, tok: Token) -> TransitionResult<Self> {
|
||||
use ParseStatus::{Dead, Incomplete, Object as Obj};
|
||||
use State::{AttrExpected, NodeExpected};
|
||||
use State::{AttrExpected, Done, NodeExpected, PreRoot};
|
||||
|
||||
match (self, tok) {
|
||||
// Comments are permitted before and after the first root element.
|
||||
(st @ (PreRoot | Done), Token::Comment(sym, span)) => {
|
||||
Transition(st).with(Object::Comment(sym, span))
|
||||
}
|
||||
|
||||
(PreRoot, tok @ Token::Open(..)) => {
|
||||
Self::parse_node(Default::default(), tok)
|
||||
}
|
||||
|
||||
(PreRoot, tok) => {
|
||||
Transition(PreRoot).err(StateError::RootOpenExpected(tok))
|
||||
}
|
||||
|
||||
(NodeExpected(stack), tok) => Self::parse_node(stack, tok),
|
||||
|
||||
(AttrExpected(stack, sa), tok) => match sa.parse_token(tok) {
|
||||
|
@ -169,6 +181,8 @@ where
|
|||
Transition(AttrExpected(stack, sa)).err(x)
|
||||
}
|
||||
},
|
||||
|
||||
(Done, tok) => Transition(Done).dead(tok),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -182,7 +196,7 @@ where
|
|||
// TODO: It'd be nice if we could also return additional context to
|
||||
// aid the user in diagnosing the problem,
|
||||
// e.g. what element(s) still need closing.
|
||||
matches!(self, Self::NodeExpected(stack) if stack.len() == 0)
|
||||
*self == State::Done
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -196,7 +210,7 @@ where
|
|||
tok: Token,
|
||||
) -> TransitionResult<Self> {
|
||||
use Object::*;
|
||||
use State::{AttrExpected, NodeExpected};
|
||||
use State::{AttrExpected, Done, NodeExpected};
|
||||
|
||||
match tok {
|
||||
Token::Open(qname, span) if stack.len() == MAX_DEPTH => Transition(
|
||||
|
@ -221,9 +235,7 @@ where
|
|||
|
||||
Token::Close(close_oqname, close_span) => {
|
||||
match (close_oqname, stack.pop()) {
|
||||
(_, None) => Transition(NodeExpected(stack)).err(
|
||||
StateError::ExtraClosingTag(close_oqname, close_span),
|
||||
),
|
||||
(_, None) => unreachable!("parser should be in Done state"),
|
||||
|
||||
(Some(qname), Some((open_qname, open_span)))
|
||||
if qname != open_qname =>
|
||||
|
@ -236,6 +248,13 @@ where
|
|||
)
|
||||
}
|
||||
|
||||
// Final closing tag (for root node) completes the document.
|
||||
(..) if stack.len() == 0 => Transition(Done).with(Close(
|
||||
close_oqname,
|
||||
close_span,
|
||||
Depth(0),
|
||||
)),
|
||||
|
||||
(..) => {
|
||||
let depth = stack.len();
|
||||
|
||||
|
@ -283,6 +302,9 @@ pub fn parse<const MAX_DEPTH: usize>(
|
|||
/// Parsing error from [`State`].
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum StateError {
|
||||
/// Opening root element tag was expected.
|
||||
RootOpenExpected(Token),
|
||||
|
||||
/// Opening tag exceeds the maximum nesting depth for this parser.
|
||||
MaxDepthExceeded { open: (QName, Span), max: Depth },
|
||||
|
||||
|
@ -293,10 +315,6 @@ pub enum StateError {
|
|||
close: (QName, Span),
|
||||
},
|
||||
|
||||
/// Attempt to close a tag with no corresponding opening tag
|
||||
/// (which would result in a negative depth).
|
||||
ExtraClosingTag(Option<QName>, Span),
|
||||
|
||||
/// Error from the attribute parser.
|
||||
AttrError(AttrParseError),
|
||||
}
|
||||
|
@ -306,6 +324,14 @@ impl Display for StateError {
|
|||
use StateError::*;
|
||||
|
||||
match self {
|
||||
RootOpenExpected(tok) => {
|
||||
write!(
|
||||
f,
|
||||
"opening root element tag expected, \
|
||||
but found {tok}"
|
||||
)
|
||||
}
|
||||
|
||||
MaxDepthExceeded {
|
||||
open: (name, span),
|
||||
max,
|
||||
|
@ -329,18 +355,6 @@ impl Display for StateError {
|
|||
)
|
||||
}
|
||||
|
||||
ExtraClosingTag(Some(name), span) => {
|
||||
write!(f, "closing tag `{name}` at {span} has no opening tag",)
|
||||
}
|
||||
|
||||
// If this occurs, its likely that something generated invalid
|
||||
// XIR;
|
||||
// it should be a parsing error on read and no generator
|
||||
// should ever produce this.
|
||||
ExtraClosingTag(None, span) => {
|
||||
write!(f, "self-closing tag at {span} has no opening tag")
|
||||
}
|
||||
|
||||
AttrError(e) => Display::fmt(e, f),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -71,19 +71,25 @@ fn empty_element_balanced_close() {
|
|||
}
|
||||
|
||||
// More closing tags than opening.
|
||||
//
|
||||
// We cannot keep the token and throw our own error because this tag may be
|
||||
// part of a parent context.
|
||||
#[test]
|
||||
fn extra_closing_tag() {
|
||||
let name = ("ns", "openclose").unwrap_into();
|
||||
let toks = [Token::Close(Some(name), S)].into_iter();
|
||||
let toks = [
|
||||
// We need an opening tag to actually begin document parsing.
|
||||
Token::Open(name, S),
|
||||
Token::Close(Some(name), S2),
|
||||
Token::Close(Some(name), S3),
|
||||
]
|
||||
.into_iter();
|
||||
|
||||
let mut sut = parse::<1>(toks);
|
||||
let sut = parse::<1>(toks);
|
||||
|
||||
assert_eq!(
|
||||
sut.next(),
|
||||
Some(Err(ParseError::StateError(StateError::ExtraClosingTag(
|
||||
Some(name),
|
||||
S,
|
||||
))))
|
||||
Err(ParseError::UnexpectedToken(Token::Close(Some(name), S3),)),
|
||||
sut.collect::<Result<Vec<Parsed<Object>>, _>>()
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -92,15 +98,20 @@ fn extra_closing_tag() {
|
|||
// gotten to XIRF).
|
||||
#[test]
|
||||
fn extra_self_closing_tag() {
|
||||
let toks = [Token::Close(None, S)].into_iter();
|
||||
let name = ("ns", "openclose").unwrap_into();
|
||||
let toks = [
|
||||
// We need an opening tag to actually begin document parsing.
|
||||
Token::Open(name, S),
|
||||
Token::Close(None, S2),
|
||||
Token::Close(None, S3),
|
||||
]
|
||||
.into_iter();
|
||||
|
||||
let mut sut = parse::<1>(toks);
|
||||
let sut = parse::<1>(toks);
|
||||
|
||||
assert_eq!(
|
||||
sut.next(),
|
||||
Some(Err(ParseError::StateError(StateError::ExtraClosingTag(
|
||||
None, S,
|
||||
))))
|
||||
Err(ParseError::UnexpectedToken(Token::Close(None, S3),)),
|
||||
sut.collect::<Result<Vec<Parsed<Object>>, _>>()
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -355,3 +366,78 @@ fn not_accepting_state_if_element_open() {
|
|||
// Element was not closed.
|
||||
assert_eq!(Some(Err(ParseError::UnexpectedEof(Some(S)))), sut.next());
|
||||
}
|
||||
|
||||
// XML permits comment nodes before and after the document root element.
|
||||
#[test]
|
||||
fn comment_before_or_after_root_ok() {
|
||||
let name = "root".unwrap_into();
|
||||
let cstart = "start comment".intern();
|
||||
let cend = "end comment".intern();
|
||||
|
||||
let toks = [
|
||||
Token::Comment(cstart, S),
|
||||
Token::Open(name, S2),
|
||||
Token::Close(None, S3),
|
||||
Token::Comment(cend, S4),
|
||||
]
|
||||
.into_iter();
|
||||
|
||||
let sut = parse::<1>(toks);
|
||||
|
||||
assert_eq!(
|
||||
Ok(vec![
|
||||
Parsed::Object(Object::Comment(cstart, S)),
|
||||
Parsed::Object(Object::Open(name, S2, Depth(0))),
|
||||
Parsed::Object(Object::Close(None, S3, Depth(0))),
|
||||
Parsed::Object(Object::Comment(cend, S4)),
|
||||
]),
|
||||
sut.collect(),
|
||||
);
|
||||
}
|
||||
|
||||
// But there must be no content at the end of the document after the closing
|
||||
// root node.
|
||||
// This does not test every applicable token;
|
||||
// you can easily verify the actual implementation at a glance.
|
||||
//
|
||||
// This is just a dead parser state,
|
||||
// since it's possible for XIRF to be composed and we want to return to
|
||||
// the parent parser.
|
||||
#[test]
|
||||
fn content_after_root_close_error() {
|
||||
let name = "root".unwrap_into();
|
||||
|
||||
let toks = [
|
||||
Token::Open(name, S),
|
||||
Token::Close(None, S2),
|
||||
// Document ends here
|
||||
Token::Open(name, S3),
|
||||
]
|
||||
.into_iter();
|
||||
|
||||
let sut = parse::<1>(toks);
|
||||
|
||||
assert_eq!(
|
||||
Result::<Vec<Parsed<Object>>, _>::Err(ParseError::UnexpectedToken(
|
||||
Token::Open(name, S3)
|
||||
)),
|
||||
sut.collect()
|
||||
);
|
||||
}
|
||||
|
||||
// Non-comment nodes cannot appear before the opening root tag.
|
||||
#[test]
|
||||
fn content_before_root_open_error() {
|
||||
let text = "foo".intern();
|
||||
|
||||
let toks = [Token::Text(text, S)].into_iter();
|
||||
|
||||
let sut = parse::<1>(toks);
|
||||
|
||||
assert_eq!(
|
||||
Result::<Vec<Parsed<Object>>, _>::Err(ParseError::StateError(
|
||||
StateError::RootOpenExpected(Token::Text(text, S))
|
||||
)),
|
||||
sut.collect()
|
||||
);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue