tamer: xir::flat: Improve parser validation

This does a couple of things: it ensures that documents one and only one
root note, and it properly handles dead transitions once parsing is
complete (allowing it to be composed).

This should make XIRF feature-complete for the time being.  It does rely on
the assumption that the reader is stripping out any trailing whitespace, so
I guess we'll see if that's true as we proceed.

DEV-10863
main
Mike Gerwitz 2022-03-17 23:22:38 -04:00
parent f04d845452
commit 150b3b9aa4
3 changed files with 152 additions and 49 deletions

View File

@ -51,6 +51,9 @@
// We _could_ do without,
// but this provides a nicer API.
#![feature(explicit_generic_args_with_impl_trait)]
// This simply removes a boilerplate `Default` impl;
// we can do without if this does not get finalized.
#![feature(derive_default_enum)]
// We build docs for private items.
#![allow(rustdoc::private_intra_doc_links)]

View File

@ -28,8 +28,10 @@
//! 1. All closing tags must correspond to a matching opening tag at the
//! same depth;
//! 2. [`Object`] exposes the [`Depth`] of each opening/closing tag;
//! 3. Attribute tokens are parsed into [`Attr`] objects; and
//! 4. Parsing will fail if input ends before all elements have been
//! 3. Attribute tokens are parsed into [`Attr`] objects;
//! 4. Documents must begin with an element and end with the closing of
//! that element;
//! 5. Parsing will fail if input ends before all elements have been
//! closed.
//!
//! XIRF lowering does not perform any dynamic memory allocation;
@ -118,29 +120,26 @@ where
/// allowing XIRF's parser to avoid memory allocation entirely.
type ElementStack<const MAX_DEPTH: usize> = ArrayVec<(QName, Span), MAX_DEPTH>;
/// XIRF parser state.
/// XIRF document parser state.
///
/// This parser is a pushdown automaton.
#[derive(Debug, PartialEq, Eq)]
/// This parser is a pushdown automaton that parses a single XML document.
#[derive(Debug, Default, PartialEq, Eq)]
pub enum State<const MAX_DEPTH: usize, SA = AttrParseState>
where
SA: FlatAttrParseState,
{
// TODO: Ensure that non-comment nodes are not encountered before the
// root,
// and that we do not encounter any non-comment nodes after the
// root.
/// Document parsing has not yet begun.
#[default]
PreRoot,
/// Parsing nodes.
NodeExpected(ElementStack<MAX_DEPTH>),
/// Delegating to attribute parser.
AttrExpected(ElementStack<MAX_DEPTH>, SA),
}
impl<const MD: usize, SA: FlatAttrParseState> Default for State<MD, SA> {
fn default() -> Self {
Self::NodeExpected(Default::default())
}
/// End of document has been reached.
Done,
}
impl<const MAX_DEPTH: usize, SA> ParseState for State<MAX_DEPTH, SA>
@ -152,9 +151,22 @@ where
fn parse_token(self, tok: Token) -> TransitionResult<Self> {
use ParseStatus::{Dead, Incomplete, Object as Obj};
use State::{AttrExpected, NodeExpected};
use State::{AttrExpected, Done, NodeExpected, PreRoot};
match (self, tok) {
// Comments are permitted before and after the first root element.
(st @ (PreRoot | Done), Token::Comment(sym, span)) => {
Transition(st).with(Object::Comment(sym, span))
}
(PreRoot, tok @ Token::Open(..)) => {
Self::parse_node(Default::default(), tok)
}
(PreRoot, tok) => {
Transition(PreRoot).err(StateError::RootOpenExpected(tok))
}
(NodeExpected(stack), tok) => Self::parse_node(stack, tok),
(AttrExpected(stack, sa), tok) => match sa.parse_token(tok) {
@ -169,6 +181,8 @@ where
Transition(AttrExpected(stack, sa)).err(x)
}
},
(Done, tok) => Transition(Done).dead(tok),
}
}
@ -182,7 +196,7 @@ where
// TODO: It'd be nice if we could also return additional context to
// aid the user in diagnosing the problem,
// e.g. what element(s) still need closing.
matches!(self, Self::NodeExpected(stack) if stack.len() == 0)
*self == State::Done
}
}
@ -196,7 +210,7 @@ where
tok: Token,
) -> TransitionResult<Self> {
use Object::*;
use State::{AttrExpected, NodeExpected};
use State::{AttrExpected, Done, NodeExpected};
match tok {
Token::Open(qname, span) if stack.len() == MAX_DEPTH => Transition(
@ -221,9 +235,7 @@ where
Token::Close(close_oqname, close_span) => {
match (close_oqname, stack.pop()) {
(_, None) => Transition(NodeExpected(stack)).err(
StateError::ExtraClosingTag(close_oqname, close_span),
),
(_, None) => unreachable!("parser should be in Done state"),
(Some(qname), Some((open_qname, open_span)))
if qname != open_qname =>
@ -236,6 +248,13 @@ where
)
}
// Final closing tag (for root node) completes the document.
(..) if stack.len() == 0 => Transition(Done).with(Close(
close_oqname,
close_span,
Depth(0),
)),
(..) => {
let depth = stack.len();
@ -283,6 +302,9 @@ pub fn parse<const MAX_DEPTH: usize>(
/// Parsing error from [`State`].
#[derive(Debug, Eq, PartialEq)]
pub enum StateError {
/// Opening root element tag was expected.
RootOpenExpected(Token),
/// Opening tag exceeds the maximum nesting depth for this parser.
MaxDepthExceeded { open: (QName, Span), max: Depth },
@ -293,10 +315,6 @@ pub enum StateError {
close: (QName, Span),
},
/// Attempt to close a tag with no corresponding opening tag
/// (which would result in a negative depth).
ExtraClosingTag(Option<QName>, Span),
/// Error from the attribute parser.
AttrError(AttrParseError),
}
@ -306,6 +324,14 @@ impl Display for StateError {
use StateError::*;
match self {
RootOpenExpected(tok) => {
write!(
f,
"opening root element tag expected, \
but found {tok}"
)
}
MaxDepthExceeded {
open: (name, span),
max,
@ -329,18 +355,6 @@ impl Display for StateError {
)
}
ExtraClosingTag(Some(name), span) => {
write!(f, "closing tag `{name}` at {span} has no opening tag",)
}
// If this occurs, its likely that something generated invalid
// XIR;
// it should be a parsing error on read and no generator
// should ever produce this.
ExtraClosingTag(None, span) => {
write!(f, "self-closing tag at {span} has no opening tag")
}
AttrError(e) => Display::fmt(e, f),
}
}

View File

@ -71,19 +71,25 @@ fn empty_element_balanced_close() {
}
// More closing tags than opening.
//
// We cannot keep the token and throw our own error because this tag may be
// part of a parent context.
#[test]
fn extra_closing_tag() {
let name = ("ns", "openclose").unwrap_into();
let toks = [Token::Close(Some(name), S)].into_iter();
let toks = [
// We need an opening tag to actually begin document parsing.
Token::Open(name, S),
Token::Close(Some(name), S2),
Token::Close(Some(name), S3),
]
.into_iter();
let mut sut = parse::<1>(toks);
let sut = parse::<1>(toks);
assert_eq!(
sut.next(),
Some(Err(ParseError::StateError(StateError::ExtraClosingTag(
Some(name),
S,
))))
Err(ParseError::UnexpectedToken(Token::Close(Some(name), S3),)),
sut.collect::<Result<Vec<Parsed<Object>>, _>>()
);
}
@ -92,15 +98,20 @@ fn extra_closing_tag() {
// gotten to XIRF).
#[test]
fn extra_self_closing_tag() {
let toks = [Token::Close(None, S)].into_iter();
let name = ("ns", "openclose").unwrap_into();
let toks = [
// We need an opening tag to actually begin document parsing.
Token::Open(name, S),
Token::Close(None, S2),
Token::Close(None, S3),
]
.into_iter();
let mut sut = parse::<1>(toks);
let sut = parse::<1>(toks);
assert_eq!(
sut.next(),
Some(Err(ParseError::StateError(StateError::ExtraClosingTag(
None, S,
))))
Err(ParseError::UnexpectedToken(Token::Close(None, S3),)),
sut.collect::<Result<Vec<Parsed<Object>>, _>>()
);
}
@ -355,3 +366,78 @@ fn not_accepting_state_if_element_open() {
// Element was not closed.
assert_eq!(Some(Err(ParseError::UnexpectedEof(Some(S)))), sut.next());
}
// XML permits comment nodes before and after the document root element.
#[test]
fn comment_before_or_after_root_ok() {
let name = "root".unwrap_into();
let cstart = "start comment".intern();
let cend = "end comment".intern();
let toks = [
Token::Comment(cstart, S),
Token::Open(name, S2),
Token::Close(None, S3),
Token::Comment(cend, S4),
]
.into_iter();
let sut = parse::<1>(toks);
assert_eq!(
Ok(vec![
Parsed::Object(Object::Comment(cstart, S)),
Parsed::Object(Object::Open(name, S2, Depth(0))),
Parsed::Object(Object::Close(None, S3, Depth(0))),
Parsed::Object(Object::Comment(cend, S4)),
]),
sut.collect(),
);
}
// But there must be no content at the end of the document after the closing
// root node.
// This does not test every applicable token;
// you can easily verify the actual implementation at a glance.
//
// This is just a dead parser state,
// since it's possible for XIRF to be composed and we want to return to
// the parent parser.
#[test]
fn content_after_root_close_error() {
let name = "root".unwrap_into();
let toks = [
Token::Open(name, S),
Token::Close(None, S2),
// Document ends here
Token::Open(name, S3),
]
.into_iter();
let sut = parse::<1>(toks);
assert_eq!(
Result::<Vec<Parsed<Object>>, _>::Err(ParseError::UnexpectedToken(
Token::Open(name, S3)
)),
sut.collect()
);
}
// Non-comment nodes cannot appear before the opening root tag.
#[test]
fn content_before_root_open_error() {
let text = "foo".intern();
let toks = [Token::Text(text, S)].into_iter();
let sut = parse::<1>(toks);
assert_eq!(
Result::<Vec<Parsed<Object>>, _>::Err(ParseError::StateError(
StateError::RootOpenExpected(Token::Text(text, S))
)),
sut.collect()
);
}