tamer: xir::parse::ele: Mixed content parsing

"Mixed content" is the XML term representing element nodes mixed with text
nodes.  For example, `foo <strong>bar</strong> baz` is mixed.

TAME supports text nodes as documentation, intended to be in a literate
style but never fully realized.  In any case, we need to permit them, and I
wanted to do more than just ignore the nodes.

This takes a different approach than typical parser delegation---it has the
parent parser _preempt_ the child by intercepting text before delegation
takes place, rather than having the child reject the token (or possibly
interpret it itself!) and have to handle an error or dead state.

And while this makes it more confusing in terms of state machine stitching,
it does make sense, in the sense that the parent parser is really what
"owns" the text node---the parser is delegating _element_ parsing only, take
asserts authority when necessary to take back control where it shouldn't be
delegated.

DEV-7145
main
Mike Gerwitz 2022-08-01 13:37:16 -04:00
parent 8779abe2bb
commit 2d117a4864
3 changed files with 266 additions and 1 deletions

View File

@ -67,6 +67,18 @@ use std::{
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Depth(pub usize);
impl Depth {
/// Yield a new [`Depth`] representing the expected depth of children of
/// an element at the current depth.
///
/// That description is probably more confusing than the method name.
pub fn child_depth(&self) -> Depth {
match self {
Depth(depth) => Depth(depth + 1),
}
}
}
impl Display for Depth {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
Display::fmt(&self.0, f)

View File

@ -98,6 +98,12 @@ macro_rules! ele_parse {
@ { $($attrbody:tt)* } => $attrmap:expr,
$(/$(($close_span:ident))? => $closemap:expr,)?
// Special forms (`[sp](args) => expr`).
$(
[$special:ident]$(($($special_arg:ident),*))?
=> $special_map:expr,
)?
// Nonterminal references are provided as a list.
// A configuration specifier can be provided,
// currently intended to support the Kleene star.
@ -110,6 +116,8 @@ macro_rules! ele_parse {
@ { $($attrbody)* } => $attrmap,
/$($($close_span)?)? => ele_parse!(@!ele_close $($closemap)?),
$([$special]$(($($special_arg),*))? => $special_map,)?
<> {
$(
$ntref [$($ntref_cfg)?],
@ -168,6 +176,11 @@ macro_rules! ele_parse {
// (defaulting to Incomplete via @!ele_expand_body).
/$($close_span:ident)? => $closemap:expr,
// Non-whitespace text nodes can be mapped into elements with the
// given QName as a preprocessing step,
// allowing them to reuse the existing element NT system.
$([text]($text:ident, $text_span:ident) => $text_map:expr,)?
// Nonterminal references.
<> {
$(
@ -252,6 +265,19 @@ macro_rules! ele_parse {
const fn qname() -> crate::xir::QName {
$qname
}
/// Yield the expected depth of child elements,
/// if known.
#[allow(dead_code)] // used by text special form
fn child_depth(&self) -> Option<Depth> {
match self {
$ntfirst((_, depth), _) => Some(depth.child_depth()),
$(
$ntnext((_, depth), _) => Some(depth.child_depth()),
)*
_ => None,
}
}
}
impl std::fmt::Display for $nt {
@ -514,6 +540,34 @@ macro_rules! ele_parse {
)
},
// Must come _after_ `Attrs_` above so that
// attributes are yielded before text that
// terminates attribute parsing.
$(
// Text nodes are handled a differently because
// it implies mixed content;
// the text is "owned" by us,
// not by the parser we have chosen to
// delegate _elements_ to.
// But we must be sure to only preempt parsing
// of text nodes _at our child depth_,
// so as not to interfere with the text
// parsing of child elements.
// This also allows us to avoid implementing
// Text handling in sum parsers.
(
st,
XirfToken::Text(
RefinedText::Unrefined(
Text($text, $text_span)
),
depth
)
) if Some(depth) == st.child_depth() => {
Transition(st).ok($text_map)
}
)?
$(
($ntprev(depth, st_inner), tok) => {
st_inner.delegate(

View File

@ -32,7 +32,7 @@
//! the system,
//! simply force the test to panic at the end.
use std::{error::Error, fmt::Display};
use std::{assert_matches::assert_matches, error::Error, fmt::Display};
use crate::{
convert::ExpectInto,
@ -1363,6 +1363,205 @@ fn sum_repetition() {
);
}
// Text nodes may appear between elements if a `[text]` special form
// specifies a mapping.
// This is "mixed content" in XML.
//
// The `[text]` mapping applies to all text nodes at the child depth of the
// element,
// meaning it'll preempt sum parser delegation to provide the desired
// behavior.
#[test]
fn mixed_content_text_nodes() {
#[derive(Debug, PartialEq, Eq)]
enum Foo {
Root,
A,
B,
TextRoot(SymbolId, Span),
TextA(SymbolId, Span),
}
impl crate::parse::Object for Foo {}
const QN_SUT: QName = QN_PACKAGE;
const QN_A: QName = QN_CLASSIFY;
const QN_B: QName = QN_EXPORT;
ele_parse! {
type Object = Foo;
Sut := QN_SUT {
@ {} => Foo::Root,
// The `[text]` special form here introduces a `Text` mapping
// for all non-whitespace text node at the same depth as our
// child elements.
[text](sym, span) => Foo::TextRoot(sym, span),
A,
AB[*],
}
A := QN_A {
@ {} => Foo::A,
// The child should not have its text processing preempted.
[text](sym, span) => Foo::TextA(sym, span),
// Text should be permitted even though we permit no children.
}
// Used only for `AB`.
B := QN_B {
@ {} => Foo::B,
}
// We need at least two NTs;
// we don't actually use `B`.
AB := (A | B);
}
let tok_ws = XirfToken::Text(
RefinedText::Whitespace(Whitespace(Text(" ".unwrap_into(), S1))),
Depth(0),
);
let text_root = "text root".into();
let text_a = "text a".into();
let toks = vec![
XirfToken::Open(QN_SUT, OpenSpan(S1, N), Depth(0)),
// Whitespace will not match the `[text]` special form.
tok_ws.clone(),
// Text node for the root (Sut).
XirfToken::Text(RefinedText::Unrefined(Text(text_root, S1)), Depth(1)),
// This child also expects text nodes,
// and should be able to yield its own parse.
XirfToken::Open(QN_A, OpenSpan(S2, N), Depth(1)),
// If this goes wrong,
// and Sut does not properly check its depth,
// then the parser would erroneously yield `Foo::TextRoot` for
// this token.
XirfToken::Text(RefinedText::Unrefined(Text(text_a, S2)), Depth(2)),
XirfToken::Close(None, CloseSpan::empty(S3), Depth(1)),
// Now we're about to parse with `AB`,
// which itself cannot handle text.
// But text should never reach that parser,
// having been preempted by Sut.
XirfToken::Text(RefinedText::Unrefined(Text(text_root, S3)), Depth(1)),
// Try to yield A again with text.
XirfToken::Open(QN_A, OpenSpan(S3, N), Depth(1)),
XirfToken::Text(RefinedText::Unrefined(Text(text_a, S4)), Depth(2)),
XirfToken::Close(None, CloseSpan::empty(S4), Depth(1)),
// Finally, some more text permitted at the close.
XirfToken::Text(RefinedText::Unrefined(Text(text_root, S5)), Depth(1)),
XirfToken::Close(Some(QN_SUT), CloseSpan(S6, N), Depth(0)),
];
use Parsed::*;
assert_eq!(
Ok(vec![
Incomplete, // [Sut] Root Open
Incomplete, // [Sut@] WS
Object(Foo::Root), // [Sut@] Text (>LA)
Object(Foo::TextRoot(text_root, S1)), // [Sut] Text (<LA)
Incomplete, // [A] A Open (<LA)
Object(Foo::A), // [A@] A Text (>LA)
Object(Foo::TextA(text_a, S2)), // [A] Text (<LA)
Incomplete, // [A] A Close
Object(Foo::TextRoot(text_root, S3)), // [Sut] Text
Incomplete, // [A] A Open
Object(Foo::A), // [A@] A Text (>LA)
Object(Foo::TextA(text_a, S4)), // [A] Text (<LA)
Incomplete, // [A] A Close
Object(Foo::TextRoot(text_root, S5)), // [Sut] Text
Incomplete, // [Sut] Root Close
]),
Sut::parse(toks.into_iter()).collect(),
);
}
/// Contrast this test with [`mixed_content_text_nodes`] above.
#[test]
fn mixed_content_text_nodes_with_non_mixed_content_child() {
#[derive(Debug, PartialEq, Eq)]
enum Foo {
Root,
A,
TextRoot(SymbolId, Span),
}
impl crate::parse::Object for Foo {}
const QN_SUT: QName = QN_PACKAGE;
const QN_A: QName = QN_CLASSIFY;
ele_parse! {
type Object = Foo;
Sut := QN_SUT {
@ {} => Foo::Root,
// Mixed content permitted at root
// (but we won't be providing text for it in this test).
[text](sym, span) => Foo::TextRoot(sym, span),
// But this child will not permit text.
A,
}
A := QN_A {
@ {} => Foo::A,
// Missing `[text`];
// no mixed content permitted.
}
}
let text_a = "text a".into();
let toks = vec![
XirfToken::Open(QN_SUT, OpenSpan(S1, N), Depth(0)),
XirfToken::Open(QN_A, OpenSpan(S2, N), Depth(1)),
// Even though the root permits text,
// the child `A` does not,
// and so this should result in an error.
XirfToken::Text(RefinedText::Unrefined(Text(text_a, S2)), Depth(2)),
XirfToken::Close(None, CloseSpan::empty(S3), Depth(1)),
XirfToken::Close(Some(QN_SUT), CloseSpan(S6, N), Depth(0)),
];
let mut sut = Sut::parse(toks.into_iter());
use Parsed::*;
// The first two tokens should parse successfully
// (four calls because of LA).
assert_eq!(sut.next(), Some(Ok(Incomplete))); // [Sut] Root Open
assert_eq!(sut.next(), Some(Ok(Object(Foo::Root)))); // [Sut@] A Open (>LA)
assert_eq!(sut.next(), Some(Ok(Incomplete))); // [A] A Open (<LA)
assert_eq!(sut.next(), Some(Ok(Object(Foo::A)))); // [A@] Text (>LA)
// The next token is `Text`,
// which is not expected by `A` and so should produce an error.
// The error that it produces at the time of writing is different than
// the error that it will eventually produce,
// so let's just expect some sort of error.
assert_matches!(sut.next(), Some(Err(_))); // [A] Text (<LA)
// A then enters recovery,
// completes recovery,
// and parsing finishes.
assert_eq!(
Ok(vec![
Incomplete, // [A] A Close
Incomplete, // [Sut] Root Close
]),
sut.collect()
);
}
// Ensure that we can actually export the generated identifiers
// (add visibility to them).
// We don't want to always make them public by default because then Rust