tamer: Xirf::Text refinement

This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.

This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.

The idea is to capture the most common whitespace as preinterned
symbols.  Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.

The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character.  This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large.  (They become large when template-expanded.)  I'll optimize further
if I notice it show up during profiling.

This also frees XIR itself from being concerned by Whitespace.  Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic.  I'd rather avoid tweaking it.

tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).

Onward and yonward.

DEV-7145
main
Mike Gerwitz 2022-07-27 15:49:38 -04:00
parent b38c16fd08
commit 41b41e02c1
14 changed files with 456 additions and 240 deletions

View File

@ -117,20 +117,6 @@ mod name {
}
}
mod ws {
use super::*;
use tamer::xir::Whitespace;
#[bench]
fn whitespace_1000(bench: &mut Bencher) {
bench.iter(|| {
(0..1000)
.map(|_| Whitespace::try_from(" \t "))
.for_each(drop);
});
}
}
mod writer {
use super::*;
use quick_xml::{

View File

@ -43,7 +43,7 @@ use crate::{
parse::{Lower, ParseError, Parsed, ParsedObject, UnknownToken},
sym::{GlobalSymbolResolve, SymbolId},
xir::{
flat::{XirToXirf, XirToXirfError, XirfToken},
flat::{Text, XirToXirf, XirToXirfError, XirfToken},
reader::XmlXirReader,
writer::{Error as XirWriterError, XmlWriter},
DefaultEscaper, Error as XirError, Escaper, Token as XirToken,
@ -192,11 +192,11 @@ fn load_xmlo<'a, P: AsRef<Path>, S: Escaper>(
// abstracted away.
let (mut asg, mut state) = Lower::<
ParsedObject<XirToken, XirError>,
XirToXirf<64>,
XirToXirf<64, Text>,
>::lower::<_, TameldError>(
&mut XmlXirReader::new(file, escaper, ctx),
|toks| {
Lower::<XirToXirf<64>, XmloReader>::lower(toks, |xmlo| {
Lower::<XirToXirf<64, Text>, XmloReader>::lower(toks, |xmlo| {
let mut iter = xmlo.scan(false, |st, rtok| match st {
true => None,
false => {
@ -283,7 +283,7 @@ pub enum TameldError {
SortError(SortError),
XirParseError(ParseError<UnknownToken, XirError>),
XirfParseError(ParseError<XirToken, XirToXirfError>),
XmloParseError(ParseError<XirfToken, XmloError>),
XmloParseError(ParseError<XirfToken<Text>, XmloError>),
XmloLowerError(ParseError<XmloToken, XmloAirError>),
AirLowerError(ParseError<AirToken, AsgError>),
XirWriterError(XirWriterError),
@ -309,8 +309,8 @@ impl From<ParseError<UnknownToken, XirError>> for TameldError {
}
}
impl From<ParseError<XirfToken, XmloError>> for TameldError {
fn from(e: ParseError<XirfToken, XmloError>) -> Self {
impl From<ParseError<XirfToken<Text>, XmloError>> for TameldError {
fn from(e: ParseError<XirfToken<Text>, XmloError>) -> Self {
Self::XmloParseError(e)
}
}

View File

@ -23,7 +23,7 @@ use crate::diagnose::{Annotate, AnnotatedSpan, Diagnostic};
use crate::parse::Token;
use crate::span::Span;
use crate::sym::SymbolId;
use crate::xir::flat::XirfToken;
use crate::xir::flat::{Text, XirfToken};
use std::fmt::Display;
/// Error during `xmlo` processing.
@ -38,7 +38,7 @@ use std::fmt::Display;
#[derive(Debug, PartialEq, Eq)]
pub enum XmloError {
/// The root node was not an `lv:package`.
UnexpectedRoot(XirfToken),
UnexpectedRoot(XirfToken<Text>),
/// A `preproc:sym` node was found, but is missing `@name`.
UnassociatedSym(Span),
/// The provided `preproc:sym/@type` is unknown or invalid.
@ -65,7 +65,7 @@ pub enum XmloError {
/// Ideally we would provide a better error depending on the context,
/// but this serves as a fallback if the input is completely
/// unexpected.
UnexpectedToken(XirfToken),
UnexpectedToken(XirfToken<Text>),
}
impl Display for XmloError {

View File

@ -31,7 +31,7 @@ use crate::{
sym::{st::raw, SymbolId},
xir::{
attr::{Attr, AttrSpan},
flat::XirfToken as Xirf,
flat::{Text, XirfToken as Xirf},
st::qname::*,
EleSpan, QName,
},
@ -140,7 +140,7 @@ impl Display for XmloToken {
}
/// A parser capable of being composed with [`XmloReader`].
pub trait XmloState = ParseState<Token = Xirf, Context = EmptyContext>
pub trait XmloState = ParseState<Token = Xirf<Text>, Context = EmptyContext>
where
Self: Default,
<Self as ParseState>::Error: Into<XmloError>,
@ -176,7 +176,7 @@ pub enum XmloReader<
impl<SS: XmloState, SD: XmloState, SF: XmloState> ParseState
for XmloReader<SS, SD, SF>
{
type Token = Xirf;
type Token = Xirf<Text>;
type Object = XmloToken;
type Error = XmloError;
@ -333,7 +333,7 @@ pub enum SymtableState {
impl parse::Object for (SymbolId, SymAttrs, Span) {}
impl ParseState for SymtableState {
type Token = Xirf;
type Token = Xirf<Text>;
type Object = (SymbolId, SymAttrs, Span);
type Error = XmloError;
@ -619,7 +619,7 @@ pub enum SymDepsState {
}
impl ParseState for SymDepsState {
type Token = Xirf;
type Token = Xirf<Text>;
type Object = XmloToken;
type Error = XmloError;
@ -730,7 +730,7 @@ pub enum FragmentsState {
}
impl ParseState for FragmentsState {
type Token = Xirf;
type Token = Xirf<Text>;
type Object = XmloToken;
type Error = XmloError;
@ -775,7 +775,7 @@ impl ParseState for FragmentsState {
(FragmentUnnamed(span), _) => Transition(FragmentUnnamed(span))
.err(XmloError::UnassociatedFragment(span)),
(Fragment(span, id), Xirf::Text(text, _)) => {
(Fragment(span, id), Xirf::Text(Text(text, _))) => {
Transition(FragmentDone(span, id))
.ok(XmloToken::Fragment(id, text, span))
}

View File

@ -537,12 +537,12 @@ fn sym_fragment_event() {
// first
open(QN_FRAGMENT, S1, Depth(0)),
Xirf::Attr(Attr(QN_ID, id1, AttrSpan(S2, S3))),
Xirf::Text(frag1, S4),
Xirf::Text(Text(frag1, S4)),
close(Some(QN_FRAGMENT), S5, Depth(0)),
// second
open(QN_FRAGMENT, S2, Depth(0)),
Xirf::Attr(Attr(QN_ID, id2, AttrSpan(S3, S4))),
Xirf::Text(frag2, S5),
Xirf::Text(Text(frag2, S5)),
close(Some(QN_FRAGMENT), S5, Depth(0)),
]
.into_iter();
@ -567,7 +567,7 @@ fn sym_fragment_missing_id() {
let toks = [
open(QN_FRAGMENT, S1, Depth(0)),
// missing @id
Xirf::Text("text".into(), S4),
Xirf::Text(Text("text".into(), S4)),
]
.into_iter();
@ -585,7 +585,7 @@ fn sym_fragment_empty_id() {
open(QN_FRAGMENT, S1, Depth(0)),
// empty @id
Xirf::Attr(Attr(QN_ID, "".into(), AttrSpan(S3, S4))),
Xirf::Text("text".into(), S4),
Xirf::Text(Text("text".into(), S4)),
]
.into_iter();
@ -655,13 +655,13 @@ fn xmlo_composite_parsers_header() {
// <preproc:fragment
open(QN_FRAGMENT, S4, Depth(2)),
Xirf::Attr(Attr(QN_ID, symfrag_id, AttrSpan(S2, S3))),
Xirf::Text(frag, S5),
Xirf::Text(Text(frag, S5)),
close(Some(QN_FRAGMENT), S4, Depth(2)),
// </preproc:fragment>
close(Some(QN_FRAGMENTS), S3, Depth(1)),
// </preproc:fragments>
// No closing root node:
// ensure that we can just end at the header without parsing further.
// ensure that we can just end at the header without parsing further).
]
.into_iter();

View File

@ -195,9 +195,9 @@ macro_rules! static_symbol_consts {
#[doc=concat!(
"Interned `",
stringify!($ty),
"` string `\"",
$str,
"\"`."
"` ",
static_symbol_consts!(@!str $ty $str),
"."
)]
#[doc=""]
#[doc=concat!(
@ -227,7 +227,16 @@ macro_rules! static_symbol_consts {
/// This can be used to help determine a base capacity for
/// collections holding [`SymbolId`]s.
pub const ST_COUNT: usize = $i - 1;
}
};
// Whitespace with newlines causes rustdoc parsing issues.
(@!str ws $str:expr) => {
"whitespace"
};
(@!str $ty:ident $str:expr) => {
concat!("string `\"", $str, "\"`")
};
}
/// Statically allocate [`SymbolId`]s for the provided symbols,
@ -267,9 +276,9 @@ macro_rules! static_symbols {
#[doc=concat!(
"Raw (untyped) interned `",
stringify!($ty),
"` string `\"",
$str,
"\"`."
"` ",
static_symbols!(@!str $ty $str),
"."
)]
#[doc=""]
#[doc=concat!(
@ -309,7 +318,16 @@ macro_rules! static_symbols {
interner
}
}
};
// Whitespace with newlines causes rustdoc parsing issues.
(@!str ws $str:expr) => {
"whitespace"
};
(@!str $ty:ident $str:expr) => {
concat!("string `\"", $str, "\"`")
};
}
static_symbol_newtypes! {
@ -354,6 +372,14 @@ static_symbol_newtypes! {
/// Any other generic string that does not fit into any particular type.
str: GenericStaticSymbolId<global::ProgSymSize>,
/// Common strings of whitespace
/// (where a character of whitespace is `[ \n]`).
///
/// There are certainly other whitespace characters,
/// but this is intended to be conservative to address only the most
/// common cases.
ws: WhitespaceStaticSymbolId<global::ProgSymSize>,
/// Static 16-bit [`Span`](crate::span::Span) context.
///
/// These contexts are intended for use in generated code where a better
@ -420,6 +446,41 @@ pub mod st {
}
}
/// Whether the given [`SymbolId`] is within a group of symbols
/// delimited by markers `a` and `b`.
///
/// This provides a _reasonably_ efficient way to compare a [`SymbolId`]
/// against a large set of [`SymbolId`]s.
/// There are more efficient ways to accomplish this,
/// though,
/// if performance ever does become a concern;
/// the current implementation is kept simple until then.
#[inline]
pub fn is_between_markers(
a: MarkStaticSymbolId,
b: MarkStaticSymbolId,
sym: SymbolId,
) -> bool {
let symid = sym.as_usize();
symid > a.as_usize() && symid < b.as_usize()
}
/// Whether the provided [`SymbolId`] is recognized as a common
/// whitespace symbol in the preinterned symbol list.
///
/// If this returns `true`,
/// then this is a quick way to determine that the provided
/// [`SymbolId`] does contain only whitespace.
/// However,
/// this is _not_ comprehensive and never will be,
/// so an answer of `false` means "it may or may not be whitespace";
/// you should fall back to other methods of checking for
/// whitespace if this fails.
#[inline]
pub fn is_common_whitespace(sym: SymbolId) -> bool {
is_between_markers(WS_SYM_START, WS_SYM_END, sym)
}
static_symbols! {
<crate::global::ProgSymSize>;
@ -508,8 +569,57 @@ pub mod st {
URI_LV_PREPROC: uri "http://www.lovullo.com/rater/preproc",
URI_LV_LINKER: uri "http://www.lovullo.com/rater/linker",
// TODO: Whitespace type
WS_EMPTY: str "",
// Common whitespace.
//
// _This does not represent all forms of whitespace!_
// Clearly,
// but it is worth emphasizing.
//
// The intent of these whitespace symbols is to provide a means to
// determine whether that symbol represents a common form of
// whitespace,
// before falling back to a more expensive symbol dereference
// and (likely-)linear scan.
//
// This list is preliminary and ought to be measured by evaluating a
// real-world codebase;
// it ought not to bloat the symbol table,
// but ought to get the most common cases so as not to fall
// back to a more expensive dereferencing of a symbol and
// subsequent scanning.
//
// There are improvements that can be made here,
// such as aligning the symbol ids such that whitespace can be
// asserted with a bitmask.
WS_SYM_START: mark "{{ws start}}",
WS_EMPTY: ws "",
WS_SP1: ws " ",
WS_SP2: ws " ",
WS_SP3: ws " ",
WS_SP4: ws " ",
WS_SP5: ws " ",
WS_SP6: ws " ",
WS_SP7: ws " ",
WS_SP8: ws " ",
WS_LF1: ws "\n",
WS_LF2: ws "\n\n",
WS_LF1_SP1: ws "\n ",
WS_LF1_SP2: ws "\n ",
WS_LF1_SP3: ws "\n ",
WS_LF1_SP4: ws "\n ",
WS_LF1_SP5: ws "\n ",
WS_LF1_SP6: ws "\n ",
WS_LF1_SP7: ws "\n ",
WS_LF1_SP8: ws "\n ",
WS_LF2_SP1: ws "\n\n ",
WS_LF2_SP2: ws "\n\n ",
WS_LF2_SP3: ws "\n\n ",
WS_LF2_SP4: ws "\n\n ",
WS_LF2_SP5: ws "\n\n ",
WS_LF2_SP6: ws "\n\n ",
WS_LF2_SP7: ws "\n\n ",
WS_LF2_SP8: ws "\n\n ",
WS_SYM_END: mark "{{ws end}}",
// [Symbols will be added here as they are needed.]

View File

@ -217,49 +217,6 @@ impl Display for LocalPart {
}
}
/// A sequence of one or more whitespace characters.
///
/// Whitespace here is expected to consist of `[ \n\t\r]`
/// (where the first character in that class is a space).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Whitespace(SymbolId);
impl Deref for Whitespace {
type Target = SymbolId;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl TryFrom<&str> for Whitespace {
type Error = SpanlessError;
fn try_from(value: &str) -> Result<Self, Self::Error> {
// We do not expect this to ever be a large value based on how we
// use it.
// If it is, well, someone's doing something they ought not to be
// and we're not going to optimize for it.
if !value.as_bytes().iter().all(u8::is_ascii_whitespace) {
return Err(SpanlessError::NotWhitespace(value.into()));
}
Ok(Self(value.intern()))
}
}
impl From<Whitespace> for SymbolId {
fn from(ws: Whitespace) -> Self {
ws.0
}
}
impl Display for Whitespace {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
/// A qualified name (namespace prefix and local name).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct QName(Option<Prefix>, LocalPart);
@ -431,8 +388,7 @@ impl CloseSpan {
}
}
/// Number of bytes of whitespace following an element name in
/// [`EleSpan`].
/// Number of bytes representing the name of the element.
pub type EleNameLen = SpanLenSize;
/// Spans associated with an element opening or closing tag.
@ -632,11 +588,6 @@ pub enum Token {
/// already present,
/// not for producing new CData safely!
CData(SymbolId, Span),
/// Similar to `Text`,
/// but intended for use where only whitespace is allowed,
/// such as alignment of attributes.
Whitespace(Whitespace, Span),
}
impl Display for Token {
@ -665,7 +616,6 @@ impl Display for Token {
Self::Comment(..) => write!(f, "comment"),
Self::Text(..) => write!(f, "text"),
Self::CData(..) => write!(f, "CDATA"),
Self::Whitespace(..) => write!(f, "whitespace"),
}
}
}
@ -689,8 +639,7 @@ impl crate::parse::Token for Token {
| AttrValueFragment(_, span)
| Comment(_, span)
| Text(_, span)
| CData(_, span)
| Whitespace(_, span) => *span,
| CData(_, span) => *span,
}
}
}
@ -832,26 +781,6 @@ pub mod test {
}
}
#[test]
fn whitespace() -> TestResult {
assert_eq!(Whitespace::try_from(" ")?, " ".try_into()?);
assert_eq!(Whitespace::try_from(" \t ")?, " \t ".try_into()?);
assert_eq!(
Whitespace::try_from("not ws!"),
Err(SpanlessError::NotWhitespace("not ws!".into(),))
);
Ok(())
}
#[test]
fn whitespace_as_text() -> TestResult {
assert_eq!(" ".intern(), Whitespace::try_from(" ")?.into(),);
Ok(())
}
mod ele_span {
use super::*;
use crate::span::DUMMY_CONTEXT as DC;

View File

@ -33,6 +33,8 @@
//! that element;
//! 5. Parsing will fail if input ends before all elements have been
//! closed.
//! 6. Text nodes may optionally be parsed into [`RefinedText`] to
//! distinguish whitespace.
//!
//! XIRF lowering does not perform any dynamic memory allocation;
//! maximum element nesting depth is set statically depending on the needs
@ -40,7 +42,8 @@
use super::{
attr::{Attr, AttrParseError, AttrParseState},
CloseSpan, OpenSpan, QName, Token as XirToken, TokenStream, Whitespace,
reader::is_xml_whitespace_char,
CloseSpan, OpenSpan, QName, Token as XirToken, TokenStream,
};
use crate::{
diagnose::{Annotate, AnnotatedSpan, Diagnostic},
@ -49,11 +52,15 @@ use crate::{
TransitionResult,
},
span::Span,
sym::SymbolId,
sym::{st::is_common_whitespace, GlobalSymbolResolve, SymbolId},
xir::EleSpan,
};
use arrayvec::ArrayVec;
use std::{error::Error, fmt::Display};
use std::{
error::Error,
fmt::{Debug, Display},
marker::PhantomData,
};
/// Tag nesting depth
/// (`0` represents the root).
@ -74,7 +81,7 @@ impl Display for Depth {
/// but are still validated to ensure that they are well-formed and that
/// the XML is well-structured.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum XirfToken {
pub enum XirfToken<T: TextType> {
/// Opening tag of an element.
Open(QName, OpenSpan, Depth),
@ -99,7 +106,7 @@ pub enum XirfToken {
/// Character data as part of an element.
///
/// See also [`CData`](XirfToken::CData) variant.
Text(SymbolId, Span),
Text(T),
/// CData node (`<![CDATA[...]]>`).
///
@ -109,14 +116,9 @@ pub enum XirfToken {
/// already present,
/// not for producing new CData safely!
CData(SymbolId, Span),
/// Similar to `Text`,
/// but intended for use where only whitespace is allowed,
/// such as alignment of attributes.
Whitespace(Whitespace, Span),
}
impl Token for XirfToken {
impl<T: TextType> Token for XirfToken<T> {
fn ir_name() -> &'static str {
"XIRF"
}
@ -128,18 +130,17 @@ impl Token for XirfToken {
Open(_, OpenSpan(span, _), _)
| Close(_, CloseSpan(span, _), _)
| Comment(_, span)
| Text(_, span)
| CData(_, span)
| Whitespace(_, span) => *span,
| CData(_, span) => *span,
Text(text) => text.span(),
Attr(attr) => attr.span(),
}
}
}
impl Object for XirfToken {}
impl<T: TextType> Object for XirfToken<T> {}
impl Display for XirfToken {
impl<T: TextType> Display for XirfToken<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use XirfToken::*;
@ -154,18 +155,112 @@ impl Display for XirfToken {
Comment(sym, span) => {
Display::fmt(&XirToken::Comment(*sym, *span), f)
}
Text(sym, span) => Display::fmt(&XirToken::Text(*sym, *span), f),
Text(text) => Display::fmt(text, f),
CData(sym, span) => Display::fmt(&XirToken::CData(*sym, *span), f),
Whitespace(ws, span) => {
Display::fmt(&XirToken::Whitespace(*ws, *span), f)
}
}
}
impl<T: TextType> From<Attr> for XirfToken<T> {
fn from(attr: Attr) -> Self {
Self::Attr(attr)
}
}
/// Token of an optionally refined [`Text`].
///
/// XIRF is configurable on the type of processing it performs on [`Text`],
/// including the detection of [`Whitespace`].
///
/// See also [`RefinedText`].
pub trait TextType = From<Text> + Token + Eq;
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Text(pub SymbolId, pub Span);
impl Token for Text {
fn ir_name() -> &'static str {
"XIRF Text"
}
fn span(&self) -> Span {
match self {
Self(_, span) => *span,
}
}
}
impl Display for Text {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
// TODO: We'll need care to output text so that it does not mess up
// formatted output.
// Further,
// text can be any arbitrary length,
// and so should probably be elided after a certain length.
write!(f, "text")
}
}
/// A sequence of one or more whitespace characters.
///
/// Whitespace here is expected to consist of `[ \n\t\r]`
/// (where the first character in that class is a space).
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Whitespace(pub Text);
impl Display for Whitespace {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
// TODO: Escape output as necessary so that we can render the symbol
// string.
// See also `<Text as Display>::fmt` TODO.
write!(f, "whitespace")
}
}
/// Text that has been refined to a more descriptive form.
///
/// This type may be used as a [`TextType`] to instruct XIRF to detect
/// [`Whitespace`].
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum RefinedText {
/// Provided [`Text`] has been determined to be [`Whitespace`].
Whitespace(Whitespace),
/// Provided [`Text`] was not able to be refined into a more specific
/// type.
Unrefined(Text),
}
impl Token for RefinedText {
fn ir_name() -> &'static str {
"XIRF RefinedText"
}
fn span(&self) -> Span {
match self {
Self::Whitespace(Whitespace(text)) | Self::Unrefined(text) => {
text.span()
}
}
}
}
impl From<Attr> for XirfToken {
fn from(attr: Attr) -> Self {
Self::Attr(attr)
impl Display for RefinedText {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Self::Whitespace(ws) => Display::fmt(ws, f),
Self::Unrefined(text) => Display::fmt(text, f),
}
}
}
impl From<Text> for RefinedText {
fn from(text: Text) -> Self {
match text {
Text(sym, _) if is_whitespace(sym) => {
Self::Whitespace(Whitespace(text))
}
_ => Self::Unrefined(text),
}
}
}
@ -187,14 +282,14 @@ type ElementStack<const MAX_DEPTH: usize> = ArrayVec<(QName, Span), MAX_DEPTH>;
/// XIRF document parser state.
///
/// This parser is a pushdown automaton that parses a single XML document.
#[derive(Debug, Default, PartialEq, Eq)]
pub enum XirToXirf<const MAX_DEPTH: usize, SA = AttrParseState>
#[derive(Debug, PartialEq, Eq)]
pub enum XirToXirf<const MAX_DEPTH: usize, T, SA = AttrParseState>
where
SA: FlatAttrParseState<MAX_DEPTH>,
T: TextType,
{
/// Document parsing has not yet begun.
#[default]
PreRoot,
PreRoot(PhantomData<T>),
/// Parsing nodes.
NodeExpected,
/// Delegating to attribute parser.
@ -203,15 +298,59 @@ where
Done,
}
impl<const MAX_DEPTH: usize, T, SA> Default for XirToXirf<MAX_DEPTH, T, SA>
where
SA: FlatAttrParseState<MAX_DEPTH>,
T: TextType,
{
fn default() -> Self {
Self::PreRoot(PhantomData::default())
}
}
pub type StateContext<const MAX_DEPTH: usize> =
Context<ElementStack<MAX_DEPTH>>;
impl<const MAX_DEPTH: usize, SA> ParseState for XirToXirf<MAX_DEPTH, SA>
/// Whether the given [`SymbolId`] is all whitespace according to
/// [`is_xml_whitespace_char`].
///
/// This will first consult the pre-interned whitespace symbol list using
/// [`is_common_whitespace`].
/// If that check fails,
/// it will resort to looking up the symbol and performing a linear scan
/// of the string,
/// terminating early if a non-whitespace character is found.
///
/// Note that the empty string is considered to be whitespace.
#[inline]
fn is_whitespace(sym: SymbolId) -> bool {
// See `sym::prefill`;
// this may require maintenance to keep the prefill list up-to-date
// with common whitespace symbols to avoid symbol lookups.
// This common check is purely a performance optimization.
is_common_whitespace(sym) || {
// If this is called often and is too expensive,
// it may be worth caching metadata about symbols,
// either for XIRF or globally.
// This requires multiple dereferences
// (for looking up the intern for the `SymbolId`,
// which may result in multiple (CPU) cache misses,
// but that would have to be profiled since the symbol may
// have just been interned and may be cached still)
// and then a linear scan of the associated `str`,
// though it will terminate as soon as it finds a non-whitespace
// character.
sym.lookup_str().chars().all(is_xml_whitespace_char)
}
}
impl<const MAX_DEPTH: usize, T, SA> ParseState for XirToXirf<MAX_DEPTH, T, SA>
where
SA: FlatAttrParseState<MAX_DEPTH>,
T: TextType,
{
type Token = XirToken;
type Object = XirfToken;
type Object = XirfToken<T>;
type Error = XirToXirfError;
type Context = StateContext<MAX_DEPTH>;
@ -224,14 +363,21 @@ where
match (self, tok) {
// Comments are permitted before and after the first root element.
(st @ (PreRoot | Done), XirToken::Comment(sym, span)) => {
(st @ (PreRoot(_) | Done), XirToken::Comment(sym, span)) => {
Transition(st).ok(XirfToken::Comment(sym, span))
}
(PreRoot, tok @ XirToken::Open(..)) => Self::parse_node(tok, stack),
(PreRoot(_), tok @ XirToken::Open(..)) => {
Self::parse_node(tok, stack)
}
(PreRoot, tok) => {
Transition(PreRoot).err(XirToXirfError::RootOpenExpected(tok))
// Ignore whitespace before root.
(st @ PreRoot(_), XirToken::Text(sym, _)) if is_whitespace(sym) => {
Transition(st).incomplete()
}
(st @ PreRoot(_), tok) => {
Transition(st).err(XirToXirfError::RootOpenExpected(tok))
}
(NodeExpected, tok) => Self::parse_node(tok, stack),
@ -261,15 +407,16 @@ where
}
}
impl<const MAX_DEPTH: usize, SA> Display for XirToXirf<MAX_DEPTH, SA>
impl<const MAX_DEPTH: usize, T, SA> Display for XirToXirf<MAX_DEPTH, T, SA>
where
SA: FlatAttrParseState<MAX_DEPTH>,
T: TextType,
{
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
use XirToXirf::*;
match self {
PreRoot => write!(f, "expecting document root"),
PreRoot(_) => write!(f, "expecting document root"),
NodeExpected => write!(f, "expecting a node"),
AttrExpected(sa) => Display::fmt(sa, f),
Done => write!(f, "done parsing"),
@ -277,9 +424,10 @@ where
}
}
impl<const MAX_DEPTH: usize, SA> XirToXirf<MAX_DEPTH, SA>
impl<const MAX_DEPTH: usize, T, SA> XirToXirf<MAX_DEPTH, T, SA>
where
SA: FlatAttrParseState<MAX_DEPTH>,
T: TextType,
{
/// Parse a token while in a state expecting a node.
fn parse_node(
@ -287,7 +435,6 @@ where
stack: &mut ElementStack<MAX_DEPTH>,
) -> TransitionResult<Self> {
use XirToXirf::{AttrExpected, Done, NodeExpected};
use XirfToken::*;
match tok {
XirToken::Open(qname, span) if stack.len() == MAX_DEPTH => {
@ -302,7 +449,7 @@ where
stack.push((qname, span.tag_span()));
// Delegate to the attribute parser until it is complete.
Transition(AttrExpected(SA::default())).ok(Open(
Transition(AttrExpected(SA::default())).ok(XirfToken::Open(
qname,
span,
Depth(depth),
@ -325,16 +472,14 @@ where
}
// Final closing tag (for root node) completes the document.
(..) if stack.len() == 0 => Transition(Done).ok(Close(
close_oqname,
close_span,
Depth(0),
)),
(..) if stack.len() == 0 => Transition(Done).ok(
XirfToken::Close(close_oqname, close_span, Depth(0)),
),
(..) => {
let depth = stack.len();
Transition(NodeExpected).ok(Close(
Transition(NodeExpected).ok(XirfToken::Close(
close_oqname,
close_span,
Depth(depth),
@ -344,16 +489,12 @@ where
}
XirToken::Comment(sym, span) => {
Transition(NodeExpected).ok(Comment(sym, span))
}
XirToken::Text(sym, span) => {
Transition(NodeExpected).ok(Text(sym, span))
Transition(NodeExpected).ok(XirfToken::Comment(sym, span))
}
XirToken::Text(sym, span) => Transition(NodeExpected)
.ok(XirfToken::Text(T::from(Text(sym, span)))),
XirToken::CData(sym, span) => {
Transition(NodeExpected).ok(CData(sym, span))
}
XirToken::Whitespace(ws, span) => {
Transition(NodeExpected).ok(Whitespace(ws, span))
Transition(NodeExpected).ok(XirfToken::CData(sym, span))
}
// We should transition to `State::Attr` before encountering any
@ -369,10 +510,10 @@ where
/// Produce a streaming parser lowering a XIR [`TokenStream`] into a XIRF
/// stream.
pub fn parse<const MAX_DEPTH: usize>(
pub fn parse<const MAX_DEPTH: usize, T: TextType>(
toks: impl TokenStream,
) -> impl Iterator<Item = ParsedResult<XirToXirf<MAX_DEPTH>>> {
XirToXirf::<MAX_DEPTH>::parse(toks)
) -> impl Iterator<Item = ParsedResult<XirToXirf<MAX_DEPTH, T>>> {
XirToXirf::<MAX_DEPTH, T>::parse(toks)
}
/// Parsing error from [`XirToXirf`].

View File

@ -38,11 +38,11 @@ use std::fmt::Debug;
///
/// This function is not suitable for production use as it does not produce
/// a complete [`OpenSpan`].
pub fn open<Q: TryInto<QName>, S: Into<OpenSpan>>(
pub fn open<Q: TryInto<QName>, S: Into<OpenSpan>, T: TextType>(
qname: Q,
span: S,
depth: Depth,
) -> XirfToken
) -> XirfToken<T>
where
<Q as TryInto<QName>>::Error: Debug,
{
@ -56,7 +56,10 @@ where
///
/// This function is not suitable for production use as it does not produce
/// a complete [`OpenSpan`].
pub fn close_empty<S: Into<CloseSpan>>(span: S, depth: Depth) -> XirfToken {
pub fn close_empty<S: Into<CloseSpan>, T: TextType>(
span: S,
depth: Depth,
) -> XirfToken<T> {
XirfToken::Close(None, span.into(), depth)
}
@ -66,11 +69,11 @@ pub fn close_empty<S: Into<CloseSpan>>(span: S, depth: Depth) -> XirfToken {
///
/// This function is not suitable for production use as it does not produce
/// a complete [`OpenSpan`].
pub fn close<Q: TryInto<QName>, S: Into<CloseSpan>>(
pub fn close<Q: TryInto<QName>, S: Into<CloseSpan>, T: TextType>(
qname: Option<Q>,
span: S,
depth: Depth,
) -> XirfToken
) -> XirfToken<T>
where
<Q as TryInto<QName>>::Error: Debug,
{
@ -88,7 +91,7 @@ fn empty_element_self_close() {
let toks = [xir_open(name, S), xir_close_empty(S2)].into_iter();
let sut = parse::<1>(toks);
let sut = parse::<1, Text>(toks);
assert_eq!(
Ok(vec![
@ -107,7 +110,7 @@ fn empty_element_balanced_close() {
let toks = [xir_open(name, S), xir_close(Some(name), S2)].into_iter();
let sut = parse::<1>(toks);
let sut = parse::<1, Text>(toks);
assert_eq!(
Ok(vec![
@ -133,10 +136,10 @@ fn extra_closing_tag() {
]
.into_iter();
let sut = parse::<1>(toks);
let sut = parse::<1, Text>(toks);
assert_matches!(
sut.collect::<Result<Vec<Parsed<XirfToken>>, _>>(),
sut.collect::<Result<Vec<Parsed<_>>, _>>(),
Err(ParseError::UnexpectedToken(
XirToken::Close(Some(given_name), given_span),
_
@ -158,10 +161,10 @@ fn extra_self_closing_tag() {
]
.into_iter();
let sut = parse::<1>(toks);
let sut = parse::<1, Text>(toks);
assert_matches!(
sut.collect::<Result<Vec<Parsed<XirfToken>>, _>>(),
sut.collect::<Result<Vec<Parsed<_>>, _>>(),
Err(ParseError::UnexpectedToken(XirToken::Close(None, given_span), _))
if given_span == S3.into(),
);
@ -177,7 +180,7 @@ fn empty_element_unbalanced_close() {
let toks =
[xir_open(open_name, S), xir_close(Some(close_name), S2)].into_iter();
let mut sut = parse::<1>(toks);
let mut sut = parse::<1, Text>(toks);
assert_eq!(
sut.next(),
@ -206,7 +209,7 @@ fn single_empty_child() {
]
.into_iter();
let sut = parse::<2>(toks);
let sut = parse::<2, Text>(toks);
assert_eq!(
Ok(vec![
@ -232,7 +235,7 @@ fn depth_exceeded() {
.into_iter();
// ...which is set here: MAX_DEPTH here is 1
let mut sut = parse::<1>(toks);
let mut sut = parse::<1, Text>(toks);
assert_eq!(
Some(Ok(Parsed::Object(open(name, S, Depth(0))))),
@ -267,7 +270,7 @@ fn empty_element_with_attrs() {
]
.into_iter();
let sut = parse::<2>(toks);
let sut = parse::<2, Text>(toks);
assert_eq!(
Ok(vec![
@ -299,7 +302,7 @@ fn child_element_after_attrs() {
]
.into_iter();
let sut = parse::<2>(toks);
let sut = parse::<2, Text>(toks);
assert_eq!(
Ok(vec![
@ -330,7 +333,7 @@ fn element_with_empty_sibling_children() {
]
.into_iter();
let sut = parse::<2>(toks);
let sut = parse::<2, Text>(toks);
assert_eq!(
Ok(vec![
@ -363,7 +366,7 @@ fn element_with_child_with_attributes() {
]
.into_iter();
let sut = parse::<2>(toks);
let sut = parse::<2, Text>(toks);
assert_eq!(
Ok(vec![
@ -390,12 +393,12 @@ fn element_with_text() {
]
.into_iter();
let sut = parse::<1>(toks);
let sut = parse::<1, Text>(toks);
assert_eq!(
Ok(vec![
Parsed::Object(open(parent, S, Depth(0))),
Parsed::Object(XirfToken::Text(text, S2)),
Parsed::Object(XirfToken::Text(Text(text, S2))),
Parsed::Object(close(Some(parent), S3, Depth(0))),
]),
sut.collect(),
@ -407,7 +410,7 @@ fn not_accepting_state_if_element_open() {
let name = "unclosed";
let toks = [xir_open(name, S)].into_iter();
let mut sut = parse::<1>(toks);
let mut sut = parse::<1, Text>(toks);
assert_eq!(
Some(Ok(Parsed::Object(open(name, S, Depth(0))))),
@ -433,7 +436,7 @@ fn comment_before_or_after_root_ok() {
]
.into_iter();
let sut = parse::<1>(toks);
let sut = parse::<1, Text>(toks);
assert_eq!(
Ok(vec![
@ -466,11 +469,11 @@ fn content_after_root_close_error() {
]
.into_iter();
let sut = parse::<1>(toks);
let sut = parse::<1, Text>(toks);
assert_matches!(
sut.collect(),
Result::<Vec<Parsed<XirfToken>>, _>::Err(ParseError::UnexpectedToken(
Result::<Vec<Parsed<_>>, _>::Err(ParseError::UnexpectedToken(
XirToken::Open(given_name, given_span),
_)) if given_name == name && given_span == S3.into()
);
@ -483,12 +486,56 @@ fn content_before_root_open_error() {
let toks = [XirToken::Text(text, S)].into_iter();
let sut = parse::<1>(toks);
let sut = parse::<1, Text>(toks);
assert_eq!(
Result::<Vec<Parsed<XirfToken>>, _>::Err(ParseError::StateError(
Result::<Vec<Parsed<_>>, _>::Err(ParseError::StateError(
XirToXirfError::RootOpenExpected(XirToken::Text(text, S))
)),
sut.collect()
);
}
#[test]
fn whitespace_refinement() {
// Nothing exhaustive;
// just check some notable examples.
vec![
("".into(), true),
(" ".into(), true),
("\n".into(), true),
("\n\n\t ".into(), true),
(" foo ".into(), false),
("\n .".into(), false),
(".\n ".into(), false),
]
.into_iter()
.for_each(|(given, expected)| {
let mut sut = parse::<1, RefinedText>(
vec![xir_open("root", S), XirToken::Text(given, S)].into_iter(),
);
let _ = sut.next(); // discard root
match sut.next().unwrap().unwrap() {
Parsed::Object(XirfToken::Text(RefinedText::Whitespace(
Whitespace(Text(ws, span)),
))) => {
assert_eq!(ws, given);
assert_eq!(span, S);
assert!(expected == true)
}
Parsed::Object(XirfToken::Text(RefinedText::Unrefined(Text(
text,
span,
)))) => {
assert_eq!(text, given);
assert_eq!(span, S);
assert!(expected == false)
}
unexpected => panic!("unexpected token: {unexpected:?}"),
}
});
}

View File

@ -387,7 +387,9 @@ macro_rules! attr_parse {
}
impl crate::parse::ParseState for $state_name {
type Token = crate::xir::flat::XirfToken;
type Token = crate::xir::flat::XirfToken<
crate::xir::flat::RefinedText
>;
type Object = $struct_name;
type Error = crate::xir::parse::AttrParseError<Self>;

View File

@ -314,7 +314,10 @@ macro_rules! ele_parse {
/// element.
///
/// The span corresponds to the opening tag.
CloseExpected_(crate::span::Span, crate::xir::flat::XirfToken),
CloseExpected_(
crate::span::Span,
crate::xir::flat::XirfToken<crate::xir::flat::RefinedText>,
),
Attrs_(crate::xir::parse::AttrParseError<[<$nt AttrsState_>]>),
@ -411,7 +414,9 @@ macro_rules! ele_parse {
}
impl crate::parse::ParseState for $nt {
type Token = crate::xir::flat::XirfToken;
type Token = crate::xir::flat::XirfToken<
crate::xir::flat::RefinedText
>;
type Object = $objty;
type Error = [<$nt Error_>];
type Context = crate::parse::Context<crate::xir::parse::EleParseCfg>;
@ -698,7 +703,9 @@ macro_rules! ele_parse {
}
impl crate::parse::ParseState for $nt {
type Token = crate::xir::flat::XirfToken;
type Token = crate::xir::flat::XirfToken<
crate::xir::flat::RefinedText
>;
type Object = $objty;
type Error = [<$nt Error_>];
type Context = crate::parse::Context<crate::xir::parse::EleParseCfg>;

View File

@ -42,7 +42,7 @@ use crate::{
sym::SymbolId,
xir::{
attr::{Attr, AttrSpan},
flat::{Depth, XirfToken},
flat::{Depth, RefinedText, Text, XirfToken},
st::qname::*,
CloseSpan, EleNameLen, EleSpan, OpenSpan, QName,
},
@ -630,7 +630,10 @@ fn child_error_and_recovery_at_close() {
XirfToken::Close(None, CloseSpan::empty(S5), Depth(1)),
// Let's mix it up a bit with some text and make sure that is
// ignored too.
XirfToken::Text("unexpected text".unwrap_into(), S5),
XirfToken::Text(RefinedText::Unrefined(Text(
"unexpected text".unwrap_into(),
S5,
))),
// Having recovered from the above tokens,
// this will end parsing for `Sut` as expected.
XirfToken::Close(Some(QN_PACKAGE), CloseSpan(S6, N), Depth(0)),

View File

@ -368,7 +368,7 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
let has_attrs = ele
.attributes_raw()
.iter()
.find(|b| !Self::is_whitespace(**b))
.find(|b| !is_xml_whitespace_u8(**b))
.is_some();
// The tail is anything following the last byte of the QName
@ -426,20 +426,6 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
})
}
/// Whether the byte represents XML whitespace.
///
/// This is quick-xml's whitespace predicate,
/// and corresponds to the
/// [nonterminal `S` in the XML specification][xmlspec-s].
///
/// [xmlspec-s]: https://www.w3.org/TR/xml/#NT-S
fn is_whitespace(b: u8) -> bool {
match b {
b' ' | b'\r' | b'\n' | b'\t' => true,
_ => false,
}
}
/// Parse attributes into a XIR [`Token`] stream.
///
/// The order of attributes will be maintained.
@ -552,6 +538,30 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
}
}
/// Whether the byte represents XML whitespace.
///
/// This is quick-xml's whitespace predicate,
/// and corresponds to the
/// [nonterminal `S` in the XML specification][xmlspec-s].
///
/// [xmlspec-s]: https://www.w3.org/TR/xml/#NT-S
pub fn is_xml_whitespace_u8(b: u8) -> bool {
match b {
b' ' | b'\r' | b'\n' | b'\t' => true,
_ => false,
}
}
/// Whether the character represents XML whitespace.
///
/// See [`is_xml_whitespace_u8`].
pub fn is_xml_whitespace_char(c: char) -> bool {
match c {
' ' | '\r' | '\n' | '\t' => true,
_ => false,
}
}
impl<'s, B, S> Iterator for XmlXirReader<'s, B, S>
where
B: BufRead,

View File

@ -279,12 +279,6 @@ impl<S: Escaper> XmlWriter<S> for &Token {
Ok(W::NodeExpected)
}
(Whitespace(ws, _), W::NodeOpen) => {
sink.write(ws.lookup_str().as_bytes())?;
Ok(W::NodeOpen)
}
// As-of-yet unsupported operations that weren't needed at the
// time of writing, but were planned for in the design of Xir.
(invalid @ AttrName(_, _), W::AttrNameAdjacent) => {
@ -429,18 +423,6 @@ mod test {
Ok(())
}
// Intended for alignment of attributes, primarily.
#[test]
fn whitespace_within_open_node() -> TestResult {
let result = Token::Whitespace(" \t ".unwrap_into(), S)
.write_new(WriterState::NodeOpen, &MockEscaper::default())?;
assert_eq!(result.0, b" \t ");
assert_eq!(result.1, WriterState::NodeOpen);
Ok(())
}
#[test]
fn writes_attr_name_to_open_node() -> TestResult {
let name_ns = ("some", "attr").unwrap_into();
@ -562,7 +544,6 @@ mod test {
Token::AttrValue("value".intern(), S),
Token::Text("text".intern(), S),
open(("c", "child"), S),
Token::Whitespace(" ".unwrap_into(), S),
close_empty(S),
close(Some(root), S),
]
@ -571,7 +552,7 @@ mod test {
assert_eq!(
result.0,
br#"<r:root an:attr="value:ESC">text:ESC<c:child /></r:root>"#
br#"<r:root an:attr="value:ESC">text:ESC<c:child/></r:root>"#
);
assert_eq!(result.1, WriterState::NodeExpected);