// XIR flat (XIRF) // // Copyright (C) 2014-2023 Ryan Specialty, LLC. // // This file is part of TAME. // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . //! Lightly-parsed XIR as a flat stream (XIRF). //! //! XIRF lightly parses a raw XIR [`TokenStream`] into a stream of //! [`XirfToken`]s that are, //! like a [`TokenStream`], //! flat in structure. //! It provides the following features over raw XIR: //! //! 1. All closing tags must correspond to a matching opening tag at the //! same depth; //! 2. [`XirfToken`] exposes the [`Depth`] of each node-related token; //! 3. Attribute tokens are parsed into [`Attr`] objects; //! 4. Documents must begin with an element and end with the closing of //! that element; //! 5. Parsing will fail if input ends before all elements have been //! closed. //! 6. Text nodes may optionally be parsed into [`RefinedText`] to //! distinguish whitespace. //! //! XIRF lowering does not perform any dynamic memory allocation; //! maximum element nesting depth is set statically depending on the needs //! of the caller. use super::{ attr::{Attr, AttrParseError, AttrParseState}, reader::is_xml_whitespace_char, CloseSpan, OpenSpan, QName, Token as XirToken, TokenStream, }; use crate::{ diagnose::{Annotate, AnnotatedSpan, Diagnostic}, f::Functor, parse::prelude::*, span::Span, sym::{st::is_common_whitespace, GlobalSymbolResolve, SymbolId}, xir::EleSpan, }; use arrayvec::ArrayVec; use std::{ convert::Infallible, error::Error, fmt::{Debug, Display}, marker::PhantomData, }; // Used for organization. pub use accept::*; /// Tag nesting depth /// (`0` represents the root). /// /// Note: the lack of a [`Default`] implementation is intentional so that /// this does not see lax initialization; /// you probably want [`Depth::root`] in that case. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd)] pub struct Depth(pub usize); impl Depth { /// Depth representing a root. pub fn root() -> Depth { Depth(0) } /// Yield a new [`Depth`] representing the expected depth of children of /// an element at the current depth. /// /// That description is probably more confusing than the method name. pub fn child_depth(&self) -> Depth { match self { Depth(depth) => Depth(depth + 1), } } } impl Display for Depth { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { Display::fmt(&self.0, f) } } /// A lightly-parsed XIRF object. /// /// Certain XIR [`Token`]s are formed into a single object, /// such as an [`Attr`]. /// Other objects retain the same format as their underlying token, /// but are still validated to ensure that they are well-formed and that /// the XML is well-structured. /// /// Each token representing a child node contains a numeric [`Depth`] /// indicating the nesting depth; /// this can be used by downstream parsers to avoid maintaining their /// own stack in certain cases. #[derive(Debug, Clone, PartialEq, Eq)] pub enum XirfToken { /// Opening tag of an element. Open(QName, OpenSpan, Depth), /// Closing tag of an element. /// /// If the name is [`None`], /// then the tag is self-closing. /// If the name is [`Some`], /// then the tag is guaranteed to be balanced /// (matching the depth of its opening tag). Close(Option, CloseSpan, Depth), /// An attribute and its value. /// /// The associated [`Span`]s can be found on the enclosed [`Attr`] /// object. Attr(Attr), /// Comment node. Comment(SymbolId, Span, Depth), /// Character data as part of an element. /// /// See also [`CData`](XirfToken::CData) variant. Text(T, Depth), /// CData node (``). /// /// _Warning: It is up to the caller to ensure that the string `]]>` is /// not present in the text!_ /// This is intended for reading existing XML data where CData is /// already present, /// not for producing new CData safely! CData(SymbolId, Span, Depth), } impl XirfToken { pub fn open( qname: impl Into, span: impl Into, depth: Depth, ) -> Self { Self::Open(qname.into(), span.into(), depth) } pub fn close( qname: Option>, span: impl Into, depth: Depth, ) -> Self { Self::Close(qname.map(Into::into), span.into(), depth) } pub fn attr( qname: impl Into, value: impl Into, span: (impl Into, impl Into), ) -> Self { Self::Attr(Attr::new( qname.into(), value.into(), (span.0.into(), span.1.into()), )) } pub fn comment( comment: impl Into, span: impl Into, depth: Depth, ) -> Self { Self::Comment(comment.into(), span.into(), depth) } pub fn text(text: impl Into, depth: Depth) -> Self { Self::Text(text.into(), depth) } } impl Token for XirfToken { fn ir_name() -> &'static str { "XIRF" } fn span(&self) -> Span { use XirfToken::*; match self { Open(_, OpenSpan(span, _), _) | Close(_, CloseSpan(span, _), _) | Comment(_, span, _) | CData(_, span, _) => *span, Text(text, _) => text.span(), Attr(attr) => attr.span(), } } } impl Object for XirfToken {} impl Display for XirfToken { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use XirfToken::*; match self { Open(qname, span, _) => { Display::fmt(&XirToken::Open(*qname, *span), f) } Close(oqname, span, _) => { Display::fmt(&XirToken::Close(*oqname, *span), f) } Attr(attr) => Display::fmt(&attr, f), Comment(sym, span, _) => { Display::fmt(&XirToken::Comment(*sym, *span), f) } Text(text, _) => Display::fmt(text, f), CData(sym, span, _) => { Display::fmt(&XirToken::CData(*sym, *span), f) } } } } impl XirfToken { pub fn depth(&self) -> Option { use XirfToken::*; match self { Open(_, _, depth) | Close(_, _, depth) | Comment(_, _, depth) | Text(_, depth) | CData(_, _, depth) => Some(*depth), Attr(_) => None, } } } impl From for XirfToken { fn from(attr: Attr) -> Self { Self::Attr(attr) } } impl Functor for XirfToken { fn map(self, f: impl FnOnce(Depth) -> Depth) -> Self::Target { use XirfToken::*; match self { Open(qn, span, depth) => Open(qn, span, f(depth)), Close(qn, span, depth) => Close(qn, span, f(depth)), Attr(_) => self, Comment(sym, span, depth) => Comment(sym, span, f(depth)), Text(text, depth) => Text(text, f(depth)), CData(cdata, span, depth) => CData(cdata, span, f(depth)), } } } /// Token of an optionally refined [`Text`]. /// /// XIRF is configurable on the type of processing it performs on [`Text`], /// including the detection of [`Whitespace`]. /// /// See also [`RefinedText`]. pub trait TextType = From + Into + Token + Eq; #[derive(Debug, PartialEq, Eq, Clone)] pub struct Text(pub SymbolId, pub Span); impl Token for Text { fn ir_name() -> &'static str { "XIRF Text" } fn span(&self) -> Span { match self { Self(_, span) => *span, } } } impl Display for Text { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { // TODO: We'll need care to output text so that it does not mess up // formatted output. // Further, // text can be any arbitrary length, // and so should probably be elided after a certain length. write!(f, "text") } } /// A sequence of one or more whitespace characters. /// /// Whitespace here is expected to consist of `[ \n\t\r]` /// (where the first character in that class is a space). #[derive(Debug, PartialEq, Eq, Clone)] pub struct Whitespace(pub Text); impl Display for Whitespace { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { // TODO: Escape output as necessary so that we can render the symbol // string. // See also `::fmt` TODO. write!(f, "whitespace") } } /// Text that has been refined to a more descriptive form. /// /// This type may be used as a [`TextType`] to instruct XIRF to detect /// [`Whitespace`]. #[derive(Debug, PartialEq, Eq, Clone)] pub enum RefinedText { /// Provided [`Text`] has been determined to be [`Whitespace`]. Whitespace(Whitespace), /// Provided [`Text`] was not able to be refined into a more specific /// type. Unrefined(Text), } impl Token for RefinedText { fn ir_name() -> &'static str { "XIRF RefinedText" } fn span(&self) -> Span { match self { Self::Whitespace(Whitespace(text)) | Self::Unrefined(text) => { text.span() } } } } impl Display for RefinedText { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { Self::Whitespace(ws) => Display::fmt(ws, f), Self::Unrefined(text) => Display::fmt(text, f), } } } impl From for RefinedText { fn from(text: Text) -> Self { match text { Text(sym, _) if is_whitespace(sym) => { Self::Whitespace(Whitespace(text)) } _ => Self::Unrefined(text), } } } impl From for Text { fn from(value: RefinedText) -> Self { match value { RefinedText::Whitespace(Whitespace(text)) | RefinedText::Unrefined(text) => text, } } } /// XIRF-compatible attribute parser. pub trait FlatAttrParseState = ClosedParseState where Self: Default, ::Error: Into, StateContext: AsMut<::Context>; /// Stack of element [`QName`] and [`Span`] pairs, /// representing the current level of nesting. /// /// This storage is statically allocated, /// allowing XIRF's parser to avoid memory allocation entirely. type ElementStack = ArrayVec<(QName, Span), MAX_DEPTH>; /// Lower [XIR](XirToken) into [XIRF](XirfToken), /// accepting only fully parsed XML documents. /// /// If parsing is expected to stop before reaching the end of the document, /// see [`PartialXirToXirf`]. /// For more information on accepting states, /// see [`XirfAcceptor`]. pub type FullXirToXirf = XirToXirf; /// Lower [XIR](XirToken) into [XIRF](XirfToken), /// accepting partially parsed XML documents at node boundaries. /// /// If the entire XML document ought to be parsed, /// see [`FullXirToXirf`] to provide a guarantee of an error in case the /// system stops parsing before completion. /// For more information on accepting states, /// see [`XirfAcceptor`]. pub type PartialXirToXirf = XirToXirf; /// Lower [XIR](XirToken) into [XIRF](XirfToken). /// /// This parser is a pushdown automaton that parses a single XML document. #[derive(Debug, PartialEq, Eq)] pub enum XirToXirf< const MAX_DEPTH: usize, T, SA = AttrParseState, A: XirfAcceptor = FullXirfAcceptor, > where SA: FlatAttrParseState, T: TextType, { /// Document parsing has not yet begun. PreRoot(PhantomData<(T, A)>), /// Parsing nodes. NodeExpected, /// Delegating to attribute parser. AttrExpected(SA), /// End of document has been reached. Done, } impl Default for XirToXirf where SA: FlatAttrParseState, T: TextType, { fn default() -> Self { Self::PreRoot(PhantomData::default()) } } pub type StateContext = Context>; /// Whether the given [`SymbolId`] is all whitespace according to /// [`is_xml_whitespace_char`]. /// /// This will first consult the pre-interned whitespace symbol list using /// [`is_common_whitespace`]. /// If that check fails, /// it will resort to looking up the symbol and performing a linear scan /// of the string, /// terminating early if a non-whitespace character is found. /// /// Note that the empty string is considered to be whitespace. #[inline] fn is_whitespace(sym: SymbolId) -> bool { // See `sym::prefill`; // this may require maintenance to keep the prefill list up-to-date // with common whitespace symbols to avoid symbol lookups. // This common check is purely a performance optimization. is_common_whitespace(sym) || { // If this is called often and is too expensive, // it may be worth caching metadata about symbols, // either for XIRF or globally. // This requires multiple dereferences // (for looking up the intern for the `SymbolId`, // which may result in multiple (CPU) cache misses, // but that would have to be profiled since the symbol may // have just been interned and may be cached still) // and then a linear scan of the associated `str`, // though it will terminate as soon as it finds a non-whitespace // character. sym.lookup_str().chars().all(is_xml_whitespace_char) } } impl ParseState for XirToXirf where SA: FlatAttrParseState, T: TextType, { type Token = XirToken; type Object = XirfToken; type Error = XirToXirfError; type Context = StateContext; fn parse_token( self, tok: Self::Token, stack: &mut Self::Context, ) -> TransitionResult { use XirToXirf::{AttrExpected, Done, NodeExpected, PreRoot}; match (self, tok) { // Comments are permitted before and after the first root element. (st @ (PreRoot(_) | Done), XirToken::Comment(sym, span)) => { let depth = Depth(stack.len()); Transition(st).ok(XirfToken::Comment(sym, span, depth)) } // Ignore whitespace before or after root. (st @ (PreRoot(_) | Done), XirToken::Text(sym, _)) if is_whitespace(sym) => { Transition(st).incomplete() } (PreRoot(_), tok @ XirToken::Open(..)) => { Self::parse_node(tok, stack) } (st @ PreRoot(_), tok) => { Transition(st).err(XirToXirfError::RootOpenExpected(tok)) } (NodeExpected, tok) => Self::parse_node(tok, stack), (AttrExpected(sa), tok) => sa.delegate( tok, stack, |sa| Transition(AttrExpected(sa)), || Transition(NodeExpected), ), (Done, tok) => Transition(Done).dead(tok), } } /// Whether all elements have been closed. /// /// Parsing will fail if there are any open elements. /// Intuitively, /// this means that the parser must have encountered the closing tag /// for the root element. fn is_accepting(&self, _: &Self::Context) -> bool { // TODO: It'd be nice if we could also return additional context to // aid the user in diagnosing the problem, // e.g. what element(s) still need closing. A::is_accepting(self) } } /// Configurable accepting states for [`XirToXirf`]. /// /// See this module's [`XirfAcceptor`] for more information. mod accept { use super::*; /// Acceptor for [`XirToXirf`]. /// /// This is responsible for determining whether [`XirToXirf`] is in an /// accepting state. /// There are two acceptors: /// /// 1. [`FullXirfAcceptor`] expects that the _entire_ XML document be /// completely parsed up to and including the closing root node; /// and /// 2. [`PartialXirfAcceptor`] allows parsing to halt part-way through /// an XML document, /// provided that parsing ends at a node boundary. /// /// See each respective acceptor for more information. pub trait XirfAcceptor: Debug + PartialEq + Eq + Display + Default { fn is_accepting( st: &XirToXirf, ) -> bool where SA: FlatAttrParseState, T: TextType; } /// Acceptor for fully parsed XML documents for [`XirToXirf`]. /// /// This acceptor should be used when the intent of the lowering /// pipeline is to fully parse the [XIR](XirToken) stream. /// In other words: /// this should be used when the XML document being read ought to be /// read _fully_, /// where halting parsing before the root node would indicate a /// defect in the system. /// /// For example, /// when reading a file with TAME sources in `tamec`, /// the compiler ought to ensure that the entire file is read to /// completion. /// If the lowering pipeline stops requesting tokens before the XIR /// stream has ended, /// then that means that compilation has halted before the system /// has had a chance to consider the rest of the file. /// Because the lowering pipeline is intended to parse and present /// errors on the entire file each run, /// this would represent a bug in the system, /// and so we ought to fail. /// /// Downstream parsers ought to fail for their own reasons as well, /// but this provides an extra layer of protection for _anything_ that /// happens to read XML files. /// /// For an example of a situation where we may not wish to fail, /// see [`PartialXirfAcceptor`]. #[derive(Debug, PartialEq, Eq, Default)] pub struct FullXirfAcceptor; impl XirfAcceptor for FullXirfAcceptor { fn is_accepting( st: &XirToXirf, ) -> bool where SA: FlatAttrParseState, T: TextType, { matches!(st, XirToXirf::Done) } } impl Display for FullXirfAcceptor { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "accepting only full documents") } } /// Acceptor for either full _or_ partially parsed XML documents for /// [`XirToXirf`]. /// /// This acceptor is intended to be used when parsing the entirety of a /// [XIR](XirToken) stream is not desirable; /// it allows parsing to be completed at a node boundary /// (when a node is expected). /// This acceptor builds on the behavior of [`FullXirfAcceptor`], /// and so will also accept all fully parsed documents. /// /// For example, /// when reading object files in `tameld`, /// the linker is concerned only with header information; /// the remainder of the XML document does not contain useful /// information and would be wasteful to parse. /// In that case, /// we rely on downstream parsers to determine whether the document /// has been sufficiently parsed. /// /// This acceptor provides one weaker guarantee: /// that parsing has _at least_ completed parsing a node, /// such as an element. /// Parsing must complete at a node boundary, /// and so cannot halt in the middle of attribute parsing for an /// element, /// for example. #[derive(Debug, PartialEq, Eq, Default)] pub struct PartialXirfAcceptor; impl XirfAcceptor for PartialXirfAcceptor { fn is_accepting( st: &XirToXirf, ) -> bool where SA: FlatAttrParseState, T: TextType, { FullXirfAcceptor::is_accepting(st) || matches!(st, XirToXirf::NodeExpected) } } impl Display for PartialXirfAcceptor { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "accepting partial documents at node boundaries") } } } impl Display for XirToXirf where SA: FlatAttrParseState, T: TextType, { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { use XirToXirf::*; match self { PreRoot(_) => write!(f, "expecting document root"), NodeExpected => write!(f, "expecting a node"), AttrExpected(sa) => Display::fmt(sa, f), Done => write!(f, "done parsing document root"), }?; // e.g. ", accepting ..." write!(f, ", ")?; Display::fmt(&A::default(), f) } } impl XirToXirf where SA: FlatAttrParseState, T: TextType, { /// Parse a token while in a state expecting a node. fn parse_node( tok: ::Token, stack: &mut ElementStack, ) -> TransitionResult { use XirToXirf::{AttrExpected, Done, NodeExpected}; let depth = Depth(stack.len()); match tok { XirToken::Open(qname, span) if stack.len() == MAX_DEPTH => { Transition(NodeExpected).err(XirToXirfError::MaxDepthExceeded { open: (qname, span.tag_span()), max: Depth(MAX_DEPTH), }) } XirToken::Open(qname, span) => { stack.push((qname, span.tag_span())); // Delegate to the attribute parser until it is complete. Transition(AttrExpected(SA::default())) .ok(XirfToken::Open(qname, span, depth)) } XirToken::Close(close_oqname, close_span) => { match (close_oqname, stack.pop()) { (_, None) => unreachable!("parser should be in Done state"), (Some(qname), Some((open_qname, open_span))) if qname != open_qname => { Transition(NodeExpected).err( XirToXirfError::UnbalancedTag { open: (open_qname, open_span), close: (qname, close_span.tag_span()), }, ) } // Final closing tag (for root node) completes the document. (..) if stack.is_empty() => Transition(Done).ok( XirfToken::Close(close_oqname, close_span, Depth(0)), ), (..) => { let depth = stack.len(); Transition(NodeExpected).ok(XirfToken::Close( close_oqname, close_span, Depth(depth), )) } } } XirToken::Comment(sym, span) => Transition(NodeExpected) .ok(XirfToken::Comment(sym, span, depth)), XirToken::Text(sym, span) => Transition(NodeExpected) .ok(XirfToken::Text(T::from(Text(sym, span)), depth)), XirToken::CData(sym, span) => { Transition(NodeExpected).ok(XirfToken::CData(sym, span, depth)) } // We should transition to `State::Attr` before encountering any // of these tokens. XirToken::AttrName(..) | XirToken::AttrValue(..) | XirToken::AttrValueFragment(..) => { unreachable!("attribute token in NodeExpected state: {tok:?}") } } } } /// Produce a streaming parser lowering a XIR [`TokenStream`] into a XIRF /// stream. pub fn parse( toks: impl TokenStream, ) -> impl Iterator>> { XirToXirf::::parse(toks) } /// Parsing error from [`XirToXirf`]. #[derive(Debug, Eq, PartialEq)] pub enum XirToXirfError { /// Opening root element tag was expected. RootOpenExpected(XirToken), /// Opening tag exceeds the maximum nesting depth for this parser. MaxDepthExceeded { open: (QName, Span), max: Depth }, /// The closing tag does not match the opening tag at the same level of /// nesting. UnbalancedTag { open: (QName, Span), close: (QName, Span), }, /// Error from the attribute parser. AttrError(AttrParseError), } impl Display for XirToXirfError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use XirToXirfError::*; match self { RootOpenExpected(_tok) => { write!(f, "missing opening root element",) } MaxDepthExceeded { open: (_name, _), max, } => { write!( f, "maximum XML element nesting depth of `{max}` exceeded" ) } UnbalancedTag { open: (open_name, _), close: (_close_name, _), } => { write!(f, "expected closing tag for `{open_name}`") } AttrError(e) => Display::fmt(e, f), } } } impl Error for XirToXirfError { fn source(&self) -> Option<&(dyn Error + 'static)> { match self { Self::AttrError(e) => Some(e), _ => None, } } } impl Diagnostic for XirToXirfError { fn describe(&self) -> Vec { use XirToXirfError::*; match self { RootOpenExpected(tok) => { // TODO: Should the span be the first byte, // or should we delegate that question to an e.g. `SpanLike`? tok.span() .error("an opening root node was expected here") .into() } MaxDepthExceeded { open: (_, span), max, } => span .error(format!( "this opening tag increases the level of nesting \ past the limit of {max}" )) .into(), UnbalancedTag { open: (open_name, open_span), close: (_close_name, close_span), } => { // TODO: hint saying that the nesting could be wrong, etc; // we can't just suggest a replacement, // since that's not necessarily the problem vec![ open_span .note(format!("element `{open_name}` is opened here")), // No need to state the close name since the source line // will be highlighted by the diagnostic message. close_span.error(format!("expected ``")), ] } AttrError(e) => e.describe(), } } } impl From for XirToXirfError { fn from(e: AttrParseError) -> Self { Self::AttrError(e) } } /// Lower a [`XirfToken`] stream into a [`XirToken`] stream. /// /// This is the dual of [`XirToXirf`], /// and is intended to be used when the system _generates_ XML. /// If you do not need any features of XIRF, /// and aren't using any operation that produces it, /// then you may also skip a step and just emit XIR to avoid having to /// perform this lowering operation. #[derive(Debug, PartialEq, Eq)] pub enum XirfToXir { Ready(PhantomData), AttrVal(PhantomData), } impl Default for XirfToXir { fn default() -> Self { Self::Ready(Default::default()) } } impl Display for XirfToXir { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "translating XIRF to XIR") } } impl ParseState for XirfToXir { type Token = XirfToken; type Object = XirToken; type Error = Infallible; fn parse_token( self, tok: Self::Token, _: NoContext, ) -> TransitionResult { use XirToken as Xir; use XirfToXir::*; use XirfToken as Xirf; macro_rules! to { ($tok:expr) => { Transition(self).ok($tok) }; } match tok { Xirf::Open(qname, ospan, _) => to!(Xir::Open(qname, ospan)), Xirf::Close(qname, cspan, _) => to!(Xir::Close(qname, cspan)), Xirf::Attr(attr) => match self { Self::Ready(p) => Transition(AttrVal(p)) .ok(Xir::AttrName(attr.name(), attr.attr_span().key_span())) .with_lookahead(Xirf::Attr(attr)), Self::AttrVal(p) => Transition(Ready(p)).ok(Xir::AttrValue( attr.value(), attr.attr_span().value_span(), )), }, Xirf::Comment(sym, span, _) => to!(Xir::Comment(sym, span)), Xirf::Text(x, _) => match x.into() { Text(sym, span) => to!(Xir::Text(sym, span)), }, Xirf::CData(sym, span, _) => to!(Xir::CData(sym, span)), } } fn is_accepting(&self, _: &Self::Context) -> bool { matches!(self, Self::Ready(_)) } } #[cfg(test)] pub mod test;