// XML IR (XIR) // // Copyright (C) 2014-2023 Ryan Specialty, LLC. // // This file is part of TAME. // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . //! Intermediate representation (IR) of an XML document. //! //! XIR serves not only as a TAMER-specific IR, //! but also as an abstraction layer atop of whatever XML library is //! used (e.g. `quick_xml`). //! XIR is _not_ intended to be comprehensive, //! or even general-purpose---it //! exists to solve concerns specific to TAMER's construction. //! //! To parse an entire XML document, //! see [`reader`]. //! //! _Note:_ XIR refers to "opening" and "closing" tags, //! as opposed to "start" and "end" as used in the XML specification. //! TAMER uses a uniform terminology for all delimited data. use crate::fmt::DisplayWrapper; use crate::span::{Span, SpanLenSize}; use crate::sym::{ st_as_sym, GlobalSymbolIntern, GlobalSymbolInternBytes, SymbolId, }; use memchr::memchr; use std::convert::{TryFrom, TryInto}; use std::fmt::Display; use std::ops::Deref; mod error; pub use error::Error; mod escape; pub use escape::{DefaultEscaper, Escaper}; use error::SpanlessError; use st::qname::QNameCompatibleStaticSymbolId; use self::fmt::{CloseXmlEle, OpenXmlEle, XmlAttr, XmlAttrValueQuote}; pub mod attr; pub mod autoclose; pub mod flat; pub mod fmt; pub mod iter; pub mod pred; pub mod reader; pub mod st; pub mod tree; pub mod writer; #[macro_use] pub mod parse; /// An infallible [`Token`] stream. /// /// If the token stream originates from an operation that could potentially /// fail and ought to be propagated, /// use [`TokenResultStream`]. /// /// The name "stream" in place of "iterator" is intended to convey that this /// type is expected to be processed in real-time as a stream, /// not read into memory. pub trait TokenStream = Iterator; /// A [`Token`] stream that may encounter errors during parsing. /// /// If the stream cannot fail, /// consider using [`TokenStream`]. pub trait TokenResultStream = Iterator>; /// XML Name minus `":"`. /// /// The intent is to check a string for validity _before_ interning; /// otherwise, /// the string would have to be first retrieved from the intern pool /// for comparison, /// which is not an operation we want to do implicitly. /// Those methods will be created as they are needed. /// /// See . #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct NCName(SymbolId); impl NCName { /// Create a new NCName from a symbol without validating that the symbol /// is a valid NCName. /// /// Safety /// ====== /// This is not unsafe in the traditional sense; /// it's unsafe in a sense similar to non-UTF-8 `str` slices, /// in that it is expected that an `NCName` means that you do not /// have to worry about whether it's syntatically valid as XML. pub unsafe fn new_unchecked(value: SymbolId) -> Self { Self(value) } } impl TryFrom<&[u8]> for NCName { type Error = SpanlessError; /// Attempt to parse a byte slice into an [`NCName`]. /// /// If the slice contains `b':'`, /// an error will be produced. /// No other checks are performed beyond checking that the byte sequence /// represents a valid UTF-8 string. /// The string will be interned for you. fn try_from(value: &[u8]) -> Result { match value.contains(&b':') { true => Err(SpanlessError::NCColon(value.intern_utf8()?)), false => Ok(NCName(value.intern_utf8()?)), } } } impl Deref for NCName { type Target = SymbolId; fn deref(&self) -> &Self::Target { &self.0 } } impl PartialEq for NCName { fn eq(&self, other: &SymbolId) -> bool { self.0 == *other } } impl TryFrom<&str> for NCName { type Error = SpanlessError; fn try_from(value: &str) -> Result { if value.contains(':') { return Err(SpanlessError::NCColon(value.into())); } Ok(Self(value.intern())) } } /// Namespace prefix of a [`QName`]. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Prefix(NCName); impl Prefix { /// Construct a constant [`Prefix`] from a static C-style symbol. pub const fn st_cid( prefix_sym: &T, ) -> Self { Self(NCName(st_as_sym(prefix_sym))) } } /// Local name portion of a [`QName`]. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct LocalPart(NCName); impl Deref for Prefix { type Target = SymbolId; fn deref(&self) -> &Self::Target { self.0.deref() } } impl Deref for LocalPart { type Target = SymbolId; fn deref(&self) -> &Self::Target { self.0.deref() } } impl From for Prefix { fn from(name: NCName) -> Self { Self(name) } } impl From for LocalPart { fn from(name: NCName) -> Self { Self(name) } } impl TryFrom<&str> for Prefix { type Error = SpanlessError; fn try_from(value: &str) -> Result { Ok(Self(value.try_into()?)) } } impl TryFrom<&str> for LocalPart { type Error = SpanlessError; fn try_from(value: &str) -> Result { Ok(Self(value.try_into()?)) } } impl Display for Prefix { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) } } impl Display for LocalPart { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) } } /// A qualified name (namespace prefix and local name). #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct QName(Option, LocalPart); // Since we implement Copy, ensure size matches our expectations: const_assert!(std::mem::size_of::() <= std::mem::size_of::()); impl QName { /// Create a new fully-qualified name (including both a namespace URI /// and local name). pub fn new(prefix: Option, local_name: LocalPart) -> Self { Self(prefix, local_name) } /// Create a new name from a local name only. /// /// This should only be used for attributes in TAMER, /// since all elements should have an associated namespace. /// /// _(If this is ever not true (e.g. due to new targets), /// please update this comment.)_ pub fn new_local(local_name: LocalPart) -> Self { Self(None, local_name) } /// Fully qualified namespace associated with a name. pub fn prefix(&self) -> Option { self.0 } /// Local part of a name (name without namespace). pub fn local_name(&self) -> LocalPart { self.1 } /// Construct a constant QName from static C-style symbols. pub const fn st_cid(prefix_sym: &T, local_sym: &U) -> Self where T: QNameCompatibleStaticSymbolId, U: QNameCompatibleStaticSymbolId, { Self( Some(Prefix(NCName(st_as_sym(prefix_sym)))), LocalPart(NCName(st_as_sym(local_sym))), ) } /// Construct a constant QName with a local name only from a static /// C-style symbol. pub const fn st_cid_local( local_sym: &T, ) -> Self { Self(None, LocalPart(NCName(st_as_sym(local_sym)))) } } impl TryFrom<(P, L)> for QName where P: TryInto, L: TryInto, { type Error = P::Error; fn try_from(value: (P, L)) -> Result { Ok(Self(Some(value.0.try_into()?), value.1.try_into()?)) } } impl TryFrom<(Option

, L)> for QName where P: TryInto, L: TryInto, { type Error = P::Error; fn try_from(value: (Option

, L)) -> Result { let ns = match value.0 { None => None, Some(ns) => Some(ns.try_into()?), }; Ok(Self(ns, value.1.try_into()?)) } } impl TryFrom<&str> for QName { type Error = SpanlessError; fn try_from(value: &str) -> Result { Ok(QName(None, value.try_into()?)) } } impl TryFrom<&[u8]> for QName { type Error = SpanlessError; /// Attempt to parse a byte slice into a [`QName`]. /// /// The byte slice must represent a valid QName in UTF-8. /// If a colon is present, /// it delimits the namespace [`Prefix`] and [`LocalPart`], /// and therefore must not be in the first or last byte position. fn try_from(name: &[u8]) -> Result { match memchr(b':', name) { // Leading colon means we're missing a prefix, trailing means // that we have no local part. Some(pos) if pos == 0 || pos == name.len() - 1 => { Err(SpanlessError::InvalidQName(name.intern_utf8()?)) } // There is _at least_ one colon in the string. Some(pos) => { // The prefix is before the first colon, // and so itself must not contain a colon and is therefore // a valid NCName. let prefix = NCName(name[..pos].intern_utf8()?); // But there could be a _second_ colon, // so the local part requires validation. let local = NCName::try_from(&name[(pos + 1)..])?; Ok(Self::new(Some(prefix.into()), local.into())) } // There are no colons in the string, so the entire string is // both a local part and a valid NCName. None => Ok(Self::new(None, NCName(name.intern_utf8()?).into())), } } } impl Display for QName { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { QName(Some(local), suffix) => write!(f, "{}:{}", local, suffix), QName(None, suffix) => suffix.fmt(f), } } } /// A span representing an opening (starting) element tag. /// /// See [`EleSpan`] for more information. #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct OpenSpan(Span, EleNameLen); impl OpenSpan { pub fn without_name_span(span: Span) -> Self { Self(span, 0) } } /// A span representing a closing (ending) element tag. /// /// See [`EleSpan`] for more information. #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct CloseSpan(Span, EleNameLen); impl CloseSpan { /// A [`CloseSpan`] representing the closing of an empty tag. /// /// This type of span has no element name. pub fn empty(span: Span) -> Self { Self::without_name_span(span) } pub fn without_name_span(span: Span) -> Self { Self(span, 0) } } /// Number of bytes representing the name of the element. pub type EleNameLen = SpanLenSize; /// Spans associated with an element opening or closing tag. /// /// The diagram below illustrates the behavior of [`EleSpan`]. /// Spans are represented by `[---]` intervals, /// with the byte offset at each end, /// and the single-letter span name centered below the interval. /// /// ```text /// /// |[--] | |[--] | [---] | |[---] ' [] /// |1 4 | |1 4 | 2 6 | |1 5 ' 9`10 /// | N | | N | | N | | N | ' T /// | | | | | | | | ' /// [------] [---] [--------] [----] ' /// 0 7 0 4 0 9 0 5 ' /// T T T T ' /// ``` /// /// Above we have /// /// - `T` = [`EleSpan::span`]; and /// - `N` = [`EleSpan::name_span`]. /// /// The purpose of the `T` span is to represent the entire token that has /// been emitted by XIR. /// If an opening tag does not contain any attributes, /// then `T` represents the entire opening tag with both the opening and /// closing angle brackets. /// If an opening tag is expected to contain attributes, /// then only the opening angle bracket is included. /// A closing tag is entirely contained by `T`. /// /// The empty tag is separated into two tokens in XIR---a /// [`Token::Open`] and a [`Token::Close`] with a [`None`] for the name. /// Unlike a typical closing tag, /// there is no `N` span available for the closing token, /// and so requesting one via [`EleSpan::name_span`] will simply /// return the `T` span, /// rather than complicating the API with an [`Option`]. /// It is generally assumed that reporting on element names will occur /// within the context of the _opening_ tag. /// /// The tag may contain whitespace following the element name, /// as permitted by `STag` and `ETag` in the /// [XML specification][xmlspec-tag]. /// /// [xmlspec-tag]: https://www.w3.org/TR/xml/#dt-stag pub trait EleSpan { /// A [`Span`] encompassing the entire opening element token. /// /// Note that what exactly this token represents varies. fn span(&self) -> Span; /// Span representing the relevant portion of the element tag. /// /// This is a more descriptive alias of [`EleSpan::span`] that may be /// appropriate in certain contexts. fn tag_span(&self) -> Span { self.span() } /// A [`Span`] representing only the element name, /// if available. /// /// An element name is _not_ available for empty tags. /// Rather than complicating the API with [`Option`], /// [`EleSpan::span`] is returned instead. fn name_span(&self) -> Span; } impl EleSpan for OpenSpan { fn span(&self) -> Span { match self { Self(t, _) => *t, } } fn name_span(&self) -> Span { match self { // // ^^^^ offset '<' and length of name // // If the length is 0, // then this will result in a 0-length span at the location // that the element name ought to be, // and so the resulting span will still be useful. // This should not happen for tokens read using XIR, // but may happen for system-generated tokens. Self(t, name_len) => { t.context().span(t.offset().saturating_add(1), *name_len) } } } } impl EleSpan for CloseSpan { fn span(&self) -> Span { match self { Self(t, _) => *t, } } fn name_span(&self) -> Span { match self { // If the length of the element name is 0, // then this must be an empty tag, // which contains no independent element name. // // // ' ^^ Self(_t, 0) => self.span(), // // ^^^^^ offset ' { t.context().span(t.offset().saturating_add(2), *name_len) } } } } impl From for Span { fn from(value: OpenSpan) -> Self { value.span() } } impl From for Span { fn from(value: CloseSpan) -> Self { value.span() } } /// Lightly-structured XML tokens with associated [`Span`]s. /// /// This is a streamable IR for XML. /// A writer requires knowledge only of a previous state, /// such as whether a node is open, /// and so this IR can be processed by a simple state machine /// (see [`writer::WriterState`]). #[derive(Debug, Clone, PartialEq, Eq)] pub enum Token { /// Opening tag of an element. Open(QName, OpenSpan), /// Closing tag of an element. /// /// If the name is [`None`], /// then the tag is self-closing. /// This is intended primarily as a safety measure: /// It allows writers to act as simple state machines without having /// to ensure balancing by indicating that a node was intended to /// self-close. /// Otherwise, /// we wouldn't know whether to self-close or to close and then /// create a new closing tag; /// if we blindly did the former, /// we risk losing a closing tag when it wasn't intended. /// Instead of losing tags, /// writers can error, /// indicating a bug in the stream. /// /// The reason for using an option here rather than a variant is to /// simplify pattern matching, /// given especially that bindings after `@` in patterns have not /// yet been stabalized at the time of writing (but are very /// close!). Close(Option, CloseSpan), /// Element attribute name. AttrName(QName, Span), /// Element attribute value. AttrValue(SymbolId, Span), /// A portion of an element attribute value. /// /// This allows for concatenating values into an attribute value without /// having to copy values. /// The last fragment must be a [`Token::AttrValue`]. /// /// Since each fragment contains a span, /// this also potentially gives higher resolution for the origin of /// components of generated attribute values. /// /// _This should be used only for writing._ /// These will never be encountered during reading, /// and so to keep the parsers and IRs simple, /// there is no support for fragments beyond XIR. /// (There was in the past, /// but it was removed.) AttrValueFragment(SymbolId, Span), /// Comment node. Comment(SymbolId, Span), /// Character data as part of an element. /// /// See also [`CData`](Token::CData) variant. Text(SymbolId, Span), /// CData node (``). /// /// _Warning: It is up to the caller to ensure that the string `]]>` is /// not present in the text!_ /// This is intended for reading existing XML data where CData is /// already present, /// not for producing new CData safely! CData(SymbolId, Span), } impl Display for Token { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // _Do not_ render large amounts of text here; // this is not only a risk depending on what is output, // but the diagnostic system also quote source lines to provide // the necessary context. match self { Self::Open(qname, _) => OpenXmlEle::fmt(qname, f), Self::Close(Some(qname), _) => CloseXmlEle::fmt(qname, f), // Its context is contained within the Open, // and hopefully any user-visible errors will display that instead. Self::Close(None, _) => { write!(f, "/>") } Self::AttrName(qname, _) => XmlAttr::fmt(qname, f), Self::AttrValue(attr_val, _) => XmlAttrValueQuote::fmt(attr_val, f), Self::AttrValueFragment(attr_val, _) => { write!( f, "value fragment {}", XmlAttrValueQuote::wrap(attr_val) ) } Self::Comment(..) => write!(f, "comment"), Self::Text(..) => write!(f, "text"), Self::CData(..) => write!(f, "CDATA"), } } } impl crate::parse::Token for Token { fn ir_name() -> &'static str { "XIR" } /// Retrieve the [`Span`] associated with a given [`Token`]. /// /// Every token has an associated span. fn span(&self) -> Span { use Token::*; match self { Open(_, OpenSpan(span, _)) | Close(_, CloseSpan(span, _)) | AttrName(_, span) | AttrValue(_, span) | AttrValueFragment(_, span) | Comment(_, span) | Text(_, span) | CData(_, span) => *span, } } } impl crate::parse::Object for Token {} #[cfg(test)] pub mod test { use super::*; use crate::convert::ExpectInto; use crate::sym::GlobalSymbolIntern; use std::convert::TryInto; use std::fmt::Debug; type TestResult = Result<(), Box>; // Prefer [`open`] below when possible. impl From for OpenSpan { fn from(span: Span) -> Self { Self::without_name_span(span) } } // Prefer [`close`] below when possible. impl From for CloseSpan { fn from(span: Span) -> Self { Self::without_name_span(span) } } /// Hastily and lazily produce a [`XirfToken::Open`]. /// /// This function is not suitable for production use as it does not /// produce a complete [`OpenSpan`]. pub fn open, S: Into>( qname: Q, span: S, ) -> Token where >::Error: Debug, { Token::Open(qname.unwrap_into(), span.into()) } /// Hastily and lazily produce a [`XirfToken::Close`] for an empty tag. /// /// This is [`close`] with the omission of the `qname` argument; the /// type parameter `Q` cannot be inferred if the value is [`None`]. /// /// This function is not suitable for production use as it does not /// produce a complete [`OpenSpan`]. pub fn close_empty>(span: S) -> Token { Token::Close(None, span.into()) } /// Hastily and lazily produce a [`XirfToken::Close`]. /// /// See also [`close_empty`] if `Q` cannot be inferred. /// /// This function is not suitable for production use as it does not /// produce a complete [`OpenSpan`]. pub fn close, S: Into>( qname: Option, span: S, ) -> Token where >::Error: Debug, { Token::Close(qname.map(ExpectInto::unwrap_into), span.into()) } mod name { use super::*; #[test] fn ncname_comparable_to_sym() { let foo = "foo".intern(); assert_eq!(NCName(foo), foo); } #[test] fn ncname_try_into_from_str_no_colon() -> TestResult { let name: NCName = "no-colon".try_into()?; assert_eq!(name, "no-colon".intern()); Ok(()) } #[test] fn ncname_try_into_from_str_fails_with_colon() { assert_eq!( NCName::try_from("look:a-colon"), Err(SpanlessError::NCColon("look:a-colon".into())) ); } #[test] fn ncname_from_byte_slice() -> TestResult { let name: NCName = (b"no-colon" as &[u8]).try_into()?; assert_eq!(name, "no-colon".intern()); Ok(()) } #[test] fn ncname_from_byte_slice_fails_with_colon() { assert_eq!( NCName::try_from(b"a:colon" as &[u8]), Err(SpanlessError::NCColon("a:colon".into())) ); } #[test] fn local_name_from_local_part_only() -> TestResult { let name = QName::new_local("foo".try_into()?); assert_eq!(name.local_name(), "foo".try_into()?); assert_eq!(None, name.prefix()); Ok(()) } #[test] fn local_name_from_option_tuple() -> TestResult { let name: QName = (Option::<&str>::None, "foo").try_into()?; assert_eq!(name.local_name(), "foo".try_into()?); assert_eq!(None, name.prefix()); Ok(()) } #[test] fn fully_qualified_name() -> TestResult { let name: QName = ("foons", "foo").try_into()?; assert_eq!(name.prefix(), Some("foons".try_into()?)); assert_eq!(name.local_name(), "foo".try_into()?); Ok(()) } } mod ele_span { use super::*; use crate::span::dummy::DUMMY_CONTEXT as DC; #[test] fn open_without_attrs() { // See docblock for [`EleSpan`]. const T: Span = DC.span(0, 8); // Relevant portion of tag const N: Span = DC.span(1, 4); // Element name let sut = OpenSpan(T, N.len()); assert_eq!(sut.span(), T); assert_eq!(sut.name_span(), N); } #[test] fn open_with_attrs() { // See docblock for [`EleSpan`]. const T: Span = DC.span(0, 5); // Relevant portion of tag const N: Span = DC.span(1, 4); // Element name let sut = OpenSpan(T, N.len()); assert_eq!(sut.span(), T); assert_eq!(sut.name_span(), N); } #[test] fn close() { // See docblock for [`EleSpan`]. const T: Span = DC.span(0, 10); // Relevant portion of tag const N: Span = DC.span(2, 5); // Element name let sut = CloseSpan(T, N.len()); assert_eq!(sut.span(), T); assert_eq!(sut.name_span(), N); } #[test] fn close_empty() { // See docblock for [`EleSpan`]. const T: Span = DC.span(9, 2); // Relevant portion of tag let sut = CloseSpan(T, 0); assert_eq!(sut.span(), T); // There is no name, // only Zuul. assert_eq!(sut.name_span(), T); } } }