// XML IR (XIR) // // Copyright (C) 2014-2021 Ryan Specialty Group, LLC. // // This file is part of TAME. // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . //! Intermediate representation (IR) of an XML document. //! //! XIR serves not only as a TAMER-specific IR, //! but also as an abstraction layer atop of whatever XML library is //! used (e.g. `quick_xml`). //! XIR is _not_ intended to be comprehensive, //! or even general-purpose---it //! exists to solve concerns specific to TAMER's construction. //! //! Parsing and Safety //! ================== //! Many XIR elements know how to safely parse into themselves, //! exposing [`TryFrom`] traits that will largely do the right thing for //! you. //! For example, //! [`QName`] is able to construct itself from a byte slice and from a //! string tuple, //! among other things. //! //! ``` //! use tamer::xir::QName; //! use tamer::sym::GlobalSymbolIntern; //! //!# fn main() -> Result<(), tamer::xir::Error> { //! let src = "foo:bar".as_bytes(); //! let qname = QName::try_from(src)?; //! //! assert_eq!(qname, ("foo", "bar").try_into()?); //! //!# Ok(()) //!# } //! ``` //! //! However, //! certain elements cannot fully parse on their own because require //! important contextual information, //! such as [`AttrValue`], //! which requires knowing whether the provided value is escaped. //! It is important that the caller is diligent in making the proper //! determination in these cases, //! otherwise it could result in situations ranging from invalid //! compiler output to security vulnerabilities //! (via XML injection). //! //! To parse an entire XML document, //! see [`reader`]. use crate::span::Span; use crate::sym::{ st_as_sym, CIdentStaticSymbolId, GlobalSymbolIntern, GlobalSymbolInternBytes, StaticSymbolId, SymbolId, TameIdentStaticSymbolId, UriStaticSymbolId, }; use memchr::memchr; use std::convert::{TryFrom, TryInto}; use std::fmt::Display; use std::ops::Deref; mod error; pub use error::Error; pub mod iter; pub mod pred; pub mod reader; pub mod tree; pub mod writer; /// An infallible [`Token`] stream. /// /// If the token stream originates from an operation that could potentially /// fail and ought to be propagated, /// use [`TokenResultStream`]. /// /// The name "stream" in place of "iterator" is intended to convey that this /// type is expected to be processed in real-time as a stream, /// not read into memory. pub trait TokenStream = Iterator; /// A [`Token`] stream that may encounter errors during parsing. /// /// If the stream cannot fail, /// consider using [`TokenStream`]. pub trait TokenResultStream = Iterator>; /// A static symbol that can be safely converted into a [`QName`] without /// any checks. /// /// This must only be implemented on static symbol types that are known to /// be valid QNames. pub trait QNameCompatibleStaticSymbolId: StaticSymbolId {} impl QNameCompatibleStaticSymbolId for CIdentStaticSymbolId {} impl QNameCompatibleStaticSymbolId for TameIdentStaticSymbolId {} #[doc(hidden)] macro_rules! qname_const_inner { ($name:ident = :$local:ident) => { const $name: crate::xir::QName = crate::xir::QName::st_cid_local(&$local); }; ($name:ident = $prefix:ident:$local:ident) => { const $name: crate::xir::QName = crate::xir::QName::st_cid(&$prefix, &$local); }; } /// Construct a series of [`QName`] constants. /// /// The syntax for each constant is `NAME: [PREFIX]:LOCAL`, /// where `PREFIX` is optional. /// /// See [`crate::sym::st`] for usable symbol constants. #[macro_export] macro_rules! qname_const { ($($name:ident: $($prefix:ident)? : $local:ident,)*) => { $( qname_const_inner!($name = $($prefix)?:$local); )* } } /// XML Name minus `":"`. /// /// The intent is to check a string for validity _before_ interning; /// otherwise, /// the string would have to be first retrieved from the intern pool /// for comparison, /// which is not an operation we want to do implicitly. /// Those methods will be created as they are needed. /// /// See . #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct NCName(SymbolId); impl NCName { /// Create a new NCName from a symbol without validating that the symbol /// is a valid NCName. /// /// Safety /// ====== /// This is not unsafe in the traditional sense; /// it's unsafe in a sense similar to non-UTF-8 `str` slices, /// in that it is expected that an `NCName` means that you do not /// have to worry about whether it's syntatically valid as XML. pub unsafe fn new_unchecked(value: SymbolId) -> Self { Self(value) } } impl TryFrom<&[u8]> for NCName { type Error = Error; /// Attempt to parse a byte slice into an [`NCName`]. /// /// If the slice contains `b':'`, /// an error will be produced. /// No other checks are performed beyond checking that the byte sequence /// represents a valid UTF-8 string. /// The string will be interned for you. fn try_from(value: &[u8]) -> Result { match value.contains(&b':') { true => Err(Error::NCColon(value.to_owned())), false => Ok(NCName(value.intern_utf8()?)), } } } impl Deref for NCName { type Target = SymbolId; fn deref(&self) -> &Self::Target { &self.0 } } impl PartialEq for NCName { fn eq(&self, other: &SymbolId) -> bool { self.0 == *other } } impl TryFrom<&str> for NCName { type Error = Error; fn try_from(value: &str) -> Result { if value.contains(':') { return Err(Error::NCColon(value.into())); } Ok(Self(value.intern())) } } /// Namespace prefix of a [`QName`]. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Prefix(NCName); /// Local name portion of a [`QName`]. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct LocalPart(NCName); impl Deref for Prefix { type Target = SymbolId; fn deref(&self) -> &Self::Target { self.0.deref() } } impl Deref for LocalPart { type Target = SymbolId; fn deref(&self) -> &Self::Target { self.0.deref() } } impl From for Prefix { fn from(name: NCName) -> Self { Self(name) } } impl From for LocalPart { fn from(name: NCName) -> Self { Self(name) } } impl TryFrom<&str> for Prefix { type Error = Error; fn try_from(value: &str) -> Result { Ok(Self(value.try_into()?)) } } impl TryFrom<&str> for LocalPart { type Error = Error; fn try_from(value: &str) -> Result { Ok(Self(value.try_into()?)) } } impl Display for Prefix { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) } } impl Display for LocalPart { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) } } /// A sequence of one or more whitespace characters. /// /// Whitespace here is expected to consist of `[ \n\t\r]` /// (where the first character in that class is a space). #[derive(Debug, Clone, PartialEq, Eq)] pub struct Whitespace(SymbolId); impl Deref for Whitespace { type Target = SymbolId; fn deref(&self) -> &Self::Target { &self.0 } } impl TryFrom<&str> for Whitespace { type Error = Error; fn try_from(value: &str) -> Result { // We do not expect this to ever be a large value based on how we // use it. // If it is, well, someone's doing something they ought not to be // and we're not going to optimize for it. if !value.as_bytes().iter().all(u8::is_ascii_whitespace) { return Err(Error::NotWhitespace(value.into())); } Ok(Self(value.intern())) } } impl From for Text { fn from(ws: Whitespace) -> Self { // Whitespace needs no escaping Self::Escaped(ws.0) } } impl Display for Whitespace { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) } } /// A qualified name (namespace prefix and local name). #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct QName(Option, LocalPart); // Since we implement Copy, ensure size matches our expectations: const_assert!(std::mem::size_of::() <= std::mem::size_of::()); impl QName { /// Create a new fully-qualified name (including both a namespace URI /// and local name). pub fn new(prefix: Option, local_name: LocalPart) -> Self { Self(prefix, local_name) } /// Create a new name from a local name only. /// /// This should only be used for attributes in TAMER, /// since all elements should have an associated namespace. /// /// _(If this is ever not true (e.g. due to new targets), /// please update this comment.)_ pub fn new_local(local_name: LocalPart) -> Self { Self(None, local_name) } /// Fully qualified namespace associated with a name. pub fn prefix(&self) -> Option { self.0 } /// Local part of a name (name without namespace). pub fn local_name(&self) -> LocalPart { self.1 } /// Construct a constant QName from static C-style symbols. pub const fn st_cid(prefix_sym: &T, local_sym: &U) -> Self where T: QNameCompatibleStaticSymbolId, U: QNameCompatibleStaticSymbolId, { Self( Some(Prefix(NCName(st_as_sym(prefix_sym)))), LocalPart(NCName(st_as_sym(local_sym))), ) } /// Construct a constant QName with a local name only from a static /// C-style symbol. pub const fn st_cid_local( local_sym: &T, ) -> Self { Self(None, LocalPart(NCName(st_as_sym(local_sym)))) } } impl TryFrom<(P, L)> for QName where P: TryInto, L: TryInto, { type Error = P::Error; fn try_from(value: (P, L)) -> Result { Ok(Self(Some(value.0.try_into()?), value.1.try_into()?)) } } impl TryFrom<(Option

, L)> for QName where P: TryInto, L: TryInto, { type Error = P::Error; fn try_from(value: (Option

, L)) -> Result { let ns = match value.0 { None => None, Some(ns) => Some(ns.try_into()?), }; Ok(Self(ns, value.1.try_into()?)) } } impl TryFrom<&str> for QName { type Error = Error; fn try_from(value: &str) -> Result { Ok(QName(None, value.try_into()?)) } } impl TryFrom<&[u8]> for QName { type Error = Error; /// Attempt to parse a byte slice into a [`QName`]. /// /// The byte slice must represent a valid QName in UTF-8. /// If a colon is present, /// it delimits the namespace [`Prefix`] and [`LocalPart`], /// and therefore must not be in the first or last byte position. fn try_from(name: &[u8]) -> Result { match memchr(b':', name) { // Leading colon means we're missing a prefix, trailing means // that we have no local part. Some(pos) if pos == 0 || pos == name.len() - 1 => { Err(Error::InvalidQName(name.to_owned())) } // There is _at least_ one colon in the string. Some(pos) => { // The prefix is before the first colon, // and so itself must not contain a colon and is therefore // a valid NCName. let prefix = NCName(name[..pos].intern_utf8()?); // But there could be a _second_ colon, // so the local part requires validation. let local = NCName::try_from(&name[(pos + 1)..])?; Ok(Self::new(Some(prefix.into()), local.into())) } // There are no colons in the string, so the entire string is // both a local part and a valid NCName. None => Ok(Self::new(None, NCName(name.intern_utf8()?).into())), } } } impl Display for QName { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { QName(Some(local), suffix) => write!(f, "{}:{}", local, suffix), QName(None, suffix) => suffix.fmt(f), } } } /// Represents text and its escaped state. /// /// Being explicit about the state of escaping allows us to skip checks when /// we know that the generated text could not possibly require escaping. /// This does, however, put the onus on the caller to ensure that they got /// the escaping status correct. /// (TODO: More information on why this burden isn"t all that bad, /// despite the risk.) #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Text { /// Text node that requires escaping. /// /// Unescaped text requires further processing before writing. /// /// Note that, /// since the unescaped text is interned, /// it may be wasteful to intern a large text node with the intent of /// escaping and re-interning later. /// Instead, /// if escaping is only needed for writing, /// it is likely better to leave it to the writer to escape, /// which does _not_ require interning of the resulting string. Unescaped(SymbolId), /// Text node that either has already been escaped or is known not to /// require escaping. /// /// Escaped text can be written as-is without any further processing. Escaped(SymbolId), } /// Represents an attribute value and its escaped contents. /// /// Being explicit about the state of escaping allows us to skip checks when /// we know that the generated text could not possibly require escaping. /// This does, however, put the onus on the caller to ensure that they got /// the escaping status correct. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum AttrValue { /// Value that requires escaping. /// /// Unescaped values require further processing before writing. Unescaped(SymbolId), /// Value that either has already been escaped or is known not to /// require escaping. /// /// Escaped values can be written as-is without any further processing. Escaped(SymbolId), } impl AttrValue { /// Construct a constant escaped attribute from a static C-style symbol. pub const fn st_cid(sym: CIdentStaticSymbolId) -> Self { Self::Escaped(sym.as_sym()) } /// Construct a constant escaped attribute from a static URI symbol. /// /// URIs are expected _not_ to contain quotes. pub const fn st_uri(sym: UriStaticSymbolId) -> Self { Self::Escaped(sym.as_sym()) } } impl Display for AttrValue { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Escaped(value) => value.fmt(f), _ => todo!("AttrValue::Unescaped fmt"), } } } /// Lightly-structured XML tokens with associated [`Span`]s. /// /// This is a streamable IR for XML. /// A writer requires knowledge only of a previous state, /// such as whether a node is open, /// and so this IR can be processed by a simple state machine /// (see [`writer::WriterState`]). #[derive(Debug, Clone, PartialEq, Eq)] pub enum Token { /// Opening tag of an element. Open(QName, Span), /// Closing tag of an element. /// /// If the name is [`None`], /// then the tag is self-closing. /// This is intended primarily as a safety measure: /// It allows writers to act as simple state machines without having /// to ensure balancing by indicating that a node was intended to /// self-close. /// Otherwise, /// we wouldn't know whether to self-close or to close and then /// create a new closing tag; /// if we blindly did the former, /// we risk losing a closing tag when it wasn't intended. /// Instead of losing tags, /// writers can error, /// indicating a bug in the stream. /// /// The reason for using an option here rather than a variant is to /// simplify pattern matching, /// given especially that bindings after `@` in patterns have not /// yet been stabalized at the time of writing (but are very /// close!). Close(Option, Span), /// Element attribute name. AttrName(QName, Span), /// Element attribute value. AttrValue(AttrValue, Span), /// A portion of an element attribute value. /// /// This allows for concatenating values into an attribute value without /// having to copy values. /// The last fragment must be a [`Token::AttrValue`]. /// /// Since each fragment contains a span, /// this also potentially gives higher resolution for the origin of /// components of generated attribute values. AttrValueFragment(AttrValue, Span), /// A delimiter indicating that attribute processing has ended and the /// next token will be either a child node or [`Token::Close`]. /// /// This allows for streaming attribute collection without any /// lookahead, /// which would otherwise require an iterator supporting a `peek` /// operation. /// /// This is mandatory for _readers_ to produce, /// but _writers must ignore it and not require it to be present_, /// allowing for the reduction of token counts for generated XIR in /// situations where we know that it will not be further parsed. AttrEnd, /// Comment node. Comment(Text, Span), /// Character data as part of an element. /// /// See also [`CData`](Token::CData) variant. Text(Text, Span), /// CData node (``). /// /// See also [`Text`](Token::Text) variant. /// /// _Warning: It is up to the caller to ensure that the string `]]>` is /// not present in the text!_ /// This is intended for reading existing XML data where CData is /// already present, /// not for producing new CData safely! CData(Text, Span), /// Similar to `Text`, /// but intended for use where only whitespace is allowed, /// such as alignment of attributes. Whitespace(Whitespace, Span), } impl Display for Token { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Open(qname, span) => write!(f, "`<{}>` at {}", qname, span), Self::Close(Some(qname), span) => { write!(f, "`` at {}", qname, span) } // Its context is contained within the Open, // and hopefully any user-visible errors will display that instead. Self::Close(None, span) => { write!(f, "self-closing tag at {}", span) } Self::AttrName(qname, span) => { write!(f, "`@{}` at {}", qname, span) } Self::AttrValue(attr_val, span) => { write!(f, "attribute value `{}` at {}", attr_val, span) } Self::AttrValueFragment(attr_val, span) => { write!(f, "attribute value fragment `{}` at {}", attr_val, span) } Self::AttrEnd => write!(f, "end of attributes"), // TODO: Safe truncated comment. Self::Comment(_, span) => write!(f, "comment at {}", span), // TODO: Safe truncated text. Self::Text(_, span) => write!(f, "text at {}", span), // TODO: Safe truncated CDATA. Self::CData(_, span) => write!(f, "CDATA at {}", span), Self::Whitespace(ws, span) => write!(f, "`{}` at {}", ws, span), } } } #[cfg(test)] mod test { use super::*; use crate::sym::GlobalSymbolIntern; use std::convert::TryInto; type TestResult = Result<(), Box>; lazy_static! { static ref S: Span = Span::from_byte_interval((0, 0), "test case".intern()); } mod name { use super::*; #[test] fn ncname_comparable_to_sym() { let foo = "foo".intern(); assert_eq!(NCName(foo), foo); } #[test] fn ncname_try_into_from_str_no_colon() -> TestResult { let name: NCName = "no-colon".try_into()?; assert_eq!(name, "no-colon".intern()); Ok(()) } #[test] fn ncname_try_into_from_str_fails_with_colon() { assert_eq!( NCName::try_from("look:a-colon"), Err(Error::NCColon("look:a-colon".into())) ); } #[test] fn ncname_from_byte_slice() -> TestResult { let name: NCName = (b"no-colon" as &[u8]).try_into()?; assert_eq!(name, "no-colon".intern()); Ok(()) } #[test] fn ncname_from_byte_slice_fails_with_colon() { assert_eq!( NCName::try_from(b"a:colon" as &[u8]), Err(Error::NCColon("a:colon".into())) ); } #[test] fn local_name_from_local_part_only() -> TestResult { let name = QName::new_local("foo".try_into()?); assert_eq!(name.local_name(), "foo".try_into()?); assert_eq!(None, name.prefix()); Ok(()) } #[test] fn local_name_from_option_tuple() -> TestResult { let name: QName = (Option::<&str>::None, "foo").try_into()?; assert_eq!(name.local_name(), "foo".try_into()?); assert_eq!(None, name.prefix()); Ok(()) } #[test] fn fully_qualified_name() -> TestResult { let name: QName = ("foons", "foo").try_into()?; assert_eq!(name.prefix(), Some("foons".try_into()?)); assert_eq!(name.local_name(), "foo".try_into()?); Ok(()) } } #[test] fn whitespace() -> TestResult { assert_eq!(Whitespace::try_from(" ")?, " ".try_into()?); assert_eq!(Whitespace::try_from(" \t ")?, " \t ".try_into()?); assert_eq!( Whitespace::try_from("not ws!"), Err(Error::NotWhitespace("not ws!".into())) ); Ok(()) } #[test] fn whitespace_as_text() -> TestResult { assert_eq!( Text::Escaped(" ".intern()), Whitespace::try_from(" ")?.into(), ); Ok(()) } }