2021-08-20 10:09:55 -04:00
|
|
|
// XML IR (XIR)
|
|
|
|
//
|
|
|
|
// Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
|
|
|
|
//
|
|
|
|
// This file is part of TAME.
|
|
|
|
//
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU General Public License
|
|
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
//! Intermediate representation (IR) of an XML document.
|
|
|
|
//!
|
|
|
|
//! XIR serves not only as a TAMER-specific IR,
|
|
|
|
//! but also as an abstraction layer atop of whatever XML library is
|
|
|
|
//! used (e.g. `quick_xml`).
|
|
|
|
//! XIR is _not_ intended to be comprehensive,
|
|
|
|
//! or even general-purpose---it
|
|
|
|
//! exists to solve concerns specific to TAMER's construction.
|
|
|
|
//!
|
2021-10-21 16:17:17 -04:00
|
|
|
//! Parsing and Safety
|
|
|
|
//! ==================
|
|
|
|
//! Many XIR elements know how to safely parse into themselves,
|
|
|
|
//! exposing [`TryFrom`] traits that will largely do the right thing for
|
|
|
|
//! you.
|
|
|
|
//! For example,
|
|
|
|
//! [`QName`] is able to construct itself from a byte slice and from a
|
|
|
|
//! string tuple,
|
|
|
|
//! among other things.
|
|
|
|
//!
|
|
|
|
//! ```
|
2021-11-04 16:12:15 -04:00
|
|
|
//! use tamer::xir::QName;
|
2021-10-21 16:17:17 -04:00
|
|
|
//! use tamer::sym::GlobalSymbolIntern;
|
|
|
|
//!
|
2021-11-04 16:12:15 -04:00
|
|
|
//!# fn main() -> Result<(), tamer::xir::Error> {
|
2021-10-21 16:17:17 -04:00
|
|
|
//! let src = "foo:bar".as_bytes();
|
|
|
|
//! let qname = QName::try_from(src)?;
|
|
|
|
//!
|
|
|
|
//! assert_eq!(qname, ("foo", "bar").try_into()?);
|
|
|
|
//!
|
|
|
|
//!# Ok(())
|
|
|
|
//!# }
|
|
|
|
//! ```
|
|
|
|
//!
|
|
|
|
//! To parse an entire XML document,
|
|
|
|
//! see [`reader`].
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
use crate::span::Span;
|
2021-09-28 14:52:31 -04:00
|
|
|
use crate::sym::{
|
2021-10-21 16:17:17 -04:00
|
|
|
st_as_sym, CIdentStaticSymbolId, GlobalSymbolIntern,
|
|
|
|
GlobalSymbolInternBytes, StaticSymbolId, SymbolId, TameIdentStaticSymbolId,
|
2021-09-28 14:52:31 -04:00
|
|
|
};
|
2021-10-21 16:17:17 -04:00
|
|
|
use memchr::memchr;
|
2021-08-20 10:09:55 -04:00
|
|
|
use std::convert::{TryFrom, TryInto};
|
2021-11-02 13:55:33 -04:00
|
|
|
use std::fmt::Display;
|
2021-08-20 10:09:55 -04:00
|
|
|
use std::ops::Deref;
|
|
|
|
|
2021-10-21 16:17:17 -04:00
|
|
|
mod error;
|
|
|
|
pub use error::Error;
|
|
|
|
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
mod escape;
|
2021-11-12 16:07:57 -05:00
|
|
|
pub use escape::{DefaultEscaper, Escaper};
|
tamer: xir::XirString: WIP implementation (likely going away)
I'm not fond of this implementation, which is why it's not fully
completed. I wanted to commit this for future reference, and take the
opportunity to explain why I don't like it.
First: this task started as an idea to implement a third variant to
AttrValue and friends that indicates that a value is fixed, in the sense of
a fixed-point function: escaped or unescaped, its value is the same. This
would allow us to skip wasteful escape/unescape operations.
In doing so, it became obvious that there's no need to leak this information
through the API, and indeed, no part of the system should care. When we
read XML, it should be unescaped, and when we write, it should be
escaped. The reason that this didn't quite happen to begin with was an
optimization: I'll be creating an echo writer in place of the current
filesystem-based copy in tamec shortly, and this would allow streaming XIR
directly from the reader to the writer without any unescaping or
re-escaping.
When we unescape, we know the value that it came from, so we could simply
store both symbols---they're 32-bit, so it results in a nicely compressed
64-bit value, so it's essentially cost-free, as long as we accept the
expense of internment. This is `XirString`. Then, when we want to escape
or unescape, we first check to see whether a symbol already exists and, if
so, use it.
While this works well for echoing streams, it won't work all that well in
practice: the unescaped SymbolId will be taken and the XirString discarded,
since nothing after XIR should be coupled with it. Then, when we later
construct a XIR stream for writting, XirString will no longer be available
and our previously known escape is lost, so the writer will have to
re-escape.
Further, if we look at XirString's generic for the XirStringEscaper---it
uses phantom, which hints that maybe it's not in the best place. Indeed,
I've already acknowledged that only a reader unescapes and only a writer
escapes, and that the rest of the system works with normal (unescaped)
values, so only readers and writers should be part of this process. I also
already acknowledged that XirString would be lost and only the unescaped
SymbolId would be used.
So what's the point of XirString, then, if it won't be a useful optimization
beyond the temporary echo writer?
Instead, we can take the XirStringWriter and implement two caches on that:
mapping SymbolId from escaped->unescaped and vice-versa. These can be
simple vectors, since SymbolId is a 32-bit value we will not have much
wasted space for symbols that never get read or written. We could even
optimize for preinterned symbols using markers, though I'll probably not do
so, and I'll explain why later.
If we do _that_, we get even _better_ optimizations through caching that
_will_ apply in the general case (so, not just for echo), and we're able to
ditch XirString entirely and simply use a SymbolId. This makes for a much
more friendly API that isn't leaking implementation details, though it
_does_ put an onus on the caller to pass the encoder to both the reader and
the writer, _if_ it wants to take advantage of a cache. But that burden is
not significant (and is, again, optional if we don't want it).
So, that'll be the next step.
2021-11-10 09:42:18 -05:00
|
|
|
|
2022-03-17 16:10:56 -04:00
|
|
|
pub mod attr;
|
2022-03-17 12:20:20 -04:00
|
|
|
pub mod flat;
|
2021-10-11 09:34:17 -04:00
|
|
|
pub mod iter;
|
2021-12-23 13:17:18 -05:00
|
|
|
pub mod parse;
|
2021-09-28 14:52:31 -04:00
|
|
|
pub mod pred;
|
2021-10-21 16:17:17 -04:00
|
|
|
pub mod reader;
|
2021-09-08 13:53:47 -04:00
|
|
|
pub mod tree;
|
2021-08-20 10:09:55 -04:00
|
|
|
pub mod writer;
|
|
|
|
|
2021-10-28 21:21:30 -04:00
|
|
|
/// An infallible [`Token`] stream.
|
|
|
|
///
|
|
|
|
/// If the token stream originates from an operation that could potentially
|
|
|
|
/// fail and ought to be propagated,
|
|
|
|
/// use [`TokenResultStream`].
|
|
|
|
///
|
|
|
|
/// The name "stream" in place of "iterator" is intended to convey that this
|
|
|
|
/// type is expected to be processed in real-time as a stream,
|
|
|
|
/// not read into memory.
|
|
|
|
pub trait TokenStream = Iterator<Item = Token>;
|
|
|
|
|
|
|
|
/// A [`Token`] stream that may encounter errors during parsing.
|
|
|
|
///
|
|
|
|
/// If the stream cannot fail,
|
|
|
|
/// consider using [`TokenStream`].
|
|
|
|
pub trait TokenResultStream = Iterator<Item = Result<Token, Error>>;
|
|
|
|
|
2021-10-11 11:56:03 -04:00
|
|
|
/// A static symbol that can be safely converted into a [`QName`] without
|
|
|
|
/// any checks.
|
|
|
|
///
|
|
|
|
/// This must only be implemented on static symbol types that are known to
|
|
|
|
/// be valid QNames.
|
2021-10-02 00:50:20 -04:00
|
|
|
pub trait QNameCompatibleStaticSymbolId: StaticSymbolId {}
|
2021-10-11 11:56:03 -04:00
|
|
|
|
2021-10-02 00:50:20 -04:00
|
|
|
impl QNameCompatibleStaticSymbolId for CIdentStaticSymbolId {}
|
2021-10-05 16:13:47 -04:00
|
|
|
impl QNameCompatibleStaticSymbolId for TameIdentStaticSymbolId {}
|
2021-10-02 00:50:20 -04:00
|
|
|
|
2021-10-21 16:17:17 -04:00
|
|
|
#[doc(hidden)]
|
2021-09-28 14:52:31 -04:00
|
|
|
macro_rules! qname_const_inner {
|
|
|
|
($name:ident = :$local:ident) => {
|
2021-11-04 16:12:15 -04:00
|
|
|
const $name: crate::xir::QName =
|
|
|
|
crate::xir::QName::st_cid_local(&$local);
|
2021-09-28 14:52:31 -04:00
|
|
|
};
|
|
|
|
|
|
|
|
($name:ident = $prefix:ident:$local:ident) => {
|
2021-11-04 16:12:15 -04:00
|
|
|
const $name: crate::xir::QName =
|
|
|
|
crate::xir::QName::st_cid(&$prefix, &$local);
|
2021-09-28 14:52:31 -04:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Construct a series of [`QName`] constants.
|
|
|
|
///
|
|
|
|
/// The syntax for each constant is `NAME: [PREFIX]:LOCAL`,
|
|
|
|
/// where `PREFIX` is optional.
|
|
|
|
///
|
|
|
|
/// See [`crate::sym::st`] for usable symbol constants.
|
|
|
|
#[macro_export]
|
|
|
|
macro_rules! qname_const {
|
|
|
|
($($name:ident: $($prefix:ident)? : $local:ident,)*) => {
|
|
|
|
$(
|
|
|
|
qname_const_inner!($name = $($prefix)?:$local);
|
|
|
|
)*
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-10-11 11:51:51 -04:00
|
|
|
/// XML Name minus `":"`.
|
|
|
|
///
|
|
|
|
/// The intent is to check a string for validity _before_ interning;
|
|
|
|
/// otherwise,
|
|
|
|
/// the string would have to be first retrieved from the intern pool
|
|
|
|
/// for comparison,
|
|
|
|
/// which is not an operation we want to do implicitly.
|
|
|
|
/// Those methods will be created as they are needed.
|
|
|
|
///
|
|
|
|
/// See <https://www.w3.org/TR/REC-xml-names/#NT-NCName>.
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
|
|
pub struct NCName(SymbolId);
|
2021-08-20 10:09:55 -04:00
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl NCName {
|
2021-08-20 10:09:55 -04:00
|
|
|
/// Create a new NCName from a symbol without validating that the symbol
|
|
|
|
/// is a valid NCName.
|
|
|
|
///
|
|
|
|
/// Safety
|
|
|
|
/// ======
|
|
|
|
/// This is not unsafe in the traditional sense;
|
|
|
|
/// it's unsafe in a sense similar to non-UTF-8 `str` slices,
|
|
|
|
/// in that it is expected that an `NCName` means that you do not
|
|
|
|
/// have to worry about whether it's syntatically valid as XML.
|
2021-09-23 14:52:53 -04:00
|
|
|
pub unsafe fn new_unchecked(value: SymbolId) -> Self {
|
2021-08-20 10:09:55 -04:00
|
|
|
Self(value)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-10-21 16:17:17 -04:00
|
|
|
impl TryFrom<&[u8]> for NCName {
|
|
|
|
type Error = Error;
|
|
|
|
|
|
|
|
/// Attempt to parse a byte slice into an [`NCName`].
|
|
|
|
///
|
|
|
|
/// If the slice contains `b':'`,
|
|
|
|
/// an error will be produced.
|
|
|
|
/// No other checks are performed beyond checking that the byte sequence
|
|
|
|
/// represents a valid UTF-8 string.
|
|
|
|
/// The string will be interned for you.
|
|
|
|
fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
|
|
|
|
match value.contains(&b':') {
|
|
|
|
true => Err(Error::NCColon(value.to_owned())),
|
|
|
|
false => Ok(NCName(value.intern_utf8()?)),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-10-11 11:51:51 -04:00
|
|
|
impl Deref for NCName {
|
|
|
|
type Target = SymbolId;
|
|
|
|
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
|
|
&self.0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl PartialEq<SymbolId> for NCName {
|
|
|
|
fn eq(&self, other: &SymbolId) -> bool {
|
|
|
|
self.0 == *other
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl TryFrom<&str> for NCName {
|
2021-08-20 10:09:55 -04:00
|
|
|
type Error = Error;
|
|
|
|
|
|
|
|
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
|
|
|
if value.contains(':') {
|
|
|
|
return Err(Error::NCColon(value.into()));
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(Self(value.intern()))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-10-11 11:56:03 -04:00
|
|
|
/// Namespace prefix of a [`QName`].
|
2021-08-20 10:09:55 -04:00
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
2021-09-23 14:52:53 -04:00
|
|
|
pub struct Prefix(NCName);
|
2021-10-11 11:56:03 -04:00
|
|
|
|
|
|
|
/// Local name portion of a [`QName`].
|
2021-08-20 10:09:55 -04:00
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
2021-09-23 14:52:53 -04:00
|
|
|
pub struct LocalPart(NCName);
|
2021-08-20 10:09:55 -04:00
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl Deref for Prefix {
|
|
|
|
type Target = SymbolId;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
|
|
self.0.deref()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl Deref for LocalPart {
|
|
|
|
type Target = SymbolId;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
|
|
self.0.deref()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl From<NCName> for Prefix {
|
|
|
|
fn from(name: NCName) -> Self {
|
2021-08-20 10:09:55 -04:00
|
|
|
Self(name)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl From<NCName> for LocalPart {
|
|
|
|
fn from(name: NCName) -> Self {
|
2021-08-20 10:09:55 -04:00
|
|
|
Self(name)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl TryFrom<&str> for Prefix {
|
2021-08-20 10:09:55 -04:00
|
|
|
type Error = Error;
|
|
|
|
|
|
|
|
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
|
|
|
Ok(Self(value.try_into()?))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl TryFrom<&str> for LocalPart {
|
2021-08-20 10:09:55 -04:00
|
|
|
type Error = Error;
|
|
|
|
|
|
|
|
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
|
|
|
Ok(Self(value.try_into()?))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-02 13:55:33 -04:00
|
|
|
impl Display for Prefix {
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
self.0.fmt(f)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Display for LocalPart {
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
self.0.fmt(f)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-10-11 11:56:03 -04:00
|
|
|
/// A sequence of one or more whitespace characters.
|
|
|
|
///
|
|
|
|
/// Whitespace here is expected to consist of `[ \n\t\r]`
|
|
|
|
/// (where the first character in that class is a space).
|
2021-10-11 10:33:24 -04:00
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
2021-09-23 14:52:53 -04:00
|
|
|
pub struct Whitespace(SymbolId);
|
2021-08-20 10:09:55 -04:00
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl Deref for Whitespace {
|
|
|
|
type Target = SymbolId;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
|
|
&self.0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl TryFrom<&str> for Whitespace {
|
2021-08-20 10:09:55 -04:00
|
|
|
type Error = Error;
|
|
|
|
|
|
|
|
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
|
|
|
// We do not expect this to ever be a large value based on how we
|
|
|
|
// use it.
|
|
|
|
// If it is, well, someone's doing something they ought not to be
|
|
|
|
// and we're not going to optimize for it.
|
|
|
|
if !value.as_bytes().iter().all(u8::is_ascii_whitespace) {
|
|
|
|
return Err(Error::NotWhitespace(value.into()));
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(Self(value.intern()))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-15 23:47:14 -05:00
|
|
|
impl From<Whitespace> for SymbolId {
|
2021-09-23 14:52:53 -04:00
|
|
|
fn from(ws: Whitespace) -> Self {
|
2021-11-15 23:47:14 -05:00
|
|
|
ws.0
|
2021-08-20 10:09:55 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-03 14:54:37 -04:00
|
|
|
impl Display for Whitespace {
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
self.0.fmt(f)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-20 10:09:55 -04:00
|
|
|
/// A qualified name (namespace prefix and local name).
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
2021-09-23 14:52:53 -04:00
|
|
|
pub struct QName(Option<Prefix>, LocalPart);
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
// Since we implement Copy, ensure size matches our expectations:
|
2021-09-23 14:52:53 -04:00
|
|
|
const_assert!(std::mem::size_of::<QName>() <= std::mem::size_of::<usize>());
|
2021-08-20 10:09:55 -04:00
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl QName {
|
2021-08-20 10:09:55 -04:00
|
|
|
/// Create a new fully-qualified name (including both a namespace URI
|
|
|
|
/// and local name).
|
2021-10-21 16:17:17 -04:00
|
|
|
pub fn new(prefix: Option<Prefix>, local_name: LocalPart) -> Self {
|
|
|
|
Self(prefix, local_name)
|
2021-08-20 10:09:55 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Create a new name from a local name only.
|
|
|
|
///
|
|
|
|
/// This should only be used for attributes in TAMER,
|
|
|
|
/// since all elements should have an associated namespace.
|
|
|
|
///
|
|
|
|
/// _(If this is ever not true (e.g. due to new targets),
|
|
|
|
/// please update this comment.)_
|
2021-09-23 14:52:53 -04:00
|
|
|
pub fn new_local(local_name: LocalPart) -> Self {
|
2021-08-20 10:09:55 -04:00
|
|
|
Self(None, local_name)
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Fully qualified namespace associated with a name.
|
2021-09-23 14:52:53 -04:00
|
|
|
pub fn prefix(&self) -> Option<Prefix> {
|
2021-08-20 10:09:55 -04:00
|
|
|
self.0
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Local part of a name (name without namespace).
|
2021-09-23 14:52:53 -04:00
|
|
|
pub fn local_name(&self) -> LocalPart {
|
2021-08-20 10:09:55 -04:00
|
|
|
self.1
|
|
|
|
}
|
2021-09-28 14:52:31 -04:00
|
|
|
|
|
|
|
/// Construct a constant QName from static C-style symbols.
|
2021-10-05 16:13:47 -04:00
|
|
|
pub const fn st_cid<T, U>(prefix_sym: &T, local_sym: &U) -> Self
|
|
|
|
where
|
|
|
|
T: QNameCompatibleStaticSymbolId,
|
|
|
|
U: QNameCompatibleStaticSymbolId,
|
|
|
|
{
|
2021-09-28 14:52:31 -04:00
|
|
|
Self(
|
2021-10-05 16:13:47 -04:00
|
|
|
Some(Prefix(NCName(st_as_sym(prefix_sym)))),
|
|
|
|
LocalPart(NCName(st_as_sym(local_sym))),
|
2021-09-28 14:52:31 -04:00
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Construct a constant QName with a local name only from a static
|
|
|
|
/// C-style symbol.
|
2021-10-05 16:13:47 -04:00
|
|
|
pub const fn st_cid_local<T: QNameCompatibleStaticSymbolId>(
|
|
|
|
local_sym: &T,
|
|
|
|
) -> Self {
|
|
|
|
Self(None, LocalPart(NCName(st_as_sym(local_sym))))
|
2021-09-28 14:52:31 -04:00
|
|
|
}
|
2021-08-20 10:09:55 -04:00
|
|
|
}
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl<P, L> TryFrom<(P, L)> for QName
|
2021-08-20 10:09:55 -04:00
|
|
|
where
|
2021-09-23 14:52:53 -04:00
|
|
|
P: TryInto<Prefix>,
|
|
|
|
L: TryInto<LocalPart, Error = P::Error>,
|
2021-08-20 10:09:55 -04:00
|
|
|
{
|
|
|
|
type Error = P::Error;
|
|
|
|
|
|
|
|
fn try_from(value: (P, L)) -> Result<Self, Self::Error> {
|
|
|
|
Ok(Self(Some(value.0.try_into()?), value.1.try_into()?))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl<P, L> TryFrom<(Option<P>, L)> for QName
|
2021-09-08 13:53:47 -04:00
|
|
|
where
|
2021-09-23 14:52:53 -04:00
|
|
|
P: TryInto<Prefix>,
|
|
|
|
L: TryInto<LocalPart, Error = P::Error>,
|
2021-09-08 13:53:47 -04:00
|
|
|
{
|
|
|
|
type Error = P::Error;
|
|
|
|
|
|
|
|
fn try_from(value: (Option<P>, L)) -> Result<Self, Self::Error> {
|
|
|
|
let ns = match value.0 {
|
|
|
|
None => None,
|
|
|
|
Some(ns) => Some(ns.try_into()?),
|
|
|
|
};
|
|
|
|
|
|
|
|
Ok(Self(ns, value.1.try_into()?))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
impl TryFrom<&str> for QName {
|
2021-09-08 13:53:47 -04:00
|
|
|
type Error = Error;
|
|
|
|
|
|
|
|
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
|
|
|
Ok(QName(None, value.try_into()?))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-10-21 16:17:17 -04:00
|
|
|
impl TryFrom<&[u8]> for QName {
|
|
|
|
type Error = Error;
|
|
|
|
|
|
|
|
/// Attempt to parse a byte slice into a [`QName`].
|
|
|
|
///
|
|
|
|
/// The byte slice must represent a valid QName in UTF-8.
|
|
|
|
/// If a colon is present,
|
|
|
|
/// it delimits the namespace [`Prefix`] and [`LocalPart`],
|
|
|
|
/// and therefore must not be in the first or last byte position.
|
|
|
|
fn try_from(name: &[u8]) -> Result<Self, Self::Error> {
|
|
|
|
match memchr(b':', name) {
|
|
|
|
// Leading colon means we're missing a prefix, trailing means
|
|
|
|
// that we have no local part.
|
|
|
|
Some(pos) if pos == 0 || pos == name.len() - 1 => {
|
|
|
|
Err(Error::InvalidQName(name.to_owned()))
|
|
|
|
}
|
|
|
|
|
|
|
|
// There is _at least_ one colon in the string.
|
|
|
|
Some(pos) => {
|
|
|
|
// The prefix is before the first colon,
|
|
|
|
// and so itself must not contain a colon and is therefore
|
|
|
|
// a valid NCName.
|
|
|
|
let prefix = NCName(name[..pos].intern_utf8()?);
|
|
|
|
|
|
|
|
// But there could be a _second_ colon,
|
|
|
|
// so the local part requires validation.
|
|
|
|
let local = NCName::try_from(&name[(pos + 1)..])?;
|
|
|
|
|
|
|
|
Ok(Self::new(Some(prefix.into()), local.into()))
|
|
|
|
}
|
|
|
|
|
|
|
|
// There are no colons in the string, so the entire string is
|
|
|
|
// both a local part and a valid NCName.
|
|
|
|
None => Ok(Self::new(None, NCName(name.intern_utf8()?).into())),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-02 13:55:33 -04:00
|
|
|
impl Display for QName {
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
match self {
|
|
|
|
QName(Some(local), suffix) => write!(f, "{}:{}", local, suffix),
|
|
|
|
QName(None, suffix) => suffix.fmt(f),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-20 10:30:27 -04:00
|
|
|
/// Lightly-structured XML tokens with associated [`Span`]s.
|
|
|
|
///
|
|
|
|
/// This is a streamable IR for XML.
|
|
|
|
/// A writer requires knowledge only of a previous state,
|
|
|
|
/// such as whether a node is open,
|
|
|
|
/// and so this IR can be processed by a simple state machine
|
|
|
|
/// (see [`writer::WriterState`]).
|
2021-10-11 10:33:24 -04:00
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
2021-09-23 14:52:53 -04:00
|
|
|
pub enum Token {
|
2021-08-20 10:09:55 -04:00
|
|
|
/// Opening tag of an element.
|
2021-09-23 14:52:53 -04:00
|
|
|
Open(QName, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
/// Closing tag of an element.
|
|
|
|
///
|
2021-09-13 12:58:54 -04:00
|
|
|
/// If the name is [`None`],
|
|
|
|
/// then the tag is self-closing.
|
2021-08-20 10:09:55 -04:00
|
|
|
/// This is intended primarily as a safety measure:
|
|
|
|
/// It allows writers to act as simple state machines without having
|
|
|
|
/// to ensure balancing by indicating that a node was intended to
|
|
|
|
/// self-close.
|
|
|
|
/// Otherwise,
|
|
|
|
/// we wouldn't know whether to self-close or to close and then
|
|
|
|
/// create a new closing tag;
|
|
|
|
/// if we blindly did the former,
|
|
|
|
/// we risk losing a closing tag when it wasn't intended.
|
|
|
|
/// Instead of losing tags,
|
|
|
|
/// writers can error,
|
|
|
|
/// indicating a bug in the stream.
|
2021-09-13 12:58:54 -04:00
|
|
|
///
|
|
|
|
/// The reason for using an option here rather than a variant is to
|
|
|
|
/// simplify pattern matching,
|
|
|
|
/// given especially that bindings after `@` in patterns have not
|
|
|
|
/// yet been stabalized at the time of writing (but are very
|
|
|
|
/// close!).
|
2021-09-23 14:52:53 -04:00
|
|
|
Close(Option<QName>, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
2021-09-21 00:13:03 -04:00
|
|
|
/// Element attribute name.
|
2021-09-23 14:52:53 -04:00
|
|
|
AttrName(QName, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
2021-09-21 00:13:03 -04:00
|
|
|
/// Element attribute value.
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
AttrValue(SymbolId, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
2021-09-21 00:13:03 -04:00
|
|
|
/// A portion of an element attribute value.
|
|
|
|
///
|
|
|
|
/// This allows for concatenating values into an attribute value without
|
|
|
|
/// having to copy values.
|
|
|
|
/// The last fragment must be a [`Token::AttrValue`].
|
|
|
|
///
|
2021-09-21 10:43:23 -04:00
|
|
|
/// Since each fragment contains a span,
|
|
|
|
/// this also potentially gives higher resolution for the origin of
|
|
|
|
/// components of generated attribute values.
|
2021-12-06 14:26:58 -05:00
|
|
|
///
|
|
|
|
/// _This should be used only for writing._
|
|
|
|
/// These will never be encountered during reading,
|
|
|
|
/// and so to keep the parsers and IRs simple,
|
|
|
|
/// there is no support for fragments beyond XIR.
|
|
|
|
/// (There was in the past,
|
|
|
|
/// but it was removed.)
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
AttrValueFragment(SymbolId, Span),
|
2021-09-21 00:13:03 -04:00
|
|
|
|
2021-08-20 10:09:55 -04:00
|
|
|
/// Comment node.
|
2021-11-15 23:47:14 -05:00
|
|
|
Comment(SymbolId, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
/// Character data as part of an element.
|
|
|
|
///
|
2021-08-20 10:30:27 -04:00
|
|
|
/// See also [`CData`](Token::CData) variant.
|
2021-11-15 23:47:14 -05:00
|
|
|
Text(SymbolId, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
/// CData node (`<![CDATA[...]]>`).
|
|
|
|
///
|
|
|
|
/// _Warning: It is up to the caller to ensure that the string `]]>` is
|
|
|
|
/// not present in the text!_
|
|
|
|
/// This is intended for reading existing XML data where CData is
|
|
|
|
/// already present,
|
|
|
|
/// not for producing new CData safely!
|
2021-11-15 23:47:14 -05:00
|
|
|
CData(SymbolId, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
/// Similar to `Text`,
|
|
|
|
/// but intended for use where only whitespace is allowed,
|
|
|
|
/// such as alignment of attributes.
|
2021-09-23 14:52:53 -04:00
|
|
|
Whitespace(Whitespace, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
}
|
|
|
|
|
2021-11-03 14:54:37 -04:00
|
|
|
impl Display for Token {
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
match self {
|
|
|
|
Self::Open(qname, span) => write!(f, "`<{}>` at {}", qname, span),
|
|
|
|
Self::Close(Some(qname), span) => {
|
|
|
|
write!(f, "`</{}>` at {}", qname, span)
|
|
|
|
}
|
|
|
|
// Its context is contained within the Open,
|
|
|
|
// and hopefully any user-visible errors will display that instead.
|
|
|
|
Self::Close(None, span) => {
|
|
|
|
write!(f, "self-closing tag at {}", span)
|
|
|
|
}
|
|
|
|
Self::AttrName(qname, span) => {
|
|
|
|
write!(f, "`@{}` at {}", qname, span)
|
|
|
|
}
|
|
|
|
Self::AttrValue(attr_val, span) => {
|
|
|
|
write!(f, "attribute value `{}` at {}", attr_val, span)
|
|
|
|
}
|
|
|
|
Self::AttrValueFragment(attr_val, span) => {
|
|
|
|
write!(f, "attribute value fragment `{}` at {}", attr_val, span)
|
|
|
|
}
|
|
|
|
// TODO: Safe truncated comment.
|
|
|
|
Self::Comment(_, span) => write!(f, "comment at {}", span),
|
|
|
|
// TODO: Safe truncated text.
|
|
|
|
Self::Text(_, span) => write!(f, "text at {}", span),
|
|
|
|
// TODO: Safe truncated CDATA.
|
|
|
|
Self::CData(_, span) => write!(f, "CDATA at {}", span),
|
|
|
|
Self::Whitespace(ws, span) => write!(f, "`{}` at {}", ws, span),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-12-06 14:48:55 -05:00
|
|
|
impl Token {
|
|
|
|
/// Retrieve the [`Span`] associated with a given [`Token`].
|
|
|
|
///
|
|
|
|
/// Every token has an associated span.
|
|
|
|
pub fn span(&self) -> Span {
|
|
|
|
use Token::*;
|
|
|
|
|
|
|
|
match self {
|
|
|
|
Open(_, span)
|
|
|
|
| Close(_, span)
|
|
|
|
| AttrName(_, span)
|
|
|
|
| AttrValue(_, span)
|
|
|
|
| AttrValueFragment(_, span)
|
|
|
|
| Comment(_, span)
|
|
|
|
| Text(_, span)
|
|
|
|
| CData(_, span)
|
|
|
|
| Whitespace(_, span) => *span,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl From<Token> for Span {
|
|
|
|
fn from(tok: Token) -> Self {
|
|
|
|
tok.span()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-20 10:09:55 -04:00
|
|
|
#[cfg(test)]
|
|
|
|
mod test {
|
|
|
|
use super::*;
|
2021-09-23 14:52:53 -04:00
|
|
|
use crate::sym::GlobalSymbolIntern;
|
2021-08-20 10:09:55 -04:00
|
|
|
use std::convert::TryInto;
|
|
|
|
|
|
|
|
type TestResult = Result<(), Box<dyn std::error::Error>>;
|
|
|
|
|
|
|
|
lazy_static! {
|
|
|
|
static ref S: Span =
|
|
|
|
Span::from_byte_interval((0, 0), "test case".intern());
|
|
|
|
}
|
|
|
|
|
|
|
|
mod name {
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn ncname_comparable_to_sym() {
|
|
|
|
let foo = "foo".intern();
|
2021-09-23 14:52:53 -04:00
|
|
|
assert_eq!(NCName(foo), foo);
|
2021-08-20 10:09:55 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn ncname_try_into_from_str_no_colon() -> TestResult {
|
2021-09-23 14:52:53 -04:00
|
|
|
let name: NCName = "no-colon".try_into()?;
|
2021-08-20 10:09:55 -04:00
|
|
|
assert_eq!(name, "no-colon".intern());
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn ncname_try_into_from_str_fails_with_colon() {
|
|
|
|
assert_eq!(
|
2021-09-23 14:52:53 -04:00
|
|
|
NCName::try_from("look:a-colon"),
|
2021-08-20 10:09:55 -04:00
|
|
|
Err(Error::NCColon("look:a-colon".into()))
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2021-10-21 16:17:17 -04:00
|
|
|
#[test]
|
|
|
|
fn ncname_from_byte_slice() -> TestResult {
|
|
|
|
let name: NCName = (b"no-colon" as &[u8]).try_into()?;
|
|
|
|
assert_eq!(name, "no-colon".intern());
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn ncname_from_byte_slice_fails_with_colon() {
|
|
|
|
assert_eq!(
|
|
|
|
NCName::try_from(b"a:colon" as &[u8]),
|
|
|
|
Err(Error::NCColon("a:colon".into()))
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2021-08-20 10:09:55 -04:00
|
|
|
#[test]
|
|
|
|
fn local_name_from_local_part_only() -> TestResult {
|
2021-09-23 14:52:53 -04:00
|
|
|
let name = QName::new_local("foo".try_into()?);
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
assert_eq!(name.local_name(), "foo".try_into()?);
|
|
|
|
assert_eq!(None, name.prefix());
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2021-09-28 14:52:31 -04:00
|
|
|
#[test]
|
|
|
|
fn local_name_from_option_tuple() -> TestResult {
|
|
|
|
let name: QName = (Option::<&str>::None, "foo").try_into()?;
|
|
|
|
|
|
|
|
assert_eq!(name.local_name(), "foo".try_into()?);
|
|
|
|
assert_eq!(None, name.prefix());
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2021-08-20 10:09:55 -04:00
|
|
|
#[test]
|
|
|
|
fn fully_qualified_name() -> TestResult {
|
2021-09-23 14:52:53 -04:00
|
|
|
let name: QName = ("foons", "foo").try_into()?;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
assert_eq!(name.prefix(), Some("foons".try_into()?));
|
|
|
|
assert_eq!(name.local_name(), "foo".try_into()?);
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn whitespace() -> TestResult {
|
2021-09-23 14:52:53 -04:00
|
|
|
assert_eq!(Whitespace::try_from(" ")?, " ".try_into()?);
|
|
|
|
assert_eq!(Whitespace::try_from(" \t ")?, " \t ".try_into()?);
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
assert_eq!(
|
2021-09-23 14:52:53 -04:00
|
|
|
Whitespace::try_from("not ws!"),
|
2021-08-20 10:09:55 -04:00
|
|
|
Err(Error::NotWhitespace("not ws!".into()))
|
|
|
|
);
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn whitespace_as_text() -> TestResult {
|
2021-11-15 23:47:14 -05:00
|
|
|
assert_eq!(" ".intern(), Whitespace::try_from(" ")?.into(),);
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|