(
local_sym: &T,
) -> Self {
Self(None, LocalPart(NCName(st_as_sym(local_sym))))
}
}
impl TryFrom<(P, L)> for QName
where
P: TryInto,
L: TryInto,
{
type Error = P::Error;
fn try_from(value: (P, L)) -> Result {
Ok(Self(Some(value.0.try_into()?), value.1.try_into()?))
}
}
impl TryFrom<(Option
, L)> for QName
where
P: TryInto,
L: TryInto,
{
type Error = P::Error;
fn try_from(value: (Option, L)) -> Result {
let ns = match value.0 {
None => None,
Some(ns) => Some(ns.try_into()?),
};
Ok(Self(ns, value.1.try_into()?))
}
}
impl TryFrom<&str> for QName {
type Error = Error;
fn try_from(value: &str) -> Result {
Ok(QName(None, value.try_into()?))
}
}
impl TryFrom<&[u8]> for QName {
type Error = Error;
/// Attempt to parse a byte slice into a [`QName`].
///
/// The byte slice must represent a valid QName in UTF-8.
/// If a colon is present,
/// it delimits the namespace [`Prefix`] and [`LocalPart`],
/// and therefore must not be in the first or last byte position.
fn try_from(name: &[u8]) -> Result {
match memchr(b':', name) {
// Leading colon means we're missing a prefix, trailing means
// that we have no local part.
Some(pos) if pos == 0 || pos == name.len() - 1 => {
Err(Error::InvalidQName(name.to_owned()))
}
// There is _at least_ one colon in the string.
Some(pos) => {
// The prefix is before the first colon,
// and so itself must not contain a colon and is therefore
// a valid NCName.
let prefix = NCName(name[..pos].intern_utf8()?);
// But there could be a _second_ colon,
// so the local part requires validation.
let local = NCName::try_from(&name[(pos + 1)..])?;
Ok(Self::new(Some(prefix.into()), local.into()))
}
// There are no colons in the string, so the entire string is
// both a local part and a valid NCName.
None => Ok(Self::new(None, NCName(name.intern_utf8()?).into())),
}
}
}
impl Display for QName {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
QName(Some(local), suffix) => write!(f, "{}:{}", local, suffix),
QName(None, suffix) => suffix.fmt(f),
}
}
}
/// Represents text and its escaped state.
///
/// Being explicit about the state of escaping allows us to skip checks when
/// we know that the generated text could not possibly require escaping.
/// This does, however, put the onus on the caller to ensure that they got
/// the escaping status correct.
/// (TODO: More information on why this burden isn"t all that bad,
/// despite the risk.)
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Text {
/// Text node that requires escaping.
///
/// Unescaped text requires further processing before writing.
///
/// Note that,
/// since the unescaped text is interned,
/// it may be wasteful to intern a large text node with the intent of
/// escaping and re-interning later.
/// Instead,
/// if escaping is only needed for writing,
/// it is likely better to leave it to the writer to escape,
/// which does _not_ require interning of the resulting string.
Unescaped(SymbolId),
/// Text node that either has already been escaped or is known not to
/// require escaping.
///
/// Escaped text can be written as-is without any further processing.
Escaped(SymbolId),
}
/// Represents an attribute value and its escaped contents.
///
/// Being explicit about the state of escaping allows us to skip checks when
/// we know that the generated text could not possibly require escaping.
/// This does, however, put the onus on the caller to ensure that they got
/// the escaping status correct.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct AttrValue(XirString);
impl AttrValue {
/// Construct a constant escaped attribute from a static C-style symbol.
pub const fn st_cid(sym: CIdentStaticSymbolId) -> Self {
Self(XirString::st_cid(sym))
}
/// Construct a constant escaped attribute from a static URI symbol.
///
/// URIs are expected _not_ to contain quotes.
pub const fn st_uri(sym: UriStaticSymbolId) -> Self {
Self(XirString::st_uri(sym))
}
}
impl> From for AttrValue {
fn from(s: T) -> Self {
Self(s.into())
}
}
impl Into for AttrValue {
fn into(self) -> SymbolId {
self.0.into()
}
}
impl Display for AttrValue {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
/// Lightly-structured XML tokens with associated [`Span`]s.
///
/// This is a streamable IR for XML.
/// A writer requires knowledge only of a previous state,
/// such as whether a node is open,
/// and so this IR can be processed by a simple state machine
/// (see [`writer::WriterState`]).
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Token {
/// Opening tag of an element.
Open(QName, Span),
/// Closing tag of an element.
///
/// If the name is [`None`],
/// then the tag is self-closing.
/// This is intended primarily as a safety measure:
/// It allows writers to act as simple state machines without having
/// to ensure balancing by indicating that a node was intended to
/// self-close.
/// Otherwise,
/// we wouldn't know whether to self-close or to close and then
/// create a new closing tag;
/// if we blindly did the former,
/// we risk losing a closing tag when it wasn't intended.
/// Instead of losing tags,
/// writers can error,
/// indicating a bug in the stream.
///
/// The reason for using an option here rather than a variant is to
/// simplify pattern matching,
/// given especially that bindings after `@` in patterns have not
/// yet been stabalized at the time of writing (but are very
/// close!).
Close(Option, Span),
/// Element attribute name.
AttrName(QName, Span),
/// Element attribute value.
AttrValue(AttrValue, Span),
/// A portion of an element attribute value.
///
/// This allows for concatenating values into an attribute value without
/// having to copy values.
/// The last fragment must be a [`Token::AttrValue`].
///
/// Since each fragment contains a span,
/// this also potentially gives higher resolution for the origin of
/// components of generated attribute values.
AttrValueFragment(AttrValue, Span),
/// A delimiter indicating that attribute processing has ended and the
/// next token will be either a child node or [`Token::Close`].
///
/// This allows for streaming attribute collection without any
/// lookahead,
/// which would otherwise require an iterator supporting a `peek`
/// operation.
///
/// This is mandatory for _readers_ to produce,
/// but _writers must ignore it and not require it to be present_,
/// allowing for the reduction of token counts for generated XIR in
/// situations where we know that it will not be further parsed.
AttrEnd,
/// Comment node.
Comment(Text, Span),
/// Character data as part of an element.
///
/// See also [`CData`](Token::CData) variant.
Text(Text, Span),
/// CData node (``).
///
/// See also [`Text`](Token::Text) variant.
///
/// _Warning: It is up to the caller to ensure that the string `]]>` is
/// not present in the text!_
/// This is intended for reading existing XML data where CData is
/// already present,
/// not for producing new CData safely!
CData(Text, Span),
/// Similar to `Text`,
/// but intended for use where only whitespace is allowed,
/// such as alignment of attributes.
Whitespace(Whitespace, Span),
}
impl Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Open(qname, span) => write!(f, "`<{}>` at {}", qname, span),
Self::Close(Some(qname), span) => {
write!(f, "`{}>` at {}", qname, span)
}
// Its context is contained within the Open,
// and hopefully any user-visible errors will display that instead.
Self::Close(None, span) => {
write!(f, "self-closing tag at {}", span)
}
Self::AttrName(qname, span) => {
write!(f, "`@{}` at {}", qname, span)
}
Self::AttrValue(attr_val, span) => {
write!(f, "attribute value `{}` at {}", attr_val, span)
}
Self::AttrValueFragment(attr_val, span) => {
write!(f, "attribute value fragment `{}` at {}", attr_val, span)
}
Self::AttrEnd => write!(f, "end of attributes"),
// TODO: Safe truncated comment.
Self::Comment(_, span) => write!(f, "comment at {}", span),
// TODO: Safe truncated text.
Self::Text(_, span) => write!(f, "text at {}", span),
// TODO: Safe truncated CDATA.
Self::CData(_, span) => write!(f, "CDATA at {}", span),
Self::Whitespace(ws, span) => write!(f, "`{}` at {}", ws, span),
}
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::sym::GlobalSymbolIntern;
use std::convert::TryInto;
type TestResult = Result<(), Box>;
lazy_static! {
static ref S: Span =
Span::from_byte_interval((0, 0), "test case".intern());
}
mod name {
use super::*;
#[test]
fn ncname_comparable_to_sym() {
let foo = "foo".intern();
assert_eq!(NCName(foo), foo);
}
#[test]
fn ncname_try_into_from_str_no_colon() -> TestResult {
let name: NCName = "no-colon".try_into()?;
assert_eq!(name, "no-colon".intern());
Ok(())
}
#[test]
fn ncname_try_into_from_str_fails_with_colon() {
assert_eq!(
NCName::try_from("look:a-colon"),
Err(Error::NCColon("look:a-colon".into()))
);
}
#[test]
fn ncname_from_byte_slice() -> TestResult {
let name: NCName = (b"no-colon" as &[u8]).try_into()?;
assert_eq!(name, "no-colon".intern());
Ok(())
}
#[test]
fn ncname_from_byte_slice_fails_with_colon() {
assert_eq!(
NCName::try_from(b"a:colon" as &[u8]),
Err(Error::NCColon("a:colon".into()))
);
}
#[test]
fn local_name_from_local_part_only() -> TestResult {
let name = QName::new_local("foo".try_into()?);
assert_eq!(name.local_name(), "foo".try_into()?);
assert_eq!(None, name.prefix());
Ok(())
}
#[test]
fn local_name_from_option_tuple() -> TestResult {
let name: QName = (Option::<&str>::None, "foo").try_into()?;
assert_eq!(name.local_name(), "foo".try_into()?);
assert_eq!(None, name.prefix());
Ok(())
}
#[test]
fn fully_qualified_name() -> TestResult {
let name: QName = ("foons", "foo").try_into()?;
assert_eq!(name.prefix(), Some("foons".try_into()?));
assert_eq!(name.local_name(), "foo".try_into()?);
Ok(())
}
}
#[test]
fn whitespace() -> TestResult {
assert_eq!(Whitespace::try_from(" ")?, " ".try_into()?);
assert_eq!(Whitespace::try_from(" \t ")?, " \t ".try_into()?);
assert_eq!(
Whitespace::try_from("not ws!"),
Err(Error::NotWhitespace("not ws!".into()))
);
Ok(())
}
#[test]
fn whitespace_as_text() -> TestResult {
assert_eq!(
Text::Escaped(" ".intern()),
Whitespace::try_from(" ")?.into(),
);
Ok(())
}
}