(
local_sym: &T,
) -> Self {
Self(None, LocalPart(NCName(st_as_sym(local_sym))))
}
}
impl TryFrom<(P, L)> for QName
where
P: TryInto,
L: TryInto,
{
type Error = P::Error;
fn try_from(value: (P, L)) -> Result {
Ok(Self(Some(value.0.try_into()?), value.1.try_into()?))
}
}
impl TryFrom<(Option
, L)> for QName
where
P: TryInto,
L: TryInto,
{
type Error = P::Error;
fn try_from(value: (Option, L)) -> Result {
let ns = match value.0 {
None => None,
Some(ns) => Some(ns.try_into()?),
};
Ok(Self(ns, value.1.try_into()?))
}
}
impl TryFrom<&str> for QName {
type Error = SpanlessError;
fn try_from(value: &str) -> Result {
Ok(QName(None, value.try_into()?))
}
}
impl TryFrom<&[u8]> for QName {
type Error = SpanlessError;
/// Attempt to parse a byte slice into a [`QName`].
///
/// The byte slice must represent a valid QName in UTF-8.
/// If a colon is present,
/// it delimits the namespace [`Prefix`] and [`LocalPart`],
/// and therefore must not be in the first or last byte position.
fn try_from(name: &[u8]) -> Result {
match memchr(b':', name) {
// Leading colon means we're missing a prefix, trailing means
// that we have no local part.
Some(pos) if pos == 0 || pos == name.len() - 1 => {
Err(SpanlessError::InvalidQName(name.intern_utf8()?))
}
// There is _at least_ one colon in the string.
Some(pos) => {
// The prefix is before the first colon,
// and so itself must not contain a colon and is therefore
// a valid NCName.
let prefix = NCName(name[..pos].intern_utf8()?);
// But there could be a _second_ colon,
// so the local part requires validation.
let local = NCName::try_from(&name[(pos + 1)..])?;
Ok(Self::new(Some(prefix.into()), local.into()))
}
// There are no colons in the string, so the entire string is
// both a local part and a valid NCName.
None => Ok(Self::new(None, NCName(name.intern_utf8()?).into())),
}
}
}
impl Display for QName {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
QName(Some(local), suffix) => write!(f, "{}:{}", local, suffix),
QName(None, suffix) => suffix.fmt(f),
}
}
}
/// A span representing an opening (starting) element tag.
///
/// See [`EleSpan`] for more information.
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct OpenSpan(Span, EleNameLen);
impl OpenSpan {
pub fn without_name_span(span: Span) -> Self {
Self(span, 0)
}
}
/// A span representing a closing (ending) element tag.
///
/// See [`EleSpan`] for more information.
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct CloseSpan(Span, EleNameLen);
impl CloseSpan {
/// A [`CloseSpan`] representing the closing of an empty tag.
///
/// This type of span has no element name.
pub fn empty(span: Span) -> Self {
Self::without_name_span(span)
}
pub fn without_name_span(span: Span) -> Self {
Self(span, 0)
}
}
/// Number of bytes representing the name of the element.
pub type EleNameLen = SpanLenSize;
/// Spans associated with an element opening or closing tag.
///
/// The diagram below illustrates the behavior of [`EleSpan`].
/// Spans are represented by `[---]` intervals,
/// with the byte offset at each end,
/// and the single-letter span name centered below the interval.
///
/// ```text
///
/// |[--] | |[--] | [---] | |[---] ' []
/// |1 4 | |1 4 | 2 6 | |1 5 ' 9`10
/// | N | | N | | N | | N | ' T
/// | | | | | | | | '
/// [------] [---] [--------] [----] '
/// 0 7 0 4 0 9 0 5 '
/// T T T T '
/// ```
///
/// Above we have
///
/// - `T` = [`EleSpan::span`]; and
/// - `N` = [`EleSpan::name_span`].
///
/// The purpose of the `T` span is to represent the entire token that has
/// been emitted by XIR.
/// If an opening tag does not contain any attributes,
/// then `T` represents the entire opening tag with both the opening and
/// closing angle brackets.
/// If an opening tag is expected to contain attributes,
/// then only the opening angle bracket is included.
/// A closing tag is entirely contained by `T`.
///
/// The empty tag is separated into two tokens in XIR---a
/// [`Token::Open`] and a [`Token::Close`] with a [`None`] for the name.
/// Unlike a typical closing tag,
/// there is no `N` span available for the closing token,
/// and so requesting one via [`EleSpan::name_span`] will simply
/// return the `T` span,
/// rather than complicating the API with an [`Option`].
/// It is generally assumed that reporting on element names will occur
/// within the context of the _opening_ tag.
///
/// The tag may contain whitespace following the element name,
/// as permitted by `STag` and `ETag` in the
/// [XML specification][xmlspec-tag].
///
/// [xmlspec-tag]: https://www.w3.org/TR/xml/#dt-stag
pub trait EleSpan {
/// A [`Span`] encompassing the entire opening element token.
///
/// Note that what exactly this token represents varies.
fn span(&self) -> Span;
/// Span representing the relevant portion of the element tag.
///
/// This is a more descriptive alias of [`EleSpan::span`] that may be
/// appropriate in certain contexts.
fn tag_span(&self) -> Span {
self.span()
}
/// A [`Span`] representing only the element name,
/// if available.
///
/// An element name is _not_ available for empty tags.
/// Rather than complicating the API with [`Option`],
/// [`EleSpan::span`] is returned instead.
fn name_span(&self) -> Span;
}
impl EleSpan for OpenSpan {
fn span(&self) -> Span {
match self {
Self(t, _) => *t,
}
}
fn name_span(&self) -> Span {
match self {
//
// ^^^^ offset '<' and length of name
//
// If the length is 0,
// then this will result in a 0-length span at the location
// that the element name ought to be,
// and so the resulting span will still be useful.
// This should not happen for tokens read using XIR,
// but may happen for system-generated tokens.
Self(t, name_len) => {
t.context().span(t.offset().saturating_add(1), *name_len)
}
}
}
}
impl EleSpan for CloseSpan {
fn span(&self) -> Span {
match self {
Self(t, _) => *t,
}
}
fn name_span(&self) -> Span {
match self {
// If the length of the element name is 0,
// then this must be an empty tag,
// which contains no independent element name.
//
//
// ' ^^
Self(_t, 0) => self.span(),
//
// ^^^^^ offset '' and length of name
Self(t, name_len) => {
t.context().span(t.offset().saturating_add(2), *name_len)
}
}
}
}
impl From for Span {
fn from(value: OpenSpan) -> Self {
value.span()
}
}
impl From for Span {
fn from(value: CloseSpan) -> Self {
value.span()
}
}
/// Lightly-structured XML tokens with associated [`Span`]s.
///
/// This is a streamable IR for XML.
/// A writer requires knowledge only of a previous state,
/// such as whether a node is open,
/// and so this IR can be processed by a simple state machine
/// (see [`writer::WriterState`]).
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Token {
/// Opening tag of an element.
Open(QName, OpenSpan),
/// Closing tag of an element.
///
/// If the name is [`None`],
/// then the tag is self-closing.
/// This is intended primarily as a safety measure:
/// It allows writers to act as simple state machines without having
/// to ensure balancing by indicating that a node was intended to
/// self-close.
/// Otherwise,
/// we wouldn't know whether to self-close or to close and then
/// create a new closing tag;
/// if we blindly did the former,
/// we risk losing a closing tag when it wasn't intended.
/// Instead of losing tags,
/// writers can error,
/// indicating a bug in the stream.
///
/// The reason for using an option here rather than a variant is to
/// simplify pattern matching,
/// given especially that bindings after `@` in patterns have not
/// yet been stabalized at the time of writing (but are very
/// close!).
Close(Option, CloseSpan),
/// Element attribute name.
AttrName(QName, Span),
/// Element attribute value.
AttrValue(SymbolId, Span),
/// A portion of an element attribute value.
///
/// This allows for concatenating values into an attribute value without
/// having to copy values.
/// The last fragment must be a [`Token::AttrValue`].
///
/// Since each fragment contains a span,
/// this also potentially gives higher resolution for the origin of
/// components of generated attribute values.
///
/// _This should be used only for writing._
/// These will never be encountered during reading,
/// and so to keep the parsers and IRs simple,
/// there is no support for fragments beyond XIR.
/// (There was in the past,
/// but it was removed.)
AttrValueFragment(SymbolId, Span),
/// Comment node.
Comment(SymbolId, Span),
/// Character data as part of an element.
///
/// See also [`CData`](Token::CData) variant.
Text(SymbolId, Span),
/// CData node (``).
///
/// _Warning: It is up to the caller to ensure that the string `]]>` is
/// not present in the text!_
/// This is intended for reading existing XML data where CData is
/// already present,
/// not for producing new CData safely!
CData(SymbolId, Span),
}
impl Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// _Do not_ render large amounts of text here;
// this is not only a risk depending on what is output,
// but the diagnostic system also quote source lines to provide
// the necessary context.
match self {
Self::Open(qname, _) => OpenXmlEle::fmt(qname, f),
Self::Close(Some(qname), _) => CloseXmlEle::fmt(qname, f),
// Its context is contained within the Open,
// and hopefully any user-visible errors will display that instead.
Self::Close(None, _) => {
write!(f, "/>")
}
Self::AttrName(qname, _) => XmlAttr::fmt(qname, f),
Self::AttrValue(attr_val, _) => XmlAttrValueQuote::fmt(attr_val, f),
Self::AttrValueFragment(attr_val, _) => {
write!(
f,
"value fragment {}",
XmlAttrValueQuote::wrap(attr_val)
)
}
Self::Comment(..) => write!(f, "comment"),
Self::Text(..) => write!(f, "text"),
Self::CData(..) => write!(f, "CDATA"),
}
}
}
impl crate::parse::Token for Token {
fn ir_name() -> &'static str {
"XIR"
}
/// Retrieve the [`Span`] associated with a given [`Token`].
///
/// Every token has an associated span.
fn span(&self) -> Span {
use Token::*;
match self {
Open(_, OpenSpan(span, _))
| Close(_, CloseSpan(span, _))
| AttrName(_, span)
| AttrValue(_, span)
| AttrValueFragment(_, span)
| Comment(_, span)
| Text(_, span)
| CData(_, span) => *span,
}
}
}
impl crate::parse::Object for Token {}
#[cfg(test)]
pub mod test {
use super::*;
use crate::convert::ExpectInto;
use crate::sym::GlobalSymbolIntern;
use std::convert::TryInto;
use std::fmt::Debug;
type TestResult = Result<(), Box>;
// Prefer [`open`] below when possible.
impl From for OpenSpan {
fn from(span: Span) -> Self {
Self::without_name_span(span)
}
}
// Prefer [`close`] below when possible.
impl From for CloseSpan {
fn from(span: Span) -> Self {
Self::without_name_span(span)
}
}
/// Hastily and lazily produce a [`XirfToken::Open`].
///
/// This function is not suitable for production use as it does not
/// produce a complete [`OpenSpan`].
pub fn open, S: Into>(
qname: Q,
span: S,
) -> Token
where
>::Error: Debug,
{
Token::Open(qname.unwrap_into(), span.into())
}
/// Hastily and lazily produce a [`XirfToken::Close`] for an empty tag.
///
/// This is [`close`] with the omission of the `qname` argument; the
/// type parameter `Q` cannot be inferred if the value is [`None`].
///
/// This function is not suitable for production use as it does not
/// produce a complete [`OpenSpan`].
pub fn close_empty>(span: S) -> Token {
Token::Close(None, span.into())
}
/// Hastily and lazily produce a [`XirfToken::Close`].
///
/// See also [`close_empty`] if `Q` cannot be inferred.
///
/// This function is not suitable for production use as it does not
/// produce a complete [`OpenSpan`].
pub fn close, S: Into>(
qname: Option,
span: S,
) -> Token
where
>::Error: Debug,
{
Token::Close(qname.map(ExpectInto::unwrap_into), span.into())
}
mod name {
use super::*;
#[test]
fn ncname_comparable_to_sym() {
let foo = "foo".intern();
assert_eq!(NCName(foo), foo);
}
#[test]
fn ncname_try_into_from_str_no_colon() -> TestResult {
let name: NCName = "no-colon".try_into()?;
assert_eq!(name, "no-colon".intern());
Ok(())
}
#[test]
fn ncname_try_into_from_str_fails_with_colon() {
assert_eq!(
NCName::try_from("look:a-colon"),
Err(SpanlessError::NCColon("look:a-colon".into()))
);
}
#[test]
fn ncname_from_byte_slice() -> TestResult {
let name: NCName = (b"no-colon" as &[u8]).try_into()?;
assert_eq!(name, "no-colon".intern());
Ok(())
}
#[test]
fn ncname_from_byte_slice_fails_with_colon() {
assert_eq!(
NCName::try_from(b"a:colon" as &[u8]),
Err(SpanlessError::NCColon("a:colon".into()))
);
}
#[test]
fn local_name_from_local_part_only() -> TestResult {
let name = QName::new_local("foo".try_into()?);
assert_eq!(name.local_name(), "foo".try_into()?);
assert_eq!(None, name.prefix());
Ok(())
}
#[test]
fn local_name_from_option_tuple() -> TestResult {
let name: QName = (Option::<&str>::None, "foo").try_into()?;
assert_eq!(name.local_name(), "foo".try_into()?);
assert_eq!(None, name.prefix());
Ok(())
}
#[test]
fn fully_qualified_name() -> TestResult {
let name: QName = ("foons", "foo").try_into()?;
assert_eq!(name.prefix(), Some("foons".try_into()?));
assert_eq!(name.local_name(), "foo".try_into()?);
Ok(())
}
}
mod ele_span {
use super::*;
use crate::span::dummy::DUMMY_CONTEXT as DC;
#[test]
fn open_without_attrs() {
// See docblock for [`EleSpan`].
const T: Span = DC.span(0, 8); // Relevant portion of tag
const N: Span = DC.span(1, 4); // Element name
let sut = OpenSpan(T, N.len());
assert_eq!(sut.span(), T);
assert_eq!(sut.name_span(), N);
}
#[test]
fn open_with_attrs() {
// See docblock for [`EleSpan`].
const T: Span = DC.span(0, 5); // Relevant portion of tag
const N: Span = DC.span(1, 4); // Element name
let sut = OpenSpan(T, N.len());
assert_eq!(sut.span(), T);
assert_eq!(sut.name_span(), N);
}
#[test]
fn close() {
// See docblock for [`EleSpan`].
const T: Span = DC.span(0, 10); // Relevant portion of tag
const N: Span = DC.span(2, 5); // Element name
let sut = CloseSpan(T, N.len());
assert_eq!(sut.span(), T);
assert_eq!(sut.name_span(), N);
}
#[test]
fn close_empty() {
// See docblock for [`EleSpan`].
const T: Span = DC.span(9, 2); // Relevant portion of tag
let sut = CloseSpan(T, 0);
assert_eq!(sut.span(), T);
// There is no name,
// only Zuul.
assert_eq!(sut.name_span(), T);
}
}
}