tame/tamer/src/xir.rs

// XML IR (XIR)
//
//  Copyright (C) 2014-2023 Ryan Specialty, LLC.
//
//  This file is part of TAME.
//
//  This program is free software: you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation, either version 3 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program.  If not, see <http://www.gnu.org/licenses/>.

//! Intermediate representation (IR) of an XML document.
//!
//! XIR serves not only as a TAMER-specific IR,
//!   but also as an abstraction layer atop of whatever XML library is
//!   used (e.g. `quick_xml`).
//! XIR is _not_ intended to be comprehensive,
//!   or even general-purpose---it
//!     exists to solve concerns specific to TAMER's construction.
//!
//! To parse an entire XML document,
//!   see [`reader`].
//!
//! _Note:_ XIR refers to "opening" and "closing" tags,
//!   as opposed to "start" and "end" as used in the XML specification.
//! TAMER uses a uniform terminology for all delimited data.

use crate::fmt::DisplayWrapper;
use crate::span::{Span, SpanLenSize};
use crate::sym::{
    st_as_sym, GlobalSymbolIntern, GlobalSymbolInternBytes, SymbolId,
};
use memchr::memchr;
use std::convert::{TryFrom, TryInto};
use std::fmt::Display;
use std::ops::Deref;

mod error;
pub use error::Error;

mod escape;
pub use escape::{DefaultEscaper, Escaper};

use error::SpanlessError;
use st::qname::QNameCompatibleStaticSymbolId;

use self::fmt::{CloseXmlEle, OpenXmlEle, XmlAttr, XmlAttrValueQuote};

pub mod attr;
pub mod autoclose;
pub mod flat;
pub mod fmt;
pub mod iter;
pub mod pred;
pub mod reader;
pub mod st;
pub mod tree;
pub mod writer;

#[macro_use]
pub mod parse;

/// An infallible [`Token`] stream.
///
/// If the token stream originates from an operation that could potentially
///   fail and ought to be propagated,
///     use [`TokenResultStream`].
///
/// The name "stream" in place of "iterator" is intended to convey that this
///   type is expected to be processed in real-time as a stream,
///     not read into memory.
pub trait TokenStream = Iterator<Item = Token>;

/// A [`Token`] stream that may encounter errors during parsing.
///
/// If the stream cannot fail,
///   consider using [`TokenStream`].
pub trait TokenResultStream = Iterator<Item = Result<Token, Error>>;

/// XML Name minus `":"`.
///
/// The intent is to check a string for validity _before_ interning;
///   otherwise,
///     the string would have to be first retrieved from the intern pool
///     for comparison,
///       which is not an operation we want to do implicitly.
/// Those methods will be created as they are needed.
///
/// See <https://www.w3.org/TR/REC-xml-names/#NT-NCName>.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct NCName(SymbolId);

impl NCName {
    /// Create a new NCName from a symbol without validating that the symbol
    ///   is a valid NCName.
    ///
    /// Safety
    /// ======
    /// This is not unsafe in the traditional sense;
    ///   it's unsafe in a sense similar to non-UTF-8 `str` slices,
    ///     in that it is expected that an `NCName` means that you do not
    ///     have to worry about whether it's syntatically valid as XML.
    pub unsafe fn new_unchecked(value: SymbolId) -> Self {
        Self(value)
    }
}

impl TryFrom<&[u8]> for NCName {
    type Error = SpanlessError;

    /// Attempt to parse a byte slice into an [`NCName`].
    ///
    /// If the slice contains `b':'`,
    ///   an error will be produced.
    /// No other checks are performed beyond checking that the byte sequence
    ///   represents a valid UTF-8 string.
    /// The string will be interned for you.
    fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
        match value.contains(&b':') {
            true => Err(SpanlessError::NCColon(value.intern_utf8()?)),
            false => Ok(NCName(value.intern_utf8()?)),
        }
    }
}

impl Deref for NCName {
    type Target = SymbolId;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl PartialEq<SymbolId> for NCName {
    fn eq(&self, other: &SymbolId) -> bool {
        self.0 == *other
    }
}

impl TryFrom<&str> for NCName {
    type Error = SpanlessError;

    fn try_from(value: &str) -> Result<Self, Self::Error> {
        if value.contains(':') {
            return Err(SpanlessError::NCColon(value.into()));
        }

        Ok(Self(value.intern()))
    }
}

/// Namespace prefix of a [`QName`].
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Prefix(NCName);

impl Prefix {
    /// Construct a constant [`Prefix`] from a static C-style symbol.
    pub const fn st_cid<T: QNameCompatibleStaticSymbolId>(
        prefix_sym: &T,
    ) -> Self {
        Self(NCName(st_as_sym(prefix_sym)))
    }
}

/// Local name portion of a [`QName`].
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct LocalPart(NCName);

impl Deref for Prefix {
    type Target = SymbolId;

    fn deref(&self) -> &Self::Target {
        self.0.deref()
    }
}

impl Deref for LocalPart {
    type Target = SymbolId;

    fn deref(&self) -> &Self::Target {
        self.0.deref()
    }
}

impl From<NCName> for Prefix {
    fn from(name: NCName) -> Self {
        Self(name)
    }
}

impl From<NCName> for LocalPart {
    fn from(name: NCName) -> Self {
        Self(name)
    }
}

impl TryFrom<&str> for Prefix {
    type Error = SpanlessError;

    fn try_from(value: &str) -> Result<Self, Self::Error> {
        Ok(Self(value.try_into()?))
    }
}

impl TryFrom<&str> for LocalPart {
    type Error = SpanlessError;

    fn try_from(value: &str) -> Result<Self, Self::Error> {
        Ok(Self(value.try_into()?))
    }
}

impl Display for Prefix {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        self.0.fmt(f)
    }
}

impl Display for LocalPart {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        self.0.fmt(f)
    }
}

/// A qualified name (namespace prefix and local name).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct QName(Option<Prefix>, LocalPart);

// Since we implement Copy, ensure size matches our expectations:
const_assert!(std::mem::size_of::<QName>() <= std::mem::size_of::<usize>());

impl QName {
    /// Create a new fully-qualified name (including both a namespace URI
    ///   and local name).
    pub fn new(prefix: Option<Prefix>, local_name: LocalPart) -> Self {
        Self(prefix, local_name)
    }

    /// Create a new name from a local name only.
    ///
    /// This should only be used for attributes in TAMER,
    ///   since all elements should have an associated namespace.
    ///
    /// _(If this is ever not true (e.g. due to new targets),
    ///   please update this comment.)_
    pub fn new_local(local_name: LocalPart) -> Self {
        Self(None, local_name)
    }

    /// Fully qualified namespace associated with a name.
    pub fn prefix(&self) -> Option<Prefix> {
        self.0
    }

    /// Local part of a name (name without namespace).
    pub fn local_name(&self) -> LocalPart {
        self.1
    }

    /// Construct a constant QName from static C-style symbols.
    pub const fn st_cid<T, U>(prefix_sym: &T, local_sym: &U) -> Self
    where
        T: QNameCompatibleStaticSymbolId,
        U: QNameCompatibleStaticSymbolId,
    {
        Self(
            Some(Prefix(NCName(st_as_sym(prefix_sym)))),
            LocalPart(NCName(st_as_sym(local_sym))),
        )
    }

    /// Construct a constant QName with a local name only from a static
    ///   C-style symbol.
    pub const fn st_cid_local<T: QNameCompatibleStaticSymbolId>(
        local_sym: &T,
    ) -> Self {
        Self(None, LocalPart(NCName(st_as_sym(local_sym))))
    }
}

impl<P, L> TryFrom<(P, L)> for QName
where
    P: TryInto<Prefix>,
    L: TryInto<LocalPart, Error = P::Error>,
{
    type Error = P::Error;

    fn try_from(value: (P, L)) -> Result<Self, Self::Error> {
        Ok(Self(Some(value.0.try_into()?), value.1.try_into()?))
    }
}

impl<P, L> TryFrom<(Option<P>, L)> for QName
where
    P: TryInto<Prefix>,
    L: TryInto<LocalPart, Error = P::Error>,
{
    type Error = P::Error;

    fn try_from(value: (Option<P>, L)) -> Result<Self, Self::Error> {
        let ns = match value.0 {
            None => None,
            Some(ns) => Some(ns.try_into()?),
        };

        Ok(Self(ns, value.1.try_into()?))
    }
}

impl TryFrom<&str> for QName {
    type Error = SpanlessError;

    fn try_from(value: &str) -> Result<Self, Self::Error> {
        Ok(QName(None, value.try_into()?))
    }
}

impl TryFrom<&[u8]> for QName {
    type Error = SpanlessError;

    /// Attempt to parse a byte slice into a [`QName`].
    ///
    /// The byte slice must represent a valid QName in UTF-8.
    /// If a colon is present,
    ///   it delimits the namespace [`Prefix`] and [`LocalPart`],
    ///   and therefore must not be in the first or last byte position.
    fn try_from(name: &[u8]) -> Result<Self, Self::Error> {
        match memchr(b':', name) {
            // Leading colon means we're missing a prefix, trailing means
            //   that we have no local part.
            Some(pos) if pos == 0 || pos == name.len() - 1 => {
                Err(SpanlessError::InvalidQName(name.intern_utf8()?))
            }

            // There is _at least_ one colon in the string.
            Some(pos) => {
                // The prefix is before the first colon,
                //   and so itself must not contain a colon and is therefore
                //   a valid NCName.
                let prefix = NCName(name[..pos].intern_utf8()?);

                // But there could be a _second_ colon,
                //   so the local part requires validation.
                let local = NCName::try_from(&name[(pos + 1)..])?;

                Ok(Self::new(Some(prefix.into()), local.into()))
            }

            // There are no colons in the string, so the entire string is
            //   both a local part and a valid NCName.
            None => Ok(Self::new(None, NCName(name.intern_utf8()?).into())),
        }
    }
}

impl Display for QName {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            QName(Some(local), suffix) => write!(f, "{}:{}", local, suffix),
            QName(None, suffix) => suffix.fmt(f),
        }
    }
}

/// A span representing an opening (starting) element tag.
///
/// See [`EleSpan`] for more information.
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct OpenSpan(Span, EleNameLen);

impl OpenSpan {
    pub fn without_name_span(span: Span) -> Self {
        Self(span, 0)
    }
}

/// A span representing a closing (ending) element tag.
///
/// See [`EleSpan`] for more information.
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct CloseSpan(Span, EleNameLen);

impl CloseSpan {
    /// A [`CloseSpan`] representing the closing of an empty tag.
    ///
    /// This type of span has no element name.
    pub fn empty(span: Span) -> Self {
        Self::without_name_span(span)
    }

    pub fn without_name_span(span: Span) -> Self {
        Self(span, 0)
    }
}

/// Number of bytes representing the name of the element.
pub type EleNameLen = SpanLenSize;

/// Spans associated with an element opening or closing tag.
///
/// The diagram below illustrates the behavior of [`EleSpan`].
/// Spans are represented by `[---]` intervals,
///   with the byte offset at each end,
///   and the single-letter span name centered below the interval.
///
/// ```text
///   <open  >          <open ...>     </close  >          <empty ' />
///   |[--]  |          |[--]          | [---]  |          |[---] ' []
///   |1  4  |          |1  4          | 2   6  |          |1   5 ' 9`10
///   | N    |          | N |          |   N    |          |  N | ' T
///   |      |          |   |          |        |          |    | '
///   [------]          [---]          [--------]          [----] '
///   0      7          0   4          0        9          0    5 '
///      T                T                T                 T    '
/// ```
///
/// Above we have
///
///   - `T` = [`EleSpan::span`]; and
///   - `N` = [`EleSpan::name_span`].
///
/// The purpose of the `T` span is to represent the entire token that has
///   been emitted by XIR.
/// If an opening tag does not contain any attributes,
///   then `T` represents the entire opening tag with both the opening and
///   closing angle brackets.
/// If an opening tag is expected to contain attributes,
///   then only the opening angle bracket is included.
/// A closing tag is entirely contained by `T`.
///
/// The empty tag is separated into two tokens in XIR---a
///   [`Token::Open`] and a [`Token::Close`] with a [`None`] for the name.
/// Unlike a typical closing tag,
///   there is no `N` span available for the closing token,
///     and so requesting one via [`EleSpan::name_span`] will simply
///     return the `T` span,
///       rather than complicating the API with an [`Option`].
/// It is generally assumed that reporting on element names will occur
///   within the context of the _opening_ tag.
///
/// The tag may contain whitespace following the element name,
///   as permitted by `STag` and `ETag` in the
///     [XML specification][xmlspec-tag].
///
/// [xmlspec-tag]: https://www.w3.org/TR/xml/#dt-stag
pub trait EleSpan {
    /// A [`Span`] encompassing the entire opening element token.
    ///
    /// Note that what exactly this token represents varies.
    fn span(&self) -> Span;

    /// Span representing the relevant portion of the element tag.
    ///
    /// This is a more descriptive alias of [`EleSpan::span`] that may be
    ///   appropriate in certain contexts.
    fn tag_span(&self) -> Span {
        self.span()
    }

    /// A [`Span`] representing only the element name,
    ///   if available.
    ///
    /// An element name is _not_ available for empty tags.
    /// Rather than complicating the API with [`Option`],
    ///   [`EleSpan::span`] is returned instead.
    fn name_span(&self) -> Span;
}

impl EleSpan for OpenSpan {
    fn span(&self) -> Span {
        match self {
            Self(t, _) => *t,
        }
    }

    fn name_span(&self) -> Span {
        match self {
            // <open  ...>
            //  ^^^^ offset '<' and length of name
            //
            // If the length is 0,
            //   then this will result in a 0-length span at the location
            //   that the element name ought to be,
            //     and so the resulting span will still be useful.
            // This should not happen for tokens read using XIR,
            //   but may happen for system-generated tokens.
            Self(t, name_len) => {
                t.context().span(t.offset().saturating_add(1), *name_len)
            }
        }
    }
}

impl EleSpan for CloseSpan {
    fn span(&self) -> Span {
        match self {
            Self(t, _) => *t,
        }
    }

    fn name_span(&self) -> Span {
        match self {
            // If the length of the element name is 0,
            //   then this must be an empty tag,
            //     which contains no independent element name.
            //
            // <foo ' />
            //      ' ^^
            Self(_t, 0) => self.span(),

            // </close  >
            //   ^^^^^ offset '</' and length of name
            Self(t, name_len) => {
                t.context().span(t.offset().saturating_add(2), *name_len)
            }
        }
    }
}

impl From<OpenSpan> for Span {
    fn from(value: OpenSpan) -> Self {
        value.span()
    }
}

impl From<CloseSpan> for Span {
    fn from(value: CloseSpan) -> Self {
        value.span()
    }
}

/// Lightly-structured XML tokens with associated [`Span`]s.
///
/// This is a streamable IR for XML.
/// A writer requires knowledge only of a previous state,
///   such as whether a node is open,
///   and so this IR can be processed by a simple state machine
///     (see [`writer::WriterState`]).
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Token {
    /// Opening tag of an element.
    Open(QName, OpenSpan),

    /// Closing tag of an element.
    ///
    /// If the name is [`None`],
    ///   then the tag is self-closing.
    /// This is intended primarily as a safety measure:
    ///   It allows writers to act as simple state machines without having
    ///     to ensure balancing by indicating that a node was intended to
    ///     self-close.
    ///   Otherwise,
    ///     we wouldn't know whether to self-close or to close and then
    ///     create a new closing tag;
    ///       if we blindly did the former,
    ///         we risk losing a closing tag when it wasn't intended.
    ///   Instead of losing tags,
    ///     writers can error,
    ///       indicating a bug in the stream.
    ///
    /// The reason for using an option here rather than a variant is to
    ///   simplify pattern matching,
    ///     given especially that bindings after `@` in patterns have not
    ///     yet been stabalized at the time of writing (but are very
    ///     close!).
    Close(Option<QName>, CloseSpan),

    /// Element attribute name.
    AttrName(QName, Span),

    /// Element attribute value.
    AttrValue(SymbolId, Span),

    /// A portion of an element attribute value.
    ///
    /// This allows for concatenating values into an attribute value without
    ///   having to copy values.
    /// The last fragment must be a [`Token::AttrValue`].
    ///
    /// Since each fragment contains a span,
    ///   this also potentially gives higher resolution for the origin of
    ///   components of generated attribute values.
    ///
    /// _This should be used only for writing._
    /// These will never be encountered during reading,
    ///   and so to keep the parsers and IRs simple,
    ///   there is no support for fragments beyond XIR.
    /// (There was in the past,
    ///    but it was removed.)
    AttrValueFragment(SymbolId, Span),

    /// Comment node.
    Comment(SymbolId, Span),

    /// Character data as part of an element.
    ///
    /// See also [`CData`](Token::CData) variant.
    Text(SymbolId, Span),

    /// CData node (`<![CDATA[...]]>`).
    ///
    /// _Warning: It is up to the caller to ensure that the string `]]>` is
    ///   not present in the text!_
    /// This is intended for reading existing XML data where CData is
    ///   already present,
    ///     not for producing new CData safely!
    CData(SymbolId, Span),
}

impl Display for Token {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        // _Do not_ render large amounts of text here;
        //   this is not only a risk depending on what is output,
        //     but the diagnostic system also quote source lines to provide
        //     the necessary context.
        match self {
            Self::Open(qname, _) => OpenXmlEle::fmt(qname, f),
            Self::Close(Some(qname), _) => CloseXmlEle::fmt(qname, f),
            // Its context is contained within the Open,
            //   and hopefully any user-visible errors will display that instead.
            Self::Close(None, _) => {
                write!(f, "/>")
            }
            Self::AttrName(qname, _) => XmlAttr::fmt(qname, f),
            Self::AttrValue(attr_val, _) => XmlAttrValueQuote::fmt(attr_val, f),
            Self::AttrValueFragment(attr_val, _) => {
                write!(
                    f,
                    "value fragment {}",
                    XmlAttrValueQuote::wrap(attr_val)
                )
            }
            Self::Comment(..) => write!(f, "comment"),
            Self::Text(..) => write!(f, "text"),
            Self::CData(..) => write!(f, "CDATA"),
        }
    }
}

impl crate::parse::Token for Token {
    fn ir_name() -> &'static str {
        "XIR"
    }

    /// Retrieve the [`Span`] associated with a given [`Token`].
    ///
    /// Every token has an associated span.
    fn span(&self) -> Span {
        use Token::*;

        match self {
            Open(_, OpenSpan(span, _))
            | Close(_, CloseSpan(span, _))
            | AttrName(_, span)
            | AttrValue(_, span)
            | AttrValueFragment(_, span)
            | Comment(_, span)
            | Text(_, span)
            | CData(_, span) => *span,
        }
    }
}

impl crate::parse::Object for Token {}

#[cfg(test)]
pub mod test {
    use super::*;
    use crate::convert::ExpectInto;
    use crate::sym::GlobalSymbolIntern;
    use std::convert::TryInto;
    use std::fmt::Debug;

    type TestResult = Result<(), Box<dyn std::error::Error>>;

    // Prefer [`open`] below when possible.
    impl From<Span> for OpenSpan {
        fn from(span: Span) -> Self {
            Self::without_name_span(span)
        }
    }

    // Prefer [`close`] below when possible.
    impl From<Span> for CloseSpan {
        fn from(span: Span) -> Self {
            Self::without_name_span(span)
        }
    }

    /// Hastily and lazily produce a [`XirfToken::Open`].
    ///
    /// This function is not suitable for production use as it does not
    ///   produce a complete [`OpenSpan`].
    pub fn open<Q: TryInto<QName>, S: Into<OpenSpan>>(
        qname: Q,
        span: S,
    ) -> Token
    where
        <Q as TryInto<QName>>::Error: Debug,
    {
        Token::Open(qname.unwrap_into(), span.into())
    }

    /// Hastily and lazily produce a [`XirfToken::Close`] for an empty tag.
    ///
    /// This is [`close`] with the omission of the `qname` argument; the
    ///   type parameter `Q` cannot be inferred if the value is [`None`].
    ///
    /// This function is not suitable for production use as it does not
    ///   produce a complete [`OpenSpan`].
    pub fn close_empty<S: Into<CloseSpan>>(span: S) -> Token {
        Token::Close(None, span.into())
    }

    /// Hastily and lazily produce a [`XirfToken::Close`].
    ///
    /// See also [`close_empty`] if `Q` cannot be inferred.
    ///
    /// This function is not suitable for production use as it does not
    ///   produce a complete [`OpenSpan`].
    pub fn close<Q: TryInto<QName>, S: Into<CloseSpan>>(
        qname: Option<Q>,
        span: S,
    ) -> Token
    where
        <Q as TryInto<QName>>::Error: Debug,
    {
        Token::Close(qname.map(ExpectInto::unwrap_into), span.into())
    }

    mod name {
        use super::*;

        #[test]
        fn ncname_comparable_to_sym() {
            let foo = "foo".intern();
            assert_eq!(NCName(foo), foo);
        }

        #[test]
        fn ncname_try_into_from_str_no_colon() -> TestResult {
            let name: NCName = "no-colon".try_into()?;
            assert_eq!(name, "no-colon".intern());
            Ok(())
        }

        #[test]
        fn ncname_try_into_from_str_fails_with_colon() {
            assert_eq!(
                NCName::try_from("look:a-colon"),
                Err(SpanlessError::NCColon("look:a-colon".into()))
            );
        }

        #[test]
        fn ncname_from_byte_slice() -> TestResult {
            let name: NCName = (b"no-colon" as &[u8]).try_into()?;
            assert_eq!(name, "no-colon".intern());
            Ok(())
        }

        #[test]
        fn ncname_from_byte_slice_fails_with_colon() {
            assert_eq!(
                NCName::try_from(b"a:colon" as &[u8]),
                Err(SpanlessError::NCColon("a:colon".into()))
            );
        }

        #[test]
        fn local_name_from_local_part_only() -> TestResult {
            let name = QName::new_local("foo".try_into()?);

            assert_eq!(name.local_name(), "foo".try_into()?);
            assert_eq!(None, name.prefix());

            Ok(())
        }

        #[test]
        fn local_name_from_option_tuple() -> TestResult {
            let name: QName = (Option::<&str>::None, "foo").try_into()?;

            assert_eq!(name.local_name(), "foo".try_into()?);
            assert_eq!(None, name.prefix());

            Ok(())
        }

        #[test]
        fn fully_qualified_name() -> TestResult {
            let name: QName = ("foons", "foo").try_into()?;

            assert_eq!(name.prefix(), Some("foons".try_into()?));
            assert_eq!(name.local_name(), "foo".try_into()?);

            Ok(())
        }
    }

    mod ele_span {
        use super::*;
        use crate::span::dummy::DUMMY_CONTEXT as DC;

        #[test]
        fn open_without_attrs() {
            // See docblock for [`EleSpan`].
            const T: Span = DC.span(0, 8); // Relevant portion of tag
            const N: Span = DC.span(1, 4); // Element name

            let sut = OpenSpan(T, N.len());

            assert_eq!(sut.span(), T);
            assert_eq!(sut.name_span(), N);
        }

        #[test]
        fn open_with_attrs() {
            // See docblock for [`EleSpan`].
            const T: Span = DC.span(0, 5); // Relevant portion of tag
            const N: Span = DC.span(1, 4); // Element name

            let sut = OpenSpan(T, N.len());

            assert_eq!(sut.span(), T);
            assert_eq!(sut.name_span(), N);
        }

        #[test]
        fn close() {
            // See docblock for [`EleSpan`].
            const T: Span = DC.span(0, 10); // Relevant portion of tag
            const N: Span = DC.span(2, 5); //  Element name

            let sut = CloseSpan(T, N.len());

            assert_eq!(sut.span(), T);
            assert_eq!(sut.name_span(), N);
        }

        #[test]
        fn close_empty() {
            // See docblock for [`EleSpan`].
            const T: Span = DC.span(9, 2); // Relevant portion of tag

            let sut = CloseSpan(T, 0);

            assert_eq!(sut.span(), T);
            // There is no name,
            //   only Zuul.
            assert_eq!(sut.name_span(), T);
        }
    }
}
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								// XML IR (XIR)
 								//
-												Copyright year and name update

Ryan Specialty Group (RSG) rebranded to Ryan Specialty after its IPO.

											
										
										
											2023-01-17 23:09:25 -05:00
+								//  Copyright (C) 2014-2023 Ryan Specialty, LLC.
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								//
 								//  This file is part of TAME.
 								//
 								//  This program is free software: you can redistribute it and/or modify
 								//  it under the terms of the GNU General Public License as published by
 								//  the Free Software Foundation, either version 3 of the License, or
 								//  (at your option) any later version.
 								//
 								//  This program is distributed in the hope that it will be useful,
 								//  but WITHOUT ANY WARRANTY; without even the implied warranty of
 								//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 								//  GNU General Public License for more details.
 								//
 								//  You should have received a copy of the GNU General Public License
 								//  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 								//! Intermediate representation (IR) of an XML document.
 								//!
 								//! XIR serves not only as a TAMER-specific IR,
 								//!   but also as an abstraction layer atop of whatever XML library is
 								//!   used (e.g. `quick_xml`).
 								//! XIR is _not_ intended to be comprehensive,
 								//!   or even general-purpose---it
 								//!     exists to solve concerns specific to TAMER's construction.
 								//!
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								//! To parse an entire XML document,
 								//!   see [`reader`].
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								//!
 								//! _Note:_ XIR refers to "opening" and "closing" tags,
 								//!   as opposed to "start" and "end" as used in the XML specification.
 								//! TAMER uses a uniform terminology for all delimited data.
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
-												tamer: xir: Format tokens without tt quotes

Whether or not quoting is appropriate depends on context, and that parent
context is already performing the quoting.  For example:

  error: expected `</rater>`, but found `<import>`
    --> /home/[...]/foo.xml:2:1
     |
   2 | <rater xmlns="http://www.lovullo.com/rater"
     | ------ note: element starts here

    --> /home/[...]/foo.xml:7:3
     |
   7 |   <import package="/rater/core/base" />
     |   ^^^^^^^ error: expected `</rater>`

In these cases (obviously I'm still working on the parser, since this is
nonsense), the parser is responsible for quoting the token "<import>".

DEV-7145

											
										
										
											2022-07-29 00:44:58 -04:00
+								use crate::fmt::DisplayWrapper;
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								use crate::span::{Span, SpanLenSize};
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								use crate::sym::{
-												tamer: xir::st::qname: New module

This moves and deduplicates the static `QName`s into a common area.

DEV-7145

											
										
										
											2022-06-03 14:34:08 -04:00
+								    st_as_sym, GlobalSymbolIntern, GlobalSymbolInternBytes, SymbolId,
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								};
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								use memchr::memchr;
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								use std::convert::{TryFrom, TryInto};
-												tamer: ir::xir::{QName, Prefix, LocalName}: Implement Display

These will be shown in error messages and need user-friendly
representations.

DEV-10863

											
										
										
											2021-11-02 13:55:33 -04:00
+								use std::fmt::Display;
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								use std::ops::Deref;
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								mod error;
 								pub use error::Error;
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								mod escape;
-												tamer: xir::escape::CachingEscaper: New Escaper

As promised, this will cache previously seen escaped/unescaped values by
creating a two-way mapping between them.

DEV-11081

											
										
										
											2021-11-12 16:07:57 -05:00
+								pub use escape::{DefaultEscaper, Escaper};
-												tamer: xir::XirString: WIP implementation (likely going away)

I'm not fond of this implementation, which is why it's not fully
completed.  I wanted to commit this for future reference, and take the
opportunity to explain why I don't like it.

First: this task started as an idea to implement a third variant to
AttrValue and friends that indicates that a value is fixed, in the sense of
a fixed-point function: escaped or unescaped, its value is the same.  This
would allow us to skip wasteful escape/unescape operations.

In doing so, it became obvious that there's no need to leak this information
through the API, and indeed, no part of the system should care.  When we
read XML, it should be unescaped, and when we write, it should be
escaped.  The reason that this didn't quite happen to begin with was an
optimization: I'll be creating an echo writer in place of the current
filesystem-based copy in tamec shortly, and this would allow streaming XIR
directly from the reader to the writer without any unescaping or
re-escaping.

When we unescape, we know the value that it came from, so we could simply
store both symbols---they're 32-bit, so it results in a nicely compressed
64-bit value, so it's essentially cost-free, as long as we accept the
expense of internment.  This is `XirString`.  Then, when we want to escape
or unescape, we first check to see whether a symbol already exists and, if
so, use it.

While this works well for echoing streams, it won't work all that well in
practice: the unescaped SymbolId will be taken and the XirString discarded,
since nothing after XIR should be coupled with it.  Then, when we later
construct a XIR stream for writting, XirString will no longer be available
and our previously known escape is lost, so the writer will have to
re-escape.

Further, if we look at XirString's generic for the XirStringEscaper---it
uses phantom, which hints that maybe it's not in the best place.  Indeed,
I've already acknowledged that only a reader unescapes and only a writer
escapes, and that the rest of the system works with normal (unescaped)
values, so only readers and writers should be part of this process.  I also
already acknowledged that XirString would be lost and only the unescaped
SymbolId would be used.

So what's the point of XirString, then, if it won't be a useful optimization
beyond the temporary echo writer?

Instead, we can take the XirStringWriter and implement two caches on that:
mapping SymbolId from escaped->unescaped and vice-versa.  These can be
simple vectors, since SymbolId is a 32-bit value we will not have much
wasted space for symbols that never get read or written.  We could even
optimize for preinterned symbols using markers, though I'll probably not do
so, and I'll explain why later.

If we do _that_, we get even _better_ optimizations through caching that
_will_ apply in the general case (so, not just for echo), and we're able to
ditch XirString entirely and simply use a SymbolId.  This makes for a much
more friendly API that isn't leaking implementation details, though it
_does_ put an onus on the caller to pass the encoder to both the reader and
the writer, _if_ it wants to take advantage of a cache.  But that burden is
not significant (and is, again, optional if we don't want it).

So, that'll be the next step.

											
										
										
											2021-11-10 09:42:18 -05:00
-												tamer: xir::st::qname: New module

This moves and deduplicates the static `QName`s into a common area.

DEV-7145

											
										
										
											2022-06-03 14:34:08 -04:00
+								use error::SpanlessError;
 								use st::qname::QNameCompatibleStaticSymbolId;
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
-												tamer: xir: Format tokens without tt quotes

Whether or not quoting is appropriate depends on context, and that parent
context is already performing the quoting.  For example:

  error: expected `</rater>`, but found `<import>`
    --> /home/[...]/foo.xml:2:1
     |
   2 | <rater xmlns="http://www.lovullo.com/rater"
     | ------ note: element starts here

    --> /home/[...]/foo.xml:7:3
     |
   7 |   <import package="/rater/core/base" />
     |   ^^^^^^^ error: expected `</rater>`

In these cases (obviously I'm still working on the parser, since this is
nonsense), the parser is responsible for quoting the token "<import>".

DEV-7145

											
										
										
											2022-07-29 00:44:58 -04:00
+								use self::fmt::{CloseXmlEle, OpenXmlEle, XmlAttr, XmlAttrValueQuote};
-												tamer: xir::{tree::=>}attr: Move

With the introduction of XIRF, attribute parsing is no longer a XIRT thing.

DEV-10863

											
										
										
											2022-03-17 16:10:56 -04:00
+								pub mod attr;
-												tamer: xir::autoclose: New lowering operation

This lowering operation is intended to allow me to write a more concise and
clear mapping from the graph to XIRF, without having to worry about
balancing tags, which really complicated the implementation.

This has details docs; see that for more information.

I can't help but be reminded of Wisp (the whitespace-based Lisp-like
syntax).  Which is unfortunate, because I'm not fond of Wisp; I like my
parenthesis.

DEV-13708

											
										
										
											2023-02-14 16:04:17 -05:00
+								pub mod autoclose;
-												tamer: xir::flat: Initial XIRF implementation

This introduces XIR Flat (XIRF), which is conceptually between XIR and
XIRT.  This provides a more appropriate level of abstraction for further
lowering operations to parse against, and removes the need for other parsers
to perform their own validations (inappropriately) to ensure well-formed
XML.

There is still some cleanup worth doing, including moving some of the
parsing responsibility up a level back into the XIR parser.

DEV-10863

											
										
										
											2022-03-17 12:20:20 -04:00
+								pub mod flat;
-												tamer: fmt: New type-based formatting system

This is partly an experiment, but is designed to simplify producing English
sentences in various contexts.  It makes use of a not only unstable, but
incomplete, Rust feature---adt_const_params, for a static str const type
parameter.  Hopefully that ends up being stabalized.

This uses types, but it's the same as function composition due to Rust's
monomorphization.

DEV-7145

											
										
										
											2022-06-10 16:28:15 -04:00
+								pub mod fmt;
-												tamer: obj::xmle::xir: Extract ElemWrap into ir::xir::iter

											
										
										
											2021-10-11 09:34:17 -04:00
+								pub mod iter;
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								pub mod pred;
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								pub mod reader;
-												tamer: xir::st::qname: New module

This moves and deduplicates the static `QName`s into a common area.

DEV-7145

											
										
										
											2022-06-03 14:34:08 -04:00
+								pub mod st;
-												tamer: ir::xir::tree: Introduce XIR tree

This begins to introduce the XIR tree.  I was originally going to wait on
this until after implementing the xmle writer in terms of XIR, but writing
unit tests is too much of a pain on the stream, so now is as good of a time
as any.

This has very limited support so far; it'll be added to as time goes on.

											
										
										
											2021-09-08 13:53:47 -04:00
+								pub mod tree;
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								pub mod writer;
-												tamer: xir::parse: Attribute parser generator

This is the first parser generator for the parsing framework.  I've been
waiting quite a while to do this because I wanted to be sure that I
understood how I intended to write the attribute parsers manually.  Now that
I'm about to start parsing source XML files, it is necessary to have a
parser generator.

Typically one thinks of a parser generator as a separate program that
generates code for some language, but that is not always the case---that
represents a lack of expressiveness in the language itself (e.g. C).  Here,
I simply use Rust's macro system, which should be a concept familiar to
someone coming from a language like Lisp.

This also resolves where I stand on parser combinators with respect to this
abstraction: they both accomplish the exact same thing (composition of
smaller parsers), but this abstraction doesn't do so in the typical
functional way.  But the end result is the same.

The parser generated by this abstraction will be optimized an inlined in the
same manner as the hand-written parsers.  Since they'll be tightly coupled
with an element parser (which too will have a parser generator), I expect
that most attribute parsers will simply be inlined; they exist as separate
parsers conceptually, for the same reason that you'd use parser combinators.

It's worth mentioning that this awkward reliance on dead state for a
lookahead token to determine when aggregation is complete rubs me the wrong
way, but resolving it would involve reintroducing the XIR AttrEnd that I had
previously removed.  I'll keep fighting with myself on this, but I want to
get a bit further before I determine if it's worth the tradeoff of
reintroducing (more complex IR but simplified parsing).

DEV-7145

											
										
										
											2022-06-13 11:17:21 -04:00
+								#[macro_use]
 								pub mod parse;
-												tamer: Begin XIR-based xmlo reader impl

There isn't a whole lot here, but there is additional work needed in various
places to support upcoming changes and so I want to get this commited to
ease the cognitive burden of what I have thusfar.  And to stop stashing.  We
have a feature flag for a reason.

DEV-10863

											
										
										
											2021-10-28 21:21:30 -04:00
+								/// An infallible [`Token`] stream.
 								///
 								/// If the token stream originates from an operation that could potentially
 								///   fail and ought to be propagated,
 								///     use [`TokenResultStream`].
 								///
 								/// The name "stream" in place of "iterator" is intended to convey that this
 								///   type is expected to be processed in real-time as a stream,
 								///     not read into memory.
 								pub trait TokenStream = Iterator<Item = Token>;
 								/// A [`Token`] stream that may encounter errors during parsing.
 								///
 								/// If the stream cannot fail,
 								///   consider using [`TokenStream`].
 								pub trait TokenResultStream = Iterator<Item = Result<Token, Error>>;
-												tamer: ir::xir (newtype_symbol!): Remove for now

This does not belong here and was more of a POC at the time.  It can be
added later on when I have the time; I have to move on.

											
										
										
											2021-10-11 11:51:51 -04:00
+								/// XML Name minus `":"`.
 								///
 								/// The intent is to check a string for validity _before_ interning;
 								///   otherwise,
 								///     the string would have to be first retrieved from the intern pool
 								///     for comparison,
 								///       which is not an operation we want to do implicitly.
 								/// Those methods will be created as they are needed.
 								///
 								/// See <https://www.w3.org/TR/REC-xml-names/#NT-NCName>.
 								#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 								pub struct NCName(SymbolId);
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								impl NCName {
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								    /// Create a new NCName from a symbol without validating that the symbol
 								    ///   is a valid NCName.
 								    ///
 								    /// Safety
 								    /// ======
 								    /// This is not unsafe in the traditional sense;
 								    ///   it's unsafe in a sense similar to non-UTF-8 `str` slices,
 								    ///     in that it is expected that an `NCName` means that you do not
 								    ///     have to worry about whether it's syntatically valid as XML.
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								    pub unsafe fn new_unchecked(value: SymbolId) -> Self {
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								        Self(value)
 								    }
 								}
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								impl TryFrom<&[u8]> for NCName {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								    type Error = SpanlessError;
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
 								    /// Attempt to parse a byte slice into an [`NCName`].
 								    ///
 								    /// If the slice contains `b':'`,
 								    ///   an error will be produced.
 								    /// No other checks are performed beyond checking that the byte sequence
 								    ///   represents a valid UTF-8 string.
 								    /// The string will be interned for you.
 								    fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
 								        match value.contains(&b':') {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								            true => Err(SpanlessError::NCColon(value.intern_utf8()?)),
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								            false => Ok(NCName(value.intern_utf8()?)),
 								        }
 								    }
 								}
-												tamer: ir::xir (newtype_symbol!): Remove for now

This does not belong here and was more of a POC at the time.  It can be
added later on when I have the time; I have to move on.

											
										
										
											2021-10-11 11:51:51 -04:00
+								impl Deref for NCName {
 								    type Target = SymbolId;
 								    fn deref(&self) -> &Self::Target {
 								        &self.0
 								    }
 								}
 								impl PartialEq<SymbolId> for NCName {
 								    fn eq(&self, other: &SymbolId) -> bool {
 								        self.0 == *other
 								    }
 								}
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								impl TryFrom<&str> for NCName {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								    type Error = SpanlessError;
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
 								    fn try_from(value: &str) -> Result<Self, Self::Error> {
 								        if value.contains(':') {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								            return Err(SpanlessError::NCColon(value.into()));
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								        }
 								        Ok(Self(value.intern()))
 								    }
 								}
-												tamer: ir::xir: Add missing docs for QName, Prefix, LocalName

The docs still need to be improved, but they can be touched as we go.

This concludes the initial development of XIR.  That was much more involved
that I had originally intended, but the result is good.

DEV-10561

											
										
										
											2021-10-11 11:56:03 -04:00
+								/// Namespace prefix of a [`QName`].
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								pub struct Prefix(NCName);
-												tamer: ir::xir: Add missing docs for QName, Prefix, LocalName

The docs still need to be improved, but they can be touched as we go.

This concludes the initial development of XIR.  That was much more involved
that I had originally intended, but the result is good.

DEV-10561

											
										
										
											2021-10-11 11:56:03 -04:00
-												tamer: xir::st: Static namespace prefixes (c and t)

In particular, `t:*` will be recognized by NIR for short-hand template
application.  These will be utilized in an upcoming commit.

DEV-7145

											
										
										
											2022-08-10 16:33:46 -04:00
+								impl Prefix {
 								    /// Construct a constant [`Prefix`] from a static C-style symbol.
 								    pub const fn st_cid<T: QNameCompatibleStaticSymbolId>(
 								        prefix_sym: &T,
 								    ) -> Self {
 								        Self(NCName(st_as_sym(prefix_sym)))
 								    }
 								}
-												tamer: ir::xir: Add missing docs for QName, Prefix, LocalName

The docs still need to be improved, but they can be touched as we go.

This concludes the initial development of XIR.  That was much more involved
that I had originally intended, but the result is good.

DEV-10561

											
										
										
											2021-10-11 11:56:03 -04:00
+								/// Local name portion of a [`QName`].
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								pub struct LocalPart(NCName);
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								impl Deref for Prefix {
 								    type Target = SymbolId;
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
 								    fn deref(&self) -> &Self::Target {
 								        self.0.deref()
 								    }
 								}
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								impl Deref for LocalPart {
 								    type Target = SymbolId;
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
 								    fn deref(&self) -> &Self::Target {
 								        self.0.deref()
 								    }
 								}
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								impl From<NCName> for Prefix {
 								    fn from(name: NCName) -> Self {
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								        Self(name)
 								    }
 								}
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								impl From<NCName> for LocalPart {
 								    fn from(name: NCName) -> Self {
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								        Self(name)
 								    }
 								}
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								impl TryFrom<&str> for Prefix {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								    type Error = SpanlessError;
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
 								    fn try_from(value: &str) -> Result<Self, Self::Error> {
 								        Ok(Self(value.try_into()?))
 								    }
 								}
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								impl TryFrom<&str> for LocalPart {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								    type Error = SpanlessError;
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
 								    fn try_from(value: &str) -> Result<Self, Self::Error> {
 								        Ok(Self(value.try_into()?))
 								    }
 								}
-												tamer: ir::xir::{QName, Prefix, LocalName}: Implement Display

These will be shown in error messages and need user-friendly
representations.

DEV-10863

											
										
										
											2021-11-02 13:55:33 -04:00
+								impl Display for Prefix {
 								    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 								        self.0.fmt(f)
 								    }
 								}
 								impl Display for LocalPart {
 								    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 								        self.0.fmt(f)
 								    }
 								}
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								/// A qualified name (namespace prefix and local name).
 								#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								pub struct QName(Option<Prefix>, LocalPart);
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
 								// Since we implement Copy, ensure size matches our expectations:
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								const_assert!(std::mem::size_of::<QName>() <= std::mem::size_of::<usize>());
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								impl QName {
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								    /// Create a new fully-qualified name (including both a namespace URI
 								    ///   and local name).
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								    pub fn new(prefix: Option<Prefix>, local_name: LocalPart) -> Self {
 								        Self(prefix, local_name)
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								    }
 								    /// Create a new name from a local name only.
 								    ///
 								    /// This should only be used for attributes in TAMER,
 								    ///   since all elements should have an associated namespace.
 								    ///
 								    /// _(If this is ever not true (e.g. due to new targets),
 								    ///   please update this comment.)_
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								    pub fn new_local(local_name: LocalPart) -> Self {
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								        Self(None, local_name)
 								    }
 								    /// Fully qualified namespace associated with a name.
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								    pub fn prefix(&self) -> Option<Prefix> {
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								        self.0
 								    }
 								    /// Local part of a name (name without namespace).
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								    pub fn local_name(&self) -> LocalPart {
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								        self.1
 								    }
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
 								    /// Construct a constant QName from static C-style symbols.
-												tamer: obj::xmle::xir: Write l:map-from

This contains some awkward coupling for opening and closing tags to reduce
the complexity of the `Iterator` types that must be manually
specified.  That may be addressed shortly.

											
										
										
											2021-10-05 16:13:47 -04:00
+								    pub const fn st_cid<T, U>(prefix_sym: &T, local_sym: &U) -> Self
 								    where
 								        T: QNameCompatibleStaticSymbolId,
 								        U: QNameCompatibleStaticSymbolId,
 								    {
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        Self(
-												tamer: obj::xmle::xir: Write l:map-from

This contains some awkward coupling for opening and closing tags to reduce
the complexity of the `Iterator` types that must be manually
specified.  That may be addressed shortly.

											
										
										
											2021-10-05 16:13:47 -04:00
+								            Some(Prefix(NCName(st_as_sym(prefix_sym)))),
 								            LocalPart(NCName(st_as_sym(local_sym))),
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        )
 								    }
 								    /// Construct a constant QName with a local name only from a static
 								    ///   C-style symbol.
-												tamer: obj::xmle::xir: Write l:map-from

This contains some awkward coupling for opening and closing tags to reduce
the complexity of the `Iterator` types that must be manually
specified.  That may be addressed shortly.

											
										
										
											2021-10-05 16:13:47 -04:00
+								    pub const fn st_cid_local<T: QNameCompatibleStaticSymbolId>(
 								        local_sym: &T,
 								    ) -> Self {
 								        Self(None, LocalPart(NCName(st_as_sym(local_sym))))
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								    }
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								}
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								impl<P, L> TryFrom<(P, L)> for QName
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								where
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								    P: TryInto<Prefix>,
 								    L: TryInto<LocalPart, Error = P::Error>,
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								{
 								    type Error = P::Error;
 								    fn try_from(value: (P, L)) -> Result<Self, Self::Error> {
 								        Ok(Self(Some(value.0.try_into()?), value.1.try_into()?))
 								    }
 								}
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								impl<P, L> TryFrom<(Option<P>, L)> for QName
-												tamer: ir::xir::tree: Introduce XIR tree

This begins to introduce the XIR tree.  I was originally going to wait on
this until after implementing the xmle writer in terms of XIR, but writing
unit tests is too much of a pain on the stream, so now is as good of a time
as any.

This has very limited support so far; it'll be added to as time goes on.

											
										
										
											2021-09-08 13:53:47 -04:00
+								where
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								    P: TryInto<Prefix>,
 								    L: TryInto<LocalPart, Error = P::Error>,
-												tamer: ir::xir::tree: Introduce XIR tree

This begins to introduce the XIR tree.  I was originally going to wait on
this until after implementing the xmle writer in terms of XIR, but writing
unit tests is too much of a pain on the stream, so now is as good of a time
as any.

This has very limited support so far; it'll be added to as time goes on.

											
										
										
											2021-09-08 13:53:47 -04:00
+								{
 								    type Error = P::Error;
 								    fn try_from(value: (Option<P>, L)) -> Result<Self, Self::Error> {
 								        let ns = match value.0 {
 								            None => None,
 								            Some(ns) => Some(ns.try_into()?),
 								        };
 								        Ok(Self(ns, value.1.try_into()?))
 								    }
 								}
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								impl TryFrom<&str> for QName {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								    type Error = SpanlessError;
-												tamer: ir::xir::tree: Introduce XIR tree

This begins to introduce the XIR tree.  I was originally going to wait on
this until after implementing the xmle writer in terms of XIR, but writing
unit tests is too much of a pain on the stream, so now is as good of a time
as any.

This has very limited support so far; it'll be added to as time goes on.

											
										
										
											2021-09-08 13:53:47 -04:00
 								    fn try_from(value: &str) -> Result<Self, Self::Error> {
 								        Ok(QName(None, value.try_into()?))
 								    }
 								}
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								impl TryFrom<&[u8]> for QName {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								    type Error = SpanlessError;
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
 								    /// Attempt to parse a byte slice into a [`QName`].
 								    ///
 								    /// The byte slice must represent a valid QName in UTF-8.
 								    /// If a colon is present,
 								    ///   it delimits the namespace [`Prefix`] and [`LocalPart`],
 								    ///   and therefore must not be in the first or last byte position.
 								    fn try_from(name: &[u8]) -> Result<Self, Self::Error> {
 								        match memchr(b':', name) {
 								            // Leading colon means we're missing a prefix, trailing means
 								            //   that we have no local part.
 								            Some(pos) if pos == 0 || pos == name.len() - 1 => {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                Err(SpanlessError::InvalidQName(name.intern_utf8()?))
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								            }
 								            // There is _at least_ one colon in the string.
 								            Some(pos) => {
 								                // The prefix is before the first colon,
 								                //   and so itself must not contain a colon and is therefore
 								                //   a valid NCName.
 								                let prefix = NCName(name[..pos].intern_utf8()?);
 								                // But there could be a _second_ colon,
 								                //   so the local part requires validation.
 								                let local = NCName::try_from(&name[(pos + 1)..])?;
 								                Ok(Self::new(Some(prefix.into()), local.into()))
 								            }
 								            // There are no colons in the string, so the entire string is
 								            //   both a local part and a valid NCName.
 								            None => Ok(Self::new(None, NCName(name.intern_utf8()?).into())),
 								        }
 								    }
 								}
-												tamer: ir::xir::{QName, Prefix, LocalName}: Implement Display

These will be shown in error messages and need user-friendly
representations.

DEV-10863

											
										
										
											2021-11-02 13:55:33 -04:00
+								impl Display for QName {
 								    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 								        match self {
 								            QName(Some(local), suffix) => write!(f, "{}:{}", local, suffix),
 								            QName(None, suffix) => suffix.fmt(f),
 								        }
 								    }
 								}
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								/// A span representing an opening (starting) element tag.
 								///
 								/// See [`EleSpan`] for more information.
 								#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 								pub struct OpenSpan(Span, EleNameLen);
 								impl OpenSpan {
 								    pub fn without_name_span(span: Span) -> Self {
 								        Self(span, 0)
 								    }
 								}
 								/// A span representing a closing (ending) element tag.
 								///
 								/// See [`EleSpan`] for more information.
 								#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 								pub struct CloseSpan(Span, EleNameLen);
 								impl CloseSpan {
 								    /// A [`CloseSpan`] representing the closing of an empty tag.
 								    ///
 								    /// This type of span has no element name.
 								    pub fn empty(span: Span) -> Self {
 								        Self::without_name_span(span)
 								    }
 								    pub fn without_name_span(span: Span) -> Self {
 								        Self(span, 0)
 								    }
 								}
-												tamer: Xirf::Text refinement

This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.

This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.

The idea is to capture the most common whitespace as preinterned
symbols.  Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.

The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character.  This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large.  (They become large when template-expanded.)  I'll optimize further
if I notice it show up during profiling.

This also frees XIR itself from being concerned by Whitespace.  Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic.  I'd rather avoid tweaking it.

tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).

Onward and yonward.

DEV-7145

											
										
										
											2022-07-27 15:49:38 -04:00
+								/// Number of bytes representing the name of the element.
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								pub type EleNameLen = SpanLenSize;
 								/// Spans associated with an element opening or closing tag.
 								///
 								/// The diagram below illustrates the behavior of [`EleSpan`].
 								/// Spans are represented by `[---]` intervals,
 								///   with the byte offset at each end,
 								///   and the single-letter span name centered below the interval.
 								///
 								/// ```text
 								///   <open  >          <open ...>     </close  >          <empty ' />
 								///   |[--]  |          |[--]          | [---]  |          |[---] ' []
 								///   |1  4  |          |1  4          | 2   6  |          |1   5 ' 9`10
 								///   | N    |          | N |          |   N    |          |  N | ' T
 								///   |      |          |   |          |        |          |    | '
 								///   [------]          [---]          [--------]          [----] '
 								///   0      7          0   4          0        9          0    5 '
 								///      T                T                T                 T    '
 								/// ```
 								///
 								/// Above we have
 								///
 								///   - `T` = [`EleSpan::span`]; and
 								///   - `N` = [`EleSpan::name_span`].
 								///
 								/// The purpose of the `T` span is to represent the entire token that has
 								///   been emitted by XIR.
 								/// If an opening tag does not contain any attributes,
 								///   then `T` represents the entire opening tag with both the opening and
 								///   closing angle brackets.
 								/// If an opening tag is expected to contain attributes,
 								///   then only the opening angle bracket is included.
 								/// A closing tag is entirely contained by `T`.
 								///
 								/// The empty tag is separated into two tokens in XIR---a
 								///   [`Token::Open`] and a [`Token::Close`] with a [`None`] for the name.
 								/// Unlike a typical closing tag,
 								///   there is no `N` span available for the closing token,
 								///     and so requesting one via [`EleSpan::name_span`] will simply
 								///     return the `T` span,
 								///       rather than complicating the API with an [`Option`].
 								/// It is generally assumed that reporting on element names will occur
 								///   within the context of the _opening_ tag.
 								///
 								/// The tag may contain whitespace following the element name,
 								///   as permitted by `STag` and `ETag` in the
 								///     [XML specification][xmlspec-tag].
 								///
 								/// [xmlspec-tag]: https://www.w3.org/TR/xml/#dt-stag
 								pub trait EleSpan {
 								    /// A [`Span`] encompassing the entire opening element token.
 								    ///
 								    /// Note that what exactly this token represents varies.
 								    fn span(&self) -> Span;
 								    /// Span representing the relevant portion of the element tag.
 								    ///
 								    /// This is a more descriptive alias of [`EleSpan::span`] that may be
 								    ///   appropriate in certain contexts.
 								    fn tag_span(&self) -> Span {
 								        self.span()
 								    }
 								    /// A [`Span`] representing only the element name,
 								    ///   if available.
 								    ///
 								    /// An element name is _not_ available for empty tags.
 								    /// Rather than complicating the API with [`Option`],
 								    ///   [`EleSpan::span`] is returned instead.
 								    fn name_span(&self) -> Span;
 								}
 								impl EleSpan for OpenSpan {
 								    fn span(&self) -> Span {
 								        match self {
 								            Self(t, _) => *t,
 								        }
 								    }
 								    fn name_span(&self) -> Span {
 								        match self {
 								            // <open  ...>
 								            //  ^^^^ offset '<' and length of name
 								            //
 								            // If the length is 0,
 								            //   then this will result in a 0-length span at the location
 								            //   that the element name ought to be,
 								            //     and so the resulting span will still be useful.
 								            // This should not happen for tokens read using XIR,
 								            //   but may happen for system-generated tokens.
 								            Self(t, name_len) => {
 								                t.context().span(t.offset().saturating_add(1), *name_len)
 								            }
 								        }
 								    }
 								}
 								impl EleSpan for CloseSpan {
 								    fn span(&self) -> Span {
 								        match self {
 								            Self(t, _) => *t,
 								        }
 								    }
 								    fn name_span(&self) -> Span {
 								        match self {
 								            // If the length of the element name is 0,
 								            //   then this must be an empty tag,
 								            //     which contains no independent element name.
 								            //
 								            // <foo ' />
 								            //      ' ^^
 								            Self(_t, 0) => self.span(),
 								            // </close  >
 								            //   ^^^^^ offset '</' and length of name
 								            Self(t, name_len) => {
 								                t.context().span(t.offset().saturating_add(2), *name_len)
 								            }
 								        }
 								    }
 								}
-												tamer: nir: Basic rate block translation

This commit is what I've been sitting on for testing some of the recent
changes; it is a very basic demonstration of lowering all the way down
from source XML files into the ASG.  This can be run on real files to
observe, beyond unit tests, how the system reacts.

Once this outputs data from the graph, we'll finally have tamec end-to-end
and can just keep filling the gaps.

I'm hoping to roll the desugaring process into NirToAir rather than having a
separate process as originally planned a couple of months back.

This also introduces the `wip-nir-to-air` feature flag.  Currently,
interpolation will cause a `Nir::BindIdent` to be emitted in blocks that
aren't yet emitting NIR, and so results in an invalid parse.

DEV-13159

											
										
										
											2023-01-23 16:30:25 -05:00
+								impl From<OpenSpan> for Span {
 								    fn from(value: OpenSpan) -> Self {
 								        value.span()
 								    }
 								}
 								impl From<CloseSpan> for Span {
 								    fn from(value: CloseSpan) -> Self {
 								        value.span()
 								    }
 								}
-												tamer: xir: {NodeStream=>Token}

I decided not to do this in a previous commit because I had documented
"NodeStream" elsewhere, so I'd like it to be in the Git history to
understand its evolution.

This never was a "Node" stream beyond the initial concept phase, because it
represents tokens that aren't themselves nodes.  It is intended to generate
XML nodes, but may need to accommodate non-nodes (e.g. XML declarations) in
the future.

The name originated from `Node`, which was a tree-based IR that was
initially conceived, but removed because it's not yet needed.  What we need
is a streaming IR for xmle writing, and then for reading and echoing back
out XML for the new frontend.

											
										
										
											2021-08-20 10:30:27 -04:00
+								/// Lightly-structured XML tokens with associated [`Span`]s.
 								///
 								/// This is a streamable IR for XML.
 								/// A writer requires knowledge only of a previous state,
 								///   such as whether a node is open,
 								///   and so this IR can be processed by a simple state machine
 								///     (see [`writer::WriterState`]).
-												tamer: obj::xmle::xir: Extract ElemWrap into ir::xir::iter

This generalizes it a bit and provides tests, which was always the intent;
the existing code was POC to determine if this could be done without
performance degradation (see that commit for more information).

											
										
										
											2021-10-11 10:33:24 -04:00
+								#[derive(Debug, Clone, PartialEq, Eq)]
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								pub enum Token {
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								    /// Opening tag of an element.
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								    Open(QName, OpenSpan),
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
 								    /// Closing tag of an element.
 								    ///
-												tamer: ir::xir: Combine Token::{SelfClose, Close} variants

This removes `SelfClose` and merges it with `Close` by making the first
parameter an `Option`.  This isn't really ideal, but it really simplifies
pattern matching, especially for the next commit.  I'll have more details
there.

The primary motivation was lack of stabalization for binding after `@` in
matches, e.g. `Foo(name, ele) | ele @ Element { name, .. }`.  It looks like
it's ready, though; maybe next Rust release?

  https://github.com/rust-lang/rust/issues/65490

I don't know if I'll revert this change after then.  This seems plenty
clear, albeit more verbose.

											
										
										
											2021-09-13 12:58:54 -04:00
+								    /// If the name is [`None`],
 								    ///   then the tag is self-closing.
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								    /// This is intended primarily as a safety measure:
 								    ///   It allows writers to act as simple state machines without having
 								    ///     to ensure balancing by indicating that a node was intended to
 								    ///     self-close.
 								    ///   Otherwise,
 								    ///     we wouldn't know whether to self-close or to close and then
 								    ///     create a new closing tag;
 								    ///       if we blindly did the former,
 								    ///         we risk losing a closing tag when it wasn't intended.
 								    ///   Instead of losing tags,
 								    ///     writers can error,
 								    ///       indicating a bug in the stream.
-												tamer: ir::xir: Combine Token::{SelfClose, Close} variants

This removes `SelfClose` and merges it with `Close` by making the first
parameter an `Option`.  This isn't really ideal, but it really simplifies
pattern matching, especially for the next commit.  I'll have more details
there.

The primary motivation was lack of stabalization for binding after `@` in
matches, e.g. `Foo(name, ele) | ele @ Element { name, .. }`.  It looks like
it's ready, though; maybe next Rust release?

  https://github.com/rust-lang/rust/issues/65490

I don't know if I'll revert this change after then.  This seems plenty
clear, albeit more verbose.

											
										
										
											2021-09-13 12:58:54 -04:00
+								    ///
 								    /// The reason for using an option here rather than a variant is to
 								    ///   simplify pattern matching,
 								    ///     given especially that bindings after `@` in patterns have not
 								    ///     yet been stabalized at the time of writing (but are very
 								    ///     close!).
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								    Close(Option<QName>, CloseSpan),
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
-												tamer: ir::xir: Add Token::AttrValueFragment with writer support

This is implemented only for the writer, since its use case is to be able to
concatenate strings without copying during writing.

It doesn't really make sense to support this in XIR Tree, since a reader
should never produce this.  But if we ever run into this (e.g. due to some
internal processing pipeline), we'll address it then; XIR Tree might have to
do copying, then, but should probably wait until encountering all fragments
before interning.  That'd be a distraction right now.

											
										
										
											2021-09-21 00:13:03 -04:00
+								    /// Element attribute name.
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								    AttrName(QName, Span),
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
-												tamer: ir::xir: Add Token::AttrValueFragment with writer support

This is implemented only for the writer, since its use case is to be able to
concatenate strings without copying during writing.

It doesn't really make sense to support this in XIR Tree, since a reader
should never produce this.  But if we ever run into this (e.g. due to some
internal processing pipeline), we'll address it then; XIR Tree might have to
do copying, then, but should probably wait until encountering all fragments
before interning.  That'd be a distraction right now.

											
										
										
											2021-09-21 00:13:03 -04:00
+								    /// Element attribute value.
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								    AttrValue(SymbolId, Span),
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
-												tamer: ir::xir: Add Token::AttrValueFragment with writer support

This is implemented only for the writer, since its use case is to be able to
concatenate strings without copying during writing.

It doesn't really make sense to support this in XIR Tree, since a reader
should never produce this.  But if we ever run into this (e.g. due to some
internal processing pipeline), we'll address it then; XIR Tree might have to
do copying, then, but should probably wait until encountering all fragments
before interning.  That'd be a distraction right now.

											
										
										
											2021-09-21 00:13:03 -04:00
+								    /// A portion of an element attribute value.
 								    ///
 								    /// This allows for concatenating values into an attribute value without
 								    ///   having to copy values.
 								    /// The last fragment must be a [`Token::AttrValue`].
 								    ///
-												tamer: ir::xir::tree: Extract Attr{,List} into new module

The `tree` module is getting more difficult to navigate.  The tests still
remain where they were, since a bunch of concerns are mixed together.  Any
tests specific only to this module will be added here.

											
										
										
											2021-09-21 10:43:23 -04:00
+								    /// Since each fragment contains a span,
 								    ///   this also potentially gives higher resolution for the origin of
 								    ///   components of generated attribute values.
-												tamer: xir: Remove Attr::Extensible

This removes XIRT support for attribute fragments.  The reason is that
because this is a write-only operation---fragments are used to concatenate
SymbolIds without reallocation, which can only happen if we are generating
XIR internally.

Given that this cannot happen during read, it was a mistake to complicate
the parsers.  But it makes sense why I did originally, given that the XIRT
parser was written for simplifying test cases.  But now that we want parsers
for real, and are writing production-quality parsers, this extra complexity
is very undesirable.

As a bonus, we also avoid any potential for heap allocations related to
attributes.  Granted, they didn't _really_ exist to begin with, but it was
part of XIRT, and was ugly.

DEV-11268

											
										
										
											2021-12-06 14:26:58 -05:00
+								    ///
 								    /// _This should be used only for writing._
 								    /// These will never be encountered during reading,
 								    ///   and so to keep the parsers and IRs simple,
 								    ///   there is no support for fragments beyond XIR.
 								    /// (There was in the past,
 								    ///    but it was removed.)
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								    AttrValueFragment(SymbolId, Span),
-												tamer: ir::xir: Add Token::AttrValueFragment with writer support

This is implemented only for the writer, since its use case is to be able to
concatenate strings without copying during writing.

It doesn't really make sense to support this in XIR Tree, since a reader
should never produce this.  But if we ever run into this (e.g. due to some
internal processing pipeline), we'll address it then; XIR Tree might have to
do copying, then, but should probably wait until encountering all fragments
before interning.  That'd be a distraction right now.

											
										
										
											2021-09-21 00:13:03 -04:00
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								    /// Comment node.
-												tamer: xir: Remove Text enum

Like previous commits, this replaces the explicit escaping context with the
convention that all values retrieved from `xir` are unescaped on read and
escaped on write.

Comments are a notable TODO, since we must escape only `--`.

CData is also an issue.  I had _expected_ to use it as a means to avoid
unescaping fragments, but I had forgotten that quick_xml hard-codes escaping
on read, so that it can re-use BytesStart!  That is terribly unfortunate,
and may result in us having to re-implement our own read method in the
future to avoid this nonsense.  So I'm just leaving it as a TODO for now.

DEV-11081

											
										
										
											2021-11-15 23:47:14 -05:00
+								    Comment(SymbolId, Span),
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
 								    /// Character data as part of an element.
 								    ///
-												tamer: xir: {NodeStream=>Token}

I decided not to do this in a previous commit because I had documented
"NodeStream" elsewhere, so I'd like it to be in the Git history to
understand its evolution.

This never was a "Node" stream beyond the initial concept phase, because it
represents tokens that aren't themselves nodes.  It is intended to generate
XML nodes, but may need to accommodate non-nodes (e.g. XML declarations) in
the future.

The name originated from `Node`, which was a tree-based IR that was
initially conceived, but removed because it's not yet needed.  What we need
is a streaming IR for xmle writing, and then for reading and echoing back
out XML for the new frontend.

											
										
										
											2021-08-20 10:30:27 -04:00
+								    /// See also [`CData`](Token::CData) variant.
-												tamer: xir: Remove Text enum

Like previous commits, this replaces the explicit escaping context with the
convention that all values retrieved from `xir` are unescaped on read and
escaped on write.

Comments are a notable TODO, since we must escape only `--`.

CData is also an issue.  I had _expected_ to use it as a means to avoid
unescaping fragments, but I had forgotten that quick_xml hard-codes escaping
on read, so that it can re-use BytesStart!  That is terribly unfortunate,
and may result in us having to re-implement our own read method in the
future to avoid this nonsense.  So I'm just leaving it as a TODO for now.

DEV-11081

											
										
										
											2021-11-15 23:47:14 -05:00
+								    Text(SymbolId, Span),
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
 								    /// CData node (`<![CDATA[...]]>`).
 								    ///
 								    /// _Warning: It is up to the caller to ensure that the string `]]>` is
 								    ///   not present in the text!_
 								    /// This is intended for reading existing XML data where CData is
 								    ///   already present,
 								    ///     not for producing new CData safely!
-												tamer: xir: Remove Text enum

Like previous commits, this replaces the explicit escaping context with the
convention that all values retrieved from `xir` are unescaped on read and
escaped on write.

Comments are a notable TODO, since we must escape only `--`.

CData is also an issue.  I had _expected_ to use it as a means to avoid
unescaping fragments, but I had forgotten that quick_xml hard-codes escaping
on read, so that it can re-use BytesStart!  That is terribly unfortunate,
and may result in us having to re-implement our own read method in the
future to avoid this nonsense.  So I'm just leaving it as a TODO for now.

DEV-11081

											
										
										
											2021-11-15 23:47:14 -05:00
+								    CData(SymbolId, Span),
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								}
-												tamer: ir::xir::Token: Implement Display

This also modifies xir::tree errors to use Display instead of Debug when
rendering error output.

DEV-10863

											
										
										
											2021-11-03 14:54:37 -04:00
+								impl Display for Token {
 								    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-												tamer: xir::Token: Remove span from Display

This was missed when removing it from other Display impls when the new
diagnostic system was introduced.  Raw `Span`s display byte offsets and the
context, which is no longer desirable as part of an error message.

DEV-10936

											
										
										
											2022-05-03 09:09:55 -04:00
+								        // _Do not_ render large amounts of text here;
 								        //   this is not only a risk depending on what is output,
 								        //     but the diagnostic system also quote source lines to provide
 								        //     the necessary context.
-												tamer: ir::xir::Token: Implement Display

This also modifies xir::tree errors to use Display instead of Debug when
rendering error output.

DEV-10863

											
										
										
											2021-11-03 14:54:37 -04:00
+								        match self {
-												tamer: xir: Format tokens without tt quotes

Whether or not quoting is appropriate depends on context, and that parent
context is already performing the quoting.  For example:

  error: expected `</rater>`, but found `<import>`
    --> /home/[...]/foo.xml:2:1
     |
   2 | <rater xmlns="http://www.lovullo.com/rater"
     | ------ note: element starts here

    --> /home/[...]/foo.xml:7:3
     |
   7 |   <import package="/rater/core/base" />
     |   ^^^^^^^ error: expected `</rater>`

In these cases (obviously I'm still working on the parser, since this is
nonsense), the parser is responsible for quoting the token "<import>".

DEV-7145

											
										
										
											2022-07-29 00:44:58 -04:00
+								            Self::Open(qname, _) => OpenXmlEle::fmt(qname, f),
 								            Self::Close(Some(qname), _) => CloseXmlEle::fmt(qname, f),
-												tamer: ir::xir::Token: Implement Display

This also modifies xir::tree errors to use Display instead of Debug when
rendering error output.

DEV-10863

											
										
										
											2021-11-03 14:54:37 -04:00
+								            // Its context is contained within the Open,
 								            //   and hopefully any user-visible errors will display that instead.
-												tamer: xir::Token: Remove span from Display

This was missed when removing it from other Display impls when the new
diagnostic system was introduced.  Raw `Span`s display byte offsets and the
context, which is no longer desirable as part of an error message.

DEV-10936

											
										
										
											2022-05-03 09:09:55 -04:00
+								            Self::Close(None, _) => {
-												tamer: xir: Format tokens without tt quotes

Whether or not quoting is appropriate depends on context, and that parent
context is already performing the quoting.  For example:

  error: expected `</rater>`, but found `<import>`
    --> /home/[...]/foo.xml:2:1
     |
   2 | <rater xmlns="http://www.lovullo.com/rater"
     | ------ note: element starts here

    --> /home/[...]/foo.xml:7:3
     |
   7 |   <import package="/rater/core/base" />
     |   ^^^^^^^ error: expected `</rater>`

In these cases (obviously I'm still working on the parser, since this is
nonsense), the parser is responsible for quoting the token "<import>".

DEV-7145

											
										
										
											2022-07-29 00:44:58 -04:00
+								                write!(f, "/>")
-												tamer: ir::xir::Token: Implement Display

This also modifies xir::tree errors to use Display instead of Debug when
rendering error output.

DEV-10863

											
										
										
											2021-11-03 14:54:37 -04:00
+								            }
-												tamer: xir: Format tokens without tt quotes

Whether or not quoting is appropriate depends on context, and that parent
context is already performing the quoting.  For example:

  error: expected `</rater>`, but found `<import>`
    --> /home/[...]/foo.xml:2:1
     |
   2 | <rater xmlns="http://www.lovullo.com/rater"
     | ------ note: element starts here

    --> /home/[...]/foo.xml:7:3
     |
   7 |   <import package="/rater/core/base" />
     |   ^^^^^^^ error: expected `</rater>`

In these cases (obviously I'm still working on the parser, since this is
nonsense), the parser is responsible for quoting the token "<import>".

DEV-7145

											
										
										
											2022-07-29 00:44:58 -04:00
+								            Self::AttrName(qname, _) => XmlAttr::fmt(qname, f),
 								            Self::AttrValue(attr_val, _) => XmlAttrValueQuote::fmt(attr_val, f),
-												tamer: xir::Token: Remove span from Display

This was missed when removing it from other Display impls when the new
diagnostic system was introduced.  Raw `Span`s display byte offsets and the
context, which is no longer desirable as part of an error message.

DEV-10936

											
										
										
											2022-05-03 09:09:55 -04:00
+								            Self::AttrValueFragment(attr_val, _) => {
-												tamer: xir: Format tokens without tt quotes

Whether or not quoting is appropriate depends on context, and that parent
context is already performing the quoting.  For example:

  error: expected `</rater>`, but found `<import>`
    --> /home/[...]/foo.xml:2:1
     |
   2 | <rater xmlns="http://www.lovullo.com/rater"
     | ------ note: element starts here

    --> /home/[...]/foo.xml:7:3
     |
   7 |   <import package="/rater/core/base" />
     |   ^^^^^^^ error: expected `</rater>`

In these cases (obviously I'm still working on the parser, since this is
nonsense), the parser is responsible for quoting the token "<import>".

DEV-7145

											
										
										
											2022-07-29 00:44:58 -04:00
+								                write!(
 								                    f,
 								                    "value fragment {}",
 								                    XmlAttrValueQuote::wrap(attr_val)
 								                )
-												tamer: ir::xir::Token: Implement Display

This also modifies xir::tree errors to use Display instead of Debug when
rendering error output.

DEV-10863

											
										
										
											2021-11-03 14:54:37 -04:00
+								            }
-												tamer: xir::Token: Remove span from Display

This was missed when removing it from other Display impls when the new
diagnostic system was introduced.  Raw `Span`s display byte offsets and the
context, which is no longer desirable as part of an error message.

DEV-10936

											
										
										
											2022-05-03 09:09:55 -04:00
+								            Self::Comment(..) => write!(f, "comment"),
 								            Self::Text(..) => write!(f, "text"),
 								            Self::CData(..) => write!(f, "CDATA"),
-												tamer: ir::xir::Token: Implement Display

This also modifies xir::tree errors to use Display instead of Debug when
rendering error output.

DEV-10863

											
										
										
											2021-11-03 14:54:37 -04:00
+								        }
 								    }
 								}
-												tamer: xir::parse: Attribute parser generator

This is the first parser generator for the parsing framework.  I've been
waiting quite a while to do this because I wanted to be sure that I
understood how I intended to write the attribute parsers manually.  Now that
I'm about to start parsing source XML files, it is necessary to have a
parser generator.

Typically one thinks of a parser generator as a separate program that
generates code for some language, but that is not always the case---that
represents a lack of expressiveness in the language itself (e.g. C).  Here,
I simply use Rust's macro system, which should be a concept familiar to
someone coming from a language like Lisp.

This also resolves where I stand on parser combinators with respect to this
abstraction: they both accomplish the exact same thing (composition of
smaller parsers), but this abstraction doesn't do so in the typical
functional way.  But the end result is the same.

The parser generated by this abstraction will be optimized an inlined in the
same manner as the hand-written parsers.  Since they'll be tightly coupled
with an element parser (which too will have a parser generator), I expect
that most attribute parsers will simply be inlined; they exist as separate
parsers conceptually, for the same reason that you'd use parser combinators.

It's worth mentioning that this awkward reliance on dead state for a
lookahead token to determine when aggregation is complete rubs me the wrong
way, but resolving it would involve reintroducing the XIR AttrEnd that I had
previously removed.  I'll keep fighting with myself on this, but I want to
get a bit further before I determine if it's worth the tradeoff of
reintroducing (more complex IR but simplified parsing).

DEV-7145

											
										
										
											2022-06-13 11:17:21 -04:00
+								impl crate::parse::Token for Token {
-												tamer: parser::Parser: cfg(test) tracing

This produces useful parse traces that are output as part of a failing test
case.  The parser generator macros can be a bit confusing to deal with when
things go wrong, so this helps to clarify matters.

This is _not_ intended to be machine-readable, but it does show that it
would be possible to generate machine-readable output to visualize the
entire lowering pipeline.  Perhaps something for the future.

I left these inline in Parser::feed_tok because they help to elucidate what
is going on, just by reading what the trace would output---that is, it helps
to make the method more self-documenting, albeit a tad bit more
verbose.  But with that said, it should probably be extracted at some point;
I don't want this to set a precedent where composition is feasible.

Here's an example from test cases:

  [Parser::feed_tok] (input IR: XIRF)
  |  ==> Parser before tok is parsing attributes for `package`.
  |   |  Attrs_(SutAttrsState_ { ___ctx: (QName(None, LocalPart(NCName(SymbolId(46 "package")))), OpenSpan(Span { len: 0, offset: 0, ctx: Context(SymbolId(1 "#!DUMMY")) }, 10)), ___done: false })
  |
  |  ==> XIRF tok: `<unexpected>`
  |   |  Open(QName(None, LocalPart(NCName(SymbolId(82 "unexpected")))), OpenSpan(Span { len: 0, offset: 1, ctx: Context(SymbolId(1 "#!DUMMY")) }, 10), Depth(1))
  |
  |  ==> Parser after tok is expecting opening tag `<classify>`.
  |   |  ChildA(Expecting_)
  |   |  Lookahead: Some(Lookahead(Open(QName(None, LocalPart(NCName(SymbolId(82 "unexpected")))), OpenSpan(Span { len: 0, offset: 1, ctx: Context(SymbolId(1 "#!DUMMY")) }, 10), Depth(1))))
  = note: this trace was output as a debugging aid because `cfg(test)`.

  [Parser::feed_tok] (input IR: XIRF)
  |  ==> Parser before tok is expecting opening tag `<classify>`.
  |   |  ChildA(Expecting_)
  |
  |  ==> XIRF tok: `<unexpected>`
  |   |  Open(QName(None, LocalPart(NCName(SymbolId(82 "unexpected")))), OpenSpan(Span { len: 0, offset: 1, ctx: Context(SymbolId(1 "#!DUMMY")) }, 10), Depth(1))
  |
  |  ==> Parser after tok is attempting to recover by ignoring element with unexpected name `unexpected` (expected `classify`).
  |   |  ChildA(RecoverEleIgnore_(QName(None, LocalPart(NCName(SymbolId(82 "unexpected")))), OpenSpan(Span { len: 0, offset: 1, ctx: Context(SymbolId(1 "#!DUMMY")) }, 10), Depth(1)))
  |   |  Lookahead: None
  = note: this trace was output as a debugging aid because `cfg(test)`.

DEV-7145

											
										
										
											2022-07-18 14:32:34 -04:00
+								    fn ir_name() -> &'static str {
 								        "XIR"
 								    }
-												tamer: xir::Token::span: New method

This permits retrieving a Span from any Token variant.  To support this,
rather than having this return an Option, Token::AttrEnd was augmented with
a Span; this results in a much simpler and friendlier API.

DEV-11268

											
										
										
											2021-12-06 14:48:55 -05:00
+								    /// Retrieve the [`Span`] associated with a given [`Token`].
 								    ///
 								    /// Every token has an associated span.
-												tamer: xir::parse: Generalize input token type

This adds a `Token` type to `ParseState`.  Everything uses `xir::Token`
currently, but `XmloReader` will use `xir::flat::Object`.

Now that this has been generalized beyond XIR, the parser ought to be
hoisted up a level.

DEV-10863

											
										
										
											2022-03-18 15:26:05 -04:00
+								    fn span(&self) -> Span {
-												tamer: xir::Token::span: New method

This permits retrieving a Span from any Token variant.  To support this,
rather than having this return an Option, Token::AttrEnd was augmented with
a Span; this results in a much simpler and friendlier API.

DEV-11268

											
										
										
											2021-12-06 14:48:55 -05:00
+								        use Token::*;
 								        match self {
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								            Open(_, OpenSpan(span, _))
 								            | Close(_, CloseSpan(span, _))
-												tamer: xir::Token::span: New method

This permits retrieving a Span from any Token variant.  To support this,
rather than having this return an Option, Token::AttrEnd was augmented with
a Span; this results in a much simpler and friendlier API.

DEV-11268

											
										
										
											2021-12-06 14:48:55 -05:00
+								            | AttrName(_, span)
 								            | AttrValue(_, span)
 								            | AttrValueFragment(_, span)
 								            | Comment(_, span)
 								            | Text(_, span)
-												tamer: Xirf::Text refinement

This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.

This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.

The idea is to capture the most common whitespace as preinterned
symbols.  Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.

The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character.  This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large.  (They become large when template-expanded.)  I'll optimize further
if I notice it show up during profiling.

This also frees XIR itself from being concerned by Whitespace.  Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic.  I'd rather avoid tweaking it.

tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).

Onward and yonward.

DEV-7145

											
										
										
											2022-07-27 15:49:38 -04:00
+								            | CData(_, span) => *span,
-												tamer: xir::Token::span: New method

This permits retrieving a Span from any Token variant.  To support this,
rather than having this return an Option, Token::AttrEnd was augmented with
a Span; this results in a much simpler and friendlier API.

DEV-11268

											
										
										
											2021-12-06 14:48:55 -05:00
+								        }
 								    }
 								}
-												tamer: xir::parse: Attribute parser generator

This is the first parser generator for the parsing framework.  I've been
waiting quite a while to do this because I wanted to be sure that I
understood how I intended to write the attribute parsers manually.  Now that
I'm about to start parsing source XML files, it is necessary to have a
parser generator.

Typically one thinks of a parser generator as a separate program that
generates code for some language, but that is not always the case---that
represents a lack of expressiveness in the language itself (e.g. C).  Here,
I simply use Rust's macro system, which should be a concept familiar to
someone coming from a language like Lisp.

This also resolves where I stand on parser combinators with respect to this
abstraction: they both accomplish the exact same thing (composition of
smaller parsers), but this abstraction doesn't do so in the typical
functional way.  But the end result is the same.

The parser generated by this abstraction will be optimized an inlined in the
same manner as the hand-written parsers.  Since they'll be tightly coupled
with an element parser (which too will have a parser generator), I expect
that most attribute parsers will simply be inlined; they exist as separate
parsers conceptually, for the same reason that you'd use parser combinators.

It's worth mentioning that this awkward reliance on dead state for a
lookahead token to determine when aggregation is complete rubs me the wrong
way, but resolving it would involve reintroducing the XIR AttrEnd that I had
previously removed.  I'll keep fighting with myself on this, but I want to
get a bit further before I determine if it's worth the tradeoff of
reintroducing (more complex IR but simplified parsing).

DEV-7145

											
										
										
											2022-06-13 11:17:21 -04:00
+								impl crate::parse::Object for Token {}
-												tamer: Integrate xir::reader as a parser in the lowering pipeline

This allows `XmlXirReader` to be used in a `Lower` operation, just as
everything else, bringing me one step closer to a pipeline that can be
concisely represented; this is finally beginning to unify in a clear way,
though it is still a bit of a mess.

This causes `XmlXirReader` to _act_ like a `parse::Parser` in that it yields
a `ParsedResult`, but it does not use `parse::Parser` itself; that was the
_original_ plan: convert it into a `ParseState` where `XmlXirReader` became
a context, and force `Parser` to yield by feeding it a stream of tokens with
`repeat`, but that ended up performing poorly relative to this change.  I
did some investigation, which I might write about in the future, but for
now, this solution works just fine.

DEV-7145

											
										
										
											2022-06-02 10:30:44 -04:00
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								#[cfg(test)]
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								pub mod test {
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								    use super::*;
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								    use crate::convert::ExpectInto;
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								    use crate::sym::GlobalSymbolIntern;
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								    use std::convert::TryInto;
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								    use std::fmt::Debug;
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
 								    type TestResult = Result<(), Box<dyn std::error::Error>>;
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								    // Prefer [`open`] below when possible.
 								    impl From<Span> for OpenSpan {
 								        fn from(span: Span) -> Self {
 								            Self::without_name_span(span)
 								        }
 								    }
 								    // Prefer [`close`] below when possible.
 								    impl From<Span> for CloseSpan {
 								        fn from(span: Span) -> Self {
 								            Self::without_name_span(span)
 								        }
 								    }
 								    /// Hastily and lazily produce a [`XirfToken::Open`].
 								    ///
 								    /// This function is not suitable for production use as it does not
 								    ///   produce a complete [`OpenSpan`].
 								    pub fn open<Q: TryInto<QName>, S: Into<OpenSpan>>(
 								        qname: Q,
 								        span: S,
 								    ) -> Token
 								    where
 								        <Q as TryInto<QName>>::Error: Debug,
 								    {
 								        Token::Open(qname.unwrap_into(), span.into())
 								    }
 								    /// Hastily and lazily produce a [`XirfToken::Close`] for an empty tag.
 								    ///
 								    /// This is [`close`] with the omission of the `qname` argument; the
 								    ///   type parameter `Q` cannot be inferred if the value is [`None`].
 								    ///
 								    /// This function is not suitable for production use as it does not
 								    ///   produce a complete [`OpenSpan`].
 								    pub fn close_empty<S: Into<CloseSpan>>(span: S) -> Token {
 								        Token::Close(None, span.into())
 								    }
 								    /// Hastily and lazily produce a [`XirfToken::Close`].
 								    ///
 								    /// See also [`close_empty`] if `Q` cannot be inferred.
 								    ///
 								    /// This function is not suitable for production use as it does not
 								    ///   produce a complete [`OpenSpan`].
 								    pub fn close<Q: TryInto<QName>, S: Into<CloseSpan>>(
 								        qname: Option<Q>,
 								        span: S,
 								    ) -> Token
 								    where
 								        <Q as TryInto<QName>>::Error: Debug,
 								    {
 								        Token::Close(qname.map(ExpectInto::unwrap_into), span.into())
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								    }
 								    mod name {
 								        use super::*;
 								        #[test]
 								        fn ncname_comparable_to_sym() {
 								            let foo = "foo".intern();
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								            assert_eq!(NCName(foo), foo);
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								        }
 								        #[test]
 								        fn ncname_try_into_from_str_no_colon() -> TestResult {
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								            let name: NCName = "no-colon".try_into()?;
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								            assert_eq!(name, "no-colon".intern());
 								            Ok(())
 								        }
 								        #[test]
 								        fn ncname_try_into_from_str_fails_with_colon() {
 								            assert_eq!(
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								                NCName::try_from("look:a-colon"),
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                Err(SpanlessError::NCColon("look:a-colon".into()))
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								            );
 								        }
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								        #[test]
 								        fn ncname_from_byte_slice() -> TestResult {
 								            let name: NCName = (b"no-colon" as &[u8]).try_into()?;
 								            assert_eq!(name, "no-colon".intern());
 								            Ok(())
 								        }
 								        #[test]
 								        fn ncname_from_byte_slice_fails_with_colon() {
 								            assert_eq!(
 								                NCName::try_from(b"a:colon" as &[u8]),
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                Err(SpanlessError::NCColon("a:colon".into()))
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								            );
 								        }
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								        #[test]
 								        fn local_name_from_local_part_only() -> TestResult {
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								            let name = QName::new_local("foo".try_into()?);
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
 								            assert_eq!(name.local_name(), "foo".try_into()?);
 								            assert_eq!(None, name.prefix());
 								            Ok(())
 								        }
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        #[test]
 								        fn local_name_from_option_tuple() -> TestResult {
 								            let name: QName = (Option::<&str>::None, "foo").try_into()?;
 								            assert_eq!(name.local_name(), "foo".try_into()?);
 								            assert_eq!(None, name.prefix());
 								            Ok(())
 								        }
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								        #[test]
 								        fn fully_qualified_name() -> TestResult {
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								            let name: QName = ("foons", "foo").try_into()?;
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
 								            assert_eq!(name.prefix(), Some("foons".try_into()?));
 								            assert_eq!(name.local_name(), "foo".try_into()?);
 								            Ok(())
 								        }
 								    }
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								    mod ele_span {
 								        use super::*;
-												tamer: span::dummy: New module to hold DUMMY_SPAN and derivatives

Various DUMMY_SPAN-derived spans are used by many test cases, so this
finally extracts them---something I've been meaning to do for some time.

This also places DUMMY_SPAN behind a `cfg(test)` directive to ensure that it
is _only_ used in tests; UNKNOWN_SPAN should be used when a span is actually
unknown, which may also be the case during development.

DEV-7145

											
										
										
											2022-07-29 10:27:46 -04:00
+								        use crate::span::dummy::DUMMY_CONTEXT as DC;
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
 								        #[test]
 								        fn open_without_attrs() {
 								            // See docblock for [`EleSpan`].
 								            const T: Span = DC.span(0, 8); // Relevant portion of tag
 								            const N: Span = DC.span(1, 4); // Element name
 								            let sut = OpenSpan(T, N.len());
 								            assert_eq!(sut.span(), T);
 								            assert_eq!(sut.name_span(), N);
 								        }
 								        #[test]
 								        fn open_with_attrs() {
 								            // See docblock for [`EleSpan`].
 								            const T: Span = DC.span(0, 5); // Relevant portion of tag
 								            const N: Span = DC.span(1, 4); // Element name
 								            let sut = OpenSpan(T, N.len());
 								            assert_eq!(sut.span(), T);
 								            assert_eq!(sut.name_span(), N);
 								        }
 								        #[test]
 								        fn close() {
 								            // See docblock for [`EleSpan`].
 								            const T: Span = DC.span(0, 10); // Relevant portion of tag
 								            const N: Span = DC.span(2, 5); //  Element name
 								            let sut = CloseSpan(T, N.len());
 								            assert_eq!(sut.span(), T);
 								            assert_eq!(sut.name_span(), N);
 								        }
 								        #[test]
 								        fn close_empty() {
 								            // See docblock for [`EleSpan`].
 								            const T: Span = DC.span(9, 2); // Relevant portion of tag
 								            let sut = CloseSpan(T, 0);
 								            assert_eq!(sut.span(), T);
 								            // There is no name,
 								            //   only Zuul.
 								            assert_eq!(sut.name_span(), T);
 								        }
 								    }
-												tamer: XIR: Working concept

This is a working streaming IR for XML.  I want to get this committed before
I go further cleaning it up and integrating it into the xmle writer.

This is lacking detailed documentation, and the names of things may end up
changing.

Initial benchmarks do show that it has a ~2x performance improvement over
quick-xml when dealing with two attributes on a node, and I suspect that
improvement will increase with the number of attributes.  We will see how it
compares in real-world benchmarks once the linker has been modified to use
it.

The goal isn't to _avoid_ quick-xml---it'll be used in the future for things
like escaping that would be a huge waste to implement ourselves.  It just so
happened that quick-xml was not beneficial for these changes; indeed, its
own writer is fairly simple for the portions that were implemented here, so
there's no use in fighting with its API, particularly around attributes and
our need to explicitly control whitespace (with the intent of handling code
formatters in the future).

To put this into perspective: the reason this work is being done isn't to
refactor the linker, or to speed it up, but to generalize XML writing and
provide a suitable IR for use in the compiler.  The first step of the
frontend is to essentially echo the XML token stream back out so we can
incrementally parse it and do something useful, to incrementally rewrite the
compiler in Rust.

											
										
										
											2021-08-20 10:09:55 -04:00
+								}