tame/tamer/src/xir/reader.rs

// XIR reader
//
//  Copyright (C) 2014-2022 Ryan Specialty Group, LLC.
//
//  This file is part of TAME.
//
//  This program is free software: you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation, either version 3 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program.  If not, see <http://www.gnu.org/licenses/>.

//! Parse XML files into a XIR [`Token`] stream.
//!
//! This uses [`quick_xml`] as the parser.

use super::{
    error::SpanlessError, CloseSpan, DefaultEscaper, Error, Escaper, OpenSpan,
    Token,
};
use crate::{
    parse::{ParseError, Parsed, ParsedObject, ParsedResult},
    span::Context,
    sym::{st::raw::WS_EMPTY, GlobalSymbolInternBytes},
};
use quick_xml::{
    self,
    events::{
        attributes::Attributes, BytesDecl, BytesStart, Event as QuickXmlEvent,
    },
    Error as QuickXmlError,
};
use std::{borrow::Cow, collections::VecDeque, io::BufRead, result};

pub type Result<T> = result::Result<T, Error>;

/// Parse XML into a XIR [`Token`] stream.
///
/// This reader is intended to be used as an [`Iterator`].
///
/// The underlying reader produces events in chunks that are far too
///   large for XIR,
///     so most [`Token`]s retrieved via this call are buffered.
/// Parsing takes place when that buffer is exhausted and the next event
///   is requested from the underlying reader
///     (see [`XmlXirReader::refill_buf`]).
/// Errors can only occur during parsing,
///   and will never occur on buffered tokens.
///
/// [`None`] is returned only on EOF,
///   not on error.
pub struct XmlXirReader<'s, B, S = DefaultEscaper>
where
    B: BufRead,
    S: Escaper,
{
    /// Inner parser.
    reader: quick_xml::Reader<B>,

    /// Parsing context for reader.
    ctx: Context,

    /// Buffer for [`quick_xml::Reader`].
    readbuf: Vec<u8>,

    /// [`Token`] buffer populated upon receiving a new event from
    ///   `reader`.
    ///
    /// This buffer serves [`Iterator::next`] requests until it is
    ///   depleted,
    ///     after which [`XmlXirReader::refill_buf`] requests another token
    ///     from `reader`.
    tokbuf: VecDeque<Token>,

    /// System for unescaping string data.
    escaper: &'s S,
}

impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
    pub fn new(reader: B, escaper: &'s S, ctx: Context) -> Self {
        let mut reader = quick_xml::Reader::from_reader(reader);

        // XIR must support mismatched tags so that we are able to represent
        //   and reconstruct malformed inputs.
        // XIRT will handle mismatch errors itself.
        reader.check_end_names(false);

        Self {
            reader,
            ctx,
            readbuf: Vec::new(),
            // This capacity is largely arbitrary,
            //   but [`Token`]s are small enough that it likely does not
            //   matter much.
            tokbuf: VecDeque::with_capacity(32),

            escaper,
        }
    }

    /// Parse using the underlying [`quick_xml::Reader`] and populate the
    ///   [`Token`] buffer.
    ///
    /// This is intended to be invoked once the buffer has been depleted by
    ///   [`XmlXirReader::next`].
    pub fn refill_buf(&mut self) -> Option<Result<Token>> {
        // Clear any previous buffer to free unneeded data.
        self.tokbuf.clear();
        self.readbuf.clear();

        let ctx = self.ctx;
        let prev_pos = self.reader.buffer_position();

        match self.reader.read_event(&mut self.readbuf) {
            // TODO: To provide better spans and error messages,
            //   we need to map specific types of errors.
            // But we don't encounter much of anything here with how we make
            //   use of quick-xml.
            Err(inner) => Some(Err({
                let span = ctx.span_or_zz(prev_pos, 0);
                SpanlessError::from(inner).with_span(span)
            })),

            Ok(ev) => match ev {
                // This is the only time we'll consider the iterator to be
                //   done.
                QuickXmlEvent::Eof => None,

                QuickXmlEvent::Empty(ele) => Some(
                    Self::parse_element_open(
                        &self.escaper,
                        &mut self.tokbuf,
                        ele,
                        prev_pos,
                        ctx,
                        true,
                    )
                    .and_then(|open| {
                        let new_pos = self.reader.buffer_position();

                        // `<tag ... />`
                        //           ||
                        let span = ctx.span_or_zz(new_pos - 2, 2);

                        // Tag is self-closing, but this does not yet
                        //   handle whitespace before the `/`
                        //     (as indicated in the span above).
                        self.tokbuf.push_front(Token::Close(
                            None,
                            CloseSpan::empty(span),
                        ));

                        Ok(open)
                    }),
                ),

                QuickXmlEvent::Start(ele) => Some(Self::parse_element_open(
                    &self.escaper,
                    &mut self.tokbuf,
                    ele,
                    prev_pos,
                    ctx,
                    false,
                )),

                QuickXmlEvent::End(ele) => Some({
                    // Only whitespace is permitted following the element
                    //   name,
                    //     so we can simply take the delta of the buffer pos.
                    //
                    // </foo  >
                    // [------]  name + '<' + '/' + "  >"
                    let pos_delta = self.reader.buffer_position() - prev_pos;
                    let span = ctx.span_or_zz(prev_pos, pos_delta);
                    let name_len = ele.name().len();

                    ele.name()
                        .try_into()
                        .map_err(Error::from_with_span(span))
                        .and_then(|qname| {
                            Ok(Token::Close(
                                Some(qname),
                                CloseSpan(
                                    span,
                                    name_len.try_into().unwrap_or(0),
                                ),
                            ))
                        })
                }),

                // quick_xml emits a useless text event if the first byte is
                //   a '<'.
                QuickXmlEvent::Text(bytes) if bytes.escaped().is_empty() => {
                    self.refill_buf()
                }

                // quick_xml _escapes_ the unescaped CData before handing it
                //   off to us,
                //     which is a complete waste since we'd just have to
                //     unescape it again.
                QuickXmlEvent::CData(bytes) => todo!("CData: {:?}", bytes),

                QuickXmlEvent::Text(bytes) => Some({
                    // <text>foo bar</text>
                    //       [-----]
                    let span = ctx.span_or_zz(prev_pos, bytes.len());

                    bytes
                        .intern_utf8()
                        .map_err(Into::into)
                        .and_then(|sym| self.escaper.unescape(sym))
                        .map_err(Error::from_with_span(span))
                        .and_then(|unesc| Ok(Token::Text(unesc, span)))
                }),

                // Comments are _not_ returned escaped.
                QuickXmlEvent::Comment(bytes) => Some({
                    // <!-- foo -->
                    // [----------]  " foo " + "<!--" + "-->"
                    let span = ctx.span_or_zz(prev_pos, bytes.len() + 7);

                    bytes
                        .intern_utf8()
                        .map_err(Error::from_with_span(span))
                        .and_then(|comment| Ok(Token::Comment(comment, span)))
                }),

                // TODO: This must appear in the prologue.
                QuickXmlEvent::Decl(decl) => {
                    match Self::validate_decl(&decl, prev_pos, ctx) {
                        Err(x) => Some(Err(x)),
                        Ok(()) => self.refill_buf(),
                    }
                }

                // We do not support processor instructions or doctypes.
                // TODO: Convert this into an error/warning?
                // Previously `xml-stylesheet` was present in some older
                //   source files and may linger for a bit after cleanup.
                QuickXmlEvent::PI(..) | QuickXmlEvent::DocType(..) => {
                    self.refill_buf()
                }
            },
        }
    }

    /// Validate an that an XML declaration contains expected values.
    ///
    /// A declaration looks like `<?xml version="1.0" encoding="utf-8"?>`,
    ///   where `@encoding` is optional but `@version` is not.
    /// It may also contain `@standalone`,
    ///   but we do not check for that.
    ///
    /// We expect version 1.0 and UTF-8 encoding.
    /// Failing when these expectations are voilated helps to ensure that
    ///   people unfamiliar with the system do not have expectations that
    ///   are going to be unmet,
    ///     which may result in subtle (or even serious) problems.
    fn validate_decl(decl: &BytesDecl, pos: usize, ctx: Context) -> Result<()> {
        // Starts after `<?`, which we want to include.
        let decl_ptr = decl.as_ptr() as usize - 2 + pos;

        // Fallback span that covers the entire declaration.
        let decl_span = ctx.span_or_zz(pos, decl.len() + 4);

        let ver =
            &decl.version().map_err(Error::from_with_span(decl_span))?[..];

        // NB: `quick-xml` docs state that `version` returns the quotes,
        //   but it does not.
        if ver != b"1.0" {
            // <?xml version="X.Y"?>
            //                [-]
            let ver_pos = (ver.as_ptr() as usize) - decl_ptr;
            let span = ctx.span_or_zz(ver_pos, ver.len());

            Err(Error::UnsupportedXmlVersion(
                ver.intern_utf8().map_err(Error::from_with_span(span))?,
                span,
            ))?
        }

        if let Some(enc) = decl.encoding() {
            match &enc.map_err(Error::from_with_span(decl_span))?[..] {
                b"utf-8" | b"UTF-8" => (),
                invalid => {
                    let enc_pos = (invalid.as_ptr() as usize) - decl_ptr;
                    let span = ctx.span_or_zz(enc_pos, invalid.len());

                    Err(Error::UnsupportedEncoding(
                        invalid
                            .intern_utf8()
                            .map_err(Error::from_with_span(span))?,
                        span,
                    ))?
                }
            }
        }

        Ok(())
    }

    /// Parse opening element and its attributes into a XIR [`Token`]
    ///   stream.
    ///
    /// The opening element is returned rather than being added to the token
    ///   buffer,
    ///     since the intent is to provide that token immediately.
    fn parse_element_open(
        escaper: &'s S,
        tokbuf: &mut VecDeque<Token>,
        ele: BytesStart,
        pos: usize,
        ctx: Context,
        empty_tag: bool,
    ) -> Result<Token> {
        // Starts after the opening tag `<`, so adjust.
        let addr = ele.as_ptr() as usize - 1;
        let len = ele.name().len();

        match ele.name().last() {
            None => {
                // TODO: QName should be self-validating.  Move this.
                return Err(Error::InvalidQName(
                    WS_EMPTY,
                    // <>
                    //  |  where QName should be
                    ctx.span_or_zz(pos + 1, 0),
                ));
            }

            // Quick-and-dirty guess as to whether they may have missed the
            //   element name and included an attribute instead,
            //     which quick-xml does not check for.
            Some(b'"' | b'\'') => {
                return Err({
                    // <foo="bar" ...>
                    //  [-------]
                    let span = ctx.span_or_zz(pos + 1, len);

                    Error::InvalidQName(
                        ele.name()
                            .intern_utf8()
                            .map_err(Error::from_with_span(span))?,
                        span,
                    )
                });
            }

            _ => (),
        };

        // `ele` contains every byte up to the [self-]closing tag.
        ele.name()
            .try_into()
            .map_err(Error::from_with_span(ctx.span_or_zz(pos + 1, len)))
            .and_then(|qname| {
                // The whitespace check is to handle input like this:
                //   <foo />
                //       ^ whitespace making `attributes_raw().len` > 0
                let has_attrs = ele
                    .attributes_raw()
                    .iter()
                    .find(|b| !Self::is_whitespace(**b))
                    .is_some();

                let attr_len = ele.attributes_raw().len();

                // The tail is anything following the last byte of the QName
                //   in a non-empty tag with no attributes.
                // For example:
                //   <foo   >             <foo>          <foo bar="baz">
                //       ~~~~ tail            ~ tail         (no tail)
                let tail = if has_attrs {
                    let found = Self::parse_attrs(
                        escaper,
                        tokbuf,
                        ele.attributes(),
                        addr - pos, // offset relative to _beginning_ of buf
                        pos,
                        ctx,
                    )?;

                    // Given this input, quick-xml ignores the bytes entirely:
                    //   <foo bar>
                    //        [--] missing `="value"`
                    if !found {
                        return Err(Error::AttrValueExpected(
                            None,
                            ctx.span_or_zz(pos + ele.len() + 1, 0),
                        ));
                    }

                    //      (empty)               (open)
                    // <foo bar="baz" />     <foo bar="baz"></foo>
                    //                |                    |
                    //            zero-length             '>'
                    let attr_end_span = ctx.span_or_zz(
                        pos + 1 + len + attr_len,
                        if empty_tag { 0 } else { 1 },
                    );

                    tokbuf.push_front(Token::AttrEnd(attr_end_span));

                    // No tail because of attributes.
                    0
                } else {
                    match empty_tag {
                        // Empty tag cannot have a tail.
                        true => 0,
                        // The "attributes" buffer represents whitespace,
                        //   so the tail is the number of bytes of
                        //   whitespace plus the closing '>' tag delimiter.
                        false => attr_len + 1,
                    }
                };

                // <tag ... />                   <tag/>
                // [--] name + '<'               [--] name + '<'
                //
                // <tag  >...</tag>              <tag ...>...</tag>
                // [-----] name + '<' + "  >"    [--] name + '<'
                //     ~~~ tail
                let span = ctx.span_or_zz(pos, len + 1 + tail);

                // The first token will be immediately returned
                //   via the Iterator.
                Ok(Token::Open(
                    qname,
                    OpenSpan(span, len.try_into().unwrap_or(0)),
                ))
            })
    }

    /// Whether the byte represents XML whitespace.
    ///
    /// This is quick-xml's whitespace predicate,
    ///   and corresponds to the
    ///     [nonterminal `S` in the XML specification][xmlspec-s].
    ///
    /// [xmlspec-s]: https://www.w3.org/TR/xml/#NT-S
    fn is_whitespace(b: u8) -> bool {
        match b {
            b' ' | b'\r' | b'\n' | b'\t' => true,
            _ => false,
        }
    }

    /// Parse attributes into a XIR [`Token`] stream.
    ///
    /// The order of attributes will be maintained.
    ///
    /// This does not yet handle whitespace between attributes,
    ///   or around `=`.
    ///
    /// Note About Pointer Arithmetic
    /// =============================
    /// `ele_ptr` is expected to be a pointer to the buffer containing the
    ///   bytes read from the source file.
    /// Attributes reference this buffer,
    ///   so we can use pointer arithmetic to determine the offset within
    ///   the buffer relative to the node.
    /// This works because the underlying buffer is a `Vec`,
    ///   which is contiguous in memory.
    ///
    /// However, since this is a `Vec`,
    ///   it is important that the address be retrieved _after_ quick-xml
    ///   read events,
    ///     otherwise the buffer may be expanded and will be reallocated.
    fn parse_attrs<'a>(
        escaper: &'s S,
        tokbuf: &mut VecDeque<Token>,
        mut attrs: Attributes<'a>,
        ele_ptr: usize,
        ele_pos: usize,
        ctx: Context,
    ) -> Result<bool> {
        let mut found = false;

        // Disable checks to allow duplicate attributes;
        //   XIR does not enforce this,
        //     because it needs to accommodate semantically invalid XML for
        //     later analysis.
        for result in attrs.with_checks(false) {
            found = true;

            let attr = result.map_err(|e| match e {
                QuickXmlError::NoEqAfterName(pos) => {
                    // TODO: quick-xml doesn't give us the name,
                    //   but we should discover it.
                    Error::AttrValueExpected(
                        None,
                        ctx.span_or_zz(ele_pos + pos, 0),
                    )
                }

                QuickXmlError::UnquotedValue(pos) => {
                    // TODO: name and span length
                    Error::AttrValueUnquoted(
                        None,
                        ctx.span_or_zz(ele_pos + pos, 0),
                    )
                }

                // fallback
                e => Error::from_with_span(ctx.span_or_zz(ele_pos, 0))(e),
            })?;

            let keyoffset = attr.key.as_ptr() as usize;
            let name_offset = keyoffset - ele_ptr;

            // Accommodates zero-length values (e.g. `key=""`) with a
            //   zero-length span at the location the value _would_ be.
            let valoffset = match attr.value {
                Cow::Borrowed(b) => b.as_ptr() as usize,

                // This should never happen since we have a reference to the
                //   underlying buffer.
                Cow::Owned(_) => unreachable!(
                    "internal error: unexpected owned attribute value"
                ),
            };

            let value_offset = valoffset - ele_ptr;

            let span_name = ctx.span_or_zz(name_offset, attr.key.len());
            let span_value = ctx.span_or_zz(value_offset, attr.value.len());

            // The name must be parsed as a QName.
            let name = attr
                .key
                .try_into()
                .map_err(Error::from_with_span(span_name))?;

            // The attribute value,
            //   having just been read from XML,
            //   must have been escaped to be parsed properly.
            // If it parsed but it's not technically escaped according to
            //   the spec,
            //     that's okay as long as we can read it again,
            //       but we probably should still throw an error if we
            //       encounter such a situation.
            let value = escaper
                .unescape(
                    attr.value
                        .as_ref()
                        .intern_utf8()
                        .map_err(Error::from_with_span(span_value))?,
                )
                .map_err(Error::from_with_span(span_value))?
                .into();

            tokbuf.push_front(Token::AttrName(name, span_name));
            tokbuf.push_front(Token::AttrValue(value, span_value));
        }

        Ok(found)
    }
}

impl<'s, B, S> Iterator for XmlXirReader<'s, B, S>
where
    B: BufRead,
    S: Escaper,
{
    type Item = ParsedResult<ParsedObject<Token, Error>>;

    /// Produce the next XIR [`Token`] from the input.
    ///
    /// For more information on how this reader operates,
    ///   see [`XmlXirReader`].
    fn next(&mut self) -> Option<Self::Item> {
        self.tokbuf
            .pop_back()
            .map(|tok| Ok(Parsed::Object(tok)))
            .or_else(|| {
                self.refill_buf().map(|result| {
                    result.map(Parsed::Object).map_err(ParseError::StateError)
                })
            })
    }
}

#[cfg(test)]
mod test;
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								// XIR reader
 								//
-												Copyright year update 2022

RSG (Ryan Specialty Group) recently announced a rename to Ryan Specialty (no
"Group"), but I'm not sure if the legal name has been changed yet or not, so
I'll wait on that.

											
										
										
											2022-05-03 14:14:29 -04:00
+								//  Copyright (C) 2014-2022 Ryan Specialty Group, LLC.
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								//
 								//  This file is part of TAME.
 								//
 								//  This program is free software: you can redistribute it and/or modify
 								//  it under the terms of the GNU General Public License as published by
 								//  the Free Software Foundation, either version 3 of the License, or
 								//  (at your option) any later version.
 								//
 								//  This program is distributed in the hope that it will be useful,
 								//  but WITHOUT ANY WARRANTY; without even the implied warranty of
 								//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 								//  GNU General Public License for more details.
 								//
 								//  You should have received a copy of the GNU General Public License
 								//  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 								//! Parse XML files into a XIR [`Token`] stream.
 								//!
 								//! This uses [`quick_xml`] as the parser.
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								use super::{
 								    error::SpanlessError, CloseSpan, DefaultEscaper, Error, Escaper, OpenSpan,
 								    Token,
 								};
-												tamer: tamec: Replace copy with XIR parsing/writing

When wip-frontends is on, this will parse the input file using XIR and then
immediately output it again.  This makes the necessary changes to be able to
read every source file we have in our largest project, such that the output
is identical after having been formatted with `xmllint --format -` (there
are differences because e.g. whitespace between attributes is not yet
maintained).

This is performant too, with times remaining essentially identical despite
the additional work.

DEV-10413

											
										
										
											2022-04-07 12:08:51 -04:00
+								use crate::{
-												tamer: Integrate xir::reader as a parser in the lowering pipeline

This allows `XmlXirReader` to be used in a `Lower` operation, just as
everything else, bringing me one step closer to a pipeline that can be
concisely represented; this is finally beginning to unify in a clear way,
though it is still a bit of a mess.

This causes `XmlXirReader` to _act_ like a `parse::Parser` in that it yields
a `ParsedResult`, but it does not use `parse::Parser` itself; that was the
_original_ plan: convert it into a `ParseState` where `XmlXirReader` became
a context, and force `Parser` to yield by feeding it a stream of tokens with
`repeat`, but that ended up performing poorly relative to this change.  I
did some investigation, which I might write about in the future, but for
now, this solution works just fine.

DEV-7145

											
										
										
											2022-06-02 10:30:44 -04:00
+								    parse::{ParseError, Parsed, ParsedObject, ParsedResult},
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								    span::Context,
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								    sym::{st::raw::WS_EMPTY, GlobalSymbolInternBytes},
-												tamer: tamec: Replace copy with XIR parsing/writing

When wip-frontends is on, this will parse the input file using XIR and then
immediately output it again.  This makes the necessary changes to be able to
read every source file we have in our largest project, such that the output
is identical after having been formatted with `xmllint --format -` (there
are differences because e.g. whitespace between attributes is not yet
maintained).

This is performant too, with times remaining essentially identical despite
the additional work.

DEV-10413

											
										
										
											2022-04-07 12:08:51 -04:00
+								};
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								use quick_xml::{
 								    self,
-												tamer: tamec: Replace copy with XIR parsing/writing

When wip-frontends is on, this will parse the input file using XIR and then
immediately output it again.  This makes the necessary changes to be able to
read every source file we have in our largest project, such that the output
is identical after having been formatted with `xmllint --format -` (there
are differences because e.g. whitespace between attributes is not yet
maintained).

This is performant too, with times remaining essentially identical despite
the additional work.

DEV-10413

											
										
										
											2022-04-07 12:08:51 -04:00
+								    events::{
 								        attributes::Attributes, BytesDecl, BytesStart, Event as QuickXmlEvent,
 								    },
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								    Error as QuickXmlError,
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								};
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								use std::{borrow::Cow, collections::VecDeque, io::BufRead, result};
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
 								pub type Result<T> = result::Result<T, Error>;
 								/// Parse XML into a XIR [`Token`] stream.
 								///
 								/// This reader is intended to be used as an [`Iterator`].
 								///
 								/// The underlying reader produces events in chunks that are far too
 								///   large for XIR,
 								///     so most [`Token`]s retrieved via this call are buffered.
 								/// Parsing takes place when that buffer is exhausted and the next event
 								///   is requested from the underlying reader
 								///     (see [`XmlXirReader::refill_buf`]).
 								/// Errors can only occur during parsing,
 								///   and will never occur on buffered tokens.
 								///
 								/// [`None`] is returned only on EOF,
 								///   not on error.
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								pub struct XmlXirReader<'s, B, S = DefaultEscaper>
 								where
 								    B: BufRead,
 								    S: Escaper,
 								{
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								    /// Inner parser.
 								    reader: quick_xml::Reader<B>,
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								    /// Parsing context for reader.
 								    ctx: Context,
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								    /// Buffer for [`quick_xml::Reader`].
 								    readbuf: Vec<u8>,
 								    /// [`Token`] buffer populated upon receiving a new event from
 								    ///   `reader`.
 								    ///
 								    /// This buffer serves [`Iterator::next`] requests until it is
 								    ///   depleted,
 								    ///     after which [`XmlXirReader::refill_buf`] requests another token
 								    ///     from `reader`.
 								    tokbuf: VecDeque<Token>,
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
 								    /// System for unescaping string data.
 								    escaper: &'s S,
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								}
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								    pub fn new(reader: B, escaper: &'s S, ctx: Context) -> Self {
-												tamer: ir::xir::reader: Disable quick-xml check_end_names

XIR must support tag mismatches; XIRT will validate them.

This is currently disabled in the linker's xmlo reader as well.

DEV-10863

											
										
										
											2021-10-25 10:58:19 -04:00
+								        let mut reader = quick_xml::Reader::from_reader(reader);
 								        // XIR must support mismatched tags so that we are able to represent
 								        //   and reconstruct malformed inputs.
 								        // XIRT will handle mismatch errors itself.
 								        reader.check_end_names(false);
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								        Self {
-												tamer: ir::xir::reader: Disable quick-xml check_end_names

XIR must support tag mismatches; XIRT will validate them.

This is currently disabled in the linker's xmlo reader as well.

DEV-10863

											
										
										
											2021-10-25 10:58:19 -04:00
+								            reader,
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								            ctx,
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								            readbuf: Vec::new(),
 								            // This capacity is largely arbitrary,
 								            //   but [`Token`]s are small enough that it likely does not
 								            //   matter much.
 								            tokbuf: VecDeque::with_capacity(32),
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
 								            escaper,
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								        }
 								    }
 								    /// Parse using the underlying [`quick_xml::Reader`] and populate the
 								    ///   [`Token`] buffer.
 								    ///
 								    /// This is intended to be invoked once the buffer has been depleted by
 								    ///   [`XmlXirReader::next`].
 								    pub fn refill_buf(&mut self) -> Option<Result<Token>> {
 								        // Clear any previous buffer to free unneeded data.
 								        self.tokbuf.clear();
-												tamer: xir::reader::XmlXirReader::refill_buf: Clear read buffer

This was done in the old reader many months ago, but I somehow forgot to do
it here (or forgot to).  The new reader was using substantially more memory.

Here's how this change affects the memory profile for one of our
systems (output from `ms_print`):

Before:

    MB
79.75^                                                             #
     |                                                             #
     |                                                             #       @
     |                                               @@@@          #       @
     |                                               @@@           #      @@
     |                                               @@@        @@@#@   @@@@@
     |                                               @@@        @@ #@@@@@@@@@@
     |                                            @@@@@@      @@@@ #@@@@@@@@@@
     |                                         @@ @@ @@@   @@ @ @@ #@@@@@@@@@@
     |                                         @@ @@ @@@  @@@@@ @@ #@@@@@@@@@@
     |                                         @@@@@ @@@ @@@@@@ @@ #@@@@@@@@@@
     |                                         @@@@@ @@@ @@@@@@ @@ #@@@@@@@@@@
     |   @@                                    @@@@@ @@@ @@@@@@ @@ #@@@@@@@@@@
     |   @        @@     @@          @        @@@@@@ @@@ @@@@@@ @@ #@@@@@@@@@@
     |   @        @     @@@         @@  @@@   @@@@@@ @@@ @@@@@@ @@ #@@@@@@@@@@
     |   @     @@@@ @@@@@@@@@@@@@@@@@@@@@ @@@@@@@@@@ @@@ @@@@@@ @@ #@@@@@@@@@@
     | @@@   @@@@@@ @@@@@@@@@ @@@@@ @@@@@ @@ @@@@@@@ @@@ @@@@@@ @@ #@@@@@@@@@@
     | @@@   @ @@@@ @@@@@@@@@ @@@@@ @@@@@ @@ @@@@@@@ @@@ @@@@@@ @@ #@@@@@@@@@@
     | @@@ @@@ @@@@ @@@@@@@@@ @@@@@ @@@@@ @@ @@@@@@@ @@@ @@@@@@ @@ #@@@@@@@@@@
     | @@@ @@@ @@@@ @@@@@@@@@ @@@@@ @@@@@ @@ @@@@@@@ @@@ @@@@@@ @@ #@@@@@@@@@@
   0 +----------------------------------------------------------------------->Gi
     0                                                                   15.20

After:

    MB
63.25^                                                                      #
     |                                                                      #
     |                                                             @@@@@@@@@#@
     |                                                             @@@@@@ @@#@
     |                                                             @@@@@@ @@#@
     |                                                             @@@@@@ @@#@
     |                                                             @@@@@@ @@#@
     |                                                       @@@@@@@@@@@@ @@#@
     |                                                @@@@@@@@@ @@ @@@@@@ @@#@
     |                                         @@@@@@@@ @@@ @@@ @@ @@@@@@ @@#@
     |                                         @@@@@  @ @@@ @@@ @@ @@@@@@ @@#@
     |                                         @@@@@  @ @@@ @@@ @@ @@@@@@ @@#@
     |                                        @@@@@@  @ @@@ @@@ @@ @@@@@@ @@#@
     |                                        @@@@@@  @ @@@ @@@ @@ @@@@@@ @@#@
     |           @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  @ @@@ @@@ @@ @@@@@@ @@#@
     |        @@@@@@@@@@@@@ @@@@@@@@ @@@@@@@@@@@@@@@  @ @@@ @@@ @@ @@@@@@ @@#@
     |      @@@@@@@@@@@@@@@ @@@@@@@@ @@@@@@@@@@@@@@@  @ @@@ @@@ @@ @@@@@@ @@#@
     |    @@@@@@@@@@@@@@@@@ @@@@@@@@ @@@@@@@@@@@@@@@  @ @@@ @@@ @@ @@@@@@ @@#@
     | @@@@@@@@@@@@@@@@@@@@ @@@@@@@@ @@@@@@@@@@@@@@@  @ @@@ @@@ @@ @@@@@@ @@#@
     | @@@@@@@@@@@@@@@@@@@@ @@@@@@@@ @@@@@@@@@@@@@@@  @ @@@ @@@ @@ @@@@@@ @@#@
   0 +----------------------------------------------------------------------->Gi
     0                                                                   15.20

The bottom graph is virtually identical to the memory profile of the old
reader, just with the exception that it's interning a bit more data than
before, because we're reading more comprehensively.

That's (potentially) the subject of future changes.

DEV-12038

											
										
										
											2022-04-06 11:50:07 -04:00
+								        self.readbuf.clear();
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								        let ctx = self.ctx;
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								        let prev_pos = self.reader.buffer_position();
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								        match self.reader.read_event(&mut self.readbuf) {
 								            // TODO: To provide better spans and error messages,
 								            //   we need to map specific types of errors.
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								            // But we don't encounter much of anything here with how we make
 								            //   use of quick-xml.
 								            Err(inner) => Some(Err({
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                let span = ctx.span_or_zz(prev_pos, 0);
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								                SpanlessError::from(inner).with_span(span)
 								            })),
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
 								            Ok(ev) => match ev {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                // This is the only time we'll consider the iterator to be
 								                //   done.
 								                QuickXmlEvent::Eof => None,
-												tamer: ir::xir::reader: Refactor common element open parsing

As mentioned in the previous commit, this is just minor cleanup.

											
										
										
											2021-10-21 16:51:47 -04:00
+								                QuickXmlEvent::Empty(ele) => Some(
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								                    Self::parse_element_open(
 								                        &self.escaper,
 								                        &mut self.tokbuf,
 								                        ele,
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                        prev_pos,
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                        ctx,
-												tamer: xir::reader: Correct empty element whitespace handling

This both adds clarifying tests and corrects the case of `<foo/>`, where the
offset was erroneously off by one---it saw that there were no attributes and
added a byte thinking it'd include `>`, as in `<foo>`.

DEV-7145

											
										
										
											2022-06-22 10:28:44 -04:00
+								                        true,
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								                    )
 								                    .and_then(|open| {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                        let new_pos = self.reader.buffer_position();
 								                        // `<tag ... />`
 								                        //           ||
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                        let span = ctx.span_or_zz(new_pos - 2, 2);
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								                        // Tag is self-closing, but this does not yet
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                        //   handle whitespace before the `/`
 								                        //     (as indicated in the span above).
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								                        self.tokbuf.push_front(Token::Close(
 								                            None,
 								                            CloseSpan::empty(span),
 								                        ));
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
 								                        Ok(open)
 								                    }),
-												tamer: ir::xir::reader: Refactor common element open parsing

As mentioned in the previous commit, this is just minor cleanup.

											
										
										
											2021-10-21 16:51:47 -04:00
+								                ),
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								                QuickXmlEvent::Start(ele) => Some(Self::parse_element_open(
 								                    &self.escaper,
 								                    &mut self.tokbuf,
 								                    ele,
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                    prev_pos,
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                    ctx,
-												tamer: xir::reader: Correct empty element whitespace handling

This both adds clarifying tests and corrects the case of `<foo/>`, where the
offset was erroneously off by one---it saw that there were no attributes and
added a byte thinking it'd include `>`, as in `<foo>`.

DEV-7145

											
										
										
											2022-06-22 10:28:44 -04:00
+								                    false,
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								                )),
-												tamer: ir::xir::reader: Parsing of child nodes

This is quick-and-dirty; refactoring can be done later on.  This is also
intended to demonstrate the ease with which additional events can be
added---the hard work is done.

											
										
										
											2021-10-21 16:32:19 -04:00
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                QuickXmlEvent::End(ele) => Some({
-												tamer: xir::reader: Opening and closing tag whitespace

Non-attribute and non-empty start/end tags will have their whitespace
as part of the produced span.  This sets us up for a following change that
will allow for deriving the name span from this span given a QName, which
gives us a span that both represents the entire XIR token and allows
deriving the element name.

An accurate token span is necessary for parsing errors where an element was
not expected, while an element name span is more appropriate for issues of
grammar and semantic errors that deal not with the fact that an element was
encountered, but _what_ element was encountered.

DEV-7145

											
										
										
											2022-06-22 15:10:49 -04:00
+								                    // Only whitespace is permitted following the element
 								                    //   name,
 								                    //     so we can simply take the delta of the buffer pos.
 								                    //
 								                    // </foo  >
 								                    // [------]  name + '<' + '/' + "  >"
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								                    let pos_delta = self.reader.buffer_position() - prev_pos;
 								                    let span = ctx.span_or_zz(prev_pos, pos_delta);
 								                    let name_len = ele.name().len();
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
 								                    ele.name()
 								                        .try_into()
 								                        .map_err(Error::from_with_span(span))
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								                        .and_then(|qname| {
 								                            Ok(Token::Close(
 								                                Some(qname),
 								                                CloseSpan(
 								                                    span,
 								                                    name_len.try_into().unwrap_or(0),
 								                                ),
 								                            ))
 								                        })
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                }),
-												tamer: ir::xir::reader: Parsing of child nodes

This is quick-and-dirty; refactoring can be done later on.  This is also
intended to demonstrate the ease with which additional events can be
added---the hard work is done.

											
										
										
											2021-10-21 16:32:19 -04:00
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								                // quick_xml emits a useless text event if the first byte is
 								                //   a '<'.
 								                QuickXmlEvent::Text(bytes) if bytes.escaped().is_empty() => {
 								                    self.refill_buf()
 								                }
-												tamer: xir: Remove Text enum

Like previous commits, this replaces the explicit escaping context with the
convention that all values retrieved from `xir` are unescaped on read and
escaped on write.

Comments are a notable TODO, since we must escape only `--`.

CData is also an issue.  I had _expected_ to use it as a means to avoid
unescaping fragments, but I had forgotten that quick_xml hard-codes escaping
on read, so that it can re-use BytesStart!  That is terribly unfortunate,
and may result in us having to re-implement our own read method in the
future to avoid this nonsense.  So I'm just leaving it as a TODO for now.

DEV-11081

											
										
										
											2021-11-15 23:47:14 -05:00
+								                // quick_xml _escapes_ the unescaped CData before handing it
 								                //   off to us,
 								                //     which is a complete waste since we'd just have to
 								                //     unescape it again.
 								                QuickXmlEvent::CData(bytes) => todo!("CData: {:?}", bytes),
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                QuickXmlEvent::Text(bytes) => Some({
 								                    // <text>foo bar</text>
-												tamer: Consistent span diagram representation

I'll document it more formally eventually, but this settles on a mix of the
two: square brackets and dashes for intervals, `+` for intersecting lines,
byte offsets below interval endpoints, and names below that.

The docblock for `Span` itself iss still off; I'll probably just take one of
the test cases and paste it there at some point.

DEV-7145

											
										
										
											2022-06-06 11:29:17 -04:00
+								                    //       [-----]
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                    let span = ctx.span_or_zz(prev_pos, bytes.len());
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
-												tamer: xir: Remove Text enum

Like previous commits, this replaces the explicit escaping context with the
convention that all values retrieved from `xir` are unescaped on read and
escaped on write.

Comments are a notable TODO, since we must escape only `--`.

CData is also an issue.  I had _expected_ to use it as a means to avoid
unescaping fragments, but I had forgotten that quick_xml hard-codes escaping
on read, so that it can re-use BytesStart!  That is terribly unfortunate,
and may result in us having to re-implement our own read method in the
future to avoid this nonsense.  So I'm just leaving it as a TODO for now.

DEV-11081

											
										
										
											2021-11-15 23:47:14 -05:00
+								                    bytes
 								                        .intern_utf8()
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                        .map_err(Into::into)
-												tamer: xir: Remove Text enum

Like previous commits, this replaces the explicit escaping context with the
convention that all values retrieved from `xir` are unescaped on read and
escaped on write.

Comments are a notable TODO, since we must escape only `--`.

CData is also an issue.  I had _expected_ to use it as a means to avoid
unescaping fragments, but I had forgotten that quick_xml hard-codes escaping
on read, so that it can re-use BytesStart!  That is terribly unfortunate,
and may result in us having to re-implement our own read method in the
future to avoid this nonsense.  So I'm just leaving it as a TODO for now.

DEV-11081

											
										
										
											2021-11-15 23:47:14 -05:00
+								                        .and_then(|sym| self.escaper.unescape(sym))
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                        .map_err(Error::from_with_span(span))
 								                        .and_then(|unesc| Ok(Token::Text(unesc, span)))
 								                }),
-												tamer: ir::xir::reader: Text and mixed content

It's nice being able to breeze through changes, since that's been a pretty
rare thing so far, given all the foundational work that has been needed.

This should get us pretty damn close to being able to parse the `xmlo` files
for the reader linker, if we're not there already.

DEV-10863

											
										
										
											2021-10-21 21:42:39 -04:00
-												tamer: ir::xir::reader: Comment parsing

Comments re-use Text, but they are _not_ escaped, so we need to take care
with the type to ensure that, if the value were ever used with a
Token::Text, that we don't end up injecting XML.

											
										
										
											2021-10-21 22:04:45 -04:00
+								                // Comments are _not_ returned escaped.
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                QuickXmlEvent::Comment(bytes) => Some({
 								                    // <!-- foo -->
-												tamer: Consistent span diagram representation

I'll document it more formally eventually, but this settles on a mix of the
two: square brackets and dashes for intervals, `+` for intersecting lines,
byte offsets below interval endpoints, and names below that.

The docblock for `Span` itself iss still off; I'll probably just take one of
the test cases and paste it there at some point.

DEV-7145

											
										
										
											2022-06-06 11:29:17 -04:00
+								                    // [----------]  " foo " + "<!--" + "-->"
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                    let span = ctx.span_or_zz(prev_pos, bytes.len() + 7);
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
-												tamer: xir: Remove Text enum

Like previous commits, this replaces the explicit escaping context with the
convention that all values retrieved from `xir` are unescaped on read and
escaped on write.

Comments are a notable TODO, since we must escape only `--`.

CData is also an issue.  I had _expected_ to use it as a means to avoid
unescaping fragments, but I had forgotten that quick_xml hard-codes escaping
on read, so that it can re-use BytesStart!  That is terribly unfortunate,
and may result in us having to re-implement our own read method in the
future to avoid this nonsense.  So I'm just leaving it as a TODO for now.

DEV-11081

											
										
										
											2021-11-15 23:47:14 -05:00
+								                    bytes
 								                        .intern_utf8()
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                        .map_err(Error::from_with_span(span))
 								                        .and_then(|comment| Ok(Token::Comment(comment, span)))
 								                }),
-												tamer: ir::xir::reader: Comment parsing

Comments re-use Text, but they are _not_ escaped, so we need to take care
with the type to ensure that, if the value were ever used with a
Token::Text, that we don't end up injecting XML.

											
										
										
											2021-10-21 22:04:45 -04:00
-												tamer: asg::ident: {prolog=>prologue} typo fix

Somewhat humorous.

											
										
										
											2022-06-23 09:17:57 -04:00
+								                // TODO: This must appear in the prologue.
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                QuickXmlEvent::Decl(decl) => {
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                    match Self::validate_decl(&decl, prev_pos, ctx) {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                        Err(x) => Some(Err(x)),
 								                        Ok(()) => self.refill_buf(),
 								                    }
 								                }
-												tamer: tamec: Replace copy with XIR parsing/writing

When wip-frontends is on, this will parse the input file using XIR and then
immediately output it again.  This makes the necessary changes to be able to
read every source file we have in our largest project, such that the output
is identical after having been formatted with `xmllint --format -` (there
are differences because e.g. whitespace between attributes is not yet
maintained).

This is performant too, with times remaining essentially identical despite
the additional work.

DEV-10413

											
										
										
											2022-04-07 12:08:51 -04:00
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                // We do not support processor instructions or doctypes.
-												tamer: tamec: Replace copy with XIR parsing/writing

When wip-frontends is on, this will parse the input file using XIR and then
immediately output it again.  This makes the necessary changes to be able to
read every source file we have in our largest project, such that the output
is identical after having been formatted with `xmllint --format -` (there
are differences because e.g. whitespace between attributes is not yet
maintained).

This is performant too, with times remaining essentially identical despite
the additional work.

DEV-10413

											
										
										
											2022-04-07 12:08:51 -04:00
+								                // TODO: Convert this into an error/warning?
 								                // Previously `xml-stylesheet` was present in some older
 								                //   source files and may linger for a bit after cleanup.
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                QuickXmlEvent::PI(..) | QuickXmlEvent::DocType(..) => {
 								                    self.refill_buf()
 								                }
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								            },
 								        }
 								    }
-												tamer: tamec: Replace copy with XIR parsing/writing

When wip-frontends is on, this will parse the input file using XIR and then
immediately output it again.  This makes the necessary changes to be able to
read every source file we have in our largest project, such that the output
is identical after having been formatted with `xmllint --format -` (there
are differences because e.g. whitespace between attributes is not yet
maintained).

This is performant too, with times remaining essentially identical despite
the additional work.

DEV-10413

											
										
										
											2022-04-07 12:08:51 -04:00
+								    /// Validate an that an XML declaration contains expected values.
 								    ///
 								    /// A declaration looks like `<?xml version="1.0" encoding="utf-8"?>`,
 								    ///   where `@encoding` is optional but `@version` is not.
 								    /// It may also contain `@standalone`,
 								    ///   but we do not check for that.
 								    ///
 								    /// We expect version 1.0 and UTF-8 encoding.
 								    /// Failing when these expectations are voilated helps to ensure that
 								    ///   people unfamiliar with the system do not have expectations that
 								    ///   are going to be unmet,
 								    ///     which may result in subtle (or even serious) problems.
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								    fn validate_decl(decl: &BytesDecl, pos: usize, ctx: Context) -> Result<()> {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								        // Starts after `<?`, which we want to include.
 								        let decl_ptr = decl.as_ptr() as usize - 2 + pos;
 								        // Fallback span that covers the entire declaration.
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								        let decl_span = ctx.span_or_zz(pos, decl.len() + 4);
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
 								        let ver =
 								            &decl.version().map_err(Error::from_with_span(decl_span))?[..];
-												tamer: tamec: Replace copy with XIR parsing/writing

When wip-frontends is on, this will parse the input file using XIR and then
immediately output it again.  This makes the necessary changes to be able to
read every source file we have in our largest project, such that the output
is identical after having been formatted with `xmllint --format -` (there
are differences because e.g. whitespace between attributes is not yet
maintained).

This is performant too, with times remaining essentially identical despite
the additional work.

DEV-10413

											
										
										
											2022-04-07 12:08:51 -04:00
+								        // NB: `quick-xml` docs state that `version` returns the quotes,
 								        //   but it does not.
 								        if ver != b"1.0" {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								            // <?xml version="X.Y"?>
-												tamer: Consistent span diagram representation

I'll document it more formally eventually, but this settles on a mix of the
two: square brackets and dashes for intervals, `+` for intersecting lines,
byte offsets below interval endpoints, and names below that.

The docblock for `Span` itself iss still off; I'll probably just take one of
the test cases and paste it there at some point.

DEV-7145

											
										
										
											2022-06-06 11:29:17 -04:00
+								            //                [-]
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								            let ver_pos = (ver.as_ptr() as usize) - decl_ptr;
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								            let span = ctx.span_or_zz(ver_pos, ver.len());
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
-												tamer: tamec: Replace copy with XIR parsing/writing

When wip-frontends is on, this will parse the input file using XIR and then
immediately output it again.  This makes the necessary changes to be able to
read every source file we have in our largest project, such that the output
is identical after having been formatted with `xmllint --format -` (there
are differences because e.g. whitespace between attributes is not yet
maintained).

This is performant too, with times remaining essentially identical despite
the additional work.

DEV-10413

											
										
										
											2022-04-07 12:08:51 -04:00
+								            Err(Error::UnsupportedXmlVersion(
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                ver.intern_utf8().map_err(Error::from_with_span(span))?,
 								                span,
-												tamer: tamec: Replace copy with XIR parsing/writing

When wip-frontends is on, this will parse the input file using XIR and then
immediately output it again.  This makes the necessary changes to be able to
read every source file we have in our largest project, such that the output
is identical after having been formatted with `xmllint --format -` (there
are differences because e.g. whitespace between attributes is not yet
maintained).

This is performant too, with times remaining essentially identical despite
the additional work.

DEV-10413

											
										
										
											2022-04-07 12:08:51 -04:00
+								            ))?
 								        }
 								        if let Some(enc) = decl.encoding() {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								            match &enc.map_err(Error::from_with_span(decl_span))?[..] {
-												tamer: tamec: Replace copy with XIR parsing/writing

When wip-frontends is on, this will parse the input file using XIR and then
immediately output it again.  This makes the necessary changes to be able to
read every source file we have in our largest project, such that the output
is identical after having been formatted with `xmllint --format -` (there
are differences because e.g. whitespace between attributes is not yet
maintained).

This is performant too, with times remaining essentially identical despite
the additional work.

DEV-10413

											
										
										
											2022-04-07 12:08:51 -04:00
+								                b"utf-8" | b"UTF-8" => (),
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								                invalid => {
 								                    let enc_pos = (invalid.as_ptr() as usize) - decl_ptr;
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                    let span = ctx.span_or_zz(enc_pos, invalid.len());
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
 								                    Err(Error::UnsupportedEncoding(
 								                        invalid
 								                            .intern_utf8()
 								                            .map_err(Error::from_with_span(span))?,
 								                        span,
 								                    ))?
 								                }
-												tamer: tamec: Replace copy with XIR parsing/writing

When wip-frontends is on, this will parse the input file using XIR and then
immediately output it again.  This makes the necessary changes to be able to
read every source file we have in our largest project, such that the output
is identical after having been formatted with `xmllint --format -` (there
are differences because e.g. whitespace between attributes is not yet
maintained).

This is performant too, with times remaining essentially identical despite
the additional work.

DEV-10413

											
										
										
											2022-04-07 12:08:51 -04:00
+								            }
 								        }
 								        Ok(())
 								    }
-												tamer: ir::xir::reader: Refactor common element open parsing

As mentioned in the previous commit, this is just minor cleanup.

											
										
										
											2021-10-21 16:51:47 -04:00
+								    /// Parse opening element and its attributes into a XIR [`Token`]
 								    ///   stream.
 								    ///
 								    /// The opening element is returned rather than being added to the token
 								    ///   buffer,
 								    ///     since the intent is to provide that token immediately.
 								    fn parse_element_open(
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								        escaper: &'s S,
-												tamer: ir::xir::reader: Refactor common element open parsing

As mentioned in the previous commit, this is just minor cleanup.

											
										
										
											2021-10-21 16:51:47 -04:00
+								        tokbuf: &mut VecDeque<Token>,
 								        ele: BytesStart,
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								        pos: usize,
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								        ctx: Context,
-												tamer: xir::reader: Correct empty element whitespace handling

This both adds clarifying tests and corrects the case of `<foo/>`, where the
offset was erroneously off by one---it saw that there were no attributes and
added a byte thinking it'd include `>`, as in `<foo>`.

DEV-7145

											
										
										
											2022-06-22 10:28:44 -04:00
+								        empty_tag: bool,
-												tamer: ir::xir::reader: Refactor common element open parsing

As mentioned in the previous commit, this is just minor cleanup.

											
										
										
											2021-10-21 16:51:47 -04:00
+								    ) -> Result<Token> {
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								        // Starts after the opening tag `<`, so adjust.
 								        let addr = ele.as_ptr() as usize - 1;
 								        let len = ele.name().len();
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								        match ele.name().last() {
 								            None => {
 								                // TODO: QName should be self-validating.  Move this.
 								                return Err(Error::InvalidQName(
 								                    WS_EMPTY,
 								                    // <>
 								                    //  |  where QName should be
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                    ctx.span_or_zz(pos + 1, 0),
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								                ));
 								            }
 								            // Quick-and-dirty guess as to whether they may have missed the
 								            //   element name and included an attribute instead,
 								            //     which quick-xml does not check for.
 								            Some(b'"' | b'\'') => {
 								                return Err({
 								                    // <foo="bar" ...>
-												tamer: Consistent span diagram representation

I'll document it more formally eventually, but this settles on a mix of the
two: square brackets and dashes for intervals, `+` for intersecting lines,
byte offsets below interval endpoints, and names below that.

The docblock for `Span` itself iss still off; I'll probably just take one of
the test cases and paste it there at some point.

DEV-7145

											
										
										
											2022-06-06 11:29:17 -04:00
+								                    //  [-------]
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                    let span = ctx.span_or_zz(pos + 1, len);
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
 								                    Error::InvalidQName(
 								                        ele.name()
 								                            .intern_utf8()
 								                            .map_err(Error::from_with_span(span))?,
 								                        span,
 								                    )
 								                });
 								            }
 								            _ => (),
 								        };
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								        // `ele` contains every byte up to the [self-]closing tag.
-												tamer: ir::xir::reader: Refactor common element open parsing

As mentioned in the previous commit, this is just minor cleanup.

											
										
										
											2021-10-21 16:51:47 -04:00
+								        ele.name()
 								            .try_into()
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								            .map_err(Error::from_with_span(ctx.span_or_zz(pos + 1, len)))
-												tamer: ir::xir::reader: Refactor common element open parsing

As mentioned in the previous commit, this is just minor cleanup.

											
										
										
											2021-10-21 16:51:47 -04:00
+								            .and_then(|qname| {
-												tamer: xir::reader: Opening and closing tag whitespace

Non-attribute and non-empty start/end tags will have their whitespace
as part of the produced span.  This sets us up for a following change that
will allow for deriving the name span from this span given a QName, which
gives us a span that both represents the entire XIR token and allows
deriving the element name.

An accurate token span is necessary for parsing errors where an element was
not expected, while an element name span is more appropriate for issues of
grammar and semantic errors that deal not with the fact that an element was
encountered, but _what_ element was encountered.

DEV-7145

											
										
										
											2022-06-22 15:10:49 -04:00
+								                // The whitespace check is to handle input like this:
 								                //   <foo />
 								                //       ^ whitespace making `attributes_raw().len` > 0
 								                let has_attrs = ele
 								                    .attributes_raw()
 								                    .iter()
 								                    .find(|b| !Self::is_whitespace(**b))
 								                    .is_some();
-												tamer: xir: Initial re-introduction of AttrEnd

AttrEnd was initially removed in
0cc0bc9d5a92e666e4ec8319f6bd29c35cc331a8 (and the commit prior), because
there was not a compelling reason to use it over a lookahead
operation (returning a token via the a dead state transition); `AttrEnd`
simply introduced inconsistencies between the XIR reader (which produced
AttrEnd) and internal XIR stream generators (e.g. the lowering operations
into XIR->XML, which do not).

But now that parsers are performing aggregation---in particular the
attribute parser-generator `xir::parse::attr`---this has become quite a
pain, because the dead state is an actionable token.  For example:

  1. Open
  2. Attr
  3. Attr
  4. Open
  5. ...

In the happy case, token #4 results in `Parsed::Incomplete`, and so can just
be transformed into the object representing the aggregated attributes.  But
even in this happy path, it's ugly, and it requires non-tail recursion on
the parser which requires a duplicate stack allocation for the
`ParserState`.  That violates a core principle of the system.

But if there is an error at #4---e.g. an unexpected element---then we no
longer have a `Parsed::Incomplete` to hijack for our own uses, and we'd have
to introduce the ability to return both an error and a token, or we'd have
to introduce the ability to keep a token of lookahead instead of reading
from the underlying token stream, but that's complicated with push parsers,
which are used for parser composition.  Yikes.

And furthermore, the aggregation has caused me to introduce the ability to
override the dead state type to introduce both a token of lookahead and
aggregation information.  This complicates the system and is going to be
confusing to others.

Given all of this, AttrEnd does now seem appropriate to reintroduce, since
it will allow processing of aggregate operations when encountering that
token without having to worry about the above scenario; without having to
duplicate a `ParseState` stack; without having to hijack dead state
transitions for producing our aggregate object; and everything else
mentioned above.

This commit does not modify those abstractions to use AttrEnd yet; it
re-introduces the token to the core system, not the parser-generators, and
it doesn't yet replace lookahead operations in the parsers that use
them.  That'll come next.  Unlike the commit that removed it, though, we are
now generating proper spans, so make note of that here.  This also does not
introduce the concept to XIRF yet, which did not exist at the time that it
was removed, so XIRF is filtering it out until a following commit.

DEV-7145

											
										
										
											2022-06-28 16:10:57 -04:00
+								                let attr_len = ele.attributes_raw().len();
-												tamer: xir::reader: Opening and closing tag whitespace

Non-attribute and non-empty start/end tags will have their whitespace
as part of the produced span.  This sets us up for a following change that
will allow for deriving the name span from this span given a QName, which
gives us a span that both represents the entire XIR token and allows
deriving the element name.

An accurate token span is necessary for parsing errors where an element was
not expected, while an element name span is more appropriate for issues of
grammar and semantic errors that deal not with the fact that an element was
encountered, but _what_ element was encountered.

DEV-7145

											
										
										
											2022-06-22 15:10:49 -04:00
+								                // The tail is anything following the last byte of the QName
 								                //   in a non-empty tag with no attributes.
 								                // For example:
 								                //   <foo   >             <foo>          <foo bar="baz">
 								                //       ~~~~ tail            ~ tail         (no tail)
 								                let tail = if has_attrs {
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								                    let found = Self::parse_attrs(
 								                        escaper,
 								                        tokbuf,
 								                        ele.attributes(),
 								                        addr - pos, // offset relative to _beginning_ of buf
 								                        pos,
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                        ctx,
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								                    )?;
 								                    // Given this input, quick-xml ignores the bytes entirely:
 								                    //   <foo bar>
-												tamer: Consistent span diagram representation

I'll document it more formally eventually, but this settles on a mix of the
two: square brackets and dashes for intervals, `+` for intersecting lines,
byte offsets below interval endpoints, and names below that.

The docblock for `Span` itself iss still off; I'll probably just take one of
the test cases and paste it there at some point.

DEV-7145

											
										
										
											2022-06-06 11:29:17 -04:00
+								                    //        [--] missing `="value"`
-												tamer: xir::reader: Opening and closing tag whitespace

Non-attribute and non-empty start/end tags will have their whitespace
as part of the produced span.  This sets us up for a following change that
will allow for deriving the name span from this span given a QName, which
gives us a span that both represents the entire XIR token and allows
deriving the element name.

An accurate token span is necessary for parsing errors where an element was
not expected, while an element name span is more appropriate for issues of
grammar and semantic errors that deal not with the fact that an element was
encountered, but _what_ element was encountered.

DEV-7145

											
										
										
											2022-06-22 15:10:49 -04:00
+								                    if !found {
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								                        return Err(Error::AttrValueExpected(
 								                            None,
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                            ctx.span_or_zz(pos + ele.len() + 1, 0),
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								                        ));
 								                    }
-												tamer: xir::reader: Opening and closing tag whitespace

Non-attribute and non-empty start/end tags will have their whitespace
as part of the produced span.  This sets us up for a following change that
will allow for deriving the name span from this span given a QName, which
gives us a span that both represents the entire XIR token and allows
deriving the element name.

An accurate token span is necessary for parsing errors where an element was
not expected, while an element name span is more appropriate for issues of
grammar and semantic errors that deal not with the fact that an element was
encountered, but _what_ element was encountered.

DEV-7145

											
										
										
											2022-06-22 15:10:49 -04:00
-												tamer: xir: Initial re-introduction of AttrEnd

AttrEnd was initially removed in
0cc0bc9d5a92e666e4ec8319f6bd29c35cc331a8 (and the commit prior), because
there was not a compelling reason to use it over a lookahead
operation (returning a token via the a dead state transition); `AttrEnd`
simply introduced inconsistencies between the XIR reader (which produced
AttrEnd) and internal XIR stream generators (e.g. the lowering operations
into XIR->XML, which do not).

But now that parsers are performing aggregation---in particular the
attribute parser-generator `xir::parse::attr`---this has become quite a
pain, because the dead state is an actionable token.  For example:

  1. Open
  2. Attr
  3. Attr
  4. Open
  5. ...

In the happy case, token #4 results in `Parsed::Incomplete`, and so can just
be transformed into the object representing the aggregated attributes.  But
even in this happy path, it's ugly, and it requires non-tail recursion on
the parser which requires a duplicate stack allocation for the
`ParserState`.  That violates a core principle of the system.

But if there is an error at #4---e.g. an unexpected element---then we no
longer have a `Parsed::Incomplete` to hijack for our own uses, and we'd have
to introduce the ability to return both an error and a token, or we'd have
to introduce the ability to keep a token of lookahead instead of reading
from the underlying token stream, but that's complicated with push parsers,
which are used for parser composition.  Yikes.

And furthermore, the aggregation has caused me to introduce the ability to
override the dead state type to introduce both a token of lookahead and
aggregation information.  This complicates the system and is going to be
confusing to others.

Given all of this, AttrEnd does now seem appropriate to reintroduce, since
it will allow processing of aggregate operations when encountering that
token without having to worry about the above scenario; without having to
duplicate a `ParseState` stack; without having to hijack dead state
transitions for producing our aggregate object; and everything else
mentioned above.

This commit does not modify those abstractions to use AttrEnd yet; it
re-introduces the token to the core system, not the parser-generators, and
it doesn't yet replace lookahead operations in the parsers that use
them.  That'll come next.  Unlike the commit that removed it, though, we are
now generating proper spans, so make note of that here.  This also does not
introduce the concept to XIRF yet, which did not exist at the time that it
was removed, so XIRF is filtering it out until a following commit.

DEV-7145

											
										
										
											2022-06-28 16:10:57 -04:00
+								                    //      (empty)               (open)
 								                    // <foo bar="baz" />     <foo bar="baz"></foo>
 								                    //                |                    |
 								                    //            zero-length             '>'
 								                    let attr_end_span = ctx.span_or_zz(
 								                        pos + 1 + len + attr_len,
 								                        if empty_tag { 0 } else { 1 },
 								                    );
 								                    tokbuf.push_front(Token::AttrEnd(attr_end_span));
-												tamer: xir::reader: Opening and closing tag whitespace

Non-attribute and non-empty start/end tags will have their whitespace
as part of the produced span.  This sets us up for a following change that
will allow for deriving the name span from this span given a QName, which
gives us a span that both represents the entire XIR token and allows
deriving the element name.

An accurate token span is necessary for parsing errors where an element was
not expected, while an element name span is more appropriate for issues of
grammar and semantic errors that deal not with the fact that an element was
encountered, but _what_ element was encountered.

DEV-7145

											
										
										
											2022-06-22 15:10:49 -04:00
+								                    // No tail because of attributes.
 
 								                } else {
 								                    match empty_tag {
 								                        // Empty tag cannot have a tail.
 								                        true => 0,
 								                        // The "attributes" buffer represents whitespace,
 								                        //   so the tail is the number of bytes of
 								                        //   whitespace plus the closing '>' tag delimiter.
-												tamer: xir: Initial re-introduction of AttrEnd

AttrEnd was initially removed in
0cc0bc9d5a92e666e4ec8319f6bd29c35cc331a8 (and the commit prior), because
there was not a compelling reason to use it over a lookahead
operation (returning a token via the a dead state transition); `AttrEnd`
simply introduced inconsistencies between the XIR reader (which produced
AttrEnd) and internal XIR stream generators (e.g. the lowering operations
into XIR->XML, which do not).

But now that parsers are performing aggregation---in particular the
attribute parser-generator `xir::parse::attr`---this has become quite a
pain, because the dead state is an actionable token.  For example:

  1. Open
  2. Attr
  3. Attr
  4. Open
  5. ...

In the happy case, token #4 results in `Parsed::Incomplete`, and so can just
be transformed into the object representing the aggregated attributes.  But
even in this happy path, it's ugly, and it requires non-tail recursion on
the parser which requires a duplicate stack allocation for the
`ParserState`.  That violates a core principle of the system.

But if there is an error at #4---e.g. an unexpected element---then we no
longer have a `Parsed::Incomplete` to hijack for our own uses, and we'd have
to introduce the ability to return both an error and a token, or we'd have
to introduce the ability to keep a token of lookahead instead of reading
from the underlying token stream, but that's complicated with push parsers,
which are used for parser composition.  Yikes.

And furthermore, the aggregation has caused me to introduce the ability to
override the dead state type to introduce both a token of lookahead and
aggregation information.  This complicates the system and is going to be
confusing to others.

Given all of this, AttrEnd does now seem appropriate to reintroduce, since
it will allow processing of aggregate operations when encountering that
token without having to worry about the above scenario; without having to
duplicate a `ParseState` stack; without having to hijack dead state
transitions for producing our aggregate object; and everything else
mentioned above.

This commit does not modify those abstractions to use AttrEnd yet; it
re-introduces the token to the core system, not the parser-generators, and
it doesn't yet replace lookahead operations in the parsers that use
them.  That'll come next.  Unlike the commit that removed it, though, we are
now generating proper spans, so make note of that here.  This also does not
introduce the concept to XIRF yet, which did not exist at the time that it
was removed, so XIRF is filtering it out until a following commit.

DEV-7145

											
										
										
											2022-06-28 16:10:57 -04:00
+								                        false => attr_len + 1,
-												tamer: xir::reader: Opening and closing tag whitespace

Non-attribute and non-empty start/end tags will have their whitespace
as part of the produced span.  This sets us up for a following change that
will allow for deriving the name span from this span given a QName, which
gives us a span that both represents the entire XIR token and allows
deriving the element name.

An accurate token span is necessary for parsing errors where an element was
not expected, while an element name span is more appropriate for issues of
grammar and semantic errors that deal not with the fact that an element was
encountered, but _what_ element was encountered.

DEV-7145

											
										
										
											2022-06-22 15:10:49 -04:00
+								                    }
 								                };
 								                // <tag ... />                   <tag/>
 								                // [--] name + '<'               [--] name + '<'
 								                //
 								                // <tag  >...</tag>              <tag ...>...</tag>
 								                // [-----] name + '<' + "  >"    [--] name + '<'
 								                //     ~~~ tail
 								                let span = ctx.span_or_zz(pos, len + 1 + tail);
-												tamer: ir::xir::reader: Refactor common element open parsing

As mentioned in the previous commit, this is just minor cleanup.

											
										
										
											2021-10-21 16:51:47 -04:00
 								                // The first token will be immediately returned
 								                //   via the Iterator.
-												tamer: xir: Introduce {Ele,Open,Close}Span

This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working.  I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.

This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token.  This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error.  However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.

This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information.  For that, we need to
encode additional context, and this is an attempt at that.

I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency.  But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project.  TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).

DEV-7145

											
										
										
											2022-06-24 13:51:49 -04:00
+								                Ok(Token::Open(
 								                    qname,
 								                    OpenSpan(span, len.try_into().unwrap_or(0)),
 								                ))
-												tamer: ir::xir::reader: Refactor common element open parsing

As mentioned in the previous commit, this is just minor cleanup.

											
										
										
											2021-10-21 16:51:47 -04:00
+								            })
 								    }
-												tamer: xir::reader: Opening and closing tag whitespace

Non-attribute and non-empty start/end tags will have their whitespace
as part of the produced span.  This sets us up for a following change that
will allow for deriving the name span from this span given a QName, which
gives us a span that both represents the entire XIR token and allows
deriving the element name.

An accurate token span is necessary for parsing errors where an element was
not expected, while an element name span is more appropriate for issues of
grammar and semantic errors that deal not with the fact that an element was
encountered, but _what_ element was encountered.

DEV-7145

											
										
										
											2022-06-22 15:10:49 -04:00
+								    /// Whether the byte represents XML whitespace.
 								    ///
 								    /// This is quick-xml's whitespace predicate,
 								    ///   and corresponds to the
 								    ///     [nonterminal `S` in the XML specification][xmlspec-s].
 								    ///
 								    /// [xmlspec-s]: https://www.w3.org/TR/xml/#NT-S
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								    fn is_whitespace(b: u8) -> bool {
 								        match b {
 								            b' ' | b'\r' | b'\n' | b'\t' => true,
 								            _ => false,
 								        }
 								    }
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								    /// Parse attributes into a XIR [`Token`] stream.
 								    ///
 								    /// The order of attributes will be maintained.
 								    ///
 								    /// This does not yet handle whitespace between attributes,
 								    ///   or around `=`.
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								    ///
 								    /// Note About Pointer Arithmetic
 								    /// =============================
 								    /// `ele_ptr` is expected to be a pointer to the buffer containing the
 								    ///   bytes read from the source file.
 								    /// Attributes reference this buffer,
 								    ///   so we can use pointer arithmetic to determine the offset within
 								    ///   the buffer relative to the node.
 								    /// This works because the underlying buffer is a `Vec`,
 								    ///   which is contiguous in memory.
 								    ///
 								    /// However, since this is a `Vec`,
 								    ///   it is important that the address be retrieved _after_ quick-xml
 								    ///   read events,
 								    ///     otherwise the buffer may be expanded and will be reallocated.
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								    fn parse_attrs<'a>(
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								        escaper: &'s S,
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								        tokbuf: &mut VecDeque<Token>,
 								        mut attrs: Attributes<'a>,
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								        ele_ptr: usize,
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								        ele_pos: usize,
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								        ctx: Context,
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								    ) -> Result<bool> {
 								        let mut found = false;
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								        // Disable checks to allow duplicate attributes;
 								        //   XIR does not enforce this,
 								        //     because it needs to accommodate semantically invalid XML for
 								        //     later analysis.
 								        for result in attrs.with_checks(false) {
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								            found = true;
 								            let attr = result.map_err(|e| match e {
 								                QuickXmlError::NoEqAfterName(pos) => {
 								                    // TODO: quick-xml doesn't give us the name,
 								                    //   but we should discover it.
 								                    Error::AttrValueExpected(
 								                        None,
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                        ctx.span_or_zz(ele_pos + pos, 0),
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								                    )
 								                }
 								                QuickXmlError::UnquotedValue(pos) => {
 								                    // TODO: name and span length
 								                    Error::AttrValueUnquoted(
 								                        None,
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                        ctx.span_or_zz(ele_pos + pos, 0),
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								                    )
 								                }
 								                // fallback
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								                e => Error::from_with_span(ctx.span_or_zz(ele_pos, 0))(e),
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								            })?;
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
 								            let keyoffset = attr.key.as_ptr() as usize;
 								            let name_offset = keyoffset - ele_ptr;
 								            // Accommodates zero-length values (e.g. `key=""`) with a
 								            //   zero-length span at the location the value _would_ be.
 								            let valoffset = match attr.value {
 								                Cow::Borrowed(b) => b.as_ptr() as usize,
 								                // This should never happen since we have a reference to the
 								                //   underlying buffer.
 								                Cow::Owned(_) => unreachable!(
 								                    "internal error: unexpected owned attribute value"
 								                ),
 								            };
 								            let value_offset = valoffset - ele_ptr;
-												tamer: Introduce context into XirReader

tamec and tameld will now both introduce a `Context` to XIR, which will use
it to create spans.

Here's an example of an error, now that it's all working well together:

  $ target/release/tameld --emit xmle -o /dev/null path/to/package.xmlo
  error: invalid preproc:sym/@dim `9` at [/../path/to/package.xmlo offset 1175451-1175452]

A future task will make this human-readable by producing line and column
numbers, and perhaps even a snippet (if not now, then eventually).

It's exciting to see this coming together finally.

DEV-10934

											
										
										
											2022-04-08 16:16:23 -04:00
+								            let span_name = ctx.span_or_zz(name_offset, attr.key.len());
 								            let span_value = ctx.span_or_zz(value_offset, attr.value.len());
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
-												tamer: xir::XirString: WIP implementation (likely going away)

I'm not fond of this implementation, which is why it's not fully
completed.  I wanted to commit this for future reference, and take the
opportunity to explain why I don't like it.

First: this task started as an idea to implement a third variant to
AttrValue and friends that indicates that a value is fixed, in the sense of
a fixed-point function: escaped or unescaped, its value is the same.  This
would allow us to skip wasteful escape/unescape operations.

In doing so, it became obvious that there's no need to leak this information
through the API, and indeed, no part of the system should care.  When we
read XML, it should be unescaped, and when we write, it should be
escaped.  The reason that this didn't quite happen to begin with was an
optimization: I'll be creating an echo writer in place of the current
filesystem-based copy in tamec shortly, and this would allow streaming XIR
directly from the reader to the writer without any unescaping or
re-escaping.

When we unescape, we know the value that it came from, so we could simply
store both symbols---they're 32-bit, so it results in a nicely compressed
64-bit value, so it's essentially cost-free, as long as we accept the
expense of internment.  This is `XirString`.  Then, when we want to escape
or unescape, we first check to see whether a symbol already exists and, if
so, use it.

While this works well for echoing streams, it won't work all that well in
practice: the unescaped SymbolId will be taken and the XirString discarded,
since nothing after XIR should be coupled with it.  Then, when we later
construct a XIR stream for writting, XirString will no longer be available
and our previously known escape is lost, so the writer will have to
re-escape.

Further, if we look at XirString's generic for the XirStringEscaper---it
uses phantom, which hints that maybe it's not in the best place.  Indeed,
I've already acknowledged that only a reader unescapes and only a writer
escapes, and that the rest of the system works with normal (unescaped)
values, so only readers and writers should be part of this process.  I also
already acknowledged that XirString would be lost and only the unescaped
SymbolId would be used.

So what's the point of XirString, then, if it won't be a useful optimization
beyond the temporary echo writer?

Instead, we can take the XirStringWriter and implement two caches on that:
mapping SymbolId from escaped->unescaped and vice-versa.  These can be
simple vectors, since SymbolId is a 32-bit value we will not have much
wasted space for symbols that never get read or written.  We could even
optimize for preinterned symbols using markers, though I'll probably not do
so, and I'll explain why later.

If we do _that_, we get even _better_ optimizations through caching that
_will_ apply in the general case (so, not just for echo), and we're able to
ditch XirString entirely and simply use a SymbolId.  This makes for a much
more friendly API that isn't leaking implementation details, though it
_does_ put an onus on the caller to pass the encoder to both the reader and
the writer, _if_ it wants to take advantage of a cache.  But that burden is
not significant (and is, again, optional if we don't want it).

So, that'll be the next step.

											
										
										
											2021-11-10 09:42:18 -05:00
+								            // The name must be parsed as a QName.
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								            let name = attr
 								                .key
 								                .try_into()
 								                .map_err(Error::from_with_span(span_name))?;
-												tamer: xir::XirString: WIP implementation (likely going away)

I'm not fond of this implementation, which is why it's not fully
completed.  I wanted to commit this for future reference, and take the
opportunity to explain why I don't like it.

First: this task started as an idea to implement a third variant to
AttrValue and friends that indicates that a value is fixed, in the sense of
a fixed-point function: escaped or unescaped, its value is the same.  This
would allow us to skip wasteful escape/unescape operations.

In doing so, it became obvious that there's no need to leak this information
through the API, and indeed, no part of the system should care.  When we
read XML, it should be unescaped, and when we write, it should be
escaped.  The reason that this didn't quite happen to begin with was an
optimization: I'll be creating an echo writer in place of the current
filesystem-based copy in tamec shortly, and this would allow streaming XIR
directly from the reader to the writer without any unescaping or
re-escaping.

When we unescape, we know the value that it came from, so we could simply
store both symbols---they're 32-bit, so it results in a nicely compressed
64-bit value, so it's essentially cost-free, as long as we accept the
expense of internment.  This is `XirString`.  Then, when we want to escape
or unescape, we first check to see whether a symbol already exists and, if
so, use it.

While this works well for echoing streams, it won't work all that well in
practice: the unescaped SymbolId will be taken and the XirString discarded,
since nothing after XIR should be coupled with it.  Then, when we later
construct a XIR stream for writting, XirString will no longer be available
and our previously known escape is lost, so the writer will have to
re-escape.

Further, if we look at XirString's generic for the XirStringEscaper---it
uses phantom, which hints that maybe it's not in the best place.  Indeed,
I've already acknowledged that only a reader unescapes and only a writer
escapes, and that the rest of the system works with normal (unescaped)
values, so only readers and writers should be part of this process.  I also
already acknowledged that XirString would be lost and only the unescaped
SymbolId would be used.

So what's the point of XirString, then, if it won't be a useful optimization
beyond the temporary echo writer?

Instead, we can take the XirStringWriter and implement two caches on that:
mapping SymbolId from escaped->unescaped and vice-versa.  These can be
simple vectors, since SymbolId is a 32-bit value we will not have much
wasted space for symbols that never get read or written.  We could even
optimize for preinterned symbols using markers, though I'll probably not do
so, and I'll explain why later.

If we do _that_, we get even _better_ optimizations through caching that
_will_ apply in the general case (so, not just for echo), and we're able to
ditch XirString entirely and simply use a SymbolId.  This makes for a much
more friendly API that isn't leaking implementation details, though it
_does_ put an onus on the caller to pass the encoder to both the reader and
the writer, _if_ it wants to take advantage of a cache.  But that burden is
not significant (and is, again, optional if we don't want it).

So, that'll be the next step.

											
										
										
											2021-11-10 09:42:18 -05:00
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								            // The attribute value,
 								            //   having just been read from XML,
 								            //   must have been escaped to be parsed properly.
 								            // If it parsed but it's not technically escaped according to
 								            //   the spec,
 								            //     that's okay as long as we can read it again,
 								            //       but we probably should still throw an error if we
 								            //       encounter such a situation.
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								            let value = escaper
 								                .unescape(
 								                    attr.value
 								                        .as_ref()
 								                        .intern_utf8()
 								                        .map_err(Error::from_with_span(span_value))?,
 								                )
 								                .map_err(Error::from_with_span(span_value))?
 								                .into();
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								            tokbuf.push_front(Token::AttrName(name, span_name));
 								            tokbuf.push_front(Token::AttrValue(value, span_value));
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								        }
-												tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934

											
										
										
											2022-04-08 13:52:16 -04:00
+								        Ok(found)
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								    }
 								}
-												tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081

											
										
										
											2021-11-12 13:59:14 -05:00
+								impl<'s, B, S> Iterator for XmlXirReader<'s, B, S>
 								where
 								    B: BufRead,
 								    S: Escaper,
 								{
-												tamer: Integrate xir::reader as a parser in the lowering pipeline

This allows `XmlXirReader` to be used in a `Lower` operation, just as
everything else, bringing me one step closer to a pipeline that can be
concisely represented; this is finally beginning to unify in a clear way,
though it is still a bit of a mess.

This causes `XmlXirReader` to _act_ like a `parse::Parser` in that it yields
a `ParsedResult`, but it does not use `parse::Parser` itself; that was the
_original_ plan: convert it into a `ParseState` where `XmlXirReader` became
a context, and force `Parser` to yield by feeding it a stream of tokens with
`repeat`, but that ended up performing poorly relative to this change.  I
did some investigation, which I might write about in the future, but for
now, this solution works just fine.

DEV-7145

											
										
										
											2022-06-02 10:30:44 -04:00
+								    type Item = ParsedResult<ParsedObject<Token, Error>>;
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
 								    /// Produce the next XIR [`Token`] from the input.
 								    ///
 								    /// For more information on how this reader operates,
 								    ///   see [`XmlXirReader`].
 								    fn next(&mut self) -> Option<Self::Item> {
 								        self.tokbuf
 								            .pop_back()
-												tamer: Integrate xir::reader as a parser in the lowering pipeline

This allows `XmlXirReader` to be used in a `Lower` operation, just as
everything else, bringing me one step closer to a pipeline that can be
concisely represented; this is finally beginning to unify in a clear way,
though it is still a bit of a mess.

This causes `XmlXirReader` to _act_ like a `parse::Parser` in that it yields
a `ParsedResult`, but it does not use `parse::Parser` itself; that was the
_original_ plan: convert it into a `ParseState` where `XmlXirReader` became
a context, and force `Parser` to yield by feeding it a stream of tokens with
`repeat`, but that ended up performing poorly relative to this change.  I
did some investigation, which I might write about in the future, but for
now, this solution works just fine.

DEV-7145

											
										
										
											2022-06-02 10:30:44 -04:00
+								            .map(|tok| Ok(Parsed::Object(tok)))
 								            .or_else(|| {
 								                self.refill_buf().map(|result| {
 								                    result.map(Parsed::Object).map_err(ParseError::StateError)
 								                })
 								            })
-												tamer: ir::xir::reader: Initial concept

This is an initial working concept for the reader which handles, so far,
just a single attribute.  But extending it to completion will not be all
that much more work.

This does not have namespace support---that will be added later as part of
XIRT, which is responsible for semantic analysis.  This allows XIR to stay
wonderfully simple, and won't have any impact on the writer (which expects
that QNames are unresolved and contain the namespace prefix to be written).

											
										
										
											2021-10-21 16:17:17 -04:00
+								    }
 								}
 								#[cfg(test)]
 								mod test;