2021-08-20 10:09:55 -04:00
|
|
|
|
// XML IR (XIR)
|
|
|
|
|
//
|
2022-05-03 14:14:29 -04:00
|
|
|
|
// Copyright (C) 2014-2022 Ryan Specialty Group, LLC.
|
2021-08-20 10:09:55 -04:00
|
|
|
|
//
|
|
|
|
|
// This file is part of TAME.
|
|
|
|
|
//
|
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
|
// it under the terms of the GNU General Public License as published by
|
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
|
// (at your option) any later version.
|
|
|
|
|
//
|
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
// GNU General Public License for more details.
|
|
|
|
|
//
|
|
|
|
|
// You should have received a copy of the GNU General Public License
|
|
|
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
//! Intermediate representation (IR) of an XML document.
|
|
|
|
|
//!
|
|
|
|
|
//! XIR serves not only as a TAMER-specific IR,
|
|
|
|
|
//! but also as an abstraction layer atop of whatever XML library is
|
|
|
|
|
//! used (e.g. `quick_xml`).
|
|
|
|
|
//! XIR is _not_ intended to be comprehensive,
|
|
|
|
|
//! or even general-purpose---it
|
|
|
|
|
//! exists to solve concerns specific to TAMER's construction.
|
|
|
|
|
//!
|
2021-10-21 16:17:17 -04:00
|
|
|
|
//! To parse an entire XML document,
|
|
|
|
|
//! see [`reader`].
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
//!
|
|
|
|
|
//! _Note:_ XIR refers to "opening" and "closing" tags,
|
|
|
|
|
//! as opposed to "start" and "end" as used in the XML specification.
|
|
|
|
|
//! TAMER uses a uniform terminology for all delimited data.
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
2022-07-29 00:44:58 -04:00
|
|
|
|
use crate::fmt::DisplayWrapper;
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
use crate::span::{Span, SpanLenSize};
|
2021-09-28 14:52:31 -04:00
|
|
|
|
use crate::sym::{
|
2022-06-03 14:34:08 -04:00
|
|
|
|
st_as_sym, GlobalSymbolIntern, GlobalSymbolInternBytes, SymbolId,
|
2021-09-28 14:52:31 -04:00
|
|
|
|
};
|
2021-10-21 16:17:17 -04:00
|
|
|
|
use memchr::memchr;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
use std::convert::{TryFrom, TryInto};
|
2021-11-02 13:55:33 -04:00
|
|
|
|
use std::fmt::Display;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
use std::ops::Deref;
|
|
|
|
|
|
2021-10-21 16:17:17 -04:00
|
|
|
|
mod error;
|
|
|
|
|
pub use error::Error;
|
|
|
|
|
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
|
mod escape;
|
2021-11-12 16:07:57 -05:00
|
|
|
|
pub use escape::{DefaultEscaper, Escaper};
|
tamer: xir::XirString: WIP implementation (likely going away)
I'm not fond of this implementation, which is why it's not fully
completed. I wanted to commit this for future reference, and take the
opportunity to explain why I don't like it.
First: this task started as an idea to implement a third variant to
AttrValue and friends that indicates that a value is fixed, in the sense of
a fixed-point function: escaped or unescaped, its value is the same. This
would allow us to skip wasteful escape/unescape operations.
In doing so, it became obvious that there's no need to leak this information
through the API, and indeed, no part of the system should care. When we
read XML, it should be unescaped, and when we write, it should be
escaped. The reason that this didn't quite happen to begin with was an
optimization: I'll be creating an echo writer in place of the current
filesystem-based copy in tamec shortly, and this would allow streaming XIR
directly from the reader to the writer without any unescaping or
re-escaping.
When we unescape, we know the value that it came from, so we could simply
store both symbols---they're 32-bit, so it results in a nicely compressed
64-bit value, so it's essentially cost-free, as long as we accept the
expense of internment. This is `XirString`. Then, when we want to escape
or unescape, we first check to see whether a symbol already exists and, if
so, use it.
While this works well for echoing streams, it won't work all that well in
practice: the unescaped SymbolId will be taken and the XirString discarded,
since nothing after XIR should be coupled with it. Then, when we later
construct a XIR stream for writting, XirString will no longer be available
and our previously known escape is lost, so the writer will have to
re-escape.
Further, if we look at XirString's generic for the XirStringEscaper---it
uses phantom, which hints that maybe it's not in the best place. Indeed,
I've already acknowledged that only a reader unescapes and only a writer
escapes, and that the rest of the system works with normal (unescaped)
values, so only readers and writers should be part of this process. I also
already acknowledged that XirString would be lost and only the unescaped
SymbolId would be used.
So what's the point of XirString, then, if it won't be a useful optimization
beyond the temporary echo writer?
Instead, we can take the XirStringWriter and implement two caches on that:
mapping SymbolId from escaped->unescaped and vice-versa. These can be
simple vectors, since SymbolId is a 32-bit value we will not have much
wasted space for symbols that never get read or written. We could even
optimize for preinterned symbols using markers, though I'll probably not do
so, and I'll explain why later.
If we do _that_, we get even _better_ optimizations through caching that
_will_ apply in the general case (so, not just for echo), and we're able to
ditch XirString entirely and simply use a SymbolId. This makes for a much
more friendly API that isn't leaking implementation details, though it
_does_ put an onus on the caller to pass the encoder to both the reader and
the writer, _if_ it wants to take advantage of a cache. But that burden is
not significant (and is, again, optional if we don't want it).
So, that'll be the next step.
2021-11-10 09:42:18 -05:00
|
|
|
|
|
2022-06-03 14:34:08 -04:00
|
|
|
|
use error::SpanlessError;
|
|
|
|
|
use st::qname::QNameCompatibleStaticSymbolId;
|
tamer: xir::reader: Initial introduction of spans
This is a large change, and was a bit of a tedious one, given the
comprehensive tests.
This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping. Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.
This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors. This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).
Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information. There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.
I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent. Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases. If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes. Different such
spans may be useful in different situations when presenting information to
the user.
This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on. These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now. I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.
Anyway, more to come.
DEV-10934
2022-04-08 11:03:46 -04:00
|
|
|
|
|
2022-07-29 00:44:58 -04:00
|
|
|
|
use self::fmt::{CloseXmlEle, OpenXmlEle, XmlAttr, XmlAttrValueQuote};
|
|
|
|
|
|
2022-03-17 16:10:56 -04:00
|
|
|
|
pub mod attr;
|
2022-03-17 12:20:20 -04:00
|
|
|
|
pub mod flat;
|
2022-06-10 16:28:15 -04:00
|
|
|
|
pub mod fmt;
|
2021-10-11 09:34:17 -04:00
|
|
|
|
pub mod iter;
|
2021-09-28 14:52:31 -04:00
|
|
|
|
pub mod pred;
|
2021-10-21 16:17:17 -04:00
|
|
|
|
pub mod reader;
|
2022-06-03 14:34:08 -04:00
|
|
|
|
pub mod st;
|
2021-09-08 13:53:47 -04:00
|
|
|
|
pub mod tree;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
pub mod writer;
|
|
|
|
|
|
2022-06-13 11:17:21 -04:00
|
|
|
|
#[macro_use]
|
|
|
|
|
pub mod parse;
|
|
|
|
|
|
2021-10-28 21:21:30 -04:00
|
|
|
|
/// An infallible [`Token`] stream.
|
|
|
|
|
///
|
|
|
|
|
/// If the token stream originates from an operation that could potentially
|
|
|
|
|
/// fail and ought to be propagated,
|
|
|
|
|
/// use [`TokenResultStream`].
|
|
|
|
|
///
|
|
|
|
|
/// The name "stream" in place of "iterator" is intended to convey that this
|
|
|
|
|
/// type is expected to be processed in real-time as a stream,
|
|
|
|
|
/// not read into memory.
|
|
|
|
|
pub trait TokenStream = Iterator<Item = Token>;
|
|
|
|
|
|
|
|
|
|
/// A [`Token`] stream that may encounter errors during parsing.
|
|
|
|
|
///
|
|
|
|
|
/// If the stream cannot fail,
|
|
|
|
|
/// consider using [`TokenStream`].
|
|
|
|
|
pub trait TokenResultStream = Iterator<Item = Result<Token, Error>>;
|
|
|
|
|
|
2021-10-11 11:51:51 -04:00
|
|
|
|
/// XML Name minus `":"`.
|
|
|
|
|
///
|
|
|
|
|
/// The intent is to check a string for validity _before_ interning;
|
|
|
|
|
/// otherwise,
|
|
|
|
|
/// the string would have to be first retrieved from the intern pool
|
|
|
|
|
/// for comparison,
|
|
|
|
|
/// which is not an operation we want to do implicitly.
|
|
|
|
|
/// Those methods will be created as they are needed.
|
|
|
|
|
///
|
|
|
|
|
/// See <https://www.w3.org/TR/REC-xml-names/#NT-NCName>.
|
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
|
|
|
pub struct NCName(SymbolId);
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
|
impl NCName {
|
2021-08-20 10:09:55 -04:00
|
|
|
|
/// Create a new NCName from a symbol without validating that the symbol
|
|
|
|
|
/// is a valid NCName.
|
|
|
|
|
///
|
|
|
|
|
/// Safety
|
|
|
|
|
/// ======
|
|
|
|
|
/// This is not unsafe in the traditional sense;
|
|
|
|
|
/// it's unsafe in a sense similar to non-UTF-8 `str` slices,
|
|
|
|
|
/// in that it is expected that an `NCName` means that you do not
|
|
|
|
|
/// have to worry about whether it's syntatically valid as XML.
|
2021-09-23 14:52:53 -04:00
|
|
|
|
pub unsafe fn new_unchecked(value: SymbolId) -> Self {
|
2021-08-20 10:09:55 -04:00
|
|
|
|
Self(value)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-10-21 16:17:17 -04:00
|
|
|
|
impl TryFrom<&[u8]> for NCName {
|
tamer: xir::reader: Initial introduction of spans
This is a large change, and was a bit of a tedious one, given the
comprehensive tests.
This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping. Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.
This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors. This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).
Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information. There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.
I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent. Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases. If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes. Different such
spans may be useful in different situations when presenting information to
the user.
This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on. These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now. I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.
Anyway, more to come.
DEV-10934
2022-04-08 11:03:46 -04:00
|
|
|
|
type Error = SpanlessError;
|
2021-10-21 16:17:17 -04:00
|
|
|
|
|
|
|
|
|
/// Attempt to parse a byte slice into an [`NCName`].
|
|
|
|
|
///
|
|
|
|
|
/// If the slice contains `b':'`,
|
|
|
|
|
/// an error will be produced.
|
|
|
|
|
/// No other checks are performed beyond checking that the byte sequence
|
|
|
|
|
/// represents a valid UTF-8 string.
|
|
|
|
|
/// The string will be interned for you.
|
|
|
|
|
fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
|
|
|
|
|
match value.contains(&b':') {
|
tamer: xir::reader: Initial introduction of spans
This is a large change, and was a bit of a tedious one, given the
comprehensive tests.
This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping. Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.
This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors. This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).
Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information. There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.
I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent. Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases. If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes. Different such
spans may be useful in different situations when presenting information to
the user.
This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on. These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now. I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.
Anyway, more to come.
DEV-10934
2022-04-08 11:03:46 -04:00
|
|
|
|
true => Err(SpanlessError::NCColon(value.intern_utf8()?)),
|
2021-10-21 16:17:17 -04:00
|
|
|
|
false => Ok(NCName(value.intern_utf8()?)),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-10-11 11:51:51 -04:00
|
|
|
|
impl Deref for NCName {
|
|
|
|
|
type Target = SymbolId;
|
|
|
|
|
|
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
|
|
|
&self.0
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl PartialEq<SymbolId> for NCName {
|
|
|
|
|
fn eq(&self, other: &SymbolId) -> bool {
|
|
|
|
|
self.0 == *other
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
|
impl TryFrom<&str> for NCName {
|
tamer: xir::reader: Initial introduction of spans
This is a large change, and was a bit of a tedious one, given the
comprehensive tests.
This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping. Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.
This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors. This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).
Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information. There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.
I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent. Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases. If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes. Different such
spans may be useful in different situations when presenting information to
the user.
This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on. These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now. I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.
Anyway, more to come.
DEV-10934
2022-04-08 11:03:46 -04:00
|
|
|
|
type Error = SpanlessError;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
|
|
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
|
|
|
|
if value.contains(':') {
|
tamer: xir::reader: Initial introduction of spans
This is a large change, and was a bit of a tedious one, given the
comprehensive tests.
This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping. Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.
This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors. This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).
Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information. There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.
I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent. Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases. If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes. Different such
spans may be useful in different situations when presenting information to
the user.
This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on. These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now. I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.
Anyway, more to come.
DEV-10934
2022-04-08 11:03:46 -04:00
|
|
|
|
return Err(SpanlessError::NCColon(value.into()));
|
2021-08-20 10:09:55 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(Self(value.intern()))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-10-11 11:56:03 -04:00
|
|
|
|
/// Namespace prefix of a [`QName`].
|
2021-08-20 10:09:55 -04:00
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
2021-09-23 14:52:53 -04:00
|
|
|
|
pub struct Prefix(NCName);
|
2021-10-11 11:56:03 -04:00
|
|
|
|
|
|
|
|
|
/// Local name portion of a [`QName`].
|
2021-08-20 10:09:55 -04:00
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
2021-09-23 14:52:53 -04:00
|
|
|
|
pub struct LocalPart(NCName);
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
|
impl Deref for Prefix {
|
|
|
|
|
type Target = SymbolId;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
|
|
|
self.0.deref()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
|
impl Deref for LocalPart {
|
|
|
|
|
type Target = SymbolId;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
|
|
|
self.0.deref()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
|
impl From<NCName> for Prefix {
|
|
|
|
|
fn from(name: NCName) -> Self {
|
2021-08-20 10:09:55 -04:00
|
|
|
|
Self(name)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
|
impl From<NCName> for LocalPart {
|
|
|
|
|
fn from(name: NCName) -> Self {
|
2021-08-20 10:09:55 -04:00
|
|
|
|
Self(name)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
|
impl TryFrom<&str> for Prefix {
|
tamer: xir::reader: Initial introduction of spans
This is a large change, and was a bit of a tedious one, given the
comprehensive tests.
This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping. Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.
This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors. This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).
Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information. There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.
I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent. Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases. If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes. Different such
spans may be useful in different situations when presenting information to
the user.
This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on. These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now. I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.
Anyway, more to come.
DEV-10934
2022-04-08 11:03:46 -04:00
|
|
|
|
type Error = SpanlessError;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
|
|
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
|
|
|
|
Ok(Self(value.try_into()?))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
|
impl TryFrom<&str> for LocalPart {
|
tamer: xir::reader: Initial introduction of spans
This is a large change, and was a bit of a tedious one, given the
comprehensive tests.
This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping. Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.
This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors. This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).
Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information. There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.
I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent. Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases. If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes. Different such
spans may be useful in different situations when presenting information to
the user.
This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on. These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now. I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.
Anyway, more to come.
DEV-10934
2022-04-08 11:03:46 -04:00
|
|
|
|
type Error = SpanlessError;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
|
|
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
|
|
|
|
Ok(Self(value.try_into()?))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-11-02 13:55:33 -04:00
|
|
|
|
impl Display for Prefix {
|
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
|
self.0.fmt(f)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Display for LocalPart {
|
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
|
self.0.fmt(f)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-20 10:09:55 -04:00
|
|
|
|
/// A qualified name (namespace prefix and local name).
|
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
2021-09-23 14:52:53 -04:00
|
|
|
|
pub struct QName(Option<Prefix>, LocalPart);
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
|
|
// Since we implement Copy, ensure size matches our expectations:
|
2021-09-23 14:52:53 -04:00
|
|
|
|
const_assert!(std::mem::size_of::<QName>() <= std::mem::size_of::<usize>());
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
|
impl QName {
|
2021-08-20 10:09:55 -04:00
|
|
|
|
/// Create a new fully-qualified name (including both a namespace URI
|
|
|
|
|
/// and local name).
|
2021-10-21 16:17:17 -04:00
|
|
|
|
pub fn new(prefix: Option<Prefix>, local_name: LocalPart) -> Self {
|
|
|
|
|
Self(prefix, local_name)
|
2021-08-20 10:09:55 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Create a new name from a local name only.
|
|
|
|
|
///
|
|
|
|
|
/// This should only be used for attributes in TAMER,
|
|
|
|
|
/// since all elements should have an associated namespace.
|
|
|
|
|
///
|
|
|
|
|
/// _(If this is ever not true (e.g. due to new targets),
|
|
|
|
|
/// please update this comment.)_
|
2021-09-23 14:52:53 -04:00
|
|
|
|
pub fn new_local(local_name: LocalPart) -> Self {
|
2021-08-20 10:09:55 -04:00
|
|
|
|
Self(None, local_name)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Fully qualified namespace associated with a name.
|
2021-09-23 14:52:53 -04:00
|
|
|
|
pub fn prefix(&self) -> Option<Prefix> {
|
2021-08-20 10:09:55 -04:00
|
|
|
|
self.0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Local part of a name (name without namespace).
|
2021-09-23 14:52:53 -04:00
|
|
|
|
pub fn local_name(&self) -> LocalPart {
|
2021-08-20 10:09:55 -04:00
|
|
|
|
self.1
|
|
|
|
|
}
|
2021-09-28 14:52:31 -04:00
|
|
|
|
|
|
|
|
|
/// Construct a constant QName from static C-style symbols.
|
2021-10-05 16:13:47 -04:00
|
|
|
|
pub const fn st_cid<T, U>(prefix_sym: &T, local_sym: &U) -> Self
|
|
|
|
|
where
|
|
|
|
|
T: QNameCompatibleStaticSymbolId,
|
|
|
|
|
U: QNameCompatibleStaticSymbolId,
|
|
|
|
|
{
|
2021-09-28 14:52:31 -04:00
|
|
|
|
Self(
|
2021-10-05 16:13:47 -04:00
|
|
|
|
Some(Prefix(NCName(st_as_sym(prefix_sym)))),
|
|
|
|
|
LocalPart(NCName(st_as_sym(local_sym))),
|
2021-09-28 14:52:31 -04:00
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Construct a constant QName with a local name only from a static
|
|
|
|
|
/// C-style symbol.
|
2021-10-05 16:13:47 -04:00
|
|
|
|
pub const fn st_cid_local<T: QNameCompatibleStaticSymbolId>(
|
|
|
|
|
local_sym: &T,
|
|
|
|
|
) -> Self {
|
|
|
|
|
Self(None, LocalPart(NCName(st_as_sym(local_sym))))
|
2021-09-28 14:52:31 -04:00
|
|
|
|
}
|
2021-08-20 10:09:55 -04:00
|
|
|
|
}
|
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
|
impl<P, L> TryFrom<(P, L)> for QName
|
2021-08-20 10:09:55 -04:00
|
|
|
|
where
|
2021-09-23 14:52:53 -04:00
|
|
|
|
P: TryInto<Prefix>,
|
|
|
|
|
L: TryInto<LocalPart, Error = P::Error>,
|
2021-08-20 10:09:55 -04:00
|
|
|
|
{
|
|
|
|
|
type Error = P::Error;
|
|
|
|
|
|
|
|
|
|
fn try_from(value: (P, L)) -> Result<Self, Self::Error> {
|
|
|
|
|
Ok(Self(Some(value.0.try_into()?), value.1.try_into()?))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
|
impl<P, L> TryFrom<(Option<P>, L)> for QName
|
2021-09-08 13:53:47 -04:00
|
|
|
|
where
|
2021-09-23 14:52:53 -04:00
|
|
|
|
P: TryInto<Prefix>,
|
|
|
|
|
L: TryInto<LocalPart, Error = P::Error>,
|
2021-09-08 13:53:47 -04:00
|
|
|
|
{
|
|
|
|
|
type Error = P::Error;
|
|
|
|
|
|
|
|
|
|
fn try_from(value: (Option<P>, L)) -> Result<Self, Self::Error> {
|
|
|
|
|
let ns = match value.0 {
|
|
|
|
|
None => None,
|
|
|
|
|
Some(ns) => Some(ns.try_into()?),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Ok(Self(ns, value.1.try_into()?))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-09-23 14:52:53 -04:00
|
|
|
|
impl TryFrom<&str> for QName {
|
tamer: xir::reader: Initial introduction of spans
This is a large change, and was a bit of a tedious one, given the
comprehensive tests.
This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping. Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.
This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors. This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).
Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information. There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.
I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent. Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases. If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes. Different such
spans may be useful in different situations when presenting information to
the user.
This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on. These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now. I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.
Anyway, more to come.
DEV-10934
2022-04-08 11:03:46 -04:00
|
|
|
|
type Error = SpanlessError;
|
2021-09-08 13:53:47 -04:00
|
|
|
|
|
|
|
|
|
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
|
|
|
|
Ok(QName(None, value.try_into()?))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-10-21 16:17:17 -04:00
|
|
|
|
impl TryFrom<&[u8]> for QName {
|
tamer: xir::reader: Initial introduction of spans
This is a large change, and was a bit of a tedious one, given the
comprehensive tests.
This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping. Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.
This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors. This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).
Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information. There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.
I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent. Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases. If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes. Different such
spans may be useful in different situations when presenting information to
the user.
This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on. These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now. I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.
Anyway, more to come.
DEV-10934
2022-04-08 11:03:46 -04:00
|
|
|
|
type Error = SpanlessError;
|
2021-10-21 16:17:17 -04:00
|
|
|
|
|
|
|
|
|
/// Attempt to parse a byte slice into a [`QName`].
|
|
|
|
|
///
|
|
|
|
|
/// The byte slice must represent a valid QName in UTF-8.
|
|
|
|
|
/// If a colon is present,
|
|
|
|
|
/// it delimits the namespace [`Prefix`] and [`LocalPart`],
|
|
|
|
|
/// and therefore must not be in the first or last byte position.
|
|
|
|
|
fn try_from(name: &[u8]) -> Result<Self, Self::Error> {
|
|
|
|
|
match memchr(b':', name) {
|
|
|
|
|
// Leading colon means we're missing a prefix, trailing means
|
|
|
|
|
// that we have no local part.
|
|
|
|
|
Some(pos) if pos == 0 || pos == name.len() - 1 => {
|
tamer: xir::reader: Initial introduction of spans
This is a large change, and was a bit of a tedious one, given the
comprehensive tests.
This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping. Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.
This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors. This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).
Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information. There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.
I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent. Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases. If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes. Different such
spans may be useful in different situations when presenting information to
the user.
This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on. These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now. I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.
Anyway, more to come.
DEV-10934
2022-04-08 11:03:46 -04:00
|
|
|
|
Err(SpanlessError::InvalidQName(name.intern_utf8()?))
|
2021-10-21 16:17:17 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// There is _at least_ one colon in the string.
|
|
|
|
|
Some(pos) => {
|
|
|
|
|
// The prefix is before the first colon,
|
|
|
|
|
// and so itself must not contain a colon and is therefore
|
|
|
|
|
// a valid NCName.
|
|
|
|
|
let prefix = NCName(name[..pos].intern_utf8()?);
|
|
|
|
|
|
|
|
|
|
// But there could be a _second_ colon,
|
|
|
|
|
// so the local part requires validation.
|
|
|
|
|
let local = NCName::try_from(&name[(pos + 1)..])?;
|
|
|
|
|
|
|
|
|
|
Ok(Self::new(Some(prefix.into()), local.into()))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// There are no colons in the string, so the entire string is
|
|
|
|
|
// both a local part and a valid NCName.
|
|
|
|
|
None => Ok(Self::new(None, NCName(name.intern_utf8()?).into())),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-11-02 13:55:33 -04:00
|
|
|
|
impl Display for QName {
|
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
|
match self {
|
|
|
|
|
QName(Some(local), suffix) => write!(f, "{}:{}", local, suffix),
|
|
|
|
|
QName(None, suffix) => suffix.fmt(f),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
/// A span representing an opening (starting) element tag.
|
|
|
|
|
///
|
|
|
|
|
/// See [`EleSpan`] for more information.
|
|
|
|
|
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
|
|
|
|
pub struct OpenSpan(Span, EleNameLen);
|
|
|
|
|
|
|
|
|
|
impl OpenSpan {
|
|
|
|
|
pub fn without_name_span(span: Span) -> Self {
|
|
|
|
|
Self(span, 0)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// A span representing a closing (ending) element tag.
|
|
|
|
|
///
|
|
|
|
|
/// See [`EleSpan`] for more information.
|
|
|
|
|
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
|
|
|
|
pub struct CloseSpan(Span, EleNameLen);
|
|
|
|
|
|
|
|
|
|
impl CloseSpan {
|
|
|
|
|
/// A [`CloseSpan`] representing the closing of an empty tag.
|
|
|
|
|
///
|
|
|
|
|
/// This type of span has no element name.
|
|
|
|
|
pub fn empty(span: Span) -> Self {
|
|
|
|
|
Self::without_name_span(span)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn without_name_span(span: Span) -> Self {
|
|
|
|
|
Self(span, 0)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
tamer: Xirf::Text refinement
This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.
This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.
The idea is to capture the most common whitespace as preinterned
symbols. Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.
The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character. This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large. (They become large when template-expanded.) I'll optimize further
if I notice it show up during profiling.
This also frees XIR itself from being concerned by Whitespace. Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic. I'd rather avoid tweaking it.
tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).
Onward and yonward.
DEV-7145
2022-07-27 15:49:38 -04:00
|
|
|
|
/// Number of bytes representing the name of the element.
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
pub type EleNameLen = SpanLenSize;
|
|
|
|
|
|
|
|
|
|
/// Spans associated with an element opening or closing tag.
|
|
|
|
|
///
|
|
|
|
|
/// The diagram below illustrates the behavior of [`EleSpan`].
|
|
|
|
|
/// Spans are represented by `[---]` intervals,
|
|
|
|
|
/// with the byte offset at each end,
|
|
|
|
|
/// and the single-letter span name centered below the interval.
|
|
|
|
|
///
|
|
|
|
|
/// ```text
|
|
|
|
|
/// <open > <open ...> </close > <empty ' />
|
|
|
|
|
/// |[--] | |[--] | [---] | |[---] ' []
|
|
|
|
|
/// |1 4 | |1 4 | 2 6 | |1 5 ' 9`10
|
|
|
|
|
/// | N | | N | | N | | N | ' T
|
|
|
|
|
/// | | | | | | | | '
|
|
|
|
|
/// [------] [---] [--------] [----] '
|
|
|
|
|
/// 0 7 0 4 0 9 0 5 '
|
|
|
|
|
/// T T T T '
|
|
|
|
|
/// ```
|
|
|
|
|
///
|
|
|
|
|
/// Above we have
|
|
|
|
|
///
|
|
|
|
|
/// - `T` = [`EleSpan::span`]; and
|
|
|
|
|
/// - `N` = [`EleSpan::name_span`].
|
|
|
|
|
///
|
|
|
|
|
/// The purpose of the `T` span is to represent the entire token that has
|
|
|
|
|
/// been emitted by XIR.
|
|
|
|
|
/// If an opening tag does not contain any attributes,
|
|
|
|
|
/// then `T` represents the entire opening tag with both the opening and
|
|
|
|
|
/// closing angle brackets.
|
|
|
|
|
/// If an opening tag is expected to contain attributes,
|
|
|
|
|
/// then only the opening angle bracket is included.
|
|
|
|
|
/// A closing tag is entirely contained by `T`.
|
|
|
|
|
///
|
|
|
|
|
/// The empty tag is separated into two tokens in XIR---a
|
|
|
|
|
/// [`Token::Open`] and a [`Token::Close`] with a [`None`] for the name.
|
|
|
|
|
/// Unlike a typical closing tag,
|
|
|
|
|
/// there is no `N` span available for the closing token,
|
|
|
|
|
/// and so requesting one via [`EleSpan::name_span`] will simply
|
|
|
|
|
/// return the `T` span,
|
|
|
|
|
/// rather than complicating the API with an [`Option`].
|
|
|
|
|
/// It is generally assumed that reporting on element names will occur
|
|
|
|
|
/// within the context of the _opening_ tag.
|
|
|
|
|
///
|
|
|
|
|
/// The tag may contain whitespace following the element name,
|
|
|
|
|
/// as permitted by `STag` and `ETag` in the
|
|
|
|
|
/// [XML specification][xmlspec-tag].
|
|
|
|
|
///
|
|
|
|
|
/// [xmlspec-tag]: https://www.w3.org/TR/xml/#dt-stag
|
|
|
|
|
pub trait EleSpan {
|
|
|
|
|
/// A [`Span`] encompassing the entire opening element token.
|
|
|
|
|
///
|
|
|
|
|
/// Note that what exactly this token represents varies.
|
|
|
|
|
fn span(&self) -> Span;
|
|
|
|
|
|
|
|
|
|
/// Span representing the relevant portion of the element tag.
|
|
|
|
|
///
|
|
|
|
|
/// This is a more descriptive alias of [`EleSpan::span`] that may be
|
|
|
|
|
/// appropriate in certain contexts.
|
|
|
|
|
fn tag_span(&self) -> Span {
|
|
|
|
|
self.span()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// A [`Span`] representing only the element name,
|
|
|
|
|
/// if available.
|
|
|
|
|
///
|
|
|
|
|
/// An element name is _not_ available for empty tags.
|
|
|
|
|
/// Rather than complicating the API with [`Option`],
|
|
|
|
|
/// [`EleSpan::span`] is returned instead.
|
|
|
|
|
fn name_span(&self) -> Span;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl EleSpan for OpenSpan {
|
|
|
|
|
fn span(&self) -> Span {
|
|
|
|
|
match self {
|
|
|
|
|
Self(t, _) => *t,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn name_span(&self) -> Span {
|
|
|
|
|
match self {
|
|
|
|
|
// <open ...>
|
|
|
|
|
// ^^^^ offset '<' and length of name
|
|
|
|
|
//
|
|
|
|
|
// If the length is 0,
|
|
|
|
|
// then this will result in a 0-length span at the location
|
|
|
|
|
// that the element name ought to be,
|
|
|
|
|
// and so the resulting span will still be useful.
|
|
|
|
|
// This should not happen for tokens read using XIR,
|
|
|
|
|
// but may happen for system-generated tokens.
|
|
|
|
|
Self(t, name_len) => {
|
|
|
|
|
t.context().span(t.offset().saturating_add(1), *name_len)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl EleSpan for CloseSpan {
|
|
|
|
|
fn span(&self) -> Span {
|
|
|
|
|
match self {
|
|
|
|
|
Self(t, _) => *t,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn name_span(&self) -> Span {
|
|
|
|
|
match self {
|
|
|
|
|
// If the length of the element name is 0,
|
|
|
|
|
// then this must be an empty tag,
|
|
|
|
|
// which contains no independent element name.
|
|
|
|
|
//
|
|
|
|
|
// <foo ' />
|
|
|
|
|
// ' ^^
|
|
|
|
|
Self(_t, 0) => self.span(),
|
|
|
|
|
|
|
|
|
|
// </close >
|
|
|
|
|
// ^^^^^ offset '</' and length of name
|
|
|
|
|
Self(t, name_len) => {
|
|
|
|
|
t.context().span(t.offset().saturating_add(2), *name_len)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-20 10:30:27 -04:00
|
|
|
|
/// Lightly-structured XML tokens with associated [`Span`]s.
|
|
|
|
|
///
|
|
|
|
|
/// This is a streamable IR for XML.
|
|
|
|
|
/// A writer requires knowledge only of a previous state,
|
|
|
|
|
/// such as whether a node is open,
|
|
|
|
|
/// and so this IR can be processed by a simple state machine
|
|
|
|
|
/// (see [`writer::WriterState`]).
|
2021-10-11 10:33:24 -04:00
|
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
2021-09-23 14:52:53 -04:00
|
|
|
|
pub enum Token {
|
2021-08-20 10:09:55 -04:00
|
|
|
|
/// Opening tag of an element.
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
Open(QName, OpenSpan),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
|
|
/// Closing tag of an element.
|
|
|
|
|
///
|
2021-09-13 12:58:54 -04:00
|
|
|
|
/// If the name is [`None`],
|
|
|
|
|
/// then the tag is self-closing.
|
2021-08-20 10:09:55 -04:00
|
|
|
|
/// This is intended primarily as a safety measure:
|
|
|
|
|
/// It allows writers to act as simple state machines without having
|
|
|
|
|
/// to ensure balancing by indicating that a node was intended to
|
|
|
|
|
/// self-close.
|
|
|
|
|
/// Otherwise,
|
|
|
|
|
/// we wouldn't know whether to self-close or to close and then
|
|
|
|
|
/// create a new closing tag;
|
|
|
|
|
/// if we blindly did the former,
|
|
|
|
|
/// we risk losing a closing tag when it wasn't intended.
|
|
|
|
|
/// Instead of losing tags,
|
|
|
|
|
/// writers can error,
|
|
|
|
|
/// indicating a bug in the stream.
|
2021-09-13 12:58:54 -04:00
|
|
|
|
///
|
|
|
|
|
/// The reason for using an option here rather than a variant is to
|
|
|
|
|
/// simplify pattern matching,
|
|
|
|
|
/// given especially that bindings after `@` in patterns have not
|
|
|
|
|
/// yet been stabalized at the time of writing (but are very
|
|
|
|
|
/// close!).
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
Close(Option<QName>, CloseSpan),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
2021-09-21 00:13:03 -04:00
|
|
|
|
/// Element attribute name.
|
2021-09-23 14:52:53 -04:00
|
|
|
|
AttrName(QName, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
2021-09-21 00:13:03 -04:00
|
|
|
|
/// Element attribute value.
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
|
AttrValue(SymbolId, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
2021-09-21 00:13:03 -04:00
|
|
|
|
/// A portion of an element attribute value.
|
|
|
|
|
///
|
|
|
|
|
/// This allows for concatenating values into an attribute value without
|
|
|
|
|
/// having to copy values.
|
|
|
|
|
/// The last fragment must be a [`Token::AttrValue`].
|
|
|
|
|
///
|
2021-09-21 10:43:23 -04:00
|
|
|
|
/// Since each fragment contains a span,
|
|
|
|
|
/// this also potentially gives higher resolution for the origin of
|
|
|
|
|
/// components of generated attribute values.
|
2021-12-06 14:26:58 -05:00
|
|
|
|
///
|
|
|
|
|
/// _This should be used only for writing._
|
|
|
|
|
/// These will never be encountered during reading,
|
|
|
|
|
/// and so to keep the parsers and IRs simple,
|
|
|
|
|
/// there is no support for fragments beyond XIR.
|
|
|
|
|
/// (There was in the past,
|
|
|
|
|
/// but it was removed.)
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
|
AttrValueFragment(SymbolId, Span),
|
2021-09-21 00:13:03 -04:00
|
|
|
|
|
2021-08-20 10:09:55 -04:00
|
|
|
|
/// Comment node.
|
2021-11-15 23:47:14 -05:00
|
|
|
|
Comment(SymbolId, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
|
|
/// Character data as part of an element.
|
|
|
|
|
///
|
2021-08-20 10:30:27 -04:00
|
|
|
|
/// See also [`CData`](Token::CData) variant.
|
2021-11-15 23:47:14 -05:00
|
|
|
|
Text(SymbolId, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
|
|
/// CData node (`<![CDATA[...]]>`).
|
|
|
|
|
///
|
|
|
|
|
/// _Warning: It is up to the caller to ensure that the string `]]>` is
|
|
|
|
|
/// not present in the text!_
|
|
|
|
|
/// This is intended for reading existing XML data where CData is
|
|
|
|
|
/// already present,
|
|
|
|
|
/// not for producing new CData safely!
|
2021-11-15 23:47:14 -05:00
|
|
|
|
CData(SymbolId, Span),
|
2021-08-20 10:09:55 -04:00
|
|
|
|
}
|
|
|
|
|
|
2021-11-03 14:54:37 -04:00
|
|
|
|
impl Display for Token {
|
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
2022-05-03 09:09:55 -04:00
|
|
|
|
// _Do not_ render large amounts of text here;
|
|
|
|
|
// this is not only a risk depending on what is output,
|
|
|
|
|
// but the diagnostic system also quote source lines to provide
|
|
|
|
|
// the necessary context.
|
2021-11-03 14:54:37 -04:00
|
|
|
|
match self {
|
2022-07-29 00:44:58 -04:00
|
|
|
|
Self::Open(qname, _) => OpenXmlEle::fmt(qname, f),
|
|
|
|
|
Self::Close(Some(qname), _) => CloseXmlEle::fmt(qname, f),
|
2021-11-03 14:54:37 -04:00
|
|
|
|
// Its context is contained within the Open,
|
|
|
|
|
// and hopefully any user-visible errors will display that instead.
|
2022-05-03 09:09:55 -04:00
|
|
|
|
Self::Close(None, _) => {
|
2022-07-29 00:44:58 -04:00
|
|
|
|
write!(f, "/>")
|
2021-11-03 14:54:37 -04:00
|
|
|
|
}
|
2022-07-29 00:44:58 -04:00
|
|
|
|
Self::AttrName(qname, _) => XmlAttr::fmt(qname, f),
|
|
|
|
|
Self::AttrValue(attr_val, _) => XmlAttrValueQuote::fmt(attr_val, f),
|
2022-05-03 09:09:55 -04:00
|
|
|
|
Self::AttrValueFragment(attr_val, _) => {
|
2022-07-29 00:44:58 -04:00
|
|
|
|
write!(
|
|
|
|
|
f,
|
|
|
|
|
"value fragment {}",
|
|
|
|
|
XmlAttrValueQuote::wrap(attr_val)
|
|
|
|
|
)
|
2021-11-03 14:54:37 -04:00
|
|
|
|
}
|
2022-05-03 09:09:55 -04:00
|
|
|
|
Self::Comment(..) => write!(f, "comment"),
|
|
|
|
|
Self::Text(..) => write!(f, "text"),
|
|
|
|
|
Self::CData(..) => write!(f, "CDATA"),
|
2021-11-03 14:54:37 -04:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-13 11:17:21 -04:00
|
|
|
|
impl crate::parse::Token for Token {
|
tamer: parser::Parser: cfg(test) tracing
This produces useful parse traces that are output as part of a failing test
case. The parser generator macros can be a bit confusing to deal with when
things go wrong, so this helps to clarify matters.
This is _not_ intended to be machine-readable, but it does show that it
would be possible to generate machine-readable output to visualize the
entire lowering pipeline. Perhaps something for the future.
I left these inline in Parser::feed_tok because they help to elucidate what
is going on, just by reading what the trace would output---that is, it helps
to make the method more self-documenting, albeit a tad bit more
verbose. But with that said, it should probably be extracted at some point;
I don't want this to set a precedent where composition is feasible.
Here's an example from test cases:
[Parser::feed_tok] (input IR: XIRF)
| ==> Parser before tok is parsing attributes for `package`.
| | Attrs_(SutAttrsState_ { ___ctx: (QName(None, LocalPart(NCName(SymbolId(46 "package")))), OpenSpan(Span { len: 0, offset: 0, ctx: Context(SymbolId(1 "#!DUMMY")) }, 10)), ___done: false })
|
| ==> XIRF tok: `<unexpected>`
| | Open(QName(None, LocalPart(NCName(SymbolId(82 "unexpected")))), OpenSpan(Span { len: 0, offset: 1, ctx: Context(SymbolId(1 "#!DUMMY")) }, 10), Depth(1))
|
| ==> Parser after tok is expecting opening tag `<classify>`.
| | ChildA(Expecting_)
| | Lookahead: Some(Lookahead(Open(QName(None, LocalPart(NCName(SymbolId(82 "unexpected")))), OpenSpan(Span { len: 0, offset: 1, ctx: Context(SymbolId(1 "#!DUMMY")) }, 10), Depth(1))))
= note: this trace was output as a debugging aid because `cfg(test)`.
[Parser::feed_tok] (input IR: XIRF)
| ==> Parser before tok is expecting opening tag `<classify>`.
| | ChildA(Expecting_)
|
| ==> XIRF tok: `<unexpected>`
| | Open(QName(None, LocalPart(NCName(SymbolId(82 "unexpected")))), OpenSpan(Span { len: 0, offset: 1, ctx: Context(SymbolId(1 "#!DUMMY")) }, 10), Depth(1))
|
| ==> Parser after tok is attempting to recover by ignoring element with unexpected name `unexpected` (expected `classify`).
| | ChildA(RecoverEleIgnore_(QName(None, LocalPart(NCName(SymbolId(82 "unexpected")))), OpenSpan(Span { len: 0, offset: 1, ctx: Context(SymbolId(1 "#!DUMMY")) }, 10), Depth(1)))
| | Lookahead: None
= note: this trace was output as a debugging aid because `cfg(test)`.
DEV-7145
2022-07-18 14:32:34 -04:00
|
|
|
|
fn ir_name() -> &'static str {
|
|
|
|
|
"XIR"
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-06 14:48:55 -05:00
|
|
|
|
/// Retrieve the [`Span`] associated with a given [`Token`].
|
|
|
|
|
///
|
|
|
|
|
/// Every token has an associated span.
|
2022-03-18 15:26:05 -04:00
|
|
|
|
fn span(&self) -> Span {
|
2021-12-06 14:48:55 -05:00
|
|
|
|
use Token::*;
|
|
|
|
|
|
|
|
|
|
match self {
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
Open(_, OpenSpan(span, _))
|
|
|
|
|
| Close(_, CloseSpan(span, _))
|
2021-12-06 14:48:55 -05:00
|
|
|
|
| AttrName(_, span)
|
|
|
|
|
| AttrValue(_, span)
|
|
|
|
|
| AttrValueFragment(_, span)
|
|
|
|
|
| Comment(_, span)
|
|
|
|
|
| Text(_, span)
|
tamer: Xirf::Text refinement
This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.
This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.
The idea is to capture the most common whitespace as preinterned
symbols. Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.
The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character. This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large. (They become large when template-expanded.) I'll optimize further
if I notice it show up during profiling.
This also frees XIR itself from being concerned by Whitespace. Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic. I'd rather avoid tweaking it.
tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).
Onward and yonward.
DEV-7145
2022-07-27 15:49:38 -04:00
|
|
|
|
| CData(_, span) => *span,
|
2021-12-06 14:48:55 -05:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-13 11:17:21 -04:00
|
|
|
|
impl crate::parse::Object for Token {}
|
2022-06-02 10:30:44 -04:00
|
|
|
|
|
2021-08-20 10:09:55 -04:00
|
|
|
|
#[cfg(test)]
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
pub mod test {
|
2021-08-20 10:09:55 -04:00
|
|
|
|
use super::*;
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
use crate::convert::ExpectInto;
|
2021-09-23 14:52:53 -04:00
|
|
|
|
use crate::sym::GlobalSymbolIntern;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
use std::convert::TryInto;
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
use std::fmt::Debug;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
|
|
type TestResult = Result<(), Box<dyn std::error::Error>>;
|
|
|
|
|
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
// Prefer [`open`] below when possible.
|
|
|
|
|
impl From<Span> for OpenSpan {
|
|
|
|
|
fn from(span: Span) -> Self {
|
|
|
|
|
Self::without_name_span(span)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Prefer [`close`] below when possible.
|
|
|
|
|
impl From<Span> for CloseSpan {
|
|
|
|
|
fn from(span: Span) -> Self {
|
|
|
|
|
Self::without_name_span(span)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Hastily and lazily produce a [`XirfToken::Open`].
|
|
|
|
|
///
|
|
|
|
|
/// This function is not suitable for production use as it does not
|
|
|
|
|
/// produce a complete [`OpenSpan`].
|
|
|
|
|
pub fn open<Q: TryInto<QName>, S: Into<OpenSpan>>(
|
|
|
|
|
qname: Q,
|
|
|
|
|
span: S,
|
|
|
|
|
) -> Token
|
|
|
|
|
where
|
|
|
|
|
<Q as TryInto<QName>>::Error: Debug,
|
|
|
|
|
{
|
|
|
|
|
Token::Open(qname.unwrap_into(), span.into())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Hastily and lazily produce a [`XirfToken::Close`] for an empty tag.
|
|
|
|
|
///
|
|
|
|
|
/// This is [`close`] with the omission of the `qname` argument; the
|
|
|
|
|
/// type parameter `Q` cannot be inferred if the value is [`None`].
|
|
|
|
|
///
|
|
|
|
|
/// This function is not suitable for production use as it does not
|
|
|
|
|
/// produce a complete [`OpenSpan`].
|
|
|
|
|
pub fn close_empty<S: Into<CloseSpan>>(span: S) -> Token {
|
|
|
|
|
Token::Close(None, span.into())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Hastily and lazily produce a [`XirfToken::Close`].
|
|
|
|
|
///
|
|
|
|
|
/// See also [`close_empty`] if `Q` cannot be inferred.
|
|
|
|
|
///
|
|
|
|
|
/// This function is not suitable for production use as it does not
|
|
|
|
|
/// produce a complete [`OpenSpan`].
|
|
|
|
|
pub fn close<Q: TryInto<QName>, S: Into<CloseSpan>>(
|
|
|
|
|
qname: Option<Q>,
|
|
|
|
|
span: S,
|
|
|
|
|
) -> Token
|
|
|
|
|
where
|
|
|
|
|
<Q as TryInto<QName>>::Error: Debug,
|
|
|
|
|
{
|
|
|
|
|
Token::Close(qname.map(ExpectInto::unwrap_into), span.into())
|
2021-08-20 10:09:55 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
mod name {
|
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn ncname_comparable_to_sym() {
|
|
|
|
|
let foo = "foo".intern();
|
2021-09-23 14:52:53 -04:00
|
|
|
|
assert_eq!(NCName(foo), foo);
|
2021-08-20 10:09:55 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn ncname_try_into_from_str_no_colon() -> TestResult {
|
2021-09-23 14:52:53 -04:00
|
|
|
|
let name: NCName = "no-colon".try_into()?;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
assert_eq!(name, "no-colon".intern());
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn ncname_try_into_from_str_fails_with_colon() {
|
|
|
|
|
assert_eq!(
|
2021-09-23 14:52:53 -04:00
|
|
|
|
NCName::try_from("look:a-colon"),
|
tamer: xir::reader: Initial introduction of spans
This is a large change, and was a bit of a tedious one, given the
comprehensive tests.
This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping. Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.
This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors. This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).
Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information. There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.
I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent. Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases. If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes. Different such
spans may be useful in different situations when presenting information to
the user.
This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on. These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now. I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.
Anyway, more to come.
DEV-10934
2022-04-08 11:03:46 -04:00
|
|
|
|
Err(SpanlessError::NCColon("look:a-colon".into()))
|
2021-08-20 10:09:55 -04:00
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2021-10-21 16:17:17 -04:00
|
|
|
|
#[test]
|
|
|
|
|
fn ncname_from_byte_slice() -> TestResult {
|
|
|
|
|
let name: NCName = (b"no-colon" as &[u8]).try_into()?;
|
|
|
|
|
assert_eq!(name, "no-colon".intern());
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn ncname_from_byte_slice_fails_with_colon() {
|
|
|
|
|
assert_eq!(
|
|
|
|
|
NCName::try_from(b"a:colon" as &[u8]),
|
tamer: xir::reader: Initial introduction of spans
This is a large change, and was a bit of a tedious one, given the
comprehensive tests.
This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping. Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.
This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors. This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).
Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information. There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.
I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent. Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases. If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes. Different such
spans may be useful in different situations when presenting information to
the user.
This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on. These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now. I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.
Anyway, more to come.
DEV-10934
2022-04-08 11:03:46 -04:00
|
|
|
|
Err(SpanlessError::NCColon("a:colon".into()))
|
2021-10-21 16:17:17 -04:00
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-20 10:09:55 -04:00
|
|
|
|
#[test]
|
|
|
|
|
fn local_name_from_local_part_only() -> TestResult {
|
2021-09-23 14:52:53 -04:00
|
|
|
|
let name = QName::new_local("foo".try_into()?);
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
|
|
assert_eq!(name.local_name(), "foo".try_into()?);
|
|
|
|
|
assert_eq!(None, name.prefix());
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
2021-09-28 14:52:31 -04:00
|
|
|
|
#[test]
|
|
|
|
|
fn local_name_from_option_tuple() -> TestResult {
|
|
|
|
|
let name: QName = (Option::<&str>::None, "foo").try_into()?;
|
|
|
|
|
|
|
|
|
|
assert_eq!(name.local_name(), "foo".try_into()?);
|
|
|
|
|
assert_eq!(None, name.prefix());
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-20 10:09:55 -04:00
|
|
|
|
#[test]
|
|
|
|
|
fn fully_qualified_name() -> TestResult {
|
2021-09-23 14:52:53 -04:00
|
|
|
|
let name: QName = ("foons", "foo").try_into()?;
|
2021-08-20 10:09:55 -04:00
|
|
|
|
|
|
|
|
|
assert_eq!(name.prefix(), Some("foons".try_into()?));
|
|
|
|
|
assert_eq!(name.local_name(), "foo".try_into()?);
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
mod ele_span {
|
|
|
|
|
use super::*;
|
2022-07-29 10:27:46 -04:00
|
|
|
|
use crate::span::dummy::DUMMY_CONTEXT as DC;
|
tamer: xir: Introduce {Ele,Open,Close}Span
This isn't conceptally all that significant of a change, but there was a lot
of modify to get it working. I would generally separate this into a commit
for the implementation and another commit for the integration, but I decided
to keep things together.
This serves a role similar to AttrSpan---this allows deriving a span
representing the element name from a span representing the entire XIR
token. This will provide more useful context for errors---including the tag
delimiter(s) means that we care about the fact that an element is in that
position (as opposed to some other type of node) within the context of an
error. However, if we are expecting an element but take issue with the
element name itself, we want to place emphasis on that instead.
This also starts to consider the issue of span contexts---a blob of detached
data that is `Span` is useful for error context, but it's not useful for
manipulation or deriving additional information. For that, we need to
encode additional context, and this is an attempt at that.
I am interested in the concept of providing Spans that are guaranteed to
actually make sense---that are instantiated and manipulated with APIs that
ensure consistency. But such a thing buys us very little, practically
speaking, over what I have now for TAMER, and so I don't expect to actually
implement that for this project; I'll leave that for a personal
project. TAMER's already take a lot of my personal interests and it can
cause me a lot of grief sometimes (with regards to letting my aspirations
cause me more work).
DEV-7145
2022-06-24 13:51:49 -04:00
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn open_without_attrs() {
|
|
|
|
|
// See docblock for [`EleSpan`].
|
|
|
|
|
const T: Span = DC.span(0, 8); // Relevant portion of tag
|
|
|
|
|
const N: Span = DC.span(1, 4); // Element name
|
|
|
|
|
|
|
|
|
|
let sut = OpenSpan(T, N.len());
|
|
|
|
|
|
|
|
|
|
assert_eq!(sut.span(), T);
|
|
|
|
|
assert_eq!(sut.name_span(), N);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn open_with_attrs() {
|
|
|
|
|
// See docblock for [`EleSpan`].
|
|
|
|
|
const T: Span = DC.span(0, 5); // Relevant portion of tag
|
|
|
|
|
const N: Span = DC.span(1, 4); // Element name
|
|
|
|
|
|
|
|
|
|
let sut = OpenSpan(T, N.len());
|
|
|
|
|
|
|
|
|
|
assert_eq!(sut.span(), T);
|
|
|
|
|
assert_eq!(sut.name_span(), N);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn close() {
|
|
|
|
|
// See docblock for [`EleSpan`].
|
|
|
|
|
const T: Span = DC.span(0, 10); // Relevant portion of tag
|
|
|
|
|
const N: Span = DC.span(2, 5); // Element name
|
|
|
|
|
|
|
|
|
|
let sut = CloseSpan(T, N.len());
|
|
|
|
|
|
|
|
|
|
assert_eq!(sut.span(), T);
|
|
|
|
|
assert_eq!(sut.name_span(), N);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn close_empty() {
|
|
|
|
|
// See docblock for [`EleSpan`].
|
|
|
|
|
const T: Span = DC.span(9, 2); // Relevant portion of tag
|
|
|
|
|
|
|
|
|
|
let sut = CloseSpan(T, 0);
|
|
|
|
|
|
|
|
|
|
assert_eq!(sut.span(), T);
|
|
|
|
|
// There is no name,
|
|
|
|
|
// only Zuul.
|
|
|
|
|
assert_eq!(sut.name_span(), T);
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-08-20 10:09:55 -04:00
|
|
|
|
}
|