tamer: ir::xir::tree: Introduce attribute fragment parsing

This is exactly was I said I was _not_ going to do in the previous commit,
but apparently hacking late at night had me forget the whole reason that
XIRT is being introduced now---unit tests.  I'll be emitting a XIR stream
and I need to parse it for convenience in the tests.

So, here's a good start.  Next will be some generalizations that are useful
for the tests as well.  This is pretty bare, but accomplishes the task.

See docs for more info.
main
Mike Gerwitz 2021-09-21 15:30:44 -04:00
parent a5afc76568
commit b348892276
3 changed files with 366 additions and 12 deletions

View File

@ -167,13 +167,37 @@
//! is, our stack is fully type-safe.
//!
//! [state machine]: https://en.wikipedia.org/wiki/Finite-state_machine
//!
//! High-Resolution Attributes
//! --------------------------
//! XIRT supports [`Token::AttrValueFragment`],
//! which can produce concatenated attribute values that retain the
//! [`Span`] of each of their constituent parts.
//! This could allow,
//! for example,
//! creating an LSP server that would expose all of the TAME templates and
//! source inputs used to generate an identifier.
//!
//! However,
//! note that the XIR token stream introduced [`Token::AttrValueFragment`]
//! primarily to eliminate the need for unnecessary [symbol
//! lookups](crate::sym), copying, and heap allocations.
//! XIRT must perform extra heap allocations to process these fragments.
//! Once processed,
//! an [`Attr::Extensible`] object is produced;
//! the value is _not_ concatenated and interned,
//! allowing it to be cheaply converted back into a [`Token`] stream
//! for writing without unnecessary overhead.
//!
//! For more information,
//! see [`AttrParts`].
use super::{AttrValue, QName, Token};
use crate::{span::Span, sym::SymbolIndexSize};
use std::{fmt::Display, mem::take};
mod attr;
pub use attr::{Attr, AttrList};
pub use attr::{Attr, AttrList, AttrParts, SimpleAttr};
/// A XIR tree (XIRT).
///
@ -388,6 +412,10 @@ pub enum Stack<Ix: SymbolIndexSize> {
/// An attribute is awaiting its value,
/// after which it will be attached to an element.
AttrName(ElementStack<Ix>, QName<Ix>, Span),
/// An attribute whose value is being constructed of value fragments,
/// after which it will be attached to an element.
AttrFragments(ElementStack<Ix>, AttrParts<Ix>),
}
impl<Ix: SymbolIndexSize> Default for Stack<Ix> {
@ -463,8 +491,47 @@ impl<Ix: SymbolIndexSize> Stack<Ix> {
})
}
/// Push a value fragment onto an attribute.
///
/// This begins to build an attribute out of value fragments,
/// which is also completed by [`Stack::close_attr`].
/// The attribute information that was previously held in
/// [`Stack::AttrName`] is moved into a [`AttrParts`] if that has not
/// already happend,
/// which is responsible for managing future fragments.
///
/// This will cause heap allocation.
fn push_attr_value(
self,
value: AttrValue<Ix>,
span: Span,
) -> Result<Self, Ix> {
Ok(match self {
Self::AttrName(ele_stack, name, open_span) => {
// This initial capacity can be adjusted after we observe
// empirically what we most often parse, or we can make it
// configurable.
let mut parts = AttrParts::with_capacity(name, open_span, 2);
parts.push_value(value, span);
Self::AttrFragments(ele_stack, parts)
}
Self::AttrFragments(ele_stack, mut parts) => {
parts.push_value(value, span);
Self::AttrFragments(ele_stack, parts)
}
_ => todo! {},
})
}
/// Assigns a value to an opened attribute and attaches to the parent
/// element.
///
/// If the attribute is composed of fragments ([`Stack::AttrFragments`]),
/// this serves as the final fragment and will yield an
/// [`Attr::Extensible`] with no further processing.
fn close_attr(self, value: AttrValue<Ix>, span: Span) -> Result<Self, Ix> {
Ok(match self {
Self::AttrName(ele_stack, name, open_span) => {
@ -474,6 +541,13 @@ impl<Ix: SymbolIndexSize> Stack<Ix> {
(open_span, span),
)))
}
Self::AttrFragments(ele_stack, mut parts) => {
parts.push_value(value, span);
Stack::BuddingElement(
ele_stack.consume_attr(Attr::Extensible(parts)),
)
}
_ => todo! {},
})
}
@ -547,6 +621,9 @@ impl<Ix: SymbolIndexSize> ParserState<Ix> {
Token::Open(name, span) => stack.open_element(name, span),
Token::Close(name, span) => stack.close_element(name, span),
Token::AttrName(name, span) => stack.open_attr(name, span),
Token::AttrValueFragment(value, span) => {
stack.push_attr_value(value, span)
}
Token::AttrValue(value, span) => stack.close_attr(value, span),
todo => Err(ParseError::Todo(todo, stack)),

View File

@ -19,24 +19,109 @@
//! XIRT attributes.
//!
//! See [parent module](super) for documentation.
//! Attributes are represented by [`Attr`].
//!
//! See [parent module](super) for additional documentation.
use super::{AttrValue, QName};
use crate::{span::Span, sym::SymbolIndexSize};
/// Element attribute.
/// An attribute.
///
/// TODO: This doesn't yet handle whitespace for alignment of attributes;
/// deferring this until it's actually needed.
/// Attributes come in two flavors:
/// attributes with simple atoms ([`SimpleAttr`]),
/// and extensible attributes composed of a list of fragments with
/// associated spans ([`AttrParts`]).
///
/// If you do not care about the distinction between the two types,
/// use the API provided by this enum for common functionality.
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct Attr<Ix: SymbolIndexSize> {
pub enum Attr<Ix: SymbolIndexSize> {
Simple(SimpleAttr<Ix>),
Extensible(AttrParts<Ix>),
}
impl<Ix: SymbolIndexSize> Attr<Ix> {
/// Construct a new simple attribute with a name, value, and respective
/// [`Span`]s.
///
/// This attribute's value cannot be extended,
/// but it can be cheaply converted into [`Attr::Extensible`] via
/// [`Attr::parts`] or [`From`].
#[inline]
pub fn new(
name: QName<Ix>,
value: AttrValue<Ix>,
span: (Span, Span),
) -> Self {
Self::Simple(SimpleAttr::new(name, value, span))
}
/// Construct a new attribute whose value will be incrementally
/// constructed.
///
/// This is intended for use with
/// [`Token::AttrValueFragment`](super::Token::AttrValueFragment),
/// which provides for string concatenation while maintaining
/// [`Span`] resolution and being zero-copy.
#[inline]
pub fn new_extensible_with_capacity(
name: QName<Ix>,
name_span: Span,
capacity: usize,
) -> Self {
Self::Extensible(AttrParts::with_capacity(name, name_span, capacity))
}
/// Create an attribute from a list of value fragments and their spans.
///
/// This is intended not only for convenience,
/// but also to permit pre-allocating buffers,
/// or re-using them in conjunction with [`AttrParts::into_fragments`].
#[inline]
pub fn from_fragments(
name: QName<Ix>,
name_span: Span,
frags: Vec<(AttrValue<Ix>, Span)>,
) -> Self {
Self::Extensible(AttrParts {
name,
name_span,
value_frags: frags,
})
}
/// The inner [`AttrParts`] representing an attribute and its value
/// fragments.
///
/// This provides the inner [`AttrParts`] needed to begin pushing value
/// fragments.
///
/// If the attribute has no parts (is a [`SimpleAttr`]),
/// it will be converted into an extensible attribute with one value
/// fragment and then returned.
#[inline]
pub fn parts(self) -> AttrParts<Ix> {
match self {
Self::Simple(attr) => attr.into(),
Self::Extensible(parts) => parts,
}
}
}
/// Element attribute with an atomic value.
///
/// This should be used in place of [`AttrParts`] whenever the attribute is
/// a simple [`QName`]/[`AttrValue`] pair.
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct SimpleAttr<Ix: SymbolIndexSize> {
name: QName<Ix>,
value: AttrValue<Ix>,
/// Spans for the attribute name and value respectively.
span: (Span, Span),
}
impl<Ix: SymbolIndexSize> Attr<Ix> {
impl<Ix: SymbolIndexSize> SimpleAttr<Ix> {
/// Construct a new simple attribute with a name, value, and respective
/// [`Span`]s.
#[inline]
@ -45,7 +130,96 @@ impl<Ix: SymbolIndexSize> Attr<Ix> {
value: AttrValue<Ix>,
span: (Span, Span),
) -> Self {
Attr { name, value, span }
Self { name, value, span }
}
}
/// Element attribute with a value composed of multiple fragments.
///
/// This should be used when one or more of these properties is desirable:
/// 1. Zero-copy concatenation with respect to [symbols](crate::sym);
/// 2. High-resolution [`Span`]s for each constituent fragment; and/or
/// 3. You need to parse a XIR stream with
/// [`Token::AttrValueFragment`](super::Token::AttrValueFragment).
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct AttrParts<Ix: SymbolIndexSize> {
name: QName<Ix>,
name_span: Span,
/// Ordered value fragments and their associated [`Span`]s.
///
/// When writing,
/// fragments will be concatenated in order without any delimiters.
value_frags: Vec<(AttrValue<Ix>, Span)>,
}
impl<Ix: SymbolIndexSize> AttrParts<Ix> {
/// Construct a new simple attribute with a name, value, and respective
/// [`Span`]s.
#[inline]
pub fn with_capacity(
name: QName<Ix>,
name_span: Span,
capacity: usize,
) -> Self {
Self {
name,
name_span,
value_frags: Vec::with_capacity(capacity),
}
}
}
impl<Ix: SymbolIndexSize> AttrParts<Ix> {
/// Append a new value fragment and its associated span.
///
/// Value fragments are intended to be concatenated on write without a
/// delimiter,
/// and are associated with
/// [`Token::AttrValueFragment`](super::Token::AttrValueFragment).
#[inline]
pub fn push_value(&mut self, value: AttrValue<Ix>, span: Span) {
self.value_frags.push((value, span));
}
/// Retrieve a read-only list of ordered value fragments and their
/// associated spans.
///
/// If you want to consume the vector to re-use it for future
/// [`AttrParts`],
/// see [`into_fragments`](AttrParts::into_fragments).
#[inline]
pub fn value_fragments(&self) -> &Vec<(AttrValue<Ix>, Span)> {
&self.value_frags
}
/// Consume [`AttrParts`],
/// yielding its internal fragment buffer.
///
/// This allows the buffer to be re-used for future [`AttrParts`],
/// avoiding additional heap allocations.
#[inline]
pub fn into_fragments(self) -> Vec<(AttrValue<Ix>, Span)> {
self.value_frags
}
}
impl<Ix: SymbolIndexSize> From<SimpleAttr<Ix>> for AttrParts<Ix> {
fn from(attr: SimpleAttr<Ix>) -> Self {
Self {
name: attr.name,
name_span: attr.span.0,
value_frags: vec![(attr.value, attr.span.1)],
}
}
}
impl<Ix: SymbolIndexSize> From<Attr<Ix>> for AttrParts<Ix> {
fn from(attr: Attr<Ix>) -> Self {
match attr {
Attr::Simple(inner) => inner.into(),
Attr::Extensible(inner) => inner,
}
}
}
@ -88,4 +262,96 @@ impl<Ix: SymbolIndexSize, const N: usize> From<[Attr<Ix>; N]> for AttrList<Ix> {
}
}
// See [`super::test`].
// See also [`super::test`] for many more tests related to attributes.
#[cfg(test)]
mod test {
use super::*;
use crate::{convert::ExpectInto, sym::GlobalSymbolIntern};
type Ix = crate::global::ProgSymSize;
lazy_static! {
static ref S: Span =
Span::from_byte_interval((0, 0), "test case, 1".intern());
static ref S2: Span =
Span::from_byte_interval((0, 0), "test case, 2".intern());
}
#[test]
fn attr_into_attr_parts() {
let name = "attr".unwrap_into();
let value = AttrValue::Escaped("value".intern());
let attr = SimpleAttr::<Ix> {
name,
value,
span: (*S, *S2),
};
let result = attr.clone().into();
assert_eq!(
AttrParts {
name,
name_span: *S,
value_frags: vec![(value, *S2)],
},
result,
);
// Enum should also be able to do it
assert_eq!(result, Attr::Simple(attr.clone()).into(),);
assert_eq!(result, Attr::Simple(attr).parts(),);
}
#[test]
fn push_attr_part() {
let name = "pushattr".unwrap_into();
let value1 = AttrValue::Escaped("first".intern());
let value2 = AttrValue::Escaped("second".intern());
let mut attr =
Attr::<Ix>::new_extensible_with_capacity(name, *S, 2).parts();
attr.push_value(value1, *S);
attr.push_value(value2, *S2);
assert_eq!(&vec![(value1, *S), (value2, *S2)], attr.value_fragments());
}
#[test]
fn attr_from_parts() {
let name = "pushattr".unwrap_into();
let value1 = AttrValue::Escaped("first".intern());
let value2 = AttrValue::Escaped("second".intern());
let attr = Attr::<Ix>::from_fragments(
name,
*S,
vec![(value1, *S), (value2, *S2)],
)
.parts();
assert_eq!(&vec![(value1, *S), (value2, *S2)], attr.value_fragments());
}
#[test]
fn into_fragments_to_reuse_buffer_for_parts() {
let name = "partbuffer".unwrap_into();
let value1 = AttrValue::Escaped("first".intern());
let value2 = AttrValue::Escaped("second".intern());
let value3 = AttrValue::Escaped("third".intern());
let frags = vec![(value1, *S2), (value2, *S)];
let mut attr1 = Attr::<Ix>::from_fragments(name, *S, frags).parts();
attr1.push_value(value3, *S2);
// Notice that the value is owned, and so we can call
// `from_fragments` again to re-use the buffer.
assert_eq!(
vec![(value1, *S2), (value2, *S), (value3, *S2)],
attr1.into_fragments(),
);
}
}

View File

@ -137,14 +137,19 @@ fn empty_element_with_attrs_from_toks() {
let attr1 = "a".unwrap_into();
let attr2 = "b".unwrap_into();
let val1 = AttrValue::Escaped("val1".intern());
let val2 = AttrValue::Escaped("val2".intern());
let val2a = AttrValue::Escaped("val2a".intern());
let val2b = AttrValue::Escaped("val2b".intern());
let val2c = AttrValue::Escaped("val2b".intern());
let toks = std::array::IntoIter::new([
Token::<Ix>::Open(name, *S),
Token::AttrName(attr1, *S),
Token::AttrValue(val1, *S2),
Token::AttrName(attr2, *S),
Token::AttrValue(val2, *S2),
// More than one fragment to ensure we handle that state
Token::AttrValueFragment(val2a, *S),
Token::AttrValueFragment(val2b, *S2),
Token::AttrValue(val2c, *S3),
Token::Close(None, *S2),
]);
@ -152,7 +157,11 @@ fn empty_element_with_attrs_from_toks() {
name,
attrs: AttrList::from(vec![
Attr::new(attr1, val1, (*S, *S2)),
Attr::new(attr2, val2, (*S, *S2)),
Attr::from_fragments(
attr2,
*S,
vec![(val2a, *S), (val2b, *S2), (val2c, *S3)],
),
]),
children: vec![],
span: (*S, *S2),
@ -164,6 +173,8 @@ fn empty_element_with_attrs_from_toks() {
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrName
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrValue
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrName
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrValueFragment
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrValueFragment
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrValue
assert_eq!(
sut.next(),