2021-09-16 10:18:02 -04:00
|
|
|
// Test XIR tree representation
|
|
|
|
//
|
|
|
|
// Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
|
|
|
|
//
|
|
|
|
// This file is part of TAME.
|
|
|
|
//
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU General Public License
|
|
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
use super::*;
|
|
|
|
use crate::convert::ExpectInto;
|
|
|
|
use crate::sym::GlobalSymbolIntern;
|
|
|
|
|
|
|
|
lazy_static! {
|
|
|
|
static ref S: Span =
|
|
|
|
Span::from_byte_interval((0, 0), "test case, 1".intern());
|
|
|
|
static ref S2: Span =
|
|
|
|
Span::from_byte_interval((0, 0), "test case, 2".intern());
|
|
|
|
static ref S3: Span =
|
|
|
|
Span::from_byte_interval((0, 0), "test case, 3".intern());
|
|
|
|
}
|
|
|
|
|
|
|
|
mod tree {
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn element_from_tree() {
|
2021-09-23 14:52:53 -04:00
|
|
|
let ele = Element {
|
2021-09-16 10:18:02 -04:00
|
|
|
name: "foo".unwrap_into(),
|
2021-10-29 16:34:05 -04:00
|
|
|
attrs: None,
|
2021-09-16 10:18:02 -04:00
|
|
|
children: vec![],
|
|
|
|
span: (*S, *S2),
|
|
|
|
};
|
|
|
|
|
|
|
|
let tree = Tree::Element(ele.clone());
|
|
|
|
|
2021-09-28 14:52:31 -04:00
|
|
|
assert_eq!(Some(&ele), tree.as_element());
|
2021-11-15 23:47:14 -05:00
|
|
|
assert_eq!(None, Into::<Option<SymbolId>>::into(tree));
|
2021-10-08 16:16:33 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn text_from_tree() {
|
2021-11-15 23:47:14 -05:00
|
|
|
let text = "foo".intern();
|
2021-10-08 16:16:33 -04:00
|
|
|
let tree = Tree::Text(text, *S);
|
|
|
|
|
|
|
|
assert!(!tree.is_element());
|
|
|
|
assert_eq!(None, tree.as_element());
|
|
|
|
assert_eq!(None, tree.clone().into_element());
|
|
|
|
|
2021-11-15 23:47:14 -05:00
|
|
|
assert_eq!(Some(text), tree.into());
|
2021-09-28 14:52:31 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mod attrs {
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn linear_search_for_attr_name_in_list() {
|
|
|
|
let a = "a".unwrap_into();
|
|
|
|
let b = "b".unwrap_into();
|
|
|
|
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
let attra = Attr::new(a, "a value".intern(), (*S, *S2));
|
|
|
|
let attrb = Attr::new(b, "b value".intern(), (*S, *S2));
|
2021-09-28 14:52:31 -04:00
|
|
|
|
|
|
|
let attrs = AttrList::from([attra.clone(), attrb.clone()]);
|
|
|
|
|
|
|
|
assert_eq!(attrs.find(a), Some(&attra));
|
|
|
|
assert_eq!(attrs.find(b), Some(&attrb));
|
|
|
|
|
|
|
|
assert_eq!(attrs.find("unknown".unwrap_into()), None);
|
2021-09-16 10:18:02 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn empty_element_self_close_from_toks() {
|
|
|
|
let name = ("ns", "elem").unwrap_into();
|
|
|
|
|
2021-10-02 01:03:19 -04:00
|
|
|
let toks = [Token::Open(name, *S), Token::Close(None, *S2)].into_iter();
|
2021-09-16 10:18:02 -04:00
|
|
|
|
|
|
|
let expected = Element {
|
|
|
|
name,
|
2021-10-29 16:34:05 -04:00
|
|
|
attrs: None,
|
2021-09-16 10:18:02 -04:00
|
|
|
children: vec![],
|
|
|
|
span: (*S, *S2),
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut sut = toks.scan(ParserState::new(), parse);
|
|
|
|
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete)));
|
2021-12-13 14:08:16 -05:00
|
|
|
assert_eq!(
|
|
|
|
sut.next(),
|
|
|
|
Some(Ok(Parsed::Object(Object::Tree(Tree::Element(expected)))))
|
|
|
|
);
|
2021-09-16 10:18:02 -04:00
|
|
|
assert_eq!(sut.next(), None);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Same as above test, but with balanced closing instead of self
|
|
|
|
// closing.
|
|
|
|
#[test]
|
|
|
|
fn empty_element_balanced_close_from_toks() {
|
|
|
|
let name = ("ns", "openclose").unwrap_into();
|
|
|
|
|
2021-10-02 01:03:19 -04:00
|
|
|
let toks =
|
|
|
|
[Token::Open(name, *S), Token::Close(Some(name), *S2)].into_iter();
|
2021-09-16 10:18:02 -04:00
|
|
|
|
|
|
|
let expected = Element {
|
|
|
|
name,
|
2021-10-29 16:34:05 -04:00
|
|
|
attrs: None,
|
2021-09-16 10:18:02 -04:00
|
|
|
children: vec![],
|
|
|
|
span: (*S, *S2),
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut sut = toks.scan(ParserState::new(), parse);
|
|
|
|
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete)));
|
2021-12-13 14:08:16 -05:00
|
|
|
assert_eq!(
|
|
|
|
sut.next(),
|
|
|
|
Some(Ok(Parsed::Object(Object::Tree(Tree::Element(expected)))))
|
|
|
|
);
|
2021-09-16 10:18:02 -04:00
|
|
|
assert_eq!(sut.next(), None);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Unbalanced should result in error. This does not test what happens
|
|
|
|
// _after_ the error.
|
|
|
|
#[test]
|
|
|
|
fn empty_element_unbalanced_close_from_toks() {
|
|
|
|
let open_name = "open".unwrap_into();
|
|
|
|
let close_name = "unbalanced_name".unwrap_into();
|
|
|
|
|
2021-10-02 01:03:19 -04:00
|
|
|
let toks = [
|
2021-09-23 14:52:53 -04:00
|
|
|
Token::Open(open_name, *S),
|
|
|
|
Token::Close(Some(close_name), *S2),
|
2021-10-02 01:03:19 -04:00
|
|
|
]
|
|
|
|
.into_iter();
|
2021-09-16 10:18:02 -04:00
|
|
|
|
|
|
|
let mut sut = toks.scan(ParserState::new(), parse);
|
|
|
|
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete)));
|
|
|
|
assert_eq!(
|
|
|
|
sut.next(),
|
|
|
|
Some(Err(ParseError::UnbalancedTag {
|
|
|
|
open: (open_name, *S),
|
|
|
|
close: (close_name, *S2),
|
|
|
|
}))
|
|
|
|
);
|
|
|
|
|
|
|
|
// TODO: We need to figure out how to best implement recovery before
|
|
|
|
// continuing with this design.
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn empty_element_with_attrs_from_toks() {
|
|
|
|
let name = ("ns", "elem").unwrap_into();
|
|
|
|
let attr1 = "a".unwrap_into();
|
|
|
|
let attr2 = "b".unwrap_into();
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
let val1 = "val1".intern();
|
2021-12-06 14:26:58 -05:00
|
|
|
let val2 = "val2".intern();
|
2021-09-16 10:18:02 -04:00
|
|
|
|
2021-10-02 01:03:19 -04:00
|
|
|
let toks = [
|
2021-09-23 14:52:53 -04:00
|
|
|
Token::Open(name, *S),
|
2021-09-16 10:18:02 -04:00
|
|
|
Token::AttrName(attr1, *S),
|
|
|
|
Token::AttrValue(val1, *S2),
|
|
|
|
Token::AttrName(attr2, *S),
|
2021-12-06 14:26:58 -05:00
|
|
|
Token::AttrValue(val2, *S3),
|
2021-09-16 10:18:02 -04:00
|
|
|
Token::Close(None, *S2),
|
2021-10-02 01:03:19 -04:00
|
|
|
]
|
|
|
|
.into_iter();
|
2021-09-16 10:18:02 -04:00
|
|
|
|
|
|
|
let expected = Element {
|
|
|
|
name,
|
2021-10-29 16:34:05 -04:00
|
|
|
attrs: Some(AttrList::from(vec![
|
2021-09-21 10:43:23 -04:00
|
|
|
Attr::new(attr1, val1, (*S, *S2)),
|
2021-12-06 14:26:58 -05:00
|
|
|
Attr::new(attr2, val2, (*S, *S3)),
|
2021-10-29 16:34:05 -04:00
|
|
|
])),
|
2021-09-16 10:18:02 -04:00
|
|
|
children: vec![],
|
|
|
|
span: (*S, *S2),
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut sut = toks.scan(ParserState::new(), parse);
|
|
|
|
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // Open
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrName
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrValue
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrName
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrValue
|
2021-12-13 14:08:16 -05:00
|
|
|
assert_eq!(
|
|
|
|
sut.next(),
|
|
|
|
Some(Ok(Parsed::Object(Object::Tree(Tree::Element(expected)))))
|
|
|
|
);
|
2021-11-03 14:37:05 -04:00
|
|
|
assert_eq!(sut.next(), None);
|
|
|
|
}
|
|
|
|
|
|
|
|
// We should accommodate missing AttrEnd in an element context so that we
|
|
|
|
// can parse generated XIR without having to emit AttrEnd if we know it
|
|
|
|
// will not be necessary.
|
|
|
|
// I may come to regret that accommodation after we have to go back and add
|
|
|
|
// AttrEnd to systems that weren't providing it.
|
|
|
|
#[test]
|
|
|
|
fn child_element_after_attrs() {
|
|
|
|
let name = ("ns", "elem").unwrap_into();
|
|
|
|
let child = "child".unwrap_into();
|
|
|
|
let attr = "a".unwrap_into();
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
let val = "val".intern();
|
2021-11-03 14:37:05 -04:00
|
|
|
|
|
|
|
let toks = [
|
|
|
|
Token::Open(name, *S),
|
|
|
|
Token::AttrName(attr, *S),
|
|
|
|
Token::AttrValue(val, *S2),
|
|
|
|
// No AttrEnd
|
|
|
|
Token::Open(child, *S),
|
|
|
|
Token::Close(None, *S2),
|
|
|
|
Token::Close(Some(name), *S3),
|
|
|
|
]
|
|
|
|
.into_iter();
|
|
|
|
|
|
|
|
let expected = Element {
|
|
|
|
name,
|
|
|
|
attrs: Some(AttrList::from(vec![Attr::new(attr, val, (*S, *S2))])),
|
|
|
|
children: vec![Tree::Element(Element {
|
|
|
|
name: child,
|
|
|
|
attrs: None,
|
|
|
|
children: vec![],
|
|
|
|
span: (*S, *S2),
|
|
|
|
})],
|
|
|
|
span: (*S, *S3),
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut sut = toks.scan(ParserState::new(), parse);
|
|
|
|
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // Open
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrName
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrValue
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // Open
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // Close
|
2021-12-13 14:08:16 -05:00
|
|
|
assert_eq!(
|
|
|
|
sut.next(),
|
|
|
|
Some(Ok(Parsed::Object(Object::Tree(Tree::Element(expected)))))
|
|
|
|
);
|
2021-09-16 10:18:02 -04:00
|
|
|
assert_eq!(sut.next(), None);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn element_with_empty_sibling_children() {
|
|
|
|
let parent = "parent".unwrap_into();
|
|
|
|
let childa = "childa".unwrap_into();
|
|
|
|
let childb = "childb".unwrap_into();
|
|
|
|
|
2021-10-02 01:03:19 -04:00
|
|
|
let toks = [
|
2021-09-23 14:52:53 -04:00
|
|
|
Token::Open(parent, *S),
|
|
|
|
Token::Open(childa, *S),
|
|
|
|
Token::Close(None, *S2),
|
|
|
|
Token::Open(childb, *S),
|
|
|
|
Token::Close(None, *S2),
|
|
|
|
Token::Close(Some(parent), *S2),
|
2021-10-02 01:03:19 -04:00
|
|
|
]
|
|
|
|
.into_iter();
|
2021-09-16 10:18:02 -04:00
|
|
|
|
|
|
|
let expected = Element {
|
|
|
|
name: parent,
|
2021-10-29 16:34:05 -04:00
|
|
|
attrs: None,
|
2021-09-16 10:18:02 -04:00
|
|
|
children: vec![
|
|
|
|
Tree::Element(Element {
|
|
|
|
name: childa,
|
2021-10-29 16:34:05 -04:00
|
|
|
attrs: None,
|
2021-09-16 10:18:02 -04:00
|
|
|
children: vec![],
|
|
|
|
span: (*S, *S2),
|
|
|
|
}),
|
|
|
|
Tree::Element(Element {
|
|
|
|
name: childb,
|
2021-10-29 16:34:05 -04:00
|
|
|
attrs: None,
|
2021-09-16 10:18:02 -04:00
|
|
|
children: vec![],
|
|
|
|
span: (*S, *S2),
|
|
|
|
}),
|
|
|
|
],
|
|
|
|
span: (*S, *S2),
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut sut = parser_from(toks);
|
|
|
|
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Tree::Element(expected))));
|
|
|
|
assert_eq!(sut.next(), None);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ensures that attributes do not cause the parent context to be lost.
|
|
|
|
#[test]
|
|
|
|
fn element_with_child_with_attributes() {
|
|
|
|
let parent = "parent".unwrap_into();
|
|
|
|
let child = "child".unwrap_into();
|
|
|
|
let attr = "attr".unwrap_into();
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
let value = "attr value".intern();
|
2021-09-16 10:18:02 -04:00
|
|
|
|
2021-10-02 01:03:19 -04:00
|
|
|
let toks = [
|
2021-09-23 14:52:53 -04:00
|
|
|
Token::Open(parent, *S),
|
|
|
|
Token::Open(child, *S),
|
|
|
|
Token::AttrName(attr, *S),
|
|
|
|
Token::AttrValue(value, *S2),
|
|
|
|
Token::Close(None, *S3),
|
|
|
|
Token::Close(Some(parent), *S3),
|
2021-10-02 01:03:19 -04:00
|
|
|
]
|
|
|
|
.into_iter();
|
2021-09-16 10:18:02 -04:00
|
|
|
|
|
|
|
let expected = Element {
|
|
|
|
name: parent,
|
2021-10-29 16:34:05 -04:00
|
|
|
attrs: None,
|
2021-09-16 10:18:02 -04:00
|
|
|
children: vec![Tree::Element(Element {
|
|
|
|
name: child,
|
2021-10-29 16:34:05 -04:00
|
|
|
attrs: Some(AttrList::from([Attr::new(attr, value, (*S, *S2))])),
|
2021-09-16 10:18:02 -04:00
|
|
|
children: vec![],
|
|
|
|
span: (*S, *S3),
|
|
|
|
})],
|
|
|
|
span: (*S, *S3),
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut sut = parser_from(toks);
|
|
|
|
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Tree::Element(expected))));
|
|
|
|
assert_eq!(sut.next(), None);
|
|
|
|
}
|
|
|
|
|
2021-10-08 16:16:33 -04:00
|
|
|
#[test]
|
|
|
|
fn element_with_text() {
|
|
|
|
let parent = "parent".unwrap_into();
|
2021-11-15 23:47:14 -05:00
|
|
|
let text = "inner text".into();
|
2021-10-08 16:16:33 -04:00
|
|
|
|
|
|
|
let toks = [
|
|
|
|
Token::Open(parent, *S),
|
|
|
|
Token::Text(text, *S2),
|
|
|
|
Token::Close(Some(parent), *S3),
|
|
|
|
]
|
|
|
|
.into_iter();
|
|
|
|
|
|
|
|
let expected = Element {
|
|
|
|
name: parent,
|
2021-10-29 16:34:05 -04:00
|
|
|
attrs: None,
|
2021-10-08 16:16:33 -04:00
|
|
|
children: vec![Tree::Text(text, *S2)],
|
|
|
|
span: (*S, *S3),
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut sut = parser_from(toks);
|
|
|
|
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Tree::Element(expected))));
|
|
|
|
assert_eq!(sut.next(), None);
|
|
|
|
}
|
|
|
|
|
2021-09-16 10:18:02 -04:00
|
|
|
#[test]
|
|
|
|
fn parser_from_filters_incomplete() {
|
|
|
|
let name = ("ns", "elem").unwrap_into();
|
|
|
|
let attr = "a".unwrap_into();
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
let val = "val1".intern();
|
2021-09-16 10:18:02 -04:00
|
|
|
|
2021-10-02 01:03:19 -04:00
|
|
|
let toks = [
|
2021-09-23 14:52:53 -04:00
|
|
|
Token::Open(name, *S),
|
2021-09-16 10:18:02 -04:00
|
|
|
Token::AttrName(attr, *S),
|
|
|
|
Token::AttrValue(val, *S2),
|
|
|
|
Token::Close(None, *S2),
|
2021-10-02 01:03:19 -04:00
|
|
|
]
|
|
|
|
.into_iter();
|
2021-09-16 10:18:02 -04:00
|
|
|
|
|
|
|
let expected = Element {
|
|
|
|
name,
|
2021-10-29 16:34:05 -04:00
|
|
|
attrs: Some(AttrList::from([Attr::new(attr, val, (*S, *S2))])),
|
2021-09-16 10:18:02 -04:00
|
|
|
children: vec![],
|
|
|
|
span: (*S, *S2),
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut sut = parser_from(toks);
|
|
|
|
|
|
|
|
// Unlike the previous tests, we should filter out all the
|
|
|
|
// `Parsed::Incomplete` and yield only when we have a fully parsed
|
|
|
|
// object.
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Tree::Element(expected))));
|
|
|
|
assert_eq!(sut.next(), None);
|
|
|
|
}
|
2021-11-03 14:37:05 -04:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn parse_attrs_fails_if_first_token_is_non_attr() {
|
|
|
|
let tok = Token::Open("foo".unwrap_into(), *S);
|
|
|
|
let mut toks = [tok.clone()].into_iter();
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
Err(ParseError::AttrNameExpected(tok)),
|
|
|
|
parse_attrs(&mut toks, AttrList::new()),
|
|
|
|
);
|
|
|
|
|
|
|
|
// The token should have been consumed, not copied.
|
|
|
|
assert_eq!(0, toks.len());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Since the purpose of this function is to parse the complete attribute
|
|
|
|
// list, it must fail if it does not encounter `AttrEnd`.
|
|
|
|
#[test]
|
|
|
|
fn parse_attrs_fails_if_end_before_attr_end() {
|
|
|
|
let mut toks = [
|
|
|
|
Token::AttrName("foo".unwrap_into(), *S),
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
Token::AttrValue("bar".intern(), *S),
|
2021-11-03 14:37:05 -04:00
|
|
|
// No Token::AttrEnd
|
|
|
|
]
|
|
|
|
.into_iter();
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
Err(ParseError::UnexpectedAttrEof),
|
|
|
|
parse_attrs(&mut toks, AttrList::new()),
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn parse_attrs_fails_if_missing_attr_end() {
|
|
|
|
// Let's also ensure we fail if some other token is available in place
|
|
|
|
// of Token::AttrEnd.
|
|
|
|
let mut toks = [
|
|
|
|
Token::AttrName("foo".unwrap_into(), *S),
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
Token::AttrValue("bar".intern(), *S2),
|
2021-11-03 14:37:05 -04:00
|
|
|
// No Token::AttrEnd
|
|
|
|
Token::Close(None, *S3),
|
|
|
|
]
|
|
|
|
.into_iter();
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
Err(ParseError::MissingIsolatedAttrEnd(*S3)),
|
|
|
|
parse_attrs(&mut toks, AttrList::new()),
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn parse_attrs_isolated() {
|
|
|
|
let attr1 = "one".unwrap_into();
|
|
|
|
let attr2 = "two".unwrap_into();
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
let val1 = "val1".intern();
|
|
|
|
let val2 = "val2".intern();
|
2021-11-03 14:37:05 -04:00
|
|
|
|
|
|
|
let mut toks = [
|
|
|
|
Token::AttrName(attr1, *S),
|
|
|
|
Token::AttrValue(val1, *S2),
|
|
|
|
Token::AttrName(attr2, *S2),
|
|
|
|
Token::AttrValue(val2, *S3),
|
2021-12-06 14:48:55 -05:00
|
|
|
Token::AttrEnd(*S3),
|
2021-11-03 14:37:05 -04:00
|
|
|
]
|
|
|
|
.into_iter();
|
|
|
|
|
|
|
|
let expected = AttrList::from([
|
|
|
|
Attr::new(attr1, val1, (*S, *S2)),
|
|
|
|
Attr::new(attr2, val2, (*S2, *S3)),
|
|
|
|
]);
|
|
|
|
|
|
|
|
assert_eq!(expected, parse_attrs(&mut toks, AttrList::new()).unwrap());
|
|
|
|
}
|
2021-11-04 10:52:16 -04:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn attr_parser_with_non_attr_token() {
|
|
|
|
let name = "unexpected".unwrap_into();
|
2021-11-05 10:54:05 -04:00
|
|
|
let mut toks = [Token::Open(name, *S)].into_iter();
|
2021-11-04 10:52:16 -04:00
|
|
|
|
2021-11-05 10:54:05 -04:00
|
|
|
let mut sut = attr_parser_from(&mut toks);
|
2021-11-04 10:52:16 -04:00
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
sut.next(),
|
|
|
|
Some(Err(ParseError::AttrNameExpected(Token::Open(name, *S))))
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn parser_attr_multiple() {
|
|
|
|
let attr1 = "one".unwrap_into();
|
|
|
|
let attr2 = "two".unwrap_into();
|
tamer: xir::escape: Remove XirString in favor of Escaper
This rewrites a good portion of the previous commit.
Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.
Given that, we need only unescape on read and escape on write. This is
customary, so why didn't I do that to begin with?
The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming. However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around. If we share the Escaper between _all_
readers and the writer, the result is that
1. Duplicate strings between source files and object files (many of which
are read by both the linker and compiler) avoid re-unescaping; and
2. Writers can use this cache to avoid re-escaping when we've already seen
the escaped variant of the string during read.
The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.
DEV-11081
2021-11-12 13:59:14 -05:00
|
|
|
let val1 = "val1".intern();
|
|
|
|
let val2 = "val2".intern();
|
2021-11-04 10:52:16 -04:00
|
|
|
|
2021-11-05 10:54:05 -04:00
|
|
|
let mut toks = [
|
2021-11-04 10:52:16 -04:00
|
|
|
Token::AttrName(attr1, *S),
|
|
|
|
Token::AttrValue(val1, *S2),
|
|
|
|
Token::AttrName(attr2, *S2),
|
|
|
|
Token::AttrValue(val2, *S3),
|
2021-12-06 14:48:55 -05:00
|
|
|
Token::AttrEnd(*S3),
|
2021-11-17 00:13:07 -05:00
|
|
|
// Token that we should _not_ hit.
|
|
|
|
Token::Text("nohit".into(), *S),
|
2021-11-04 10:52:16 -04:00
|
|
|
]
|
|
|
|
.into_iter();
|
|
|
|
|
2021-11-05 10:54:05 -04:00
|
|
|
let mut sut = attr_parser_from(&mut toks);
|
2021-11-04 10:52:16 -04:00
|
|
|
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Attr::new(attr1, val1, (*S, *S2)))));
|
|
|
|
assert_eq!(sut.next(), Some(Ok(Attr::new(attr2, val2, (*S2, *S3)))));
|
|
|
|
assert_eq!(sut.next(), None);
|
2021-11-17 00:13:07 -05:00
|
|
|
|
|
|
|
// Parsing must stop after AttrEnd,
|
|
|
|
// after which some other parser can continue on the same token
|
|
|
|
// stream.
|
|
|
|
// Even if there _were_ more attributes,
|
|
|
|
// this parser is spent and cannot continue.
|
|
|
|
drop(sut);
|
|
|
|
assert_eq!(toks.next(), Some(Token::Text("nohit".into(), *S)));
|
2021-11-04 10:52:16 -04:00
|
|
|
}
|