tamer: ir::xir::reader: Initial concept
This is an initial working concept for the reader which handles, so far, just a single attribute. But extending it to completion will not be all that much more work. This does not have namespace support---that will be added later as part of XIRT, which is responsible for semantic analysis. This allows XIR to stay wonderfully simple, and won't have any impact on the writer (which expects that QNames are unresolved and contain the namespace prefix to be written).main
parent
fc3953e90e
commit
4c4d89f84f
|
@ -22,24 +22,64 @@
|
|||
//! XIR serves not only as a TAMER-specific IR,
|
||||
//! but also as an abstraction layer atop of whatever XML library is
|
||||
//! used (e.g. `quick_xml`).
|
||||
//!
|
||||
//! XIR is _not_ intended to be comprehensive,
|
||||
//! or even general-purpose---it
|
||||
//! exists to solve concerns specific to TAMER's construction.
|
||||
//!
|
||||
//! _This is a work in progress!_
|
||||
//! Parsing and Safety
|
||||
//! ==================
|
||||
//! Many XIR elements know how to safely parse into themselves,
|
||||
//! exposing [`TryFrom`] traits that will largely do the right thing for
|
||||
//! you.
|
||||
//! For example,
|
||||
//! [`QName`] is able to construct itself from a byte slice and from a
|
||||
//! string tuple,
|
||||
//! among other things.
|
||||
//!
|
||||
//! ```
|
||||
//! use tamer::ir::xir::QName;
|
||||
//! use tamer::sym::GlobalSymbolIntern;
|
||||
//!
|
||||
//!# fn main() -> Result<(), tamer::ir::xir::Error> {
|
||||
//! let src = "foo:bar".as_bytes();
|
||||
//! let qname = QName::try_from(src)?;
|
||||
//!
|
||||
//! assert_eq!(qname, ("foo", "bar").try_into()?);
|
||||
//!
|
||||
//!# Ok(())
|
||||
//!# }
|
||||
//! ```
|
||||
//!
|
||||
//! However,
|
||||
//! certain elements cannot fully parse on their own because require
|
||||
//! important contextual information,
|
||||
//! such as [`AttrValue`],
|
||||
//! which requires knowing whether the provided value is escaped.
|
||||
//! It is important that the caller is diligent in making the proper
|
||||
//! determination in these cases,
|
||||
//! otherwise it could result in situations ranging from invalid
|
||||
//! compiler output to security vulnerabilities
|
||||
//! (via XML injection).
|
||||
//!
|
||||
//! To parse an entire XML document,
|
||||
//! see [`reader`].
|
||||
|
||||
use crate::span::Span;
|
||||
use crate::sym::{
|
||||
st_as_sym, CIdentStaticSymbolId, GlobalSymbolIntern, StaticSymbolId,
|
||||
SymbolId, TameIdentStaticSymbolId, UriStaticSymbolId,
|
||||
st_as_sym, CIdentStaticSymbolId, GlobalSymbolIntern,
|
||||
GlobalSymbolInternBytes, StaticSymbolId, SymbolId, TameIdentStaticSymbolId,
|
||||
UriStaticSymbolId,
|
||||
};
|
||||
use memchr::memchr;
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fmt::Display;
|
||||
use std::ops::Deref;
|
||||
|
||||
mod error;
|
||||
pub use error::Error;
|
||||
|
||||
pub mod iter;
|
||||
pub mod pred;
|
||||
pub mod reader;
|
||||
pub mod tree;
|
||||
pub mod writer;
|
||||
|
||||
|
@ -53,6 +93,7 @@ pub trait QNameCompatibleStaticSymbolId: StaticSymbolId {}
|
|||
impl QNameCompatibleStaticSymbolId for CIdentStaticSymbolId {}
|
||||
impl QNameCompatibleStaticSymbolId for TameIdentStaticSymbolId {}
|
||||
|
||||
#[doc(hidden)]
|
||||
macro_rules! qname_const_inner {
|
||||
($name:ident = :$local:ident) => {
|
||||
const $name: QName = QName::st_cid_local(&$local);
|
||||
|
@ -106,6 +147,24 @@ impl NCName {
|
|||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&[u8]> for NCName {
|
||||
type Error = Error;
|
||||
|
||||
/// Attempt to parse a byte slice into an [`NCName`].
|
||||
///
|
||||
/// If the slice contains `b':'`,
|
||||
/// an error will be produced.
|
||||
/// No other checks are performed beyond checking that the byte sequence
|
||||
/// represents a valid UTF-8 string.
|
||||
/// The string will be interned for you.
|
||||
fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
|
||||
match value.contains(&b':') {
|
||||
true => Err(Error::NCColon(value.to_owned())),
|
||||
false => Ok(NCName(value.intern_utf8()?)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for NCName {
|
||||
type Target = SymbolId;
|
||||
|
||||
|
@ -120,34 +179,6 @@ impl PartialEq<SymbolId> for NCName {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum Error {
|
||||
/// Provided name contains a `':'`.
|
||||
NCColon(String),
|
||||
|
||||
/// Provided string contains non-ASCII-whitespace characters.
|
||||
NotWhitespace(String),
|
||||
}
|
||||
|
||||
impl Display for Error {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::NCColon(name) => {
|
||||
write!(f, "NCName must not contain a colon: `{}`", name)
|
||||
}
|
||||
Self::NotWhitespace(s) => {
|
||||
write!(f, "String contains non-ASCII-whitespace: `{}`", s)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for Error {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&str> for NCName {
|
||||
type Error = Error;
|
||||
|
||||
|
@ -260,8 +291,8 @@ const_assert!(std::mem::size_of::<QName>() <= std::mem::size_of::<usize>());
|
|||
impl QName {
|
||||
/// Create a new fully-qualified name (including both a namespace URI
|
||||
/// and local name).
|
||||
pub fn new(prefix: Prefix, local_name: LocalPart) -> Self {
|
||||
Self(Some(prefix), local_name)
|
||||
pub fn new(prefix: Option<Prefix>, local_name: LocalPart) -> Self {
|
||||
Self(prefix, local_name)
|
||||
}
|
||||
|
||||
/// Create a new name from a local name only.
|
||||
|
@ -343,6 +374,44 @@ impl TryFrom<&str> for QName {
|
|||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&[u8]> for QName {
|
||||
type Error = Error;
|
||||
|
||||
/// Attempt to parse a byte slice into a [`QName`].
|
||||
///
|
||||
/// The byte slice must represent a valid QName in UTF-8.
|
||||
/// If a colon is present,
|
||||
/// it delimits the namespace [`Prefix`] and [`LocalPart`],
|
||||
/// and therefore must not be in the first or last byte position.
|
||||
fn try_from(name: &[u8]) -> Result<Self, Self::Error> {
|
||||
match memchr(b':', name) {
|
||||
// Leading colon means we're missing a prefix, trailing means
|
||||
// that we have no local part.
|
||||
Some(pos) if pos == 0 || pos == name.len() - 1 => {
|
||||
Err(Error::InvalidQName(name.to_owned()))
|
||||
}
|
||||
|
||||
// There is _at least_ one colon in the string.
|
||||
Some(pos) => {
|
||||
// The prefix is before the first colon,
|
||||
// and so itself must not contain a colon and is therefore
|
||||
// a valid NCName.
|
||||
let prefix = NCName(name[..pos].intern_utf8()?);
|
||||
|
||||
// But there could be a _second_ colon,
|
||||
// so the local part requires validation.
|
||||
let local = NCName::try_from(&name[(pos + 1)..])?;
|
||||
|
||||
Ok(Self::new(Some(prefix.into()), local.into()))
|
||||
}
|
||||
|
||||
// There are no colons in the string, so the entire string is
|
||||
// both a local part and a valid NCName.
|
||||
None => Ok(Self::new(None, NCName(name.intern_utf8()?).into())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents text and its escaped state.
|
||||
///
|
||||
/// Being explicit about the state of escaping allows us to skip checks when
|
||||
|
@ -523,6 +592,21 @@ mod test {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ncname_from_byte_slice() -> TestResult {
|
||||
let name: NCName = (b"no-colon" as &[u8]).try_into()?;
|
||||
assert_eq!(name, "no-colon".intern());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ncname_from_byte_slice_fails_with_colon() {
|
||||
assert_eq!(
|
||||
NCName::try_from(b"a:colon" as &[u8]),
|
||||
Err(Error::NCColon("a:colon".into()))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_name_from_local_part_only() -> TestResult {
|
||||
let name = QName::new_local("foo".try_into()?);
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
// XIR errors
|
||||
//
|
||||
// Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
|
||||
//
|
||||
// This file is part of TAME.
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! XIR error information.
|
||||
|
||||
use std::{fmt::Display, str::Utf8Error};
|
||||
|
||||
/// Error attempting to produce a XIR object.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum Error {
|
||||
/// Provided name contains a `':'`.
|
||||
NCColon(Vec<u8>),
|
||||
/// Provided string contains non-ASCII-whitespace characters.
|
||||
NotWhitespace(String),
|
||||
/// Provided QName is not valid.
|
||||
InvalidQName(Vec<u8>),
|
||||
// A UTF-8 error together with the byte slice that caused it.
|
||||
//
|
||||
// By storing the raw bytes instead of a string,
|
||||
// we allow the displayer to determine how to handle invalid UTF-8
|
||||
// encodings.
|
||||
InvalidUtf8(Utf8Error, Vec<u8>),
|
||||
}
|
||||
|
||||
impl Display for Error {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::NCColon(bytes) => {
|
||||
write!(
|
||||
f,
|
||||
"NCName `{}` cannot contain ':'",
|
||||
String::from_utf8_lossy(bytes)
|
||||
)
|
||||
}
|
||||
Self::NotWhitespace(s) => {
|
||||
write!(f, "string contains non-ASCII-whitespace: `{}`", s)
|
||||
}
|
||||
Self::InvalidQName(bytes) => {
|
||||
write!(f, "invalid QName `{}`", String::from_utf8_lossy(bytes))
|
||||
}
|
||||
Self::InvalidUtf8(inner, bytes) => {
|
||||
write!(
|
||||
f,
|
||||
"{} for string `{}`",
|
||||
inner,
|
||||
String::from_utf8_lossy(bytes)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for Error {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
match self {
|
||||
Self::InvalidUtf8(err, ..) => Some(err),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<(Utf8Error, &[u8])> for Error {
|
||||
fn from((err, bytes): (Utf8Error, &[u8])) -> Self {
|
||||
Self::InvalidUtf8(err, bytes.to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<quick_xml::Error> for Error {
|
||||
fn from(err: quick_xml::Error) -> Self {
|
||||
// These need to be translated to provide our own errors.
|
||||
todo!("quick_xml::Error: {:?}", err)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,179 @@
|
|||
// XIR reader
|
||||
//
|
||||
// Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
|
||||
//
|
||||
// This file is part of TAME.
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
//! Parse XML files into a XIR [`Token`] stream.
|
||||
//!
|
||||
//! This uses [`quick_xml`] as the parser.
|
||||
|
||||
use super::{AttrValue, Error, Token};
|
||||
use crate::{span::DUMMY_SPAN, sym::GlobalSymbolInternBytes};
|
||||
use quick_xml::{
|
||||
self,
|
||||
events::{attributes::Attributes, Event as QuickXmlEvent},
|
||||
};
|
||||
use std::{collections::VecDeque, io::BufRead, result};
|
||||
|
||||
pub type Result<T> = result::Result<T, Error>;
|
||||
|
||||
/// Parse XML into a XIR [`Token`] stream.
|
||||
///
|
||||
/// This reader is intended to be used as an [`Iterator`].
|
||||
///
|
||||
/// The underlying reader produces events in chunks that are far too
|
||||
/// large for XIR,
|
||||
/// so most [`Token`]s retrieved via this call are buffered.
|
||||
/// Parsing takes place when that buffer is exhausted and the next event
|
||||
/// is requested from the underlying reader
|
||||
/// (see [`XmlXirReader::refill_buf`]).
|
||||
/// Errors can only occur during parsing,
|
||||
/// and will never occur on buffered tokens.
|
||||
///
|
||||
/// [`None`] is returned only on EOF,
|
||||
/// not on error.
|
||||
pub struct XmlXirReader<B: BufRead> {
|
||||
/// Inner parser.
|
||||
reader: quick_xml::Reader<B>,
|
||||
|
||||
/// Buffer for [`quick_xml::Reader`].
|
||||
readbuf: Vec<u8>,
|
||||
|
||||
/// [`Token`] buffer populated upon receiving a new event from
|
||||
/// `reader`.
|
||||
///
|
||||
/// This buffer serves [`Iterator::next`] requests until it is
|
||||
/// depleted,
|
||||
/// after which [`XmlXirReader::refill_buf`] requests another token
|
||||
/// from `reader`.
|
||||
tokbuf: VecDeque<Token>,
|
||||
}
|
||||
|
||||
impl<B: BufRead> XmlXirReader<B> {
|
||||
pub fn new(reader: B) -> Self {
|
||||
Self {
|
||||
reader: quick_xml::Reader::from_reader(reader),
|
||||
readbuf: Vec::new(),
|
||||
// This capacity is largely arbitrary,
|
||||
// but [`Token`]s are small enough that it likely does not
|
||||
// matter much.
|
||||
tokbuf: VecDeque::with_capacity(32),
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse using the underlying [`quick_xml::Reader`] and populate the
|
||||
/// [`Token`] buffer.
|
||||
///
|
||||
/// This is intended to be invoked once the buffer has been depleted by
|
||||
/// [`XmlXirReader::next`].
|
||||
pub fn refill_buf(&mut self) -> Option<Result<Token>> {
|
||||
// Clear any previous buffer to free unneeded data.
|
||||
self.tokbuf.clear();
|
||||
|
||||
// TODO: need an option to ignore namespaces, since it's a waste of
|
||||
// time for the linker
|
||||
match self.reader.read_event(&mut self.readbuf) {
|
||||
// This is the only time we'll consider the iterator to be done.
|
||||
Ok(QuickXmlEvent::Eof) => None,
|
||||
|
||||
Err(inner) => Some(Err(inner.into())),
|
||||
|
||||
Ok(ev) => match ev {
|
||||
QuickXmlEvent::Empty(ele) => {
|
||||
Some(ele.name().try_into().map_err(Error::from).and_then(
|
||||
|qname| {
|
||||
Self::parse_attrs(
|
||||
&mut self.tokbuf,
|
||||
ele.attributes(),
|
||||
)?;
|
||||
|
||||
self.tokbuf
|
||||
.push_front(Token::Close(None, DUMMY_SPAN));
|
||||
|
||||
// The first token will be immediately returned
|
||||
// via the Iterator.
|
||||
Ok(Token::Open(qname, DUMMY_SPAN))
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
// quick_xml emits a useless text event if the first byte is
|
||||
// a '<'.
|
||||
QuickXmlEvent::Text(bytes) if bytes.escaped().is_empty() => {
|
||||
self.refill_buf()
|
||||
}
|
||||
|
||||
x => todo!("event: {:?}", x),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse attributes into a XIR [`Token`] stream.
|
||||
///
|
||||
/// The order of attributes will be maintained.
|
||||
///
|
||||
/// This does not yet handle whitespace between attributes,
|
||||
/// or around `=`.
|
||||
fn parse_attrs<'a>(
|
||||
tokbuf: &mut VecDeque<Token>,
|
||||
mut attrs: Attributes<'a>,
|
||||
) -> Result<()> {
|
||||
// Disable checks to allow duplicate attributes;
|
||||
// XIR does not enforce this,
|
||||
// because it needs to accommodate semantically invalid XML for
|
||||
// later analysis.
|
||||
for result in attrs.with_checks(false) {
|
||||
let attr = result?;
|
||||
|
||||
// The attribute value,
|
||||
// having just been read from XML,
|
||||
// must have been escaped to be parsed properly.
|
||||
// If it parsed but it's not technically escaped according to
|
||||
// the spec,
|
||||
// that's okay as long as we can read it again,
|
||||
// but we probably should still throw an error if we
|
||||
// encounter such a situation.
|
||||
let value = AttrValue::Escaped(attr.value.as_ref().intern_utf8()?);
|
||||
|
||||
// The name must be parsed as a QName.
|
||||
let name = attr.key.try_into()?;
|
||||
|
||||
tokbuf.push_front(Token::AttrName(name, DUMMY_SPAN));
|
||||
tokbuf.push_front(Token::AttrValue(value, DUMMY_SPAN));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<B: BufRead> Iterator for XmlXirReader<B> {
|
||||
type Item = Result<Token>;
|
||||
|
||||
/// Produce the next XIR [`Token`] from the input.
|
||||
///
|
||||
/// For more information on how this reader operates,
|
||||
/// see [`XmlXirReader`].
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.tokbuf
|
||||
.pop_back()
|
||||
.map(|tok| Ok(tok))
|
||||
.or_else(|| self.refill_buf())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test;
|
|
@ -0,0 +1,230 @@
|
|||
// XIR reader tests
|
||||
//
|
||||
// Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
|
||||
//
|
||||
// This file is part of TAME.
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use super::*;
|
||||
use crate::{
|
||||
convert::ExpectInto,
|
||||
ir::xir::{AttrValue, Token},
|
||||
span::DUMMY_SPAN,
|
||||
};
|
||||
|
||||
/// These tests use [`quick_xml`] directly,
|
||||
/// rather than mocking it,
|
||||
/// because parsing XML isn't a simple matter and we want to be sure that
|
||||
/// our assumptions of how `quick_xml` performs its parsing is accurate.
|
||||
/// Consequently,
|
||||
/// these act more like integration tests than unit tests.
|
||||
///
|
||||
/// This means that `quick_xml` breakages will break these tests,
|
||||
/// and that is (unlike with unit tests) exactly what we want to happen
|
||||
/// here;
|
||||
/// we _complement_ the behavior of quick-xml,
|
||||
/// both by reimplementing certain functionality
|
||||
/// (like namespace management)
|
||||
/// and by relying on certain parsing behavior to eliminate
|
||||
/// redundant checks.
|
||||
|
||||
type Sut<B> = XmlXirReader<B>;
|
||||
|
||||
/// A byte that will be invalid provided that there is either no following
|
||||
/// UTF-8 byte,
|
||||
/// or if it's followed by another byte that is invalid in that
|
||||
/// position.
|
||||
const INVALID_UTF8_BYTE: u8 = 0b11000000u8;
|
||||
|
||||
// SAFETY: We want an invalid UTF-8 str for tests.
|
||||
// (We can use raw bytes and avoid `unsafe`,
|
||||
// but this is more convenient.)
|
||||
const INVALID_STR: &str =
|
||||
unsafe { std::str::from_utf8_unchecked(&[INVALID_UTF8_BYTE]) };
|
||||
|
||||
#[test]
|
||||
fn empty_node_without_prefix_or_attributes() {
|
||||
let sut = Sut::new("<empty-node />".as_bytes());
|
||||
|
||||
let result = sut.collect::<Result<Vec<_>>>();
|
||||
|
||||
assert_eq!(
|
||||
result.expect("parsing failed"),
|
||||
vec![
|
||||
Token::Open("empty-node".unwrap_into(), DUMMY_SPAN),
|
||||
Token::Close(None, DUMMY_SPAN),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
// Resolving namespaces is not the concern of XIR.
|
||||
#[test]
|
||||
fn does_not_resolve_xmlns() {
|
||||
let sut = Sut::new(r#"<no-ns xmlns="noresolve" />"#.as_bytes());
|
||||
|
||||
let result = sut.collect::<Result<Vec<_>>>();
|
||||
|
||||
assert_eq!(
|
||||
result.expect("parsing failed"),
|
||||
vec![
|
||||
Token::Open("no-ns".unwrap_into(), DUMMY_SPAN),
|
||||
// Since we didn't parse @xmlns, it's still an attribute.
|
||||
Token::AttrName("xmlns".unwrap_into(), DUMMY_SPAN),
|
||||
Token::AttrValue(
|
||||
AttrValue::Escaped("noresolve".into()),
|
||||
DUMMY_SPAN
|
||||
),
|
||||
Token::Close(None, DUMMY_SPAN),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
// Resolving namespaces is not the concern of XIR.
|
||||
#[test]
|
||||
fn empty_node_with_prefix_without_attributes_unresolved() {
|
||||
let sut = Sut::new(r#"<x:empty-node xmlns:x="noresolve" />"#.as_bytes());
|
||||
|
||||
let result = sut.collect::<Result<Vec<_>>>();
|
||||
|
||||
// Should be the QName, _unresolved_.
|
||||
assert_eq!(
|
||||
result.expect("parsing failed"),
|
||||
vec![
|
||||
Token::Open(("x", "empty-node").unwrap_into(), DUMMY_SPAN),
|
||||
Token::AttrName(("xmlns", "x").unwrap_into(), DUMMY_SPAN),
|
||||
Token::AttrValue(
|
||||
AttrValue::Escaped("noresolve".into()),
|
||||
DUMMY_SPAN
|
||||
),
|
||||
Token::Close(None, DUMMY_SPAN),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
// TODO: Enough information for error recovery and reporting.
|
||||
#[test]
|
||||
fn prefix_with_empty_local_name_invalid_qname() {
|
||||
// No local name (trailing colon).
|
||||
let sut = Sut::new(r#"<x: xmlns:x="testns" />"#.as_bytes());
|
||||
|
||||
let result = sut.collect::<Result<Vec<_>>>();
|
||||
|
||||
match result {
|
||||
Ok(_) => panic!("expected failure"),
|
||||
Err(given) => {
|
||||
assert_eq!(Error::InvalidQName("x:".into()), given);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The order of attributes must be retained.
|
||||
#[test]
|
||||
fn multiple_attrs_ordered() {
|
||||
let sut = Sut::new(r#"<ele foo="a" bar="b" b:baz="c" />"#.as_bytes());
|
||||
|
||||
let result = sut.collect::<Result<Vec<_>>>();
|
||||
|
||||
assert_eq!(
|
||||
result.expect("parsing failed"),
|
||||
vec![
|
||||
Token::Open("ele".unwrap_into(), DUMMY_SPAN),
|
||||
Token::AttrName("foo".unwrap_into(), DUMMY_SPAN),
|
||||
Token::AttrValue(AttrValue::Escaped("a".into()), DUMMY_SPAN),
|
||||
Token::AttrName("bar".unwrap_into(), DUMMY_SPAN),
|
||||
Token::AttrValue(AttrValue::Escaped("b".into()), DUMMY_SPAN),
|
||||
Token::AttrName(("b", "baz").unwrap_into(), DUMMY_SPAN),
|
||||
Token::AttrValue(AttrValue::Escaped("c".into()), DUMMY_SPAN),
|
||||
Token::Close(None, DUMMY_SPAN),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
// Contrary to the specification, but this is the responsibility of XIRT; we
|
||||
// need to allow it to support e.g. recovery, code formatting, and LSPs.
|
||||
#[test]
|
||||
fn permits_duplicate_attrs() {
|
||||
let sut = Sut::new(r#"<dup attr="a" attr="b" />"#.as_bytes());
|
||||
|
||||
let result = sut.collect::<Result<Vec<_>>>();
|
||||
|
||||
assert_eq!(
|
||||
result.expect("parsing failed"),
|
||||
vec![
|
||||
Token::Open("dup".unwrap_into(), DUMMY_SPAN),
|
||||
Token::AttrName("attr".unwrap_into(), DUMMY_SPAN),
|
||||
Token::AttrValue(AttrValue::Escaped("a".into()), DUMMY_SPAN),
|
||||
Token::AttrName("attr".unwrap_into(), DUMMY_SPAN),
|
||||
Token::AttrValue(AttrValue::Escaped("b".into()), DUMMY_SPAN),
|
||||
Token::Close(None, DUMMY_SPAN),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
// TODO: Enough information for error recovery and reporting.
|
||||
#[test]
|
||||
fn node_name_invalid_utf8() {
|
||||
let bytes: &[u8] = &[b'<', INVALID_UTF8_BYTE, b'/', b'>'];
|
||||
let sut = Sut::new(bytes);
|
||||
|
||||
let result = sut.collect::<Result<Vec<_>>>();
|
||||
|
||||
match result {
|
||||
Ok(_) => panic!("expected failure"),
|
||||
Err(Error::InvalidUtf8(_, bytes)) => {
|
||||
assert_eq!(bytes, &[INVALID_UTF8_BYTE]);
|
||||
}
|
||||
_ => panic!("unexpected failure"),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Enough information for error recovery and reporting.
|
||||
#[test]
|
||||
fn attr_name_invalid_utf8() {
|
||||
let mut s = String::from("<a ");
|
||||
s.push_str(INVALID_STR);
|
||||
s.push_str(r#"="value"/>"#);
|
||||
|
||||
let sut = Sut::new(s.as_bytes());
|
||||
|
||||
let result = sut.collect::<Result<Vec<_>>>();
|
||||
|
||||
match result {
|
||||
Ok(_) => panic!("expected failure"),
|
||||
Err(Error::InvalidUtf8(_, bytes)) => {
|
||||
assert_eq!(bytes, &[INVALID_UTF8_BYTE]);
|
||||
}
|
||||
_ => panic!("unexpected failure"),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Enough information for error recovery and reporting.
|
||||
#[test]
|
||||
fn attr_value_invalid_utf8() {
|
||||
let mut s = String::from(r#"<a attr="bad"#);
|
||||
s.push_str(INVALID_STR);
|
||||
s.push_str(r#""/>"#);
|
||||
|
||||
let sut = Sut::new(s.as_bytes());
|
||||
|
||||
let result = sut.collect::<Result<Vec<_>>>();
|
||||
|
||||
match result {
|
||||
Ok(_) => panic!("expected failure"),
|
||||
Err(Error::InvalidUtf8(_, bytes)) => {
|
||||
assert_eq!(bytes, &[b'b', b'a', b'd', INVALID_UTF8_BYTE]);
|
||||
}
|
||||
_ => panic!("unexpected failure"),
|
||||
}
|
||||
}
|
|
@ -176,8 +176,14 @@ pub trait Interner<'i, Ix: SymbolIndexSize> {
|
|||
/// see [`Interner::intern_utf8_unchecked`].
|
||||
///
|
||||
/// If the byte slice does not represent a valid UTF-8 string,
|
||||
/// a [`Utf8Error`] will be returned.
|
||||
fn intern_utf8(&self, value: &[u8]) -> Result<SymbolId<Ix>, Utf8Error>;
|
||||
/// a [`Utf8Error`] will be returned along with a reference to the
|
||||
/// provided byte string.
|
||||
/// The purpose of this pair is to simplify error conversions
|
||||
/// using `?` so that errors can contain additional context.
|
||||
fn intern_utf8<'a>(
|
||||
&self,
|
||||
value: &'a [u8],
|
||||
) -> Result<SymbolId<Ix>, (Utf8Error, &'a [u8])>;
|
||||
|
||||
/// Intern an assumed-UTF-8 slice of bytes or return an existing
|
||||
/// [`SymbolId`].
|
||||
|
@ -367,7 +373,10 @@ where
|
|||
id
|
||||
}
|
||||
|
||||
fn intern_utf8(&self, value: &[u8]) -> Result<SymbolId<Ix>, Utf8Error> {
|
||||
fn intern_utf8<'a>(
|
||||
&self,
|
||||
value: &'a [u8],
|
||||
) -> Result<SymbolId<Ix>, (Utf8Error, &'a [u8])> {
|
||||
// Check the raw byte slice _before_ performing a UTF-8 check.
|
||||
// Note that `from_utf8_unchecked` is simply a transmute,
|
||||
// so this check incurs only a hashing cost.
|
||||
|
@ -384,7 +393,9 @@ where
|
|||
// and can then proceed to intern as we normally would.
|
||||
// This does incur a double hashing cost,
|
||||
// just like `intern`.
|
||||
Ok(self.intern_without_lookup(from_utf8(value)?))
|
||||
Ok(self.intern_without_lookup(
|
||||
from_utf8(value).map_err(|err| (err, value))?,
|
||||
))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
@ -610,14 +621,15 @@ mod test {
|
|||
let sut = Sut::new();
|
||||
|
||||
// Invalid two-byte encoding.
|
||||
let bytes = &vec![0b11000000u8];
|
||||
let bytes = &[0b11000000u8];
|
||||
let result = sut.intern_utf8(bytes);
|
||||
|
||||
match (result, from_utf8(bytes)) {
|
||||
(_, Ok(_)) => panic!("test string is valid UTF-8"),
|
||||
(Ok(_), _) => panic!("expected error"),
|
||||
(Err(given), Err(expected)) => {
|
||||
assert_eq!(given, expected);
|
||||
(Err((given_err, given_u8)), Err(expected)) => {
|
||||
assert_eq!(given_u8, bytes);
|
||||
assert_eq!(given_err, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -354,6 +354,6 @@ pub use interner::{
|
|||
Interner,
|
||||
};
|
||||
pub use symbol::{
|
||||
GlobalSymbolIntern, GlobalSymbolInternUnchecked, GlobalSymbolResolve,
|
||||
SymbolId, SymbolIndexSize,
|
||||
GlobalSymbolIntern, GlobalSymbolInternBytes, GlobalSymbolInternUnchecked,
|
||||
GlobalSymbolResolve, SymbolId, SymbolIndexSize,
|
||||
};
|
||||
|
|
|
@ -28,6 +28,7 @@ use std::fmt::{Debug, Display};
|
|||
use std::hash::Hash;
|
||||
use std::num::{NonZeroU16, NonZeroU32};
|
||||
use std::ops::Deref;
|
||||
use std::str::Utf8Error;
|
||||
use std::thread::LocalKey;
|
||||
|
||||
/// Unique symbol identifier produced by an [`Interner`].
|
||||
|
@ -326,6 +327,26 @@ pub trait GlobalSymbolIntern<Ix: SymbolIndexSize> {
|
|||
fn clone_uninterned(self) -> SymbolId<Ix>;
|
||||
}
|
||||
|
||||
/// Intern a byte slice using a global interner.
|
||||
pub trait GlobalSymbolInternBytes<Ix: SymbolIndexSize>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
/// Intern a byte slice using a global interner.
|
||||
///
|
||||
/// This first checks to see if the provided slice has already been
|
||||
/// interned.
|
||||
/// If so,
|
||||
/// we are able to save time by not checking for UTF-8 validity.
|
||||
/// Otherwise,
|
||||
/// we intern the slice in the usual way,
|
||||
/// failing if it does not represent a valid UTF-8 string.
|
||||
///
|
||||
/// For further explanation,
|
||||
/// see [`Interner::intern_utf8`].
|
||||
fn intern_utf8(self) -> Result<SymbolId<Ix>, (Utf8Error, Self)>;
|
||||
}
|
||||
|
||||
/// Intern a byte slice using a global interner.
|
||||
///
|
||||
/// See also [`GlobalSymbolIntern`].
|
||||
|
@ -374,6 +395,12 @@ impl<Ix: SymbolIndexSize> GlobalSymbolIntern<Ix> for &str {
|
|||
}
|
||||
}
|
||||
|
||||
impl<Ix: SymbolIndexSize> GlobalSymbolInternBytes<Ix> for &[u8] {
|
||||
fn intern_utf8(self) -> Result<SymbolId<Ix>, (Utf8Error, Self)> {
|
||||
Ix::with_static_interner(|interner| interner.intern_utf8(self))
|
||||
}
|
||||
}
|
||||
|
||||
impl<Ix: SymbolIndexSize> GlobalSymbolInternUnchecked<Ix> for &[u8] {
|
||||
unsafe fn intern_utf8_unchecked(self) -> SymbolId<Ix> {
|
||||
Ix::with_static_interner(|interner| {
|
||||
|
|
Loading…
Reference in New Issue