tamer: xir::reader: Additional quick-xml error spans
There's a bit to unpack here. Some of the spans originate from quick-xml's error handling, but in coming up with test cases to try to trigger errors, I found that quick-xml is far too permissive in what it accepts, and oughtright dangerous in some situations. I feel like the writing is on the wall for quick-xml, but I'll probably wait until replacing `xmlo` with a more efficient format before deciding whether to use a different library or implement parsing ourselves. There's a lot of factors to consider, and a library would have to not only be correct and performant, but provide useful information for span generation. But for now, I have other more important things to work on, like a functioning compiler. So while quick-xml is around, I'll just have to do the best I can to provide a correct parser with useful errors. DEV-10934main
parent
ab181670b5
commit
68223cb7d3
|
@ -19,6 +19,7 @@
|
|||
|
||||
//! XIR error information.
|
||||
|
||||
use super::QName;
|
||||
use crate::{span::Span, sym::SymbolId, tpwrap::quick_xml};
|
||||
use std::{fmt::Display, str::Utf8Error};
|
||||
|
||||
|
@ -50,6 +51,16 @@ pub enum Error {
|
|||
/// TAMER expects UTF-8 encoding for everything,
|
||||
/// which should not be an unreasonable expectation.
|
||||
UnsupportedEncoding(SymbolId, Span),
|
||||
/// The named attribute is missing a value.
|
||||
///
|
||||
/// The span is expected to placed at the offset where the value is
|
||||
/// expected.
|
||||
/// The character `=` may or may not be present.
|
||||
AttrValueExpected(Option<QName>, Span),
|
||||
/// An attribute value was found but was not quoted.
|
||||
///
|
||||
/// The symbol here should be the name of the attribute.
|
||||
AttrValueUnquoted(Option<QName>, Span),
|
||||
|
||||
// TODO: Better error translation.
|
||||
QuickXmlError(quick_xml::Error, Span),
|
||||
|
@ -65,31 +76,33 @@ impl Error {
|
|||
|
||||
impl Display for Error {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use Error::*;
|
||||
|
||||
match self {
|
||||
Self::NCColon(sym, span) => {
|
||||
NCColon(sym, span) => {
|
||||
write!(f, "NCName `{sym}` cannot contain ':' at {span}",)
|
||||
}
|
||||
Self::NotWhitespace(s) => {
|
||||
NotWhitespace(s) => {
|
||||
write!(f, "string contains non-ASCII-whitespace: `{}`", s)
|
||||
}
|
||||
Self::InvalidQName(qname, span) => {
|
||||
InvalidQName(qname, span) => {
|
||||
write!(f, "invalid QName `{qname}` at {span}")
|
||||
}
|
||||
Self::InvalidUtf8(inner, bytes, span) => {
|
||||
InvalidUtf8(inner, bytes, span) => {
|
||||
write!(
|
||||
f,
|
||||
"{inner} for string `{}` with bytes `{bytes:?}` at {span}",
|
||||
String::from_utf8_lossy(bytes)
|
||||
)
|
||||
}
|
||||
Self::UnsupportedXmlVersion(ver, span) => {
|
||||
UnsupportedXmlVersion(ver, span) => {
|
||||
write!(
|
||||
f,
|
||||
"expected XML version `1.0` at {span}, \
|
||||
but found unsupported version `{ver}`"
|
||||
)
|
||||
}
|
||||
Self::UnsupportedEncoding(enc, span) => {
|
||||
UnsupportedEncoding(enc, span) => {
|
||||
// TODO: when we have hints,
|
||||
// indicate that they can also entirely remove this
|
||||
// attribute to resolve the error
|
||||
|
@ -99,8 +112,25 @@ impl Display for Error {
|
|||
but found unsupported encoding `{enc}`"
|
||||
)
|
||||
}
|
||||
AttrValueExpected(Some(name), span) => {
|
||||
write!(f, "value expected for attribute `{name}` at {span}")
|
||||
}
|
||||
// TODO: Parsers should provide the name.
|
||||
AttrValueExpected(None, span) => {
|
||||
write!(f, "value expected for attribute at {span}")
|
||||
}
|
||||
AttrValueUnquoted(Some(name), span) => {
|
||||
write!(
|
||||
f,
|
||||
"value for attribute `{name}` is missing quotes at {span}"
|
||||
)
|
||||
}
|
||||
// TODO: Parsers should provide the name.
|
||||
AttrValueUnquoted(None, span) => {
|
||||
write!(f, "value for attribute is missing quotes at {span}")
|
||||
}
|
||||
// TODO: Translate error messages
|
||||
Self::QuickXmlError(inner, span) => {
|
||||
QuickXmlError(inner, span) => {
|
||||
write!(f, "internal parser error: {inner} at {span}")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,14 +23,15 @@
|
|||
|
||||
use super::{error::SpanlessError, DefaultEscaper, Error, Escaper, Token};
|
||||
use crate::{
|
||||
span::{UNKNOWN_CONTEXT as UC, UNKNOWN_SPAN},
|
||||
sym::GlobalSymbolInternBytes,
|
||||
span::UNKNOWN_CONTEXT as UC,
|
||||
sym::{st::raw::WS_EMPTY, GlobalSymbolInternBytes},
|
||||
};
|
||||
use quick_xml::{
|
||||
self,
|
||||
events::{
|
||||
attributes::Attributes, BytesDecl, BytesStart, Event as QuickXmlEvent,
|
||||
},
|
||||
Error as QuickXmlError,
|
||||
};
|
||||
use std::{borrow::Cow, collections::VecDeque, io::BufRead, result};
|
||||
|
||||
|
@ -111,10 +112,12 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
|||
match self.reader.read_event(&mut self.readbuf) {
|
||||
// TODO: To provide better spans and error messages,
|
||||
// we need to map specific types of errors.
|
||||
Err(inner) => {
|
||||
// But we don't encounter much of anything here with how we make
|
||||
// use of quick-xml.
|
||||
Err(inner) => Some(Err({
|
||||
let span = UC.span_or_zz(prev_pos, 0);
|
||||
Some(Err(SpanlessError::from(inner).with_span(span)))
|
||||
}
|
||||
SpanlessError::from(inner).with_span(span)
|
||||
})),
|
||||
|
||||
Ok(ev) => match ev {
|
||||
// This is the only time we'll consider the iterator to be
|
||||
|
@ -290,13 +293,45 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
|||
let addr = ele.as_ptr() as usize - 1;
|
||||
let len = ele.name().len();
|
||||
|
||||
match ele.name().last() {
|
||||
None => {
|
||||
// TODO: QName should be self-validating. Move this.
|
||||
return Err(Error::InvalidQName(
|
||||
WS_EMPTY,
|
||||
// <>
|
||||
// | where QName should be
|
||||
UC.span_or_zz(pos + 1, 0),
|
||||
));
|
||||
}
|
||||
|
||||
// Quick-and-dirty guess as to whether they may have missed the
|
||||
// element name and included an attribute instead,
|
||||
// which quick-xml does not check for.
|
||||
Some(b'"' | b'\'') => {
|
||||
return Err({
|
||||
// <foo="bar" ...>
|
||||
// |-------|
|
||||
let span = UC.span_or_zz(pos + 1, len);
|
||||
|
||||
Error::InvalidQName(
|
||||
ele.name()
|
||||
.intern_utf8()
|
||||
.map_err(Error::from_with_span(span))?,
|
||||
span,
|
||||
)
|
||||
});
|
||||
}
|
||||
|
||||
_ => (),
|
||||
};
|
||||
|
||||
// `ele` contains every byte up to the [self-]closing tag.
|
||||
ele.name()
|
||||
.try_into()
|
||||
.map_err(Error::from_with_span(UC.span_or_zz(pos + 1, len)))
|
||||
.and_then(|qname| {
|
||||
let noattr_add: usize =
|
||||
(ele.attributes_raw().len() == 0).into();
|
||||
let has_attrs = ele.attributes_raw().len() > 0;
|
||||
let noattr_add: usize = (!has_attrs).into();
|
||||
|
||||
// <tag ... />
|
||||
// |--| name + '<'
|
||||
|
@ -305,12 +340,35 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
|||
// |---| name + '<' + '>'
|
||||
let span = UC.span_or_zz(pos, len + 1 + noattr_add);
|
||||
|
||||
Self::parse_attrs(
|
||||
escaper,
|
||||
tokbuf,
|
||||
ele.attributes(),
|
||||
addr - pos, // offset relative to _beginning_ of buffer
|
||||
)?;
|
||||
if has_attrs {
|
||||
let found = Self::parse_attrs(
|
||||
escaper,
|
||||
tokbuf,
|
||||
ele.attributes(),
|
||||
addr - pos, // offset relative to _beginning_ of buf
|
||||
pos,
|
||||
)?;
|
||||
|
||||
// Given this input, quick-xml ignores the bytes entirely:
|
||||
// <foo bar>
|
||||
// ^^^| missing `="value"`
|
||||
//
|
||||
// The whitespace check is to handle input like this:
|
||||
// <foo />
|
||||
// ^ whitespace making `attributes_raw().len` > 0
|
||||
if !found
|
||||
&& ele
|
||||
.attributes_raw()
|
||||
.iter()
|
||||
.find(|b| !Self::is_whitespace(**b))
|
||||
.is_some()
|
||||
{
|
||||
return Err(Error::AttrValueExpected(
|
||||
None,
|
||||
UC.span_or_zz(pos + ele.len() + 1, 0),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// The first token will be immediately returned
|
||||
// via the Iterator.
|
||||
|
@ -318,6 +376,14 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
|||
})
|
||||
}
|
||||
|
||||
/// quick-xml's whitespace predicate.
|
||||
fn is_whitespace(b: u8) -> bool {
|
||||
match b {
|
||||
b' ' | b'\r' | b'\n' | b'\t' => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse attributes into a XIR [`Token`] stream.
|
||||
///
|
||||
/// The order of attributes will be maintained.
|
||||
|
@ -344,15 +410,38 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
|||
tokbuf: &mut VecDeque<Token>,
|
||||
mut attrs: Attributes<'a>,
|
||||
ele_ptr: usize,
|
||||
) -> Result<()> {
|
||||
ele_pos: usize,
|
||||
) -> Result<bool> {
|
||||
let mut found = false;
|
||||
|
||||
// Disable checks to allow duplicate attributes;
|
||||
// XIR does not enforce this,
|
||||
// because it needs to accommodate semantically invalid XML for
|
||||
// later analysis.
|
||||
for result in attrs.with_checks(false) {
|
||||
// TODO: We'll need to map this quick-xml error to provide more
|
||||
// detailed messages and spans.
|
||||
let attr = result.map_err(Error::from_with_span(UNKNOWN_SPAN))?;
|
||||
found = true;
|
||||
|
||||
let attr = result.map_err(|e| match e {
|
||||
QuickXmlError::NoEqAfterName(pos) => {
|
||||
// TODO: quick-xml doesn't give us the name,
|
||||
// but we should discover it.
|
||||
Error::AttrValueExpected(
|
||||
None,
|
||||
UC.span_or_zz(ele_pos + pos, 0),
|
||||
)
|
||||
}
|
||||
|
||||
QuickXmlError::UnquotedValue(pos) => {
|
||||
// TODO: name and span length
|
||||
Error::AttrValueUnquoted(
|
||||
None,
|
||||
UC.span_or_zz(ele_pos + pos, 0),
|
||||
)
|
||||
}
|
||||
|
||||
// fallback
|
||||
e => Error::from_with_span(UC.span_or_zz(ele_pos, 0))(e),
|
||||
})?;
|
||||
|
||||
let keyoffset = attr.key.as_ptr() as usize;
|
||||
let name_offset = keyoffset - ele_ptr;
|
||||
|
@ -402,7 +491,7 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
|||
tokbuf.push_front(Token::AttrValue(value, span_value));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(found)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -641,3 +641,164 @@ fn invalid_xml_encoding() {
|
|||
sut.collect::<Result<Vec<_>>>()
|
||||
);
|
||||
}
|
||||
|
||||
//
|
||||
// quick-xml parser shortcomings
|
||||
//
|
||||
// There are certain problems quick-xml does not catch that we will catch
|
||||
// ourselves.
|
||||
// This is not intended to be comprehensive,
|
||||
// but is intended to catch what might be common-enough errors that they
|
||||
// deserve friendly output from the compiler,
|
||||
// rather than resulting in more obscure errors down the line after
|
||||
// we've left the context of XML.
|
||||
//
|
||||
// Ultimately,
|
||||
// the writing may be on the wall for quick-xml,
|
||||
// but at the time of writing there are more important things to focus on,
|
||||
// so we will make do for now.
|
||||
//
|
||||
|
||||
// quick-xml's behavior here is alarming---it simply doesn't see it as an
|
||||
// attribute at all and skips the tokens.
|
||||
// We must detect this on our own.
|
||||
#[test]
|
||||
fn attr_single_no_value_no_eq() {
|
||||
new_sut!(sut = r#" <foo attr>"#);
|
||||
// ____/ |
|
||||
// / 10
|
||||
// / where the `="value"` should be
|
||||
// |
|
||||
// WS to make sure we add the file-relative pos
|
||||
// and not just have the span relative to the element
|
||||
|
||||
let span = UC.span(10, 0);
|
||||
|
||||
assert_eq!(
|
||||
Err(Error::AttrValueExpected(None, span)),
|
||||
sut.collect::<Result<Vec<_>>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn attr_single_no_value_with_eq() {
|
||||
new_sut!(sut = r#" <foo attr=>"#);
|
||||
// |
|
||||
// 11
|
||||
// where the `"value"` should be
|
||||
|
||||
let span = UC.span(11, 0);
|
||||
|
||||
assert_eq!(
|
||||
Err(Error::AttrValueExpected(None, span)),
|
||||
sut.collect::<Result<Vec<_>>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn attr_multi_no_value_no_eq() {
|
||||
new_sut!(sut = r#" <foo attr another>"#);
|
||||
// |
|
||||
// 10
|
||||
// where the `="value"` should be
|
||||
|
||||
let span = UC.span(10, 0);
|
||||
|
||||
assert_eq!(
|
||||
// quick-xml doesn't provide the name
|
||||
Err(Error::AttrValueExpected(None, span)),
|
||||
sut.collect::<Result<Vec<_>>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn attr_multi_no_value_with_eq() {
|
||||
new_sut!(sut = r#" <foo attr= another=>"#);
|
||||
// |
|
||||
// 11
|
||||
// quick-xml interprets this as an unquoted value
|
||||
|
||||
// TODO: quick-xml does not give us the length so we'll have to figure
|
||||
// it out ourselves.
|
||||
let span = UC.span(11, 0);
|
||||
|
||||
assert_eq!(
|
||||
Err(Error::AttrValueUnquoted(None, span)),
|
||||
sut.collect::<Result<Vec<_>>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn attr_multiple_no_value_no_eq_then_good() {
|
||||
new_sut!(sut = r#" <foo attr good="value">"#);
|
||||
// |
|
||||
// 10
|
||||
// where the `="value"` should be
|
||||
|
||||
let span = UC.span(10, 0);
|
||||
|
||||
assert_eq!(
|
||||
// quick-xml doesn't provide the name
|
||||
Err(Error::AttrValueExpected(None, span)),
|
||||
sut.collect::<Result<Vec<_>>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_element_qname_no_attrs() {
|
||||
new_sut!(sut = r#"<>"#);
|
||||
// |
|
||||
// 1
|
||||
// where the QName should be
|
||||
|
||||
let span = UC.span(1, 0);
|
||||
|
||||
assert_eq!(
|
||||
Err(Error::InvalidQName("".intern(), span)),
|
||||
sut.collect::<Result<Vec<_>>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_element_qname_with_space_no_attrs() {
|
||||
new_sut!(sut = r#"< >"#);
|
||||
// |
|
||||
// 1
|
||||
// where the QName should be
|
||||
|
||||
let span = UC.span(1, 0);
|
||||
|
||||
assert_eq!(
|
||||
Err(Error::InvalidQName("".intern(), span)),
|
||||
sut.collect::<Result<Vec<_>>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_element_qname_with_attr() {
|
||||
new_sut!(sut = r#"<foo="bar">"#);
|
||||
// |-------|
|
||||
// 1 10
|
||||
|
||||
let span = UC.span(1, 9);
|
||||
|
||||
assert_eq!(
|
||||
Err(Error::InvalidQName("foo=\"bar\"".intern(), span)),
|
||||
sut.collect::<Result<Vec<_>>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_element_qname_with_space_with_attr() {
|
||||
new_sut!(sut = r#"< foo="bar">"#);
|
||||
// |
|
||||
// 1
|
||||
// quick-xml interprets the space as a "" QName
|
||||
|
||||
let span = UC.span(1, 0);
|
||||
|
||||
assert_eq!(
|
||||
Err(Error::InvalidQName("".intern(), span)),
|
||||
sut.collect::<Result<Vec<_>>>()
|
||||
);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue