tamer: xir::reader: Additional quick-xml error spans
There's a bit to unpack here. Some of the spans originate from quick-xml's error handling, but in coming up with test cases to try to trigger errors, I found that quick-xml is far too permissive in what it accepts, and oughtright dangerous in some situations. I feel like the writing is on the wall for quick-xml, but I'll probably wait until replacing `xmlo` with a more efficient format before deciding whether to use a different library or implement parsing ourselves. There's a lot of factors to consider, and a library would have to not only be correct and performant, but provide useful information for span generation. But for now, I have other more important things to work on, like a functioning compiler. So while quick-xml is around, I'll just have to do the best I can to provide a correct parser with useful errors. DEV-10934main
parent
ab181670b5
commit
68223cb7d3
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
//! XIR error information.
|
//! XIR error information.
|
||||||
|
|
||||||
|
use super::QName;
|
||||||
use crate::{span::Span, sym::SymbolId, tpwrap::quick_xml};
|
use crate::{span::Span, sym::SymbolId, tpwrap::quick_xml};
|
||||||
use std::{fmt::Display, str::Utf8Error};
|
use std::{fmt::Display, str::Utf8Error};
|
||||||
|
|
||||||
|
@ -50,6 +51,16 @@ pub enum Error {
|
||||||
/// TAMER expects UTF-8 encoding for everything,
|
/// TAMER expects UTF-8 encoding for everything,
|
||||||
/// which should not be an unreasonable expectation.
|
/// which should not be an unreasonable expectation.
|
||||||
UnsupportedEncoding(SymbolId, Span),
|
UnsupportedEncoding(SymbolId, Span),
|
||||||
|
/// The named attribute is missing a value.
|
||||||
|
///
|
||||||
|
/// The span is expected to placed at the offset where the value is
|
||||||
|
/// expected.
|
||||||
|
/// The character `=` may or may not be present.
|
||||||
|
AttrValueExpected(Option<QName>, Span),
|
||||||
|
/// An attribute value was found but was not quoted.
|
||||||
|
///
|
||||||
|
/// The symbol here should be the name of the attribute.
|
||||||
|
AttrValueUnquoted(Option<QName>, Span),
|
||||||
|
|
||||||
// TODO: Better error translation.
|
// TODO: Better error translation.
|
||||||
QuickXmlError(quick_xml::Error, Span),
|
QuickXmlError(quick_xml::Error, Span),
|
||||||
|
@ -65,31 +76,33 @@ impl Error {
|
||||||
|
|
||||||
impl Display for Error {
|
impl Display for Error {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
use Error::*;
|
||||||
|
|
||||||
match self {
|
match self {
|
||||||
Self::NCColon(sym, span) => {
|
NCColon(sym, span) => {
|
||||||
write!(f, "NCName `{sym}` cannot contain ':' at {span}",)
|
write!(f, "NCName `{sym}` cannot contain ':' at {span}",)
|
||||||
}
|
}
|
||||||
Self::NotWhitespace(s) => {
|
NotWhitespace(s) => {
|
||||||
write!(f, "string contains non-ASCII-whitespace: `{}`", s)
|
write!(f, "string contains non-ASCII-whitespace: `{}`", s)
|
||||||
}
|
}
|
||||||
Self::InvalidQName(qname, span) => {
|
InvalidQName(qname, span) => {
|
||||||
write!(f, "invalid QName `{qname}` at {span}")
|
write!(f, "invalid QName `{qname}` at {span}")
|
||||||
}
|
}
|
||||||
Self::InvalidUtf8(inner, bytes, span) => {
|
InvalidUtf8(inner, bytes, span) => {
|
||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"{inner} for string `{}` with bytes `{bytes:?}` at {span}",
|
"{inner} for string `{}` with bytes `{bytes:?}` at {span}",
|
||||||
String::from_utf8_lossy(bytes)
|
String::from_utf8_lossy(bytes)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
Self::UnsupportedXmlVersion(ver, span) => {
|
UnsupportedXmlVersion(ver, span) => {
|
||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"expected XML version `1.0` at {span}, \
|
"expected XML version `1.0` at {span}, \
|
||||||
but found unsupported version `{ver}`"
|
but found unsupported version `{ver}`"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
Self::UnsupportedEncoding(enc, span) => {
|
UnsupportedEncoding(enc, span) => {
|
||||||
// TODO: when we have hints,
|
// TODO: when we have hints,
|
||||||
// indicate that they can also entirely remove this
|
// indicate that they can also entirely remove this
|
||||||
// attribute to resolve the error
|
// attribute to resolve the error
|
||||||
|
@ -99,8 +112,25 @@ impl Display for Error {
|
||||||
but found unsupported encoding `{enc}`"
|
but found unsupported encoding `{enc}`"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
AttrValueExpected(Some(name), span) => {
|
||||||
|
write!(f, "value expected for attribute `{name}` at {span}")
|
||||||
|
}
|
||||||
|
// TODO: Parsers should provide the name.
|
||||||
|
AttrValueExpected(None, span) => {
|
||||||
|
write!(f, "value expected for attribute at {span}")
|
||||||
|
}
|
||||||
|
AttrValueUnquoted(Some(name), span) => {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"value for attribute `{name}` is missing quotes at {span}"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
// TODO: Parsers should provide the name.
|
||||||
|
AttrValueUnquoted(None, span) => {
|
||||||
|
write!(f, "value for attribute is missing quotes at {span}")
|
||||||
|
}
|
||||||
// TODO: Translate error messages
|
// TODO: Translate error messages
|
||||||
Self::QuickXmlError(inner, span) => {
|
QuickXmlError(inner, span) => {
|
||||||
write!(f, "internal parser error: {inner} at {span}")
|
write!(f, "internal parser error: {inner} at {span}")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,14 +23,15 @@
|
||||||
|
|
||||||
use super::{error::SpanlessError, DefaultEscaper, Error, Escaper, Token};
|
use super::{error::SpanlessError, DefaultEscaper, Error, Escaper, Token};
|
||||||
use crate::{
|
use crate::{
|
||||||
span::{UNKNOWN_CONTEXT as UC, UNKNOWN_SPAN},
|
span::UNKNOWN_CONTEXT as UC,
|
||||||
sym::GlobalSymbolInternBytes,
|
sym::{st::raw::WS_EMPTY, GlobalSymbolInternBytes},
|
||||||
};
|
};
|
||||||
use quick_xml::{
|
use quick_xml::{
|
||||||
self,
|
self,
|
||||||
events::{
|
events::{
|
||||||
attributes::Attributes, BytesDecl, BytesStart, Event as QuickXmlEvent,
|
attributes::Attributes, BytesDecl, BytesStart, Event as QuickXmlEvent,
|
||||||
},
|
},
|
||||||
|
Error as QuickXmlError,
|
||||||
};
|
};
|
||||||
use std::{borrow::Cow, collections::VecDeque, io::BufRead, result};
|
use std::{borrow::Cow, collections::VecDeque, io::BufRead, result};
|
||||||
|
|
||||||
|
@ -111,10 +112,12 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
||||||
match self.reader.read_event(&mut self.readbuf) {
|
match self.reader.read_event(&mut self.readbuf) {
|
||||||
// TODO: To provide better spans and error messages,
|
// TODO: To provide better spans and error messages,
|
||||||
// we need to map specific types of errors.
|
// we need to map specific types of errors.
|
||||||
Err(inner) => {
|
// But we don't encounter much of anything here with how we make
|
||||||
|
// use of quick-xml.
|
||||||
|
Err(inner) => Some(Err({
|
||||||
let span = UC.span_or_zz(prev_pos, 0);
|
let span = UC.span_or_zz(prev_pos, 0);
|
||||||
Some(Err(SpanlessError::from(inner).with_span(span)))
|
SpanlessError::from(inner).with_span(span)
|
||||||
}
|
})),
|
||||||
|
|
||||||
Ok(ev) => match ev {
|
Ok(ev) => match ev {
|
||||||
// This is the only time we'll consider the iterator to be
|
// This is the only time we'll consider the iterator to be
|
||||||
|
@ -290,13 +293,45 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
||||||
let addr = ele.as_ptr() as usize - 1;
|
let addr = ele.as_ptr() as usize - 1;
|
||||||
let len = ele.name().len();
|
let len = ele.name().len();
|
||||||
|
|
||||||
|
match ele.name().last() {
|
||||||
|
None => {
|
||||||
|
// TODO: QName should be self-validating. Move this.
|
||||||
|
return Err(Error::InvalidQName(
|
||||||
|
WS_EMPTY,
|
||||||
|
// <>
|
||||||
|
// | where QName should be
|
||||||
|
UC.span_or_zz(pos + 1, 0),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Quick-and-dirty guess as to whether they may have missed the
|
||||||
|
// element name and included an attribute instead,
|
||||||
|
// which quick-xml does not check for.
|
||||||
|
Some(b'"' | b'\'') => {
|
||||||
|
return Err({
|
||||||
|
// <foo="bar" ...>
|
||||||
|
// |-------|
|
||||||
|
let span = UC.span_or_zz(pos + 1, len);
|
||||||
|
|
||||||
|
Error::InvalidQName(
|
||||||
|
ele.name()
|
||||||
|
.intern_utf8()
|
||||||
|
.map_err(Error::from_with_span(span))?,
|
||||||
|
span,
|
||||||
|
)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
_ => (),
|
||||||
|
};
|
||||||
|
|
||||||
// `ele` contains every byte up to the [self-]closing tag.
|
// `ele` contains every byte up to the [self-]closing tag.
|
||||||
ele.name()
|
ele.name()
|
||||||
.try_into()
|
.try_into()
|
||||||
.map_err(Error::from_with_span(UC.span_or_zz(pos + 1, len)))
|
.map_err(Error::from_with_span(UC.span_or_zz(pos + 1, len)))
|
||||||
.and_then(|qname| {
|
.and_then(|qname| {
|
||||||
let noattr_add: usize =
|
let has_attrs = ele.attributes_raw().len() > 0;
|
||||||
(ele.attributes_raw().len() == 0).into();
|
let noattr_add: usize = (!has_attrs).into();
|
||||||
|
|
||||||
// <tag ... />
|
// <tag ... />
|
||||||
// |--| name + '<'
|
// |--| name + '<'
|
||||||
|
@ -305,12 +340,35 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
||||||
// |---| name + '<' + '>'
|
// |---| name + '<' + '>'
|
||||||
let span = UC.span_or_zz(pos, len + 1 + noattr_add);
|
let span = UC.span_or_zz(pos, len + 1 + noattr_add);
|
||||||
|
|
||||||
Self::parse_attrs(
|
if has_attrs {
|
||||||
escaper,
|
let found = Self::parse_attrs(
|
||||||
tokbuf,
|
escaper,
|
||||||
ele.attributes(),
|
tokbuf,
|
||||||
addr - pos, // offset relative to _beginning_ of buffer
|
ele.attributes(),
|
||||||
)?;
|
addr - pos, // offset relative to _beginning_ of buf
|
||||||
|
pos,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
// Given this input, quick-xml ignores the bytes entirely:
|
||||||
|
// <foo bar>
|
||||||
|
// ^^^| missing `="value"`
|
||||||
|
//
|
||||||
|
// The whitespace check is to handle input like this:
|
||||||
|
// <foo />
|
||||||
|
// ^ whitespace making `attributes_raw().len` > 0
|
||||||
|
if !found
|
||||||
|
&& ele
|
||||||
|
.attributes_raw()
|
||||||
|
.iter()
|
||||||
|
.find(|b| !Self::is_whitespace(**b))
|
||||||
|
.is_some()
|
||||||
|
{
|
||||||
|
return Err(Error::AttrValueExpected(
|
||||||
|
None,
|
||||||
|
UC.span_or_zz(pos + ele.len() + 1, 0),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// The first token will be immediately returned
|
// The first token will be immediately returned
|
||||||
// via the Iterator.
|
// via the Iterator.
|
||||||
|
@ -318,6 +376,14 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// quick-xml's whitespace predicate.
|
||||||
|
fn is_whitespace(b: u8) -> bool {
|
||||||
|
match b {
|
||||||
|
b' ' | b'\r' | b'\n' | b'\t' => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Parse attributes into a XIR [`Token`] stream.
|
/// Parse attributes into a XIR [`Token`] stream.
|
||||||
///
|
///
|
||||||
/// The order of attributes will be maintained.
|
/// The order of attributes will be maintained.
|
||||||
|
@ -344,15 +410,38 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
||||||
tokbuf: &mut VecDeque<Token>,
|
tokbuf: &mut VecDeque<Token>,
|
||||||
mut attrs: Attributes<'a>,
|
mut attrs: Attributes<'a>,
|
||||||
ele_ptr: usize,
|
ele_ptr: usize,
|
||||||
) -> Result<()> {
|
ele_pos: usize,
|
||||||
|
) -> Result<bool> {
|
||||||
|
let mut found = false;
|
||||||
|
|
||||||
// Disable checks to allow duplicate attributes;
|
// Disable checks to allow duplicate attributes;
|
||||||
// XIR does not enforce this,
|
// XIR does not enforce this,
|
||||||
// because it needs to accommodate semantically invalid XML for
|
// because it needs to accommodate semantically invalid XML for
|
||||||
// later analysis.
|
// later analysis.
|
||||||
for result in attrs.with_checks(false) {
|
for result in attrs.with_checks(false) {
|
||||||
// TODO: We'll need to map this quick-xml error to provide more
|
found = true;
|
||||||
// detailed messages and spans.
|
|
||||||
let attr = result.map_err(Error::from_with_span(UNKNOWN_SPAN))?;
|
let attr = result.map_err(|e| match e {
|
||||||
|
QuickXmlError::NoEqAfterName(pos) => {
|
||||||
|
// TODO: quick-xml doesn't give us the name,
|
||||||
|
// but we should discover it.
|
||||||
|
Error::AttrValueExpected(
|
||||||
|
None,
|
||||||
|
UC.span_or_zz(ele_pos + pos, 0),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
QuickXmlError::UnquotedValue(pos) => {
|
||||||
|
// TODO: name and span length
|
||||||
|
Error::AttrValueUnquoted(
|
||||||
|
None,
|
||||||
|
UC.span_or_zz(ele_pos + pos, 0),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// fallback
|
||||||
|
e => Error::from_with_span(UC.span_or_zz(ele_pos, 0))(e),
|
||||||
|
})?;
|
||||||
|
|
||||||
let keyoffset = attr.key.as_ptr() as usize;
|
let keyoffset = attr.key.as_ptr() as usize;
|
||||||
let name_offset = keyoffset - ele_ptr;
|
let name_offset = keyoffset - ele_ptr;
|
||||||
|
@ -402,7 +491,7 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
||||||
tokbuf.push_front(Token::AttrValue(value, span_value));
|
tokbuf.push_front(Token::AttrValue(value, span_value));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(found)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -641,3 +641,164 @@ fn invalid_xml_encoding() {
|
||||||
sut.collect::<Result<Vec<_>>>()
|
sut.collect::<Result<Vec<_>>>()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// quick-xml parser shortcomings
|
||||||
|
//
|
||||||
|
// There are certain problems quick-xml does not catch that we will catch
|
||||||
|
// ourselves.
|
||||||
|
// This is not intended to be comprehensive,
|
||||||
|
// but is intended to catch what might be common-enough errors that they
|
||||||
|
// deserve friendly output from the compiler,
|
||||||
|
// rather than resulting in more obscure errors down the line after
|
||||||
|
// we've left the context of XML.
|
||||||
|
//
|
||||||
|
// Ultimately,
|
||||||
|
// the writing may be on the wall for quick-xml,
|
||||||
|
// but at the time of writing there are more important things to focus on,
|
||||||
|
// so we will make do for now.
|
||||||
|
//
|
||||||
|
|
||||||
|
// quick-xml's behavior here is alarming---it simply doesn't see it as an
|
||||||
|
// attribute at all and skips the tokens.
|
||||||
|
// We must detect this on our own.
|
||||||
|
#[test]
|
||||||
|
fn attr_single_no_value_no_eq() {
|
||||||
|
new_sut!(sut = r#" <foo attr>"#);
|
||||||
|
// ____/ |
|
||||||
|
// / 10
|
||||||
|
// / where the `="value"` should be
|
||||||
|
// |
|
||||||
|
// WS to make sure we add the file-relative pos
|
||||||
|
// and not just have the span relative to the element
|
||||||
|
|
||||||
|
let span = UC.span(10, 0);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
Err(Error::AttrValueExpected(None, span)),
|
||||||
|
sut.collect::<Result<Vec<_>>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn attr_single_no_value_with_eq() {
|
||||||
|
new_sut!(sut = r#" <foo attr=>"#);
|
||||||
|
// |
|
||||||
|
// 11
|
||||||
|
// where the `"value"` should be
|
||||||
|
|
||||||
|
let span = UC.span(11, 0);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
Err(Error::AttrValueExpected(None, span)),
|
||||||
|
sut.collect::<Result<Vec<_>>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn attr_multi_no_value_no_eq() {
|
||||||
|
new_sut!(sut = r#" <foo attr another>"#);
|
||||||
|
// |
|
||||||
|
// 10
|
||||||
|
// where the `="value"` should be
|
||||||
|
|
||||||
|
let span = UC.span(10, 0);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
// quick-xml doesn't provide the name
|
||||||
|
Err(Error::AttrValueExpected(None, span)),
|
||||||
|
sut.collect::<Result<Vec<_>>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn attr_multi_no_value_with_eq() {
|
||||||
|
new_sut!(sut = r#" <foo attr= another=>"#);
|
||||||
|
// |
|
||||||
|
// 11
|
||||||
|
// quick-xml interprets this as an unquoted value
|
||||||
|
|
||||||
|
// TODO: quick-xml does not give us the length so we'll have to figure
|
||||||
|
// it out ourselves.
|
||||||
|
let span = UC.span(11, 0);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
Err(Error::AttrValueUnquoted(None, span)),
|
||||||
|
sut.collect::<Result<Vec<_>>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn attr_multiple_no_value_no_eq_then_good() {
|
||||||
|
new_sut!(sut = r#" <foo attr good="value">"#);
|
||||||
|
// |
|
||||||
|
// 10
|
||||||
|
// where the `="value"` should be
|
||||||
|
|
||||||
|
let span = UC.span(10, 0);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
// quick-xml doesn't provide the name
|
||||||
|
Err(Error::AttrValueExpected(None, span)),
|
||||||
|
sut.collect::<Result<Vec<_>>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_element_qname_no_attrs() {
|
||||||
|
new_sut!(sut = r#"<>"#);
|
||||||
|
// |
|
||||||
|
// 1
|
||||||
|
// where the QName should be
|
||||||
|
|
||||||
|
let span = UC.span(1, 0);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
Err(Error::InvalidQName("".intern(), span)),
|
||||||
|
sut.collect::<Result<Vec<_>>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_element_qname_with_space_no_attrs() {
|
||||||
|
new_sut!(sut = r#"< >"#);
|
||||||
|
// |
|
||||||
|
// 1
|
||||||
|
// where the QName should be
|
||||||
|
|
||||||
|
let span = UC.span(1, 0);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
Err(Error::InvalidQName("".intern(), span)),
|
||||||
|
sut.collect::<Result<Vec<_>>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_element_qname_with_attr() {
|
||||||
|
new_sut!(sut = r#"<foo="bar">"#);
|
||||||
|
// |-------|
|
||||||
|
// 1 10
|
||||||
|
|
||||||
|
let span = UC.span(1, 9);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
Err(Error::InvalidQName("foo=\"bar\"".intern(), span)),
|
||||||
|
sut.collect::<Result<Vec<_>>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_element_qname_with_space_with_attr() {
|
||||||
|
new_sut!(sut = r#"< foo="bar">"#);
|
||||||
|
// |
|
||||||
|
// 1
|
||||||
|
// quick-xml interprets the space as a "" QName
|
||||||
|
|
||||||
|
let span = UC.span(1, 0);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
Err(Error::InvalidQName("".intern(), span)),
|
||||||
|
sut.collect::<Result<Vec<_>>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue