tamer: xir::reader: Additional quick-xml error spans

There's a bit to unpack here.  Some of the spans originate from quick-xml's
error handling, but in coming up with test cases to try to trigger errors, I
found that quick-xml is far too permissive in what it accepts, and
oughtright dangerous in some situations.

I feel like the writing is on the wall for quick-xml, but I'll probably wait
until replacing `xmlo` with a more efficient format before deciding whether
to use a different library or implement parsing ourselves.  There's a lot of
factors to consider, and a library would have to not only be correct and
performant, but provide useful information for span generation.

But for now, I have other more important things to work on, like a
functioning compiler.  So while quick-xml is around, I'll just have to do
the best I can to provide a correct parser with useful errors.

DEV-10934
main
Mike Gerwitz 2022-04-08 13:52:16 -04:00
parent ab181670b5
commit 68223cb7d3
3 changed files with 305 additions and 25 deletions

View File

@ -19,6 +19,7 @@
//! XIR error information.
use super::QName;
use crate::{span::Span, sym::SymbolId, tpwrap::quick_xml};
use std::{fmt::Display, str::Utf8Error};
@ -50,6 +51,16 @@ pub enum Error {
/// TAMER expects UTF-8 encoding for everything,
/// which should not be an unreasonable expectation.
UnsupportedEncoding(SymbolId, Span),
/// The named attribute is missing a value.
///
/// The span is expected to placed at the offset where the value is
/// expected.
/// The character `=` may or may not be present.
AttrValueExpected(Option<QName>, Span),
/// An attribute value was found but was not quoted.
///
/// The symbol here should be the name of the attribute.
AttrValueUnquoted(Option<QName>, Span),
// TODO: Better error translation.
QuickXmlError(quick_xml::Error, Span),
@ -65,31 +76,33 @@ impl Error {
impl Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use Error::*;
match self {
Self::NCColon(sym, span) => {
NCColon(sym, span) => {
write!(f, "NCName `{sym}` cannot contain ':' at {span}",)
}
Self::NotWhitespace(s) => {
NotWhitespace(s) => {
write!(f, "string contains non-ASCII-whitespace: `{}`", s)
}
Self::InvalidQName(qname, span) => {
InvalidQName(qname, span) => {
write!(f, "invalid QName `{qname}` at {span}")
}
Self::InvalidUtf8(inner, bytes, span) => {
InvalidUtf8(inner, bytes, span) => {
write!(
f,
"{inner} for string `{}` with bytes `{bytes:?}` at {span}",
String::from_utf8_lossy(bytes)
)
}
Self::UnsupportedXmlVersion(ver, span) => {
UnsupportedXmlVersion(ver, span) => {
write!(
f,
"expected XML version `1.0` at {span}, \
but found unsupported version `{ver}`"
)
}
Self::UnsupportedEncoding(enc, span) => {
UnsupportedEncoding(enc, span) => {
// TODO: when we have hints,
// indicate that they can also entirely remove this
// attribute to resolve the error
@ -99,8 +112,25 @@ impl Display for Error {
but found unsupported encoding `{enc}`"
)
}
AttrValueExpected(Some(name), span) => {
write!(f, "value expected for attribute `{name}` at {span}")
}
// TODO: Parsers should provide the name.
AttrValueExpected(None, span) => {
write!(f, "value expected for attribute at {span}")
}
AttrValueUnquoted(Some(name), span) => {
write!(
f,
"value for attribute `{name}` is missing quotes at {span}"
)
}
// TODO: Parsers should provide the name.
AttrValueUnquoted(None, span) => {
write!(f, "value for attribute is missing quotes at {span}")
}
// TODO: Translate error messages
Self::QuickXmlError(inner, span) => {
QuickXmlError(inner, span) => {
write!(f, "internal parser error: {inner} at {span}")
}
}

View File

@ -23,14 +23,15 @@
use super::{error::SpanlessError, DefaultEscaper, Error, Escaper, Token};
use crate::{
span::{UNKNOWN_CONTEXT as UC, UNKNOWN_SPAN},
sym::GlobalSymbolInternBytes,
span::UNKNOWN_CONTEXT as UC,
sym::{st::raw::WS_EMPTY, GlobalSymbolInternBytes},
};
use quick_xml::{
self,
events::{
attributes::Attributes, BytesDecl, BytesStart, Event as QuickXmlEvent,
},
Error as QuickXmlError,
};
use std::{borrow::Cow, collections::VecDeque, io::BufRead, result};
@ -111,10 +112,12 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
match self.reader.read_event(&mut self.readbuf) {
// TODO: To provide better spans and error messages,
// we need to map specific types of errors.
Err(inner) => {
// But we don't encounter much of anything here with how we make
// use of quick-xml.
Err(inner) => Some(Err({
let span = UC.span_or_zz(prev_pos, 0);
Some(Err(SpanlessError::from(inner).with_span(span)))
}
SpanlessError::from(inner).with_span(span)
})),
Ok(ev) => match ev {
// This is the only time we'll consider the iterator to be
@ -290,13 +293,45 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
let addr = ele.as_ptr() as usize - 1;
let len = ele.name().len();
match ele.name().last() {
None => {
// TODO: QName should be self-validating. Move this.
return Err(Error::InvalidQName(
WS_EMPTY,
// <>
// | where QName should be
UC.span_or_zz(pos + 1, 0),
));
}
// Quick-and-dirty guess as to whether they may have missed the
// element name and included an attribute instead,
// which quick-xml does not check for.
Some(b'"' | b'\'') => {
return Err({
// <foo="bar" ...>
// |-------|
let span = UC.span_or_zz(pos + 1, len);
Error::InvalidQName(
ele.name()
.intern_utf8()
.map_err(Error::from_with_span(span))?,
span,
)
});
}
_ => (),
};
// `ele` contains every byte up to the [self-]closing tag.
ele.name()
.try_into()
.map_err(Error::from_with_span(UC.span_or_zz(pos + 1, len)))
.and_then(|qname| {
let noattr_add: usize =
(ele.attributes_raw().len() == 0).into();
let has_attrs = ele.attributes_raw().len() > 0;
let noattr_add: usize = (!has_attrs).into();
// <tag ... />
// |--| name + '<'
@ -305,12 +340,35 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
// |---| name + '<' + '>'
let span = UC.span_or_zz(pos, len + 1 + noattr_add);
Self::parse_attrs(
escaper,
tokbuf,
ele.attributes(),
addr - pos, // offset relative to _beginning_ of buffer
)?;
if has_attrs {
let found = Self::parse_attrs(
escaper,
tokbuf,
ele.attributes(),
addr - pos, // offset relative to _beginning_ of buf
pos,
)?;
// Given this input, quick-xml ignores the bytes entirely:
// <foo bar>
// ^^^| missing `="value"`
//
// The whitespace check is to handle input like this:
// <foo />
// ^ whitespace making `attributes_raw().len` > 0
if !found
&& ele
.attributes_raw()
.iter()
.find(|b| !Self::is_whitespace(**b))
.is_some()
{
return Err(Error::AttrValueExpected(
None,
UC.span_or_zz(pos + ele.len() + 1, 0),
));
}
}
// The first token will be immediately returned
// via the Iterator.
@ -318,6 +376,14 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
})
}
/// quick-xml's whitespace predicate.
fn is_whitespace(b: u8) -> bool {
match b {
b' ' | b'\r' | b'\n' | b'\t' => true,
_ => false,
}
}
/// Parse attributes into a XIR [`Token`] stream.
///
/// The order of attributes will be maintained.
@ -344,15 +410,38 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
tokbuf: &mut VecDeque<Token>,
mut attrs: Attributes<'a>,
ele_ptr: usize,
) -> Result<()> {
ele_pos: usize,
) -> Result<bool> {
let mut found = false;
// Disable checks to allow duplicate attributes;
// XIR does not enforce this,
// because it needs to accommodate semantically invalid XML for
// later analysis.
for result in attrs.with_checks(false) {
// TODO: We'll need to map this quick-xml error to provide more
// detailed messages and spans.
let attr = result.map_err(Error::from_with_span(UNKNOWN_SPAN))?;
found = true;
let attr = result.map_err(|e| match e {
QuickXmlError::NoEqAfterName(pos) => {
// TODO: quick-xml doesn't give us the name,
// but we should discover it.
Error::AttrValueExpected(
None,
UC.span_or_zz(ele_pos + pos, 0),
)
}
QuickXmlError::UnquotedValue(pos) => {
// TODO: name and span length
Error::AttrValueUnquoted(
None,
UC.span_or_zz(ele_pos + pos, 0),
)
}
// fallback
e => Error::from_with_span(UC.span_or_zz(ele_pos, 0))(e),
})?;
let keyoffset = attr.key.as_ptr() as usize;
let name_offset = keyoffset - ele_ptr;
@ -402,7 +491,7 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
tokbuf.push_front(Token::AttrValue(value, span_value));
}
Ok(())
Ok(found)
}
}

View File

@ -641,3 +641,164 @@ fn invalid_xml_encoding() {
sut.collect::<Result<Vec<_>>>()
);
}
//
// quick-xml parser shortcomings
//
// There are certain problems quick-xml does not catch that we will catch
// ourselves.
// This is not intended to be comprehensive,
// but is intended to catch what might be common-enough errors that they
// deserve friendly output from the compiler,
// rather than resulting in more obscure errors down the line after
// we've left the context of XML.
//
// Ultimately,
// the writing may be on the wall for quick-xml,
// but at the time of writing there are more important things to focus on,
// so we will make do for now.
//
// quick-xml's behavior here is alarming---it simply doesn't see it as an
// attribute at all and skips the tokens.
// We must detect this on our own.
#[test]
fn attr_single_no_value_no_eq() {
new_sut!(sut = r#" <foo attr>"#);
// ____/ |
// / 10
// / where the `="value"` should be
// |
// WS to make sure we add the file-relative pos
// and not just have the span relative to the element
let span = UC.span(10, 0);
assert_eq!(
Err(Error::AttrValueExpected(None, span)),
sut.collect::<Result<Vec<_>>>()
);
}
#[test]
fn attr_single_no_value_with_eq() {
new_sut!(sut = r#" <foo attr=>"#);
// |
// 11
// where the `"value"` should be
let span = UC.span(11, 0);
assert_eq!(
Err(Error::AttrValueExpected(None, span)),
sut.collect::<Result<Vec<_>>>()
);
}
#[test]
fn attr_multi_no_value_no_eq() {
new_sut!(sut = r#" <foo attr another>"#);
// |
// 10
// where the `="value"` should be
let span = UC.span(10, 0);
assert_eq!(
// quick-xml doesn't provide the name
Err(Error::AttrValueExpected(None, span)),
sut.collect::<Result<Vec<_>>>()
);
}
#[test]
fn attr_multi_no_value_with_eq() {
new_sut!(sut = r#" <foo attr= another=>"#);
// |
// 11
// quick-xml interprets this as an unquoted value
// TODO: quick-xml does not give us the length so we'll have to figure
// it out ourselves.
let span = UC.span(11, 0);
assert_eq!(
Err(Error::AttrValueUnquoted(None, span)),
sut.collect::<Result<Vec<_>>>()
);
}
#[test]
fn attr_multiple_no_value_no_eq_then_good() {
new_sut!(sut = r#" <foo attr good="value">"#);
// |
// 10
// where the `="value"` should be
let span = UC.span(10, 0);
assert_eq!(
// quick-xml doesn't provide the name
Err(Error::AttrValueExpected(None, span)),
sut.collect::<Result<Vec<_>>>()
);
}
#[test]
fn empty_element_qname_no_attrs() {
new_sut!(sut = r#"<>"#);
// |
// 1
// where the QName should be
let span = UC.span(1, 0);
assert_eq!(
Err(Error::InvalidQName("".intern(), span)),
sut.collect::<Result<Vec<_>>>()
);
}
#[test]
fn empty_element_qname_with_space_no_attrs() {
new_sut!(sut = r#"< >"#);
// |
// 1
// where the QName should be
let span = UC.span(1, 0);
assert_eq!(
Err(Error::InvalidQName("".intern(), span)),
sut.collect::<Result<Vec<_>>>()
);
}
#[test]
fn empty_element_qname_with_attr() {
new_sut!(sut = r#"<foo="bar">"#);
// |-------|
// 1 10
let span = UC.span(1, 9);
assert_eq!(
Err(Error::InvalidQName("foo=\"bar\"".intern(), span)),
sut.collect::<Result<Vec<_>>>()
);
}
#[test]
fn empty_element_qname_with_space_with_attr() {
new_sut!(sut = r#"< foo="bar">"#);
// |
// 1
// quick-xml interprets the space as a "" QName
let span = UC.span(1, 0);
assert_eq!(
Err(Error::InvalidQName("".intern(), span)),
sut.collect::<Result<Vec<_>>>()
);
}