// XIR reader // // Copyright (C) 2014-2022 Ryan Specialty Group, LLC. // // This file is part of TAME. // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . //! Parse XML files into a XIR [`Token`] stream. //! //! This uses [`quick_xml`] as the parser. use super::{ error::SpanlessError, CloseSpan, DefaultEscaper, Error, Escaper, OpenSpan, Token, }; use crate::{ parse::{ParseError, Parsed, ParsedObject, ParsedResult}, span::Context, sym::{st::raw::WS_EMPTY, GlobalSymbolInternBytes}, }; use quick_xml::{ self, events::{ attributes::Attributes, BytesDecl, BytesStart, Event as QuickXmlEvent, }, Error as QuickXmlError, }; use std::{borrow::Cow, collections::VecDeque, io::BufRead, result}; pub type Result = result::Result; /// Parse XML into a XIR [`Token`] stream. /// /// This reader is intended to be used as an [`Iterator`]. /// /// The underlying reader produces events in chunks that are far too /// large for XIR, /// so most [`Token`]s retrieved via this call are buffered. /// Parsing takes place when that buffer is exhausted and the next event /// is requested from the underlying reader /// (see [`XmlXirReader::refill_buf`]). /// Errors can only occur during parsing, /// and will never occur on buffered tokens. /// /// [`None`] is returned only on EOF, /// not on error. pub struct XmlXirReader<'s, B, S = DefaultEscaper> where B: BufRead, S: Escaper, { /// Inner parser. reader: quick_xml::Reader, /// Parsing context for reader. ctx: Context, /// Buffer for [`quick_xml::Reader`]. readbuf: Vec, /// [`Token`] buffer populated upon receiving a new event from /// `reader`. /// /// This buffer serves [`Iterator::next`] requests until it is /// depleted, /// after which [`XmlXirReader::refill_buf`] requests another token /// from `reader`. tokbuf: VecDeque, /// System for unescaping string data. escaper: &'s S, } impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> { pub fn new(reader: B, escaper: &'s S, ctx: Context) -> Self { let mut reader = quick_xml::Reader::from_reader(reader); // XIR must support mismatched tags so that we are able to represent // and reconstruct malformed inputs. // XIRT will handle mismatch errors itself. reader.check_end_names(false); Self { reader, ctx, readbuf: Vec::new(), // This capacity is largely arbitrary, // but [`Token`]s are small enough that it likely does not // matter much. tokbuf: VecDeque::with_capacity(32), escaper, } } /// Parse using the underlying [`quick_xml::Reader`] and populate the /// [`Token`] buffer. /// /// This is intended to be invoked once the buffer has been depleted by /// [`XmlXirReader::next`]. pub fn refill_buf(&mut self) -> Option> { // Clear any previous buffer to free unneeded data. self.tokbuf.clear(); self.readbuf.clear(); let ctx = self.ctx; let prev_pos = self.reader.buffer_position(); match self.reader.read_event(&mut self.readbuf) { // TODO: To provide better spans and error messages, // we need to map specific types of errors. // But we don't encounter much of anything here with how we make // use of quick-xml. Err(inner) => Some(Err({ let span = ctx.span_or_zz(prev_pos, 0); SpanlessError::from(inner).with_span(span) })), Ok(ev) => match ev { // This is the only time we'll consider the iterator to be // done. QuickXmlEvent::Eof => None, QuickXmlEvent::Empty(ele) => Some( Self::parse_element_open( &self.escaper, &mut self.tokbuf, ele, prev_pos, ctx, true, ) .and_then(|open| { let new_pos = self.reader.buffer_position(); // `` // || let span = ctx.span_or_zz(new_pos - 2, 2); // Tag is self-closing, but this does not yet // handle whitespace before the `/` // (as indicated in the span above). self.tokbuf.push_front(Token::Close( None, CloseSpan::empty(span), )); Ok(open) }), ), QuickXmlEvent::Start(ele) => Some(Self::parse_element_open( &self.escaper, &mut self.tokbuf, ele, prev_pos, ctx, false, )), QuickXmlEvent::End(ele) => Some({ // Only whitespace is permitted following the element // name, // so we can simply take the delta of the buffer pos. // // // [------] name + '<' + '/' + " >" let pos_delta = self.reader.buffer_position() - prev_pos; let span = ctx.span_or_zz(prev_pos, pos_delta); let name_len = ele.name().len(); ele.name() .try_into() .map_err(Error::from_with_span(span)) .and_then(|qname| { Ok(Token::Close( Some(qname), CloseSpan( span, name_len.try_into().unwrap_or(0), ), )) }) }), // quick_xml emits a useless text event if the first byte is // a '<'. QuickXmlEvent::Text(bytes) if bytes.escaped().is_empty() => { self.refill_buf() } // quick_xml _escapes_ the unescaped CData before handing it // off to us, // which is a complete waste since we'd just have to // unescape it again. QuickXmlEvent::CData(bytes) => todo!("CData: {:?}", bytes), QuickXmlEvent::Text(bytes) => Some({ // foo bar // [-----] let span = ctx.span_or_zz(prev_pos, bytes.len()); bytes .intern_utf8() .map_err(Into::into) .and_then(|sym| self.escaper.unescape(sym)) .map_err(Error::from_with_span(span)) .and_then(|unesc| Ok(Token::Text(unesc, span))) }), // Comments are _not_ returned escaped. QuickXmlEvent::Comment(bytes) => Some({ // // [----------] " foo " + "" let span = ctx.span_or_zz(prev_pos, bytes.len() + 7); bytes .intern_utf8() .map_err(Error::from_with_span(span)) .and_then(|comment| Ok(Token::Comment(comment, span))) }), // TODO: This must appear in the prologue. QuickXmlEvent::Decl(decl) => { match Self::validate_decl(&decl, prev_pos, ctx) { Err(x) => Some(Err(x)), Ok(()) => self.refill_buf(), } } // We do not support processor instructions or doctypes. // TODO: Convert this into an error/warning? // Previously `xml-stylesheet` was present in some older // source files and may linger for a bit after cleanup. QuickXmlEvent::PI(..) | QuickXmlEvent::DocType(..) => { self.refill_buf() } }, } } /// Validate an that an XML declaration contains expected values. /// /// A declaration looks like ``, /// where `@encoding` is optional but `@version` is not. /// It may also contain `@standalone`, /// but we do not check for that. /// /// We expect version 1.0 and UTF-8 encoding. /// Failing when these expectations are voilated helps to ensure that /// people unfamiliar with the system do not have expectations that /// are going to be unmet, /// which may result in subtle (or even serious) problems. fn validate_decl(decl: &BytesDecl, pos: usize, ctx: Context) -> Result<()> { // Starts after ` // [-] let ver_pos = (ver.as_ptr() as usize) - decl_ptr; let span = ctx.span_or_zz(ver_pos, ver.len()); Err(Error::UnsupportedXmlVersion( ver.intern_utf8().map_err(Error::from_with_span(span))?, span, ))? } if let Some(enc) = decl.encoding() { match &enc.map_err(Error::from_with_span(decl_span))?[..] { b"utf-8" | b"UTF-8" => (), invalid => { let enc_pos = (invalid.as_ptr() as usize) - decl_ptr; let span = ctx.span_or_zz(enc_pos, invalid.len()); Err(Error::UnsupportedEncoding( invalid .intern_utf8() .map_err(Error::from_with_span(span))?, span, ))? } } } Ok(()) } /// Parse opening element and its attributes into a XIR [`Token`] /// stream. /// /// The opening element is returned rather than being added to the token /// buffer, /// since the intent is to provide that token immediately. fn parse_element_open( escaper: &'s S, tokbuf: &mut VecDeque, ele: BytesStart, pos: usize, ctx: Context, empty_tag: bool, ) -> Result { // Starts after the opening tag `<`, so adjust. let addr = ele.as_ptr() as usize - 1; let len = ele.name().len(); match ele.name().last() { None => { // TODO: QName should be self-validating. Move this. return Err(Error::InvalidQName( WS_EMPTY, // <> // | where QName should be ctx.span_or_zz(pos + 1, 0), )); } // Quick-and-dirty guess as to whether they may have missed the // element name and included an attribute instead, // which quick-xml does not check for. Some(b'"' | b'\'') => { return Err({ // // [-------] let span = ctx.span_or_zz(pos + 1, len); Error::InvalidQName( ele.name() .intern_utf8() .map_err(Error::from_with_span(span))?, span, ) }); } _ => (), }; // `ele` contains every byte up to the [self-]closing tag. ele.name() .try_into() .map_err(Error::from_with_span(ctx.span_or_zz(pos + 1, len))) .and_then(|qname| { // The whitespace check is to handle input like this: // // ^ whitespace making `attributes_raw().len` > 0 let has_attrs = ele .attributes_raw() .iter() .find(|b| !is_xml_whitespace_u8(**b)) .is_some(); // The tail is anything following the last byte of the QName // in a non-empty tag with no attributes. // For example: // // ~~~~ tail ~ tail (no tail) let tail = if has_attrs { let found = Self::parse_attrs( escaper, tokbuf, ele.attributes(), addr - pos, // offset relative to _beginning_ of buf pos, ctx, )?; // Given this input, quick-xml ignores the bytes entirely: // // [--] missing `="value"` if !found { return Err(Error::AttrValueExpected( None, ctx.span_or_zz(pos + ele.len() + 1, 0), )); } // No tail because of attributes. 0 } else { match empty_tag { // Empty tag cannot have a tail. true => 0, // The "attributes" buffer represents whitespace, // so the tail is the number of bytes of // whitespace plus the closing '>' tag delimiter. false => ele.attributes_raw().len() + 1, } }; // // [--] name + '<' [--] name + '<' // // ... ... // [-----] name + '<' + " >" [--] name + '<' // ~~~ tail let span = ctx.span_or_zz(pos, len + 1 + tail); // The first token will be immediately returned // via the Iterator. Ok(Token::Open( qname, OpenSpan(span, len.try_into().unwrap_or(0)), )) }) } /// Parse attributes into a XIR [`Token`] stream. /// /// The order of attributes will be maintained. /// /// This does not yet handle whitespace between attributes, /// or around `=`. /// /// Note About Pointer Arithmetic /// ============================= /// `ele_ptr` is expected to be a pointer to the buffer containing the /// bytes read from the source file. /// Attributes reference this buffer, /// so we can use pointer arithmetic to determine the offset within /// the buffer relative to the node. /// This works because the underlying buffer is a `Vec`, /// which is contiguous in memory. /// /// However, since this is a `Vec`, /// it is important that the address be retrieved _after_ quick-xml /// read events, /// otherwise the buffer may be expanded and will be reallocated. fn parse_attrs<'a>( escaper: &'s S, tokbuf: &mut VecDeque, mut attrs: Attributes<'a>, ele_ptr: usize, ele_pos: usize, ctx: Context, ) -> Result { let mut found = false; // Disable checks to allow duplicate attributes; // XIR does not enforce this, // because it needs to accommodate semantically invalid XML for // later analysis. for result in attrs.with_checks(false) { found = true; let attr = result.map_err(|e| match e { QuickXmlError::NoEqAfterName(pos) => { // TODO: quick-xml doesn't give us the name, // but we should discover it. Error::AttrValueExpected( None, ctx.span_or_zz(ele_pos + pos, 0), ) } QuickXmlError::UnquotedValue(pos) => { // TODO: name and span length Error::AttrValueUnquoted( None, ctx.span_or_zz(ele_pos + pos, 0), ) } // fallback e => Error::from_with_span(ctx.span_or_zz(ele_pos, 0))(e), })?; let keyoffset = attr.key.as_ptr() as usize; let name_offset = keyoffset - ele_ptr; // Accommodates zero-length values (e.g. `key=""`) with a // zero-length span at the location the value _would_ be. let valoffset = match attr.value { Cow::Borrowed(b) => b.as_ptr() as usize, // This should never happen since we have a reference to the // underlying buffer. Cow::Owned(_) => unreachable!( "internal error: unexpected owned attribute value" ), }; let value_offset = valoffset - ele_ptr; let span_name = ctx.span_or_zz(name_offset, attr.key.len()); let span_value = ctx.span_or_zz(value_offset, attr.value.len()); // The name must be parsed as a QName. let name = attr .key .try_into() .map_err(Error::from_with_span(span_name))?; // The attribute value, // having just been read from XML, // must have been escaped to be parsed properly. // If it parsed but it's not technically escaped according to // the spec, // that's okay as long as we can read it again, // but we probably should still throw an error if we // encounter such a situation. let value = escaper .unescape( attr.value .as_ref() .intern_utf8() .map_err(Error::from_with_span(span_value))?, ) .map_err(Error::from_with_span(span_value))? .into(); tokbuf.push_front(Token::AttrName(name, span_name)); tokbuf.push_front(Token::AttrValue(value, span_value)); } Ok(found) } } /// Whether the byte represents XML whitespace. /// /// This is quick-xml's whitespace predicate, /// and corresponds to the /// [nonterminal `S` in the XML specification][xmlspec-s]. /// /// [xmlspec-s]: https://www.w3.org/TR/xml/#NT-S pub fn is_xml_whitespace_u8(b: u8) -> bool { match b { b' ' | b'\r' | b'\n' | b'\t' => true, _ => false, } } /// Whether the character represents XML whitespace. /// /// See [`is_xml_whitespace_u8`]. pub fn is_xml_whitespace_char(c: char) -> bool { match c { ' ' | '\r' | '\n' | '\t' => true, _ => false, } } impl<'s, B, S> Iterator for XmlXirReader<'s, B, S> where B: BufRead, S: Escaper, { type Item = ParsedResult>; /// Produce the next XIR [`Token`] from the input. /// /// For more information on how this reader operates, /// see [`XmlXirReader`]. fn next(&mut self) -> Option { self.tokbuf .pop_back() .map(|tok| Ok(Parsed::Object(tok))) .or_else(|| { self.refill_buf().map(|result| { result.map(Parsed::Object).map_err(ParseError::StateError) }) }) } } #[cfg(test)] mod test;