tamer: tamec: Replace copy with XIR parsing/writing

When wip-frontends is on, this will parse the input file using XIR and then
immediately output it again.  This makes the necessary changes to be able to
read every source file we have in our largest project, such that the output
is identical after having been formatted with `xmllint --format -` (there
are differences because e.g. whitespace between attributes is not yet
maintained).

This is performant too, with times remaining essentially identical despite
the additional work.

DEV-10413
main
Mike Gerwitz 2022-04-07 12:08:51 -04:00
parent b90bf9d8a8
commit 99aacaf7ca
4 changed files with 159 additions and 27 deletions

View File

@ -23,18 +23,18 @@
extern crate tamer;
use getopts::{Fail, Options};
use std::env;
use std::error::Error;
use std::ffi::OsStr;
use std::fs;
use std::path::Path;
use std::{env, io::BufWriter};
use tamer::{
iter::into_iter_while_ok,
xir::{reader::XmlXirReader, DefaultEscaper},
};
#[cfg(feature = "wip-frontends")]
use {
std::io::BufReader,
tamer::frontend::{FrontendEvent, FrontendParser, XmlFrontendParser},
tamer::fs::File,
};
use {std::io::BufReader, tamer::fs::File};
/// Types of commands
enum Command {
@ -58,24 +58,26 @@ pub fn main() -> Result<(), Box<dyn Error>> {
let dest = Path::new(&output);
// This will eventually replace `fs::copy` below.
#[cfg(not(feature = "wip-frontends"))]
fs::copy(source, dest)?;
#[cfg(feature = "wip-frontends")]
{
use tamer::xir::writer::XmlWriter;
let escaper = DefaultEscaper::default();
let file: BufReader<fs::File> = File::open(source)?;
let mut parser = XmlFrontendParser::new(file);
let mut fout = BufWriter::new(fs::File::create(dest)?);
// Parse all the way through, but don't do anything with it
// yet.
loop {
match parser.parse_next()? {
FrontendEvent::Eof => break,
_ => continue,
}
}
// Parse into XIR and re-lower into XML,
// which is similar to a copy but proves that we're able
// to parse source files.
into_iter_while_ok(
XmlXirReader::new(file, &escaper),
|toks| toks.write(&mut fout, Default::default(), &escaper),
)??;
}
fs::copy(source, dest)?;
Ok(())
}
Ok(Command::Usage) => {

View File

@ -19,7 +19,7 @@
//! XIR error information.
use crate::tpwrap::quick_xml;
use crate::{span::Span, sym::SymbolId, tpwrap::quick_xml};
use std::{fmt::Display, str::Utf8Error};
/// Error attempting to produce a XIR object.
@ -31,12 +31,23 @@ pub enum Error {
NotWhitespace(String),
/// Provided QName is not valid.
InvalidQName(Vec<u8>),
// A UTF-8 error together with the byte slice that caused it.
//
// By storing the raw bytes instead of a string,
// we allow the displayer to determine how to handle invalid UTF-8
// encodings.
/// A UTF-8 error together with the byte slice that caused it.
///
/// By storing the raw bytes instead of a string,
/// we allow the displayer to determine how to handle invalid UTF-8
/// encodings.
InvalidUtf8(Utf8Error, Vec<u8>),
/// XML 1.0 only.
///
/// Other versions are not widely in use
/// (only 1.1 exists at the time of writing)
/// and providing that is either in error,
/// copy/paste,
/// or the user is expecting something they're not going to get.
UnsupportedXmlVersion(SymbolId, Span),
/// TAMER expects UTF-8 encoding for everything,
/// which should not be an unreasonable expectation.
UnsupportedEncoding(SymbolId, Span),
// TODO: Better error translation and spans.
QuickXmlError(quick_xml::Error),
@ -66,6 +77,23 @@ impl Display for Error {
String::from_utf8_lossy(bytes)
)
}
Self::UnsupportedXmlVersion(ver, span) => {
write!(
f,
"expected XML version `1.0` at {span}, \
but found unsupported version `{ver}`"
)
}
Self::UnsupportedEncoding(enc, span) => {
// TODO: when we have hints,
// indicate that they can also entirely remove this
// attribute to resolve the error
write!(
f,
"expected `utf-8` or `UTF-8` encoding at {span}, \
but found unsupported encoding `{enc}`"
)
}
// TODO: See Error TODO
Self::QuickXmlError(inner) => {
write!(f, "internal parser error: {:?}", inner)
@ -91,6 +119,6 @@ impl From<(Utf8Error, &[u8])> for Error {
impl<E: Into<quick_xml::Error>> From<E> for Error {
fn from(err: E) -> Self {
Self::QuickXmlError(err.into().into())
Self::QuickXmlError(err.into())
}
}

View File

@ -22,10 +22,15 @@
//! This uses [`quick_xml`] as the parser.
use super::{DefaultEscaper, Error, Escaper, Token};
use crate::{span::DUMMY_SPAN, sym::GlobalSymbolInternBytes};
use crate::{
span::{DUMMY_SPAN, UNKNOWN_SPAN},
sym::GlobalSymbolInternBytes,
};
use quick_xml::{
self,
events::{attributes::Attributes, BytesStart, Event as QuickXmlEvent},
events::{
attributes::Attributes, BytesDecl, BytesStart, Event as QuickXmlEvent,
},
};
use std::{collections::VecDeque, io::BufRead, result};
@ -163,11 +168,59 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
.map(|text| Token::Comment(text, DUMMY_SPAN)),
),
// TODO: This must appear in the Prolog.
QuickXmlEvent::Decl(decl) => match Self::validate_decl(&decl) {
Err(x) => Some(Err(x)),
Ok(()) => self.refill_buf(),
},
// We do not support processor instructions.
// TODO: Convert this into an error/warning?
// Previously `xml-stylesheet` was present in some older
// source files and may linger for a bit after cleanup.
QuickXmlEvent::PI(..) => self.refill_buf(),
x => todo!("event: {:?}", x),
},
}
}
/// Validate an that an XML declaration contains expected values.
///
/// A declaration looks like `<?xml version="1.0" encoding="utf-8"?>`,
/// where `@encoding` is optional but `@version` is not.
/// It may also contain `@standalone`,
/// but we do not check for that.
///
/// We expect version 1.0 and UTF-8 encoding.
/// Failing when these expectations are voilated helps to ensure that
/// people unfamiliar with the system do not have expectations that
/// are going to be unmet,
/// which may result in subtle (or even serious) problems.
fn validate_decl(decl: &BytesDecl) -> Result<()> {
// NB: `quick-xml` docs state that `version` returns the quotes,
// but it does not.
let ver = &decl.version()?[..];
if ver != b"1.0" {
Err(Error::UnsupportedXmlVersion(
ver.intern_utf8()?,
UNKNOWN_SPAN,
))?
}
if let Some(enc) = decl.encoding() {
match &enc?[..] {
b"utf-8" | b"UTF-8" => (),
invalid => Err(Error::UnsupportedEncoding(
invalid.intern_utf8()?,
UNKNOWN_SPAN,
))?,
}
}
Ok(())
}
/// Parse opening element and its attributes into a XIR [`Token`]
/// stream.
///

View File

@ -431,3 +431,52 @@ fn attr_value_invalid_utf8() {
_ => panic!("unexpected failure"),
}
}
#[test]
fn valid_xml_decl_no_encoding() {
new_sut!(sut = r#"<?xml version="1.0"?><root />"#);
assert_eq!(
Ok(vec![
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
]),
sut.collect()
);
}
#[test]
fn valid_xml_decl_with_encoding_lower() {
new_sut!(sut = r#"<?xml version="1.0" encoding="utf-8"?>"#);
assert_eq!(Ok(vec![]), sut.collect());
}
#[test]
fn valid_xml_decl_with_encoding_upper() {
new_sut!(sut = r#"<?xml version="1.0" encoding="UTF-8"?>"#);
assert_eq!(Ok(vec![]), sut.collect());
}
// Only 1.0 supported.
#[test]
fn invalid_xml_decl_version() {
new_sut!(sut = r#"<?xml version="1.1"?>"#);
assert_eq!(
Err(Error::UnsupportedXmlVersion("1.1".intern(), UNKNOWN_SPAN)),
sut.collect::<Result<Vec<_>>>()
);
}
// Only UTF-8 supported.
#[test]
fn invalid_xml_encoding() {
new_sut!(sut = r#"<?xml version="1.0" encoding="latin-1"?>"#);
assert_eq!(
Err(Error::UnsupportedEncoding("latin-1".intern(), UNKNOWN_SPAN)),
sut.collect::<Result<Vec<_>>>()
);
}