tamer: tamec: Replace copy with XIR parsing/writing
When wip-frontends is on, this will parse the input file using XIR and then immediately output it again. This makes the necessary changes to be able to read every source file we have in our largest project, such that the output is identical after having been formatted with `xmllint --format -` (there are differences because e.g. whitespace between attributes is not yet maintained). This is performant too, with times remaining essentially identical despite the additional work. DEV-10413main
parent
b90bf9d8a8
commit
99aacaf7ca
|
@ -23,18 +23,18 @@
|
|||
extern crate tamer;
|
||||
|
||||
use getopts::{Fail, Options};
|
||||
use std::env;
|
||||
use std::error::Error;
|
||||
use std::ffi::OsStr;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::{env, io::BufWriter};
|
||||
use tamer::{
|
||||
iter::into_iter_while_ok,
|
||||
xir::{reader::XmlXirReader, DefaultEscaper},
|
||||
};
|
||||
|
||||
#[cfg(feature = "wip-frontends")]
|
||||
use {
|
||||
std::io::BufReader,
|
||||
tamer::frontend::{FrontendEvent, FrontendParser, XmlFrontendParser},
|
||||
tamer::fs::File,
|
||||
};
|
||||
use {std::io::BufReader, tamer::fs::File};
|
||||
|
||||
/// Types of commands
|
||||
enum Command {
|
||||
|
@ -58,24 +58,26 @@ pub fn main() -> Result<(), Box<dyn Error>> {
|
|||
|
||||
let dest = Path::new(&output);
|
||||
|
||||
// This will eventually replace `fs::copy` below.
|
||||
#[cfg(not(feature = "wip-frontends"))]
|
||||
fs::copy(source, dest)?;
|
||||
|
||||
#[cfg(feature = "wip-frontends")]
|
||||
{
|
||||
use tamer::xir::writer::XmlWriter;
|
||||
|
||||
let escaper = DefaultEscaper::default();
|
||||
let file: BufReader<fs::File> = File::open(source)?;
|
||||
let mut parser = XmlFrontendParser::new(file);
|
||||
let mut fout = BufWriter::new(fs::File::create(dest)?);
|
||||
|
||||
// Parse all the way through, but don't do anything with it
|
||||
// yet.
|
||||
loop {
|
||||
match parser.parse_next()? {
|
||||
FrontendEvent::Eof => break,
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
// Parse into XIR and re-lower into XML,
|
||||
// which is similar to a copy but proves that we're able
|
||||
// to parse source files.
|
||||
into_iter_while_ok(
|
||||
XmlXirReader::new(file, &escaper),
|
||||
|toks| toks.write(&mut fout, Default::default(), &escaper),
|
||||
)??;
|
||||
}
|
||||
|
||||
fs::copy(source, dest)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Ok(Command::Usage) => {
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
//! XIR error information.
|
||||
|
||||
use crate::tpwrap::quick_xml;
|
||||
use crate::{span::Span, sym::SymbolId, tpwrap::quick_xml};
|
||||
use std::{fmt::Display, str::Utf8Error};
|
||||
|
||||
/// Error attempting to produce a XIR object.
|
||||
|
@ -31,12 +31,23 @@ pub enum Error {
|
|||
NotWhitespace(String),
|
||||
/// Provided QName is not valid.
|
||||
InvalidQName(Vec<u8>),
|
||||
// A UTF-8 error together with the byte slice that caused it.
|
||||
//
|
||||
// By storing the raw bytes instead of a string,
|
||||
// we allow the displayer to determine how to handle invalid UTF-8
|
||||
// encodings.
|
||||
/// A UTF-8 error together with the byte slice that caused it.
|
||||
///
|
||||
/// By storing the raw bytes instead of a string,
|
||||
/// we allow the displayer to determine how to handle invalid UTF-8
|
||||
/// encodings.
|
||||
InvalidUtf8(Utf8Error, Vec<u8>),
|
||||
/// XML 1.0 only.
|
||||
///
|
||||
/// Other versions are not widely in use
|
||||
/// (only 1.1 exists at the time of writing)
|
||||
/// and providing that is either in error,
|
||||
/// copy/paste,
|
||||
/// or the user is expecting something they're not going to get.
|
||||
UnsupportedXmlVersion(SymbolId, Span),
|
||||
/// TAMER expects UTF-8 encoding for everything,
|
||||
/// which should not be an unreasonable expectation.
|
||||
UnsupportedEncoding(SymbolId, Span),
|
||||
|
||||
// TODO: Better error translation and spans.
|
||||
QuickXmlError(quick_xml::Error),
|
||||
|
@ -66,6 +77,23 @@ impl Display for Error {
|
|||
String::from_utf8_lossy(bytes)
|
||||
)
|
||||
}
|
||||
Self::UnsupportedXmlVersion(ver, span) => {
|
||||
write!(
|
||||
f,
|
||||
"expected XML version `1.0` at {span}, \
|
||||
but found unsupported version `{ver}`"
|
||||
)
|
||||
}
|
||||
Self::UnsupportedEncoding(enc, span) => {
|
||||
// TODO: when we have hints,
|
||||
// indicate that they can also entirely remove this
|
||||
// attribute to resolve the error
|
||||
write!(
|
||||
f,
|
||||
"expected `utf-8` or `UTF-8` encoding at {span}, \
|
||||
but found unsupported encoding `{enc}`"
|
||||
)
|
||||
}
|
||||
// TODO: See Error TODO
|
||||
Self::QuickXmlError(inner) => {
|
||||
write!(f, "internal parser error: {:?}", inner)
|
||||
|
@ -91,6 +119,6 @@ impl From<(Utf8Error, &[u8])> for Error {
|
|||
|
||||
impl<E: Into<quick_xml::Error>> From<E> for Error {
|
||||
fn from(err: E) -> Self {
|
||||
Self::QuickXmlError(err.into().into())
|
||||
Self::QuickXmlError(err.into())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,10 +22,15 @@
|
|||
//! This uses [`quick_xml`] as the parser.
|
||||
|
||||
use super::{DefaultEscaper, Error, Escaper, Token};
|
||||
use crate::{span::DUMMY_SPAN, sym::GlobalSymbolInternBytes};
|
||||
use crate::{
|
||||
span::{DUMMY_SPAN, UNKNOWN_SPAN},
|
||||
sym::GlobalSymbolInternBytes,
|
||||
};
|
||||
use quick_xml::{
|
||||
self,
|
||||
events::{attributes::Attributes, BytesStart, Event as QuickXmlEvent},
|
||||
events::{
|
||||
attributes::Attributes, BytesDecl, BytesStart, Event as QuickXmlEvent,
|
||||
},
|
||||
};
|
||||
use std::{collections::VecDeque, io::BufRead, result};
|
||||
|
||||
|
@ -163,11 +168,59 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
|
|||
.map(|text| Token::Comment(text, DUMMY_SPAN)),
|
||||
),
|
||||
|
||||
// TODO: This must appear in the Prolog.
|
||||
QuickXmlEvent::Decl(decl) => match Self::validate_decl(&decl) {
|
||||
Err(x) => Some(Err(x)),
|
||||
Ok(()) => self.refill_buf(),
|
||||
},
|
||||
|
||||
// We do not support processor instructions.
|
||||
// TODO: Convert this into an error/warning?
|
||||
// Previously `xml-stylesheet` was present in some older
|
||||
// source files and may linger for a bit after cleanup.
|
||||
QuickXmlEvent::PI(..) => self.refill_buf(),
|
||||
|
||||
x => todo!("event: {:?}", x),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Validate an that an XML declaration contains expected values.
|
||||
///
|
||||
/// A declaration looks like `<?xml version="1.0" encoding="utf-8"?>`,
|
||||
/// where `@encoding` is optional but `@version` is not.
|
||||
/// It may also contain `@standalone`,
|
||||
/// but we do not check for that.
|
||||
///
|
||||
/// We expect version 1.0 and UTF-8 encoding.
|
||||
/// Failing when these expectations are voilated helps to ensure that
|
||||
/// people unfamiliar with the system do not have expectations that
|
||||
/// are going to be unmet,
|
||||
/// which may result in subtle (or even serious) problems.
|
||||
fn validate_decl(decl: &BytesDecl) -> Result<()> {
|
||||
// NB: `quick-xml` docs state that `version` returns the quotes,
|
||||
// but it does not.
|
||||
let ver = &decl.version()?[..];
|
||||
if ver != b"1.0" {
|
||||
Err(Error::UnsupportedXmlVersion(
|
||||
ver.intern_utf8()?,
|
||||
UNKNOWN_SPAN,
|
||||
))?
|
||||
}
|
||||
|
||||
if let Some(enc) = decl.encoding() {
|
||||
match &enc?[..] {
|
||||
b"utf-8" | b"UTF-8" => (),
|
||||
invalid => Err(Error::UnsupportedEncoding(
|
||||
invalid.intern_utf8()?,
|
||||
UNKNOWN_SPAN,
|
||||
))?,
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Parse opening element and its attributes into a XIR [`Token`]
|
||||
/// stream.
|
||||
///
|
||||
|
|
|
@ -431,3 +431,52 @@ fn attr_value_invalid_utf8() {
|
|||
_ => panic!("unexpected failure"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn valid_xml_decl_no_encoding() {
|
||||
new_sut!(sut = r#"<?xml version="1.0"?><root />"#);
|
||||
|
||||
assert_eq!(
|
||||
Ok(vec![
|
||||
Token::Open("root".unwrap_into(), DUMMY_SPAN),
|
||||
Token::Close(None, DUMMY_SPAN),
|
||||
]),
|
||||
sut.collect()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn valid_xml_decl_with_encoding_lower() {
|
||||
new_sut!(sut = r#"<?xml version="1.0" encoding="utf-8"?>"#);
|
||||
|
||||
assert_eq!(Ok(vec![]), sut.collect());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn valid_xml_decl_with_encoding_upper() {
|
||||
new_sut!(sut = r#"<?xml version="1.0" encoding="UTF-8"?>"#);
|
||||
|
||||
assert_eq!(Ok(vec![]), sut.collect());
|
||||
}
|
||||
|
||||
// Only 1.0 supported.
|
||||
#[test]
|
||||
fn invalid_xml_decl_version() {
|
||||
new_sut!(sut = r#"<?xml version="1.1"?>"#);
|
||||
|
||||
assert_eq!(
|
||||
Err(Error::UnsupportedXmlVersion("1.1".intern(), UNKNOWN_SPAN)),
|
||||
sut.collect::<Result<Vec<_>>>()
|
||||
);
|
||||
}
|
||||
|
||||
// Only UTF-8 supported.
|
||||
#[test]
|
||||
fn invalid_xml_encoding() {
|
||||
new_sut!(sut = r#"<?xml version="1.0" encoding="latin-1"?>"#);
|
||||
|
||||
assert_eq!(
|
||||
Err(Error::UnsupportedEncoding("latin-1".intern(), UNKNOWN_SPAN)),
|
||||
sut.collect::<Result<Vec<_>>>()
|
||||
);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue