tamer: tameld: Skip fragment unescaping only to re-escape on write

Fragments' text were unescaped on reading, producing an owned String and
spending time parsing the text to unescape.  We were then copying that into
an internement pool (so, copying twice, effectively).

Further, we were then _re-escaping_ on write.

This was all wasteful, since we do not do any manipulation of the fragment
before outputting to the xmle file; we know that Saxon produced properly
escaped XML to begin with, and can trust to propagate it.

This also introduces a new global `clone_uninterned_utf8_unchecked` method.

In profiling this change, I tested (a) before this change, (b) after writing
without escaping, and (c) after both reading escaped and writing without
escaping.

     (a)              (b)              (c)
  sec   mem (B)    sec     B        sec     B
0:00.95 47896 -> 0:00.91 47988 -> 0:00.87 48288
0:00.40 30176 -> 0:00.37 25656 -> 0:00.36 25788
0:00.39 45672 -> 0:00.37 45756 -> 0:00.35 34952
0:00.39 20716 -> 0:00.38 19604 -> 0:00.36 19956
0:00.33 16836 -> 0:00.32 16988 -> 0:00.31 16892
0:00.23 15268 -> 0:00.23 15236 -> 0:00.22 15312
0:00.44 20780 -> 0:00.44 20048 -> 0:00.41 20148
0:00.54 44516 -> 0:00.50 36964 -> 0:00.49 36728
0:00.62 55976 -> 0:00.57 46204 -> 0:00.54 41468
0:00.31 28016 -> 0:00.30 27308 -> 0:00.28 23844
0:00.23 15388 -> 0:00.22 15316 -> 0:00.21 15304
0:00.05 4888  -> 0:00.05 4760  -> 0:00.05 4948
0:00.41 19756 -> 0:00.41 19852 -> 0:00.40 19992
0:00.47 20828 -> 0:00.46 20844 -> 0:00.44 20968
0:00.27 18152 -> 0:00.26 18184 -> 0:00.25 18312

Interestingly, the peak memory usage increases very slightly between the
second and third steps (though decreases from the first), likely because the
raw (encoded) is larger than the unencoded text (e.g. `>` takes more
space than `>`).
main
Mike Gerwitz 2021-08-18 11:39:06 -04:00
parent f97141f5c5
commit 1cdb3fbbc5
6 changed files with 88 additions and 28 deletions

View File

@ -29,6 +29,7 @@ use mock::MockXmlWriter as XmlWriter;
use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
#[cfg(not(test))]
use quick_xml::Writer as XmlWriter;
use std::borrow::Cow;
use std::io::Write;
/// Responsible for writing to the xmle files
@ -361,7 +362,9 @@ impl<W: Write> XmleWriter<W> {
match ident {
IdentObject::IdentFragment(_, _, _, frag) => {
self.writer.write_event(Event::Text(
BytesText::from_plain_str(&frag.lookup_str()),
BytesText::from_escaped_str(Cow::Borrowed(
&frag.lookup_str() as &str,
)),
))?;
}
// Cgen, Gen, and Lparam are not expected to be present, so we

View File

@ -138,8 +138,8 @@
use crate::ir::legacyir::{PackageAttrs, SymAttrs, SymType};
use crate::sym::{
GlobalSymbolIntern, GlobalSymbolInternUnchecked, GlobalSymbolResolve,
SymbolId, SymbolIndexSize, SymbolStr,
GlobalSymbolInternUnchecked, GlobalSymbolResolve, SymbolId,
SymbolIndexSize, SymbolStr,
};
#[cfg(test)]
use crate::test::quick_xml::MockBytesStart as BytesStart;
@ -624,8 +624,13 @@ where
/// Process `preproc:fragment` element.
///
/// This element represents the compiled code for the given symbol.
/// The result is a single [`XmloEvent::Fragment`] with an owned
/// [`String`] fragment and an interned `preproc:fragment/@id`.
/// The result is a single [`XmloEvent::Fragment`] with an interned
/// fragment and an interned `preproc:fragment/@id`.
/// The fragment is _left escaped_,
/// since it is assumed that it will be written back out verbatim
/// without further modification;
/// this save us from having to spend time re-escaping on output
/// down the line.
///
/// Errors
/// ======
@ -648,17 +653,18 @@ where
Ok(unsafe { attr.value.intern_utf8_unchecked() })
})?;
let text =
reader
.read_text(ele.name(), buffer)
.map_err(|err| match err {
InnerXmlError::TextNotFound => {
XmloError::MissingFragmentText(id.lookup_str())
}
_ => err.into(),
})?;
let text = match reader.read_event(buffer)? {
XmlEvent::Text(ev) => {
// It is wasteful to unescape only to have to re-escape
// again on write, so keep the text raw (escaped), and also
// trust that it's valid UTF-8, having come from the
// compiler.
Ok(unsafe { ev.escaped().clone_uninterned_utf8_unchecked() })
}
_ => Err(XmloError::MissingFragmentText(id.lookup_str())),
}?;
Ok(XmloEvent::Fragment(id, text.clone_uninterned()))
Ok(XmloEvent::Fragment(id, text))
}
/// Convert single-character `@dim` to a [`u8`].

View File

@ -314,16 +314,21 @@ xmlo_tests! {
}
fn fragment_event(sut) {
let expected = "fragment text".to_string();
sut.reader.next_text = Some(Ok(expected.clone()));
let expected = "fragment text";
sut.reader.next_event = Some(Box::new(|_, _| {
Ok(XmlEvent::Start(MockBytesStart::new(
sut.reader.next_event = Some(Box::new(|_, event_i| match event_i {
0 => Ok(XmlEvent::Start(MockBytesStart::new(
b"preproc:fragment",
Some(MockAttributes::new(vec![MockAttribute::new(
b"id", b"fragsym",
)])),
)))
))),
1 => Ok(XmlEvent::Text(MockBytesText::new(
b"fragment text"
))),
_ => Err(InnerXmlError::UnexpectedEof(
format!("MockXmlReader out of events: {}", event_i).into(),
)),
}));
let result = sut.read_event()?;
@ -333,12 +338,6 @@ xmlo_tests! {
XmloEvent::Fragment(sym, given)
if sym == "fragsym".intern() && given.lookup_str() == expected
));
// argument provided to underlying read_text
assert_eq!(
Some("preproc:fragment".to_string()),
sut.reader.given_text_ele
);
}
fn fragment_fails_with_missing_id(sut) {

View File

@ -147,7 +147,7 @@ pub trait Interner<'i, Ix: SymbolIndexSize> {
/// the result is [`None`].
fn index_lookup(&'i self, index: SymbolId<Ix>) -> Option<SymbolStr<'i>>;
/// Intern an assumed-UTF8 slice of bytes or return an existing
/// Intern an assumed-UTF-8 slice of bytes or return an existing
/// [`SymbolId`].
///
/// Safety
@ -162,6 +162,29 @@ pub trait Interner<'i, Ix: SymbolIndexSize> {
unsafe fn intern_utf8_unchecked(&self, value: &[u8]) -> SymbolId<Ix> {
self.intern(std::str::from_utf8_unchecked(value))
}
/// Copy the provided assumed-UTF-8 slice of bytes into the intern pool
/// and produce a symbol,
/// but do not intern the symbol.
///
/// See [`clone_uninterned`](Interner::clone_uninterned) for more
/// information.
///
/// Safety
/// ======
/// This function is unsafe because it uses
/// [`std::str::from_utf8_unchecked`].
/// It is provided for convenience when interning from trusted binary
/// data
/// (such as [object files][]).
///
/// [object files]: crate::obj
unsafe fn clone_uninterned_utf8_unchecked(
&self,
value: &[u8],
) -> SymbolId<Ix> {
self.clone_uninterned(std::str::from_utf8_unchecked(value))
}
}
/// An interner backed by an [arena](bumpalo).

View File

@ -354,7 +354,9 @@ pub trait GlobalSymbolIntern<Ix: SymbolIndexSize> {
/// See also [`GlobalSymbolIntern`].
/// This uses [`Interner::intern_utf8_unchecked`].
pub trait GlobalSymbolInternUnchecked<Ix: SymbolIndexSize> {
/// Intern a bye slice using a global interner.
/// Intern a byte slice using a global interner.
///
/// See also [`GlobalSymbolIntern::intern`].
///
/// Safety
/// ======
@ -366,6 +368,23 @@ pub trait GlobalSymbolInternUnchecked<Ix: SymbolIndexSize> {
///
/// [object files]: crate::obj
unsafe fn intern_utf8_unchecked(self) -> SymbolId<Ix>;
/// Copy the provided assumed-UTF-8 byte slice into the intern pool and
/// produce a symbol using a global interner,
/// but do not intern the symbol.
///
/// See also [`GlobalSymbolIntern::clone_uninterned`].
///
/// Safety
/// ======
/// This function is unsafe because it uses
/// [`Interner::intern_utf8_unchecked`].
/// It is provided for convenience when interning from trusted binary
/// data
/// (such as [object files][]).
///
/// [object files]: crate::obj
unsafe fn clone_uninterned_utf8_unchecked(self) -> SymbolId<Ix>;
}
impl<Ix: SymbolIndexSize> GlobalSymbolIntern<Ix> for &str {
@ -384,6 +403,12 @@ impl<Ix: SymbolIndexSize> GlobalSymbolInternUnchecked<Ix> for &[u8] {
interner.intern_utf8_unchecked(self)
})
}
unsafe fn clone_uninterned_utf8_unchecked(self) -> SymbolId<Ix> {
Ix::with_static_interner(|interner| {
interner.clone_uninterned_utf8_unchecked(self)
})
}
}
impl<T, Ix> From<T> for SymbolId<Ix>

View File

@ -79,6 +79,10 @@ impl<'a> MockBytesText<'a> {
content: Cow::Borrowed(content),
}
}
pub fn escaped(&self) -> &[u8] {
self.content.as_ref()
}
}
pub struct MockAttributes<'a> {