tamer: tameld: Skip fragment unescaping only to re-escape on write
Fragments' text were unescaped on reading, producing an owned String and spending time parsing the text to unescape. We were then copying that into an internement pool (so, copying twice, effectively). Further, we were then _re-escaping_ on write. This was all wasteful, since we do not do any manipulation of the fragment before outputting to the xmle file; we know that Saxon produced properly escaped XML to begin with, and can trust to propagate it. This also introduces a new global `clone_uninterned_utf8_unchecked` method. In profiling this change, I tested (a) before this change, (b) after writing without escaping, and (c) after both reading escaped and writing without escaping. (a) (b) (c) sec mem (B) sec B sec B 0:00.95 47896 -> 0:00.91 47988 -> 0:00.87 48288 0:00.40 30176 -> 0:00.37 25656 -> 0:00.36 25788 0:00.39 45672 -> 0:00.37 45756 -> 0:00.35 34952 0:00.39 20716 -> 0:00.38 19604 -> 0:00.36 19956 0:00.33 16836 -> 0:00.32 16988 -> 0:00.31 16892 0:00.23 15268 -> 0:00.23 15236 -> 0:00.22 15312 0:00.44 20780 -> 0:00.44 20048 -> 0:00.41 20148 0:00.54 44516 -> 0:00.50 36964 -> 0:00.49 36728 0:00.62 55976 -> 0:00.57 46204 -> 0:00.54 41468 0:00.31 28016 -> 0:00.30 27308 -> 0:00.28 23844 0:00.23 15388 -> 0:00.22 15316 -> 0:00.21 15304 0:00.05 4888 -> 0:00.05 4760 -> 0:00.05 4948 0:00.41 19756 -> 0:00.41 19852 -> 0:00.40 19992 0:00.47 20828 -> 0:00.46 20844 -> 0:00.44 20968 0:00.27 18152 -> 0:00.26 18184 -> 0:00.25 18312 Interestingly, the peak memory usage increases very slightly between the second and third steps (though decreases from the first), likely because the raw (encoded) is larger than the unencoded text (e.g. `>` takes more space than `>`).main
parent
f97141f5c5
commit
1cdb3fbbc5
|
@ -29,6 +29,7 @@ use mock::MockXmlWriter as XmlWriter;
|
|||
use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
|
||||
#[cfg(not(test))]
|
||||
use quick_xml::Writer as XmlWriter;
|
||||
use std::borrow::Cow;
|
||||
use std::io::Write;
|
||||
|
||||
/// Responsible for writing to the xmle files
|
||||
|
@ -361,7 +362,9 @@ impl<W: Write> XmleWriter<W> {
|
|||
match ident {
|
||||
IdentObject::IdentFragment(_, _, _, frag) => {
|
||||
self.writer.write_event(Event::Text(
|
||||
BytesText::from_plain_str(&frag.lookup_str()),
|
||||
BytesText::from_escaped_str(Cow::Borrowed(
|
||||
&frag.lookup_str() as &str,
|
||||
)),
|
||||
))?;
|
||||
}
|
||||
// Cgen, Gen, and Lparam are not expected to be present, so we
|
||||
|
|
|
@ -138,8 +138,8 @@
|
|||
|
||||
use crate::ir::legacyir::{PackageAttrs, SymAttrs, SymType};
|
||||
use crate::sym::{
|
||||
GlobalSymbolIntern, GlobalSymbolInternUnchecked, GlobalSymbolResolve,
|
||||
SymbolId, SymbolIndexSize, SymbolStr,
|
||||
GlobalSymbolInternUnchecked, GlobalSymbolResolve, SymbolId,
|
||||
SymbolIndexSize, SymbolStr,
|
||||
};
|
||||
#[cfg(test)]
|
||||
use crate::test::quick_xml::MockBytesStart as BytesStart;
|
||||
|
@ -624,8 +624,13 @@ where
|
|||
/// Process `preproc:fragment` element.
|
||||
///
|
||||
/// This element represents the compiled code for the given symbol.
|
||||
/// The result is a single [`XmloEvent::Fragment`] with an owned
|
||||
/// [`String`] fragment and an interned `preproc:fragment/@id`.
|
||||
/// The result is a single [`XmloEvent::Fragment`] with an interned
|
||||
/// fragment and an interned `preproc:fragment/@id`.
|
||||
/// The fragment is _left escaped_,
|
||||
/// since it is assumed that it will be written back out verbatim
|
||||
/// without further modification;
|
||||
/// this save us from having to spend time re-escaping on output
|
||||
/// down the line.
|
||||
///
|
||||
/// Errors
|
||||
/// ======
|
||||
|
@ -648,17 +653,18 @@ where
|
|||
Ok(unsafe { attr.value.intern_utf8_unchecked() })
|
||||
})?;
|
||||
|
||||
let text =
|
||||
reader
|
||||
.read_text(ele.name(), buffer)
|
||||
.map_err(|err| match err {
|
||||
InnerXmlError::TextNotFound => {
|
||||
XmloError::MissingFragmentText(id.lookup_str())
|
||||
}
|
||||
_ => err.into(),
|
||||
})?;
|
||||
let text = match reader.read_event(buffer)? {
|
||||
XmlEvent::Text(ev) => {
|
||||
// It is wasteful to unescape only to have to re-escape
|
||||
// again on write, so keep the text raw (escaped), and also
|
||||
// trust that it's valid UTF-8, having come from the
|
||||
// compiler.
|
||||
Ok(unsafe { ev.escaped().clone_uninterned_utf8_unchecked() })
|
||||
}
|
||||
_ => Err(XmloError::MissingFragmentText(id.lookup_str())),
|
||||
}?;
|
||||
|
||||
Ok(XmloEvent::Fragment(id, text.clone_uninterned()))
|
||||
Ok(XmloEvent::Fragment(id, text))
|
||||
}
|
||||
|
||||
/// Convert single-character `@dim` to a [`u8`].
|
||||
|
|
|
@ -314,16 +314,21 @@ xmlo_tests! {
|
|||
}
|
||||
|
||||
fn fragment_event(sut) {
|
||||
let expected = "fragment text".to_string();
|
||||
sut.reader.next_text = Some(Ok(expected.clone()));
|
||||
let expected = "fragment text";
|
||||
|
||||
sut.reader.next_event = Some(Box::new(|_, _| {
|
||||
Ok(XmlEvent::Start(MockBytesStart::new(
|
||||
sut.reader.next_event = Some(Box::new(|_, event_i| match event_i {
|
||||
0 => Ok(XmlEvent::Start(MockBytesStart::new(
|
||||
b"preproc:fragment",
|
||||
Some(MockAttributes::new(vec![MockAttribute::new(
|
||||
b"id", b"fragsym",
|
||||
)])),
|
||||
)))
|
||||
))),
|
||||
1 => Ok(XmlEvent::Text(MockBytesText::new(
|
||||
b"fragment text"
|
||||
))),
|
||||
_ => Err(InnerXmlError::UnexpectedEof(
|
||||
format!("MockXmlReader out of events: {}", event_i).into(),
|
||||
)),
|
||||
}));
|
||||
|
||||
let result = sut.read_event()?;
|
||||
|
@ -333,12 +338,6 @@ xmlo_tests! {
|
|||
XmloEvent::Fragment(sym, given)
|
||||
if sym == "fragsym".intern() && given.lookup_str() == expected
|
||||
));
|
||||
|
||||
// argument provided to underlying read_text
|
||||
assert_eq!(
|
||||
Some("preproc:fragment".to_string()),
|
||||
sut.reader.given_text_ele
|
||||
);
|
||||
}
|
||||
|
||||
fn fragment_fails_with_missing_id(sut) {
|
||||
|
|
|
@ -147,7 +147,7 @@ pub trait Interner<'i, Ix: SymbolIndexSize> {
|
|||
/// the result is [`None`].
|
||||
fn index_lookup(&'i self, index: SymbolId<Ix>) -> Option<SymbolStr<'i>>;
|
||||
|
||||
/// Intern an assumed-UTF8 slice of bytes or return an existing
|
||||
/// Intern an assumed-UTF-8 slice of bytes or return an existing
|
||||
/// [`SymbolId`].
|
||||
///
|
||||
/// Safety
|
||||
|
@ -162,6 +162,29 @@ pub trait Interner<'i, Ix: SymbolIndexSize> {
|
|||
unsafe fn intern_utf8_unchecked(&self, value: &[u8]) -> SymbolId<Ix> {
|
||||
self.intern(std::str::from_utf8_unchecked(value))
|
||||
}
|
||||
|
||||
/// Copy the provided assumed-UTF-8 slice of bytes into the intern pool
|
||||
/// and produce a symbol,
|
||||
/// but do not intern the symbol.
|
||||
///
|
||||
/// See [`clone_uninterned`](Interner::clone_uninterned) for more
|
||||
/// information.
|
||||
///
|
||||
/// Safety
|
||||
/// ======
|
||||
/// This function is unsafe because it uses
|
||||
/// [`std::str::from_utf8_unchecked`].
|
||||
/// It is provided for convenience when interning from trusted binary
|
||||
/// data
|
||||
/// (such as [object files][]).
|
||||
///
|
||||
/// [object files]: crate::obj
|
||||
unsafe fn clone_uninterned_utf8_unchecked(
|
||||
&self,
|
||||
value: &[u8],
|
||||
) -> SymbolId<Ix> {
|
||||
self.clone_uninterned(std::str::from_utf8_unchecked(value))
|
||||
}
|
||||
}
|
||||
|
||||
/// An interner backed by an [arena](bumpalo).
|
||||
|
|
|
@ -354,7 +354,9 @@ pub trait GlobalSymbolIntern<Ix: SymbolIndexSize> {
|
|||
/// See also [`GlobalSymbolIntern`].
|
||||
/// This uses [`Interner::intern_utf8_unchecked`].
|
||||
pub trait GlobalSymbolInternUnchecked<Ix: SymbolIndexSize> {
|
||||
/// Intern a bye slice using a global interner.
|
||||
/// Intern a byte slice using a global interner.
|
||||
///
|
||||
/// See also [`GlobalSymbolIntern::intern`].
|
||||
///
|
||||
/// Safety
|
||||
/// ======
|
||||
|
@ -366,6 +368,23 @@ pub trait GlobalSymbolInternUnchecked<Ix: SymbolIndexSize> {
|
|||
///
|
||||
/// [object files]: crate::obj
|
||||
unsafe fn intern_utf8_unchecked(self) -> SymbolId<Ix>;
|
||||
|
||||
/// Copy the provided assumed-UTF-8 byte slice into the intern pool and
|
||||
/// produce a symbol using a global interner,
|
||||
/// but do not intern the symbol.
|
||||
///
|
||||
/// See also [`GlobalSymbolIntern::clone_uninterned`].
|
||||
///
|
||||
/// Safety
|
||||
/// ======
|
||||
/// This function is unsafe because it uses
|
||||
/// [`Interner::intern_utf8_unchecked`].
|
||||
/// It is provided for convenience when interning from trusted binary
|
||||
/// data
|
||||
/// (such as [object files][]).
|
||||
///
|
||||
/// [object files]: crate::obj
|
||||
unsafe fn clone_uninterned_utf8_unchecked(self) -> SymbolId<Ix>;
|
||||
}
|
||||
|
||||
impl<Ix: SymbolIndexSize> GlobalSymbolIntern<Ix> for &str {
|
||||
|
@ -384,6 +403,12 @@ impl<Ix: SymbolIndexSize> GlobalSymbolInternUnchecked<Ix> for &[u8] {
|
|||
interner.intern_utf8_unchecked(self)
|
||||
})
|
||||
}
|
||||
|
||||
unsafe fn clone_uninterned_utf8_unchecked(self) -> SymbolId<Ix> {
|
||||
Ix::with_static_interner(|interner| {
|
||||
interner.clone_uninterned_utf8_unchecked(self)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, Ix> From<T> for SymbolId<Ix>
|
||||
|
|
|
@ -79,6 +79,10 @@ impl<'a> MockBytesText<'a> {
|
|||
content: Cow::Borrowed(content),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn escaped(&self) -> &[u8] {
|
||||
self.content.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MockAttributes<'a> {
|
||||
|
|
Loading…
Reference in New Issue