tame/tamer/benches/xir.rs

259 lines
9.1 KiB
Rust
Raw Normal View History

// Comparisons between Rust built-ins and memchr.
//
// Copyright (C) 2014-2023 Ryan Specialty, LLC.
//
// This file is part of TAME.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
#![feature(test)]
//! Assessment of overhead of Xir compared to baselines.
//!
//! A lot of time in TAMER is spent parsing and writing XML files, so it's
//! important that these operations be efficient.
//! Xir is intended to be a very lightweight IR,
//! able to provide convenient abstractions and validations only when
//! both necessary and desired.
//!
//! Rust touts "zero-cost abstractions",
//! which is a generally true statement (with some exceptions) that allows
//! us to create dense newtype abstractions that represent validated and
//! structured data,
//! at a compile-time but not runtime cost.
//! These tests serve to demonstrate that such a claim is true for Xir,
//! and help to obviate any potential future regressions.
extern crate quick_xml;
extern crate tamer;
extern crate test;
use std::convert::{TryFrom, TryInto};
use tamer::sym::{GlobalSymbolIntern, GlobalSymbolResolve, SymbolId};
use tamer::xir::{NCName, QName, Token};
use test::Bencher;
fn gen_strs(n: usize, suffix: &str) -> Vec<String> {
(0..n).map(|n| n.to_string() + suffix).collect()
}
mod name {
use super::*;
// Essentially duplicates sym::interner::global::with_all_new_1000, but
// provides a local baseline that we can be sure will be available to
// compare against, at a glance.
#[bench]
fn baseline_global_intern_str_1000(bench: &mut Bencher) {
let strs = gen_strs(1000, "foobar");
bench.iter(|| {
strs.iter()
.map(|s| s.as_str().intern() as SymbolId)
.for_each(drop);
});
}
// This should be cost-free relative to the previous test.
#[bench]
fn ncname_new_unchecked_str_intern_1000(bench: &mut Bencher) {
let strs = gen_strs(1000, "foobar");
bench.iter(|| {
strs.iter()
.map(|s| unsafe { NCName::new_unchecked(s.as_str().intern()) })
.for_each(drop);
});
}
// This duplicates a memchr test, but allows us to have a comparable
// baseline at a glance.
#[bench]
fn baseline_str_contains_1000(bench: &mut Bencher) {
let strs = gen_strs(1000, "foobar");
bench.iter(|| {
strs.iter().map(|s| s.as_str().contains(':')).for_each(drop);
});
}
// This should be approximately as expensive as the two baselines added
// together.
#[bench]
fn ncname_try_from_str_1000(bench: &mut Bencher) {
let strs = gen_strs(1000, "foobar");
bench.iter(|| {
strs.iter()
.map(|s| NCName::try_from(s.as_str()))
.for_each(drop);
});
}
// Should be ~2x previous test, since it contains two `NCName`s.
#[bench]
fn qname_try_from_str_pair_1000(bench: &mut Bencher) {
let prefixes = gen_strs(1000, "prefix");
let names = gen_strs(1000, "name");
bench.iter(|| {
prefixes
.iter()
.zip(names.iter())
.map(|(p, s)| QName::try_from((p.as_str(), s.as_str())))
.for_each(drop);
});
}
}
mod writer {
use super::*;
use quick_xml::{
events::{BytesStart, BytesText, Event as XmlEvent},
Writer as QuickXmlWriter,
};
use std::borrow::Cow;
tamer: xir: Introduce {Ele,Open,Close}Span This isn't conceptally all that significant of a change, but there was a lot of modify to get it working. I would generally separate this into a commit for the implementation and another commit for the integration, but I decided to keep things together. This serves a role similar to AttrSpan---this allows deriving a span representing the element name from a span representing the entire XIR token. This will provide more useful context for errors---including the tag delimiter(s) means that we care about the fact that an element is in that position (as opposed to some other type of node) within the context of an error. However, if we are expecting an element but take issue with the element name itself, we want to place emphasis on that instead. This also starts to consider the issue of span contexts---a blob of detached data that is `Span` is useful for error context, but it's not useful for manipulation or deriving additional information. For that, we need to encode additional context, and this is an attempt at that. I am interested in the concept of providing Spans that are guaranteed to actually make sense---that are instantiated and manipulated with APIs that ensure consistency. But such a thing buys us very little, practically speaking, over what I have now for TAMER, and so I don't expect to actually implement that for this project; I'll leave that for a personal project. TAMER's already take a lot of my personal interests and it can cause me a lot of grief sometimes (with regards to letting my aspirations cause me more work). DEV-7145
2022-06-24 13:51:49 -04:00
use tamer::xir::{writer::XmlWriter, CloseSpan, Escaper, OpenSpan};
use tamer::{span::Span, xir::DefaultEscaper};
const FRAGMENT: &str = r#"<fragment>
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.
This is pretend fragment text. We need a lot of it.</fragment>
"#;
// TAME makes heavy use of attributes, which unfortunately requires
// copies in quick-xml. This will serve as our baseline---we want to
// perform _at least_ as well (but we do end up performing much better,
// despite the global symbol lookups).
#[bench]
fn baseline_quick_xml_empty_with_attrs_1000(bench: &mut Bencher) {
let buf = Vec::<u8>::new();
let mut writer = QuickXmlWriter::new(buf);
bench.iter(|| {
(0..1000).for_each(|_| {
writer
.write_event(XmlEvent::Empty(
BytesStart::borrowed_name(b"test:foo").with_attributes(
vec![("first", "value"), ("second", "value2")],
),
))
.unwrap();
});
});
}
// Produces the same output as above.
#[bench]
fn xir_empty_with_attrs_preinterned_1000(bench: &mut Bencher) {
let mut buf = Vec::<u8>::new();
// Perform all interning beforehand, since in practice, values will
// have been interned well before we get to the writer. Further,
// common values such as these (QNames) will be pre-defined and
// reused.
let span = Span::from_byte_interval((0, 0), "path".intern());
let name = QName::try_from(("test", "foo")).unwrap();
let attr1 = QName::new_local("first".try_into().unwrap());
let attr2 = QName::new_local("second".try_into().unwrap());
let val1 = "value".intern();
let val2 = "value2".intern();
// Prime the cache, since BytesStart is already assumed to be
// escaped. We will have cached on read in a real-world scenario.
let escaper = DefaultEscaper::default();
escaper.escape(val1);
escaper.escape(val2);
bench.iter(|| {
(0..1000).for_each(|_| {
vec![
tamer: xir: Introduce {Ele,Open,Close}Span This isn't conceptally all that significant of a change, but there was a lot of modify to get it working. I would generally separate this into a commit for the implementation and another commit for the integration, but I decided to keep things together. This serves a role similar to AttrSpan---this allows deriving a span representing the element name from a span representing the entire XIR token. This will provide more useful context for errors---including the tag delimiter(s) means that we care about the fact that an element is in that position (as opposed to some other type of node) within the context of an error. However, if we are expecting an element but take issue with the element name itself, we want to place emphasis on that instead. This also starts to consider the issue of span contexts---a blob of detached data that is `Span` is useful for error context, but it's not useful for manipulation or deriving additional information. For that, we need to encode additional context, and this is an attempt at that. I am interested in the concept of providing Spans that are guaranteed to actually make sense---that are instantiated and manipulated with APIs that ensure consistency. But such a thing buys us very little, practically speaking, over what I have now for TAMER, and so I don't expect to actually implement that for this project; I'll leave that for a personal project. TAMER's already take a lot of my personal interests and it can cause me a lot of grief sometimes (with regards to letting my aspirations cause me more work). DEV-7145
2022-06-24 13:51:49 -04:00
Token::Open(name, OpenSpan::without_name_span(span)),
Token::AttrName(attr1, span),
tamer: xir::XirString: WIP implementation (likely going away) I'm not fond of this implementation, which is why it's not fully completed. I wanted to commit this for future reference, and take the opportunity to explain why I don't like it. First: this task started as an idea to implement a third variant to AttrValue and friends that indicates that a value is fixed, in the sense of a fixed-point function: escaped or unescaped, its value is the same. This would allow us to skip wasteful escape/unescape operations. In doing so, it became obvious that there's no need to leak this information through the API, and indeed, no part of the system should care. When we read XML, it should be unescaped, and when we write, it should be escaped. The reason that this didn't quite happen to begin with was an optimization: I'll be creating an echo writer in place of the current filesystem-based copy in tamec shortly, and this would allow streaming XIR directly from the reader to the writer without any unescaping or re-escaping. When we unescape, we know the value that it came from, so we could simply store both symbols---they're 32-bit, so it results in a nicely compressed 64-bit value, so it's essentially cost-free, as long as we accept the expense of internment. This is `XirString`. Then, when we want to escape or unescape, we first check to see whether a symbol already exists and, if so, use it. While this works well for echoing streams, it won't work all that well in practice: the unescaped SymbolId will be taken and the XirString discarded, since nothing after XIR should be coupled with it. Then, when we later construct a XIR stream for writting, XirString will no longer be available and our previously known escape is lost, so the writer will have to re-escape. Further, if we look at XirString's generic for the XirStringEscaper---it uses phantom, which hints that maybe it's not in the best place. Indeed, I've already acknowledged that only a reader unescapes and only a writer escapes, and that the rest of the system works with normal (unescaped) values, so only readers and writers should be part of this process. I also already acknowledged that XirString would be lost and only the unescaped SymbolId would be used. So what's the point of XirString, then, if it won't be a useful optimization beyond the temporary echo writer? Instead, we can take the XirStringWriter and implement two caches on that: mapping SymbolId from escaped->unescaped and vice-versa. These can be simple vectors, since SymbolId is a 32-bit value we will not have much wasted space for symbols that never get read or written. We could even optimize for preinterned symbols using markers, though I'll probably not do so, and I'll explain why later. If we do _that_, we get even _better_ optimizations through caching that _will_ apply in the general case (so, not just for echo), and we're able to ditch XirString entirely and simply use a SymbolId. This makes for a much more friendly API that isn't leaking implementation details, though it _does_ put an onus on the caller to pass the encoder to both the reader and the writer, _if_ it wants to take advantage of a cache. But that burden is not significant (and is, again, optional if we don't want it). So, that'll be the next step.
2021-11-10 09:42:18 -05:00
Token::AttrValue(val1.into(), span),
Token::AttrName(attr2, span),
tamer: xir::XirString: WIP implementation (likely going away) I'm not fond of this implementation, which is why it's not fully completed. I wanted to commit this for future reference, and take the opportunity to explain why I don't like it. First: this task started as an idea to implement a third variant to AttrValue and friends that indicates that a value is fixed, in the sense of a fixed-point function: escaped or unescaped, its value is the same. This would allow us to skip wasteful escape/unescape operations. In doing so, it became obvious that there's no need to leak this information through the API, and indeed, no part of the system should care. When we read XML, it should be unescaped, and when we write, it should be escaped. The reason that this didn't quite happen to begin with was an optimization: I'll be creating an echo writer in place of the current filesystem-based copy in tamec shortly, and this would allow streaming XIR directly from the reader to the writer without any unescaping or re-escaping. When we unescape, we know the value that it came from, so we could simply store both symbols---they're 32-bit, so it results in a nicely compressed 64-bit value, so it's essentially cost-free, as long as we accept the expense of internment. This is `XirString`. Then, when we want to escape or unescape, we first check to see whether a symbol already exists and, if so, use it. While this works well for echoing streams, it won't work all that well in practice: the unescaped SymbolId will be taken and the XirString discarded, since nothing after XIR should be coupled with it. Then, when we later construct a XIR stream for writting, XirString will no longer be available and our previously known escape is lost, so the writer will have to re-escape. Further, if we look at XirString's generic for the XirStringEscaper---it uses phantom, which hints that maybe it's not in the best place. Indeed, I've already acknowledged that only a reader unescapes and only a writer escapes, and that the rest of the system works with normal (unescaped) values, so only readers and writers should be part of this process. I also already acknowledged that XirString would be lost and only the unescaped SymbolId would be used. So what's the point of XirString, then, if it won't be a useful optimization beyond the temporary echo writer? Instead, we can take the XirStringWriter and implement two caches on that: mapping SymbolId from escaped->unescaped and vice-versa. These can be simple vectors, since SymbolId is a 32-bit value we will not have much wasted space for symbols that never get read or written. We could even optimize for preinterned symbols using markers, though I'll probably not do so, and I'll explain why later. If we do _that_, we get even _better_ optimizations through caching that _will_ apply in the general case (so, not just for echo), and we're able to ditch XirString entirely and simply use a SymbolId. This makes for a much more friendly API that isn't leaking implementation details, though it _does_ put an onus on the caller to pass the encoder to both the reader and the writer, _if_ it wants to take advantage of a cache. But that burden is not significant (and is, again, optional if we don't want it). So, that'll be the next step.
2021-11-10 09:42:18 -05:00
Token::AttrValue(val2.into(), span),
tamer: xir: Introduce {Ele,Open,Close}Span This isn't conceptally all that significant of a change, but there was a lot of modify to get it working. I would generally separate this into a commit for the implementation and another commit for the integration, but I decided to keep things together. This serves a role similar to AttrSpan---this allows deriving a span representing the element name from a span representing the entire XIR token. This will provide more useful context for errors---including the tag delimiter(s) means that we care about the fact that an element is in that position (as opposed to some other type of node) within the context of an error. However, if we are expecting an element but take issue with the element name itself, we want to place emphasis on that instead. This also starts to consider the issue of span contexts---a blob of detached data that is `Span` is useful for error context, but it's not useful for manipulation or deriving additional information. For that, we need to encode additional context, and this is an attempt at that. I am interested in the concept of providing Spans that are guaranteed to actually make sense---that are instantiated and manipulated with APIs that ensure consistency. But such a thing buys us very little, practically speaking, over what I have now for TAMER, and so I don't expect to actually implement that for this project; I'll leave that for a personal project. TAMER's already take a lot of my personal interests and it can cause me a lot of grief sometimes (with regards to letting my aspirations cause me more work). DEV-7145
2022-06-24 13:51:49 -04:00
Token::Close(None, CloseSpan::empty(span)),
]
.into_iter()
.write(&mut buf, Default::default(), &escaper)
.unwrap();
});
});
}
// The other major thing we do is output large amounts of text (the
// linked fragments).
#[bench]
fn baseline_quick_xml_text_50(bench: &mut Bencher) {
let buf = Vec::<u8>::with_capacity(FRAGMENT.len() * 50);
let mut writer = QuickXmlWriter::new(buf);
let frag: SymbolId = FRAGMENT.intern();
bench.iter(|| {
(0..50).for_each(|_| {
writer
.write_event(XmlEvent::Text(BytesText::from_escaped_str(
Cow::Borrowed(&frag.lookup_str() as &str),
)))
.unwrap();
});
});
}
// This test and the above are expected to perform similarly, and can
// vary wildy run-to-run.
#[bench]
fn xir_text_50(bench: &mut Bencher) {
let mut buf = Vec::<u8>::with_capacity(FRAGMENT.len() * 50);
let frag: SymbolId = FRAGMENT.intern();
let span = Span::from_byte_interval((0, 0), "path".intern());
// Prime the cache, since BytesStart is already assumed to be
// escaped.
let escaper = DefaultEscaper::default();
escaper.escape(frag);
bench.iter(|| {
(0..50).for_each(|_| {
Token::Text(frag, span)
.write(&mut buf, Default::default(), &escaper)
.unwrap();
});
});
}
}