tamer: benches/sym.rs: Add additional UTF-8-related tests

The intent of this is to demonstrate how significant of an impact checking
byte arrays for UTF-8 validity will have, since the existing tests do not
make that clear (a static string in Rust is always valid UTF-8).

These benchmarks show that the cost when re-interning an already existing
value is +50%.

This is important, because the new reader will be interning a _lot_ of
duplicate strings, whereas the existing reader operates on byte arrays
without interning unless necessary.  And, when it does, it does so
unchecked.  But we'd rather not do that, since we cannot guarantee that
those XML files are valid (and not modified in some way).

Upcoming commits will have what I think is a reasonable compromise to this,
based on the fact that we'll be encountering _many_ duplicate strings in
parsing XML files.

DEV-10920
main
Mike Gerwitz 2021-10-18 21:32:21 -04:00
parent 2715f3e845
commit 63e5a0d441
1 changed files with 40 additions and 0 deletions

View File

@ -164,6 +164,34 @@ mod interner {
});
}
// Unlike the above, which already has a UTF-8 string, this actually
// performs a conversion and check from `&[u8]`.
#[bench]
fn with_all_new_1000_utf8_checked(bench: &mut Bencher) {
let strs = gen_strs(1000);
let bs: Vec<&[u8]> = strs.iter().map(|s| s.as_bytes()).collect();
bench.iter(|| {
let sut = ArenaInterner::<FxBuildHasher, u32>::new();
bs.iter()
.map(|b| sut.intern(std::str::from_utf8(&b).unwrap()))
.for_each(drop);
});
}
#[bench]
fn with_all_new_1000_utf8_unchecked(bench: &mut Bencher) {
let strs = gen_strs(1000);
let bs: Vec<&[u8]> = strs.iter().map(|s| s.as_bytes()).collect();
bench.iter(|| {
let sut = ArenaInterner::<FxBuildHasher, u32>::new();
bs.iter()
.map(|b| unsafe { sut.intern_utf8_unchecked(&b) })
.for_each(drop);
});
}
#[bench]
fn with_all_new_uninterned_1000(bench: &mut Bencher) {
let strs = gen_strs(1000);
@ -193,6 +221,18 @@ mod interner {
});
}
// Unlike the above, which already has a UTF-8 string, this actually
// performs a conversion and check from `&[u8]`.
#[bench]
fn with_one_new_1000_utf8_checked(bench: &mut Bencher) {
bench.iter(|| {
let sut = ArenaInterner::<FxBuildHasher, u32>::new();
(0..1000)
.map(|_| sut.intern(std::str::from_utf8(b"first").unwrap()))
.for_each(drop);
});
}
#[bench]
fn with_one_new_1000_utf8_unchecked(bench: &mut Bencher) {
bench.iter(|| {