From 63e5a0d441948f4e1dfc66a1766142516c13e7ef Mon Sep 17 00:00:00 2001 From: Mike Gerwitz Date: Mon, 18 Oct 2021 21:32:21 -0400 Subject: [PATCH] tamer: benches/sym.rs: Add additional UTF-8-related tests The intent of this is to demonstrate how significant of an impact checking byte arrays for UTF-8 validity will have, since the existing tests do not make that clear (a static string in Rust is always valid UTF-8). These benchmarks show that the cost when re-interning an already existing value is +50%. This is important, because the new reader will be interning a _lot_ of duplicate strings, whereas the existing reader operates on byte arrays without interning unless necessary. And, when it does, it does so unchecked. But we'd rather not do that, since we cannot guarantee that those XML files are valid (and not modified in some way). Upcoming commits will have what I think is a reasonable compromise to this, based on the fact that we'll be encountering _many_ duplicate strings in parsing XML files. DEV-10920 --- tamer/benches/sym.rs | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tamer/benches/sym.rs b/tamer/benches/sym.rs index 786f97a4..1bab589d 100644 --- a/tamer/benches/sym.rs +++ b/tamer/benches/sym.rs @@ -164,6 +164,34 @@ mod interner { }); } + // Unlike the above, which already has a UTF-8 string, this actually + // performs a conversion and check from `&[u8]`. + #[bench] + fn with_all_new_1000_utf8_checked(bench: &mut Bencher) { + let strs = gen_strs(1000); + let bs: Vec<&[u8]> = strs.iter().map(|s| s.as_bytes()).collect(); + + bench.iter(|| { + let sut = ArenaInterner::::new(); + bs.iter() + .map(|b| sut.intern(std::str::from_utf8(&b).unwrap())) + .for_each(drop); + }); + } + + #[bench] + fn with_all_new_1000_utf8_unchecked(bench: &mut Bencher) { + let strs = gen_strs(1000); + let bs: Vec<&[u8]> = strs.iter().map(|s| s.as_bytes()).collect(); + + bench.iter(|| { + let sut = ArenaInterner::::new(); + bs.iter() + .map(|b| unsafe { sut.intern_utf8_unchecked(&b) }) + .for_each(drop); + }); + } + #[bench] fn with_all_new_uninterned_1000(bench: &mut Bencher) { let strs = gen_strs(1000); @@ -193,6 +221,18 @@ mod interner { }); } + // Unlike the above, which already has a UTF-8 string, this actually + // performs a conversion and check from `&[u8]`. + #[bench] + fn with_one_new_1000_utf8_checked(bench: &mut Bencher) { + bench.iter(|| { + let sut = ArenaInterner::::new(); + (0..1000) + .map(|_| sut.intern(std::str::from_utf8(b"first").unwrap())) + .for_each(drop); + }); + } + #[bench] fn with_one_new_1000_utf8_unchecked(bench: &mut Bencher) { bench.iter(|| {