tamer: memchr benches

This adds benchmarking for the memchr crate.  It is used primarily by
quick-xml at the moment, but the question is whether to rely on it for
certain operations for XIR.

The benchmarking on an Intel Xeon system shows that memchr and Rust's
contains() perform very similarly on small inputs, matching against a single
character, and so Rust's built-in should be preferred in that case so that
we're using APIs that are familiar to most people.

When larger inputs are compared against, there's a greater benefit (a little
under ~2x).

When comparing against two characters, they are again very close.  But look
at when we compare two characters against _multiple_ inputs:

  running 24 tests
  test large_str:1️⃣:memchr_early_match                 ... bench:       4,938 ns/iter (+/- 124)
  test large_str:1️⃣:memchr_late_match                  ... bench:      81,807 ns/iter (+/- 1,153)
  test large_str:1️⃣:memchr_non_match                   ... bench:      82,074 ns/iter (+/- 1,062)
  test large_str:1️⃣:rust_contains_one_byte_early_match ... bench:       9,425 ns/iter (+/- 167)
  test large_str:1️⃣:rust_contains_one_byte_late_match  ... bench:     123,685 ns/iter (+/- 3,728)
  test large_str:1️⃣:rust_contains_one_byte_non_match   ... bench:     123,117 ns/iter (+/- 2,200)
  test large_str:1️⃣:rust_contains_one_char_early_match ... bench:       9,561 ns/iter (+/- 507)
  test large_str:1️⃣:rust_contains_one_char_late_match  ... bench:     123,929 ns/iter (+/- 2,377)
  test large_str:1️⃣:rust_contains_one_char_non_match   ... bench:     122,989 ns/iter (+/- 2,788)
  test large_str:2️⃣:memchr2_early_match                ... bench:       5,704 ns/iter (+/- 91)
  test large_str:2️⃣:memchr2_late_match                 ... bench:      89,194 ns/iter (+/- 8,546)
  test large_str:2️⃣:memchr2_non_match                  ... bench:      85,649 ns/iter (+/- 3,879)
  test large_str:2️⃣:rust_contains_two_char_early_match ... bench:      66,785 ns/iter (+/- 3,385)
  test large_str:2️⃣:rust_contains_two_char_late_match  ... bench:   2,148,064 ns/iter (+/- 21,812)
  test large_str:2️⃣:rust_contains_two_char_non_match   ... bench:   2,322,082 ns/iter (+/- 22,947)
  test small_str:1️⃣:memchr_mid_match                   ... bench:       4,737 ns/iter (+/- 842)
  test small_str:1️⃣:memchr_non_match                   ... bench:       5,160 ns/iter (+/- 62)
  test small_str:1️⃣:rust_contains_one_byte_non_match   ... bench:       3,930 ns/iter (+/- 35)
  test small_str:1️⃣:rust_contains_one_char_mid_match   ... bench:       3,677 ns/iter (+/- 618)
  test small_str:1️⃣:rust_contains_one_char_non_match   ... bench:       5,415 ns/iter (+/- 221)
  test small_str:2️⃣:memchr2_mid_match                  ... bench:       5,488 ns/iter (+/- 888)
  test small_str:2️⃣:memchr2_non_match                  ... bench:       6,788 ns/iter (+/- 134)
  test small_str:2️⃣:rust_contains_two_char_mid_match   ... bench:       6,203 ns/iter (+/- 170)
  test small_str:2️⃣:rust_contains_two_char_non_match   ... bench:       7,853 ns/iter (+/- 713)

Yikes.

With that said, we won't be comparing against such large inputs
short-term.  The larger strings (fragments) are copied verbatim, and not
compared against---but they _were_ prior to the previous commit that stopped
unencoding and re-encoding.

So: Rust built-ins for inputs that are expected to be small.
main
Mike Gerwitz 2021-08-18 14:18:24 -04:00
parent 1cdb3fbbc5
commit fc235b7ecc
3 changed files with 400 additions and 0 deletions

1
tamer/Cargo.lock generated
View File

@ -302,6 +302,7 @@ dependencies = [
"fxhash",
"getopts",
"lazy_static",
"memchr",
"petgraph",
"petgraph-graphml",
"predicates",

View File

@ -36,6 +36,7 @@ exitcode = "1.1.2"
lazy_static = ">= 1.4.0"
petgraph-graphml = ">= 2.0.1"
static_assertions = ">= 1.1.0"
memchr = ">= 2.3.4" # quick-xml expects =2.3.4 at the time
# Feature flags can be specified using `./configure FEATURES=foo,bar,baz`.
#

View File

@ -0,0 +1,398 @@
// Comparisons between Rust built-ins and memchr.
//
// Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
//
// This file is part of TAME.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
#![feature(test)]
//! Comparisons between Rust built-ins and memchr.
//!
//! The intent of these benchmarks are to determine how significant of a
//! benefit the more traditional approach in C (`memchr`) provides over
//! Rust's built-ins in various situations.
//!
//! See the [`memchr`] crate for more information.
extern crate memchr;
extern crate tamer;
extern crate test;
use test::Bencher;
fn gen_strs(n: usize, suffix: &str) -> Vec<String> {
(0..n).map(|n| n.to_string() + suffix).collect()
}
mod small_str {
use super::*;
mod one {
use super::*;
#[bench]
fn rust_contains_one_char_non_match(bench: &mut Bencher) {
let strs = gen_strs(1000, "foobar");
bench.iter(|| {
strs.iter()
.map(|s| assert!(!s.contains(':')))
.for_each(drop);
});
}
#[bench]
fn rust_contains_one_byte_non_match(bench: &mut Bencher) {
let strs = gen_strs(1000, "foobar");
bench.iter(|| {
strs.iter()
.map(|s| assert!(!s.as_bytes().contains(&b':')))
.for_each(drop);
});
}
#[bench]
fn memchr_non_match(bench: &mut Bencher) {
let strs = gen_strs(1000, "foobar");
bench.iter(|| {
strs.iter()
.map(|s| {
assert!(memchr::memchr(b':', s.as_bytes()).is_none())
})
.for_each(drop);
});
}
#[bench]
fn rust_contains_one_char_mid_match(bench: &mut Bencher) {
let strs = gen_strs(1000, "foo:bar");
bench.iter(|| {
strs.iter().map(|s| assert!(s.contains(':'))).for_each(drop);
});
}
fn rust_contains_one_byte_mid_match(bench: &mut Bencher) {
let strs = gen_strs(1000, "foo:bar");
bench.iter(|| {
strs.iter()
.map(|s| assert!(s.as_bytes().contains(&b':')))
.for_each(drop);
});
}
#[bench]
fn memchr_mid_match(bench: &mut Bencher) {
let strs = gen_strs(1000, "foo:bar");
bench.iter(|| {
strs.iter()
.map(|s| {
assert!(memchr::memchr(b':', s.as_bytes()).is_some())
})
.for_each(drop);
});
}
}
mod two {
use super::*;
#[bench]
fn rust_contains_two_char_non_match(bench: &mut Bencher) {
let strs = gen_strs(1000, "foobar");
bench.iter(|| {
strs.iter()
.map(|s| assert!(!s.contains(&[':', '>'][..])))
.for_each(drop)
});
}
#[bench]
fn memchr2_non_match(bench: &mut Bencher) {
let strs = gen_strs(1000, "foobar");
bench.iter(|| {
strs.iter()
.map(|s| {
assert!(
memchr::memchr2(b':', b'>', s.as_bytes()).is_none()
)
})
.for_each(drop);
});
}
#[bench]
fn rust_contains_two_char_mid_match(bench: &mut Bencher) {
let strs = gen_strs(1000, "foo>bar");
bench.iter(|| {
strs.iter()
.map(|s| assert!(s.contains(&[':', '>'][..])))
.for_each(drop);
});
}
#[bench]
fn memchr2_mid_match(bench: &mut Bencher) {
let strs = gen_strs(1000, "foo>bar");
bench.iter(|| {
strs.iter()
.map(|s| {
assert!(
memchr::memchr2(b':', b'>', s.as_bytes()).is_some()
)
})
.for_each(drop);
});
}
}
}
mod large_str {
use super::*;
// Granted, this isn't large compared to some of the strings the linker
// deals with, but the linker also isn't searching those strings.
const LG_STR: &'static str = r#"
This is a line of a longer string to test efficiency of searches.
: It contains a unique char near the beginning.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
@ And a unique char near the end.
This is a line of a longer string to test efficiency of searches.
This is a line of a longer string to test efficiency of searches.
"#;
mod one {
use super::*;
#[bench]
fn rust_contains_one_char_non_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| assert!(!s.contains('_')))
.for_each(drop);
});
}
#[bench]
fn rust_contains_one_byte_non_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| assert!(!s.as_bytes().contains(&b'_')))
.for_each(drop);
});
}
#[bench]
fn memchr_non_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| {
assert!(memchr::memchr(b'_', s.as_bytes()).is_none())
})
.for_each(drop);
});
}
#[bench]
fn rust_contains_one_char_early_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter().map(|s| assert!(s.contains(':'))).for_each(drop);
});
}
#[bench]
fn rust_contains_one_byte_early_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| assert!(s.as_bytes().contains(&b':')))
.for_each(drop);
});
}
#[bench]
fn memchr_early_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| {
assert!(memchr::memchr(b':', s.as_bytes()).is_some())
})
.for_each(drop);
});
}
#[bench]
fn rust_contains_one_char_late_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter().map(|s| assert!(s.contains('@'))).for_each(drop);
});
}
#[bench]
fn rust_contains_one_byte_late_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| assert!(s.as_bytes().contains(&b'@')))
.for_each(drop);
});
}
#[bench]
fn memchr_late_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| {
assert!(memchr::memchr(b'@', s.as_bytes()).is_some())
})
.for_each(drop);
});
}
}
mod two {
use super::*;
#[bench]
fn rust_contains_two_char_non_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| assert!(!s.contains(&['_', '!'][..])))
.for_each(drop);
});
}
#[bench]
fn memchr2_non_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| {
assert!(
memchr::memchr2(b'_', b'!', s.as_bytes()).is_none()
)
})
.for_each(drop);
});
}
#[bench]
fn rust_contains_two_char_early_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| assert!(s.contains(&['_', ':'][..])))
.for_each(drop);
});
}
#[bench]
fn memchr2_early_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| {
assert!(
memchr::memchr2(b'_', b':', s.as_bytes()).is_some()
)
})
.for_each(drop);
});
}
#[bench]
fn rust_contains_two_char_late_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| assert!(s.contains(&['_', '@'][..])))
.for_each(drop);
});
}
#[bench]
fn memchr2_late_match(bench: &mut Bencher) {
let strs = gen_strs(1000, LG_STR);
bench.iter(|| {
strs.iter()
.map(|s| {
assert!(
memchr::memchr2(b'_', b'@', s.as_bytes()).is_some()
)
})
.for_each(drop);
});
}
}
}