TAMER: Arena-based string interner
Contrary to what I said previously, this replaces the previous implementation with an arena-backed internment system. The motivation for this change was investigating how Rustc performed its string interning, and why they chose to associate integer identifiers with symbols. The intent was originally to use Rustc's arena allocator directly, but that create pulled in far too many dependencies and depended on nightly Rust. Bumpalo provides a very similar implementation to Rustc's DroplessArena, so I went with that instead. Rustc also relies on a global, singleton interner. I do not do that here. Instead, the returned Symbol carries a lifetime of the underlying arena, as well as a pointer to the interned string. Now that this is put to rest, it's time to move on.master
parent
176d099fb6
commit
1f4db84f24
|
@ -1,5 +1,10 @@
|
|||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "2.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.3.2"
|
||||
|
@ -49,6 +54,7 @@ dependencies = [
|
|||
name = "tamer"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"bumpalo 2.6.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"fixedbitset 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"fxhash 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"petgraph 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
|
@ -56,6 +62,7 @@ dependencies = [
|
|||
]
|
||||
|
||||
[metadata]
|
||||
"checksum bumpalo 2.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ad807f2fc2bf185eeb98ff3a901bd46dc5ad58163d0fa4577ba0d25674d71708"
|
||||
"checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5"
|
||||
"checksum fixedbitset 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "86d4de0081402f5e88cdac65c8dcdcc73118c1a7a465e2a05f0da05843a8ea33"
|
||||
"checksum fxhash 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
||||
|
|
|
@ -23,8 +23,9 @@ lto = true
|
|||
lto = true
|
||||
|
||||
[dependencies]
|
||||
bumpalo = ">= 2.6.0"
|
||||
# used by petgraph
|
||||
fixedbitset = ">= 0.1"
|
||||
fxhash = ">= 0.2.1"
|
||||
petgraph = ">= 0.4.13"
|
||||
quick-xml = ">= 0.17.0"
|
||||
# used by petgraph
|
||||
fixedbitset = ">= 0.1"
|
||||
|
|
|
@ -28,67 +28,13 @@ use std::rc::Rc;
|
|||
use tamer::sym::*;
|
||||
use test::Bencher;
|
||||
|
||||
mod symbol {
|
||||
use super::*;
|
||||
|
||||
/// This is our baseline. We should never be any slower than this.
|
||||
#[bench]
|
||||
fn new_raw_rc_1000_baseline(bench: &mut Bencher) {
|
||||
let s = "foo bar baz";
|
||||
|
||||
bench.iter(|| {
|
||||
(0..1000)
|
||||
.map(|_| {
|
||||
let _: Rc<str> = s.into();
|
||||
})
|
||||
.for_each(drop);
|
||||
});
|
||||
}
|
||||
|
||||
/// Using the `SymbolRc` wrapper should perform no differently than the
|
||||
/// above test with `Rc<str>`.
|
||||
#[bench]
|
||||
fn new_symbol_rc_1000(bench: &mut Bencher) {
|
||||
let s = "foo bar baz";
|
||||
|
||||
bench.iter(|| {
|
||||
(0..1000).map(|_| SymbolRc::new(s)).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
||||
/// Rc uses pointer comparisons when possible. We want to match that.
|
||||
#[bench]
|
||||
fn eq_check_rc_baseline(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let a: Rc<str> = "foobarbazquux".into();
|
||||
let b: Rc<str> = "foobarbazquux".into();
|
||||
let c: Rc<str> = "foobarbazquuxx".into();
|
||||
|
||||
let _ = a == b;
|
||||
let _ = a == c;
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn eq_check_symbol_rc_baseline(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let a: SymbolRc = "foobarbazquux".into();
|
||||
let b: SymbolRc = "foobarbazquux".into();
|
||||
let c: SymbolRc = "foobarbazquuxx".into();
|
||||
|
||||
let _ = a == b;
|
||||
let _ = a == c;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn gen_strs(n: usize) -> Vec<String> {
|
||||
(0..n)
|
||||
.map(|n| n.to_string() + "foobarbazquuxlongsymbol")
|
||||
.collect()
|
||||
}
|
||||
|
||||
mod hash_set {
|
||||
mod interner {
|
||||
use super::*;
|
||||
use std::collections::hash_map::RandomState;
|
||||
use std::collections::HashSet;
|
||||
|
@ -137,7 +83,7 @@ mod hash_set {
|
|||
let strs = gen_strs(1000);
|
||||
|
||||
bench.iter(|| {
|
||||
let mut sut = HashSetInterner::<SymbolRc>::new();
|
||||
let sut = ArenaInterner::<RandomState>::new();
|
||||
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
@ -154,7 +100,7 @@ mod hash_set {
|
|||
#[bench]
|
||||
fn with_one_new_1000(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut sut = HashSetInterner::<SymbolRc>::new();
|
||||
let sut = ArenaInterner::<RandomState>::new();
|
||||
(0..1000).map(|_| sut.intern("first")).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
@ -178,7 +124,7 @@ mod hash_set {
|
|||
let strs = gen_strs(1000);
|
||||
|
||||
bench.iter(|| {
|
||||
let mut sut = HashSetInterner::<SymbolRc, FxBuildHasher>::new();
|
||||
let sut = ArenaInterner::<FxBuildHasher>::new();
|
||||
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
@ -197,7 +143,7 @@ mod hash_set {
|
|||
#[bench]
|
||||
fn with_one_new_1000(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut sut = HashSetInterner::<SymbolRc, FxBuildHasher>::new();
|
||||
let sut = ArenaInterner::<FxBuildHasher>::new();
|
||||
(0..1000).map(|_| sut.intern("first")).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
@ -210,176 +156,9 @@ mod hash_set {
|
|||
let strs = gen_strs(n);
|
||||
|
||||
bench.iter(|| {
|
||||
let mut sut =
|
||||
HashSetInterner::<SymbolRc, FxBuildHasher>::with_capacity(
|
||||
n,
|
||||
);
|
||||
let sut = ArenaInterner::<FxBuildHasher>::with_capacity(n);
|
||||
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod hash_map {
|
||||
use super::*;
|
||||
use std::collections::hash_map::{Entry, RandomState};
|
||||
use std::collections::HashMap;
|
||||
use std::hash::BuildHasher;
|
||||
|
||||
pub struct HashMapSut<M, S = RandomState>
|
||||
where
|
||||
S: BuildHasher,
|
||||
{
|
||||
pub map: HashMap<Rc<str>, M, S>,
|
||||
}
|
||||
|
||||
impl<M, S> HashMapSut<M, S>
|
||||
where
|
||||
M: Default,
|
||||
S: BuildHasher + Default,
|
||||
{
|
||||
#[inline]
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
map: HashMap::with_hasher(Default::default()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn intern(&mut self, value: &str) -> Rc<str> {
|
||||
match self.map.entry(value.into()) {
|
||||
Entry::Vacant(v) => {
|
||||
let intern = v.key().clone();
|
||||
|
||||
v.insert(Default::default());
|
||||
intern
|
||||
}
|
||||
Entry::Occupied(o) => o.key().clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is our baseline with a raw Rc<str>.
|
||||
#[bench]
|
||||
fn with_all_new_rc_str_1000_baseline(bench: &mut Bencher) {
|
||||
let strs = gen_strs(1000);
|
||||
|
||||
bench.iter(|| {
|
||||
let mut sut = HashMapSut::<(), RandomState>::new();
|
||||
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn with_all_new_1000(bench: &mut Bencher) {
|
||||
let strs = gen_strs(1000);
|
||||
|
||||
bench.iter(|| {
|
||||
let mut sut = HashMapInterner::<SymbolRc, ()>::new();
|
||||
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
/// This is our baseline with a raw Rc<str>.
|
||||
fn with_one_new_rc_str_1000_baseline(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut sut = HashMapSut::<(), RandomState>::new();
|
||||
(0..1000).map(|_| sut.intern("first")).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn with_one_new_1000(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut sut = HashMapInterner::<SymbolRc, ()>::new();
|
||||
(0..1000).map(|_| sut.intern("first")).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
||||
mod fx {
|
||||
use super::*;
|
||||
use fxhash::FxBuildHasher;
|
||||
|
||||
/// This is our baseline with a raw Rc<str>.
|
||||
#[bench]
|
||||
fn with_all_new_rc_str_1000_baseline(bench: &mut Bencher) {
|
||||
let strs = gen_strs(1000);
|
||||
bench.iter(|| {
|
||||
let mut sut = HashMapSut::<(), FxBuildHasher>::new();
|
||||
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn with_all_new_1000(bench: &mut Bencher) {
|
||||
let strs = gen_strs(1000);
|
||||
|
||||
bench.iter(|| {
|
||||
let mut sut =
|
||||
HashMapInterner::<SymbolRc, (), FxBuildHasher>::new();
|
||||
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
/// This is our baseline with a raw Rc<str>.
|
||||
fn with_one_new_rc_str_1000_baseline(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut sut: HashMapSut<(), FxBuildHasher> = HashMapSut {
|
||||
map: HashMap::with_hasher(Default::default()),
|
||||
};
|
||||
(0..1000).map(|_| sut.intern("first")).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn with_one_new_1000(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut sut =
|
||||
HashMapInterner::<SymbolRc, (), FxBuildHasher>::new();
|
||||
(0..1000).map(|_| sut.intern("first")).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
||||
/// Since FNV is the best-performing, let's build upon it to demonstrate
|
||||
/// the benefits of with_capacity
|
||||
#[bench]
|
||||
fn with_all_new_1000_with_capacity(bench: &mut Bencher) {
|
||||
let n = 1000;
|
||||
let strs = gen_strs(n);
|
||||
|
||||
bench.iter(|| {
|
||||
let mut sut =
|
||||
HashMapInterner::<SymbolRc, (), FxBuildHasher>::with_capacity(n);
|
||||
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn with_all_new_meta_1000(bench: &mut Bencher) {
|
||||
let strs = gen_strs(1000);
|
||||
|
||||
bench.iter(|| {
|
||||
let mut sut =
|
||||
HashMapInterner::<SymbolRc, u8, FxBuildHasher>::new();
|
||||
strs.iter().map(|s| sut.intern_meta(&s, 0)).for_each(drop);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn with_all_new_then_set_meta_1000(bench: &mut Bencher) {
|
||||
let strs = gen_strs(1000);
|
||||
|
||||
bench.iter(|| {
|
||||
let mut sut =
|
||||
HashMapInterner::<SymbolRc, u8, FxBuildHasher>::new();
|
||||
strs.iter()
|
||||
.map(|s| {
|
||||
sut.intern(&s);
|
||||
sut.intern_meta(&s, 0);
|
||||
})
|
||||
.for_each(drop);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
1008
tamer/src/sym.rs
1008
tamer/src/sym.rs
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue