tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701
main
Mike Gerwitz 2021-09-22 16:04:56 -04:00
parent e0a209d417
commit 366fef714b
4 changed files with 229 additions and 4 deletions

View File

@ -45,6 +45,9 @@ pub type NonZeroPkgSymSize = num::NonZeroU16;
/// A size capable of representing every interned string in a program.
pub type ProgSymSize = u32;
/// The initial capacity for global interners.
pub const INIT_GLOBAL_INTERNER_CAPACITY: usize = 1024;
/// A non-zero equivalent of [`ProgSymSize`];
pub type NonZeroProgSymSize = num::NonZeroU32;

View File

@ -218,6 +218,23 @@
//! if you utilize interners for any other purpose,
//! it is advised that you create newtypes for their [`SymbolId`]s.
//!
//! Static Symbols
//! --------------
//! Since nearly every string in the system is represented by a symbol,
//! comparing against static string slices would require awkward interning
//! of a static string at each relevant point in the program.
//! Instead,
//! common static strings are pre-interned when the global interner is
//! first initialized.
//!
//! These symbols are allocated statically,
//! so they can be used in `const` expressions and include additional
//! metadata allowing for safe type conversions in circumstances that
//! aren't typically permitted.
//! This further allows constructing symbol newtypes at compile-time.
//!
//! These symbol constants can be found in the [`st`] module.
//!
//! Uninterned Symbols
//! ------------------
//! Interners are able to allocate a [`SymbolId`] without interning,
@ -308,8 +325,7 @@
//! - Rustc's [`newtype_index!` macro][rustc-nt] uses
//! [`NonZeroU32`] so that [`Option`] uses no
//! additional space (see [pull request `53315`][rustc-nt-pr]).
//! - Differences between TAMER and Rustc's implementations are outlined
//! above.
//! - Rustc also [prefills interners][rustc-intern] with common symbols.
//!
//! [flyweight pattern]: https://en.wikipedia.org/wiki/Flyweight_pattern
//! [rust-string-cache]: https://github.com/servo/string-cache
@ -338,8 +354,11 @@
//! [hash-rs]: https://github.com/Gankra/hash-rs
mod interner;
mod prefill;
mod symbol;
pub use prefill::st;
pub use interner::{
ArenaInterner, DefaultInterner, DefaultPkgInterner, DefaultProgInterner,
FxArenaInterner, Interner,

View File

@ -0,0 +1,201 @@
// Pre-interned strings
//
// Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
//
// This file is part of TAME.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//! Pre-interned strings.
//!
//! These strings are expected to be encountered nearly every run,
//! and substitute static strings that would otherwise appear hard-coded
//! in the system and have to be interned to be compared against other
//! values.
//!
//! See the [parent module](super) for more information.
use super::{Interner, SymbolId, SymbolIndexSize};
use crate::global;
use std::array;
type NonZero = <global::ProgSymSize as SymbolIndexSize>::NonZero;
/// Generate symbols of size [`global::ProgSymSize`] for preinterned strings.
///
/// These symbols,
/// rather than being generated by the global internment system,
/// are generated statically.
/// Once the global interner is initialized
/// (see [parent module](`super`)),
/// which is on first access,
/// these symbols will reference valid values.
macro_rules! static_symbol_consts {
(@i $i:expr; $name:ident: $str:expr, $($ti:ident: $ts:expr,)*) => {
#[doc=concat!("Interned string `\"", $str, "\"`.")]
#[allow(non_upper_case_globals)]
pub const $name: SymbolId<global::ProgSymSize> = unsafe {
SymbolId(NonZero::new_unchecked($i))
};
// Recurse until no tail is left (terminating condition below).
static_symbol_consts!{
// This will result in 1 + 1 + 1 + 1 ... and will eventually hit
// the recursion limit if we have too many static symbols, after
// which time we may have to switch methodology.
@i $i + 1;
$($ti: $ts,)*
}
};
// Terminating condition.
(@i $i:expr;) => {}
}
/// Statically allocate [`SymbolId`]s for the provided symbols,
/// and schedule their static strings to be interned upon initialization
/// of the global interner.
///
/// This generates [`fill`],
/// which the global interners call by default.
/// Any interner may optionally invoke this,
/// immediately after initialization,
/// /before/ any internment requests.
macro_rules! static_symbols {
($($name:ident : $str:expr),*) => {
/// Static symbols (pre-allocated).
///
/// Each of the constants in this module represent a [`SymbolId`]
/// statically allocated at compile-time.
/// The strings that they represent are automatically populated into
/// the global interners when the interner is first accessed.
///
/// See [`crate::sym`] for more information on static symbols.
///
/// `static` is a keyword in Rust,
/// so we shorten the module name to `st`.
pub mod st {
use super::*;
static_symbol_consts! {
// Index 0 is not valid, so begin at 1
@i 1;
$(
$name: $str,
)*
}
}
/// Fill a new interner with static symbols.
///
/// Panics
/// ======
/// This function will panic if the interner has any symbols,
/// which would cause misalignment with the generated constants.
pub(super) fn fill<'a, I, Ix>(interner: I) -> I
where
I: Interner<'a, Ix>,
Ix: SymbolIndexSize
{
assert!(
interner.len() == 0,
"cannot fill non-empty Interner with static symbols"
);
// This array does not exist as a constant, because that would
// require that we count the number of items first for the
// sake of the type definition.
// This is more convenient.
array::IntoIter::new([
$(
$str,
)*
]).for_each(|sym| { interner.intern(sym); });
interner
}
}
}
// Static symbols that will have their strings interned upon global
// interner initialization.
//
// Each of these generates a constant of the same name with a [`SymbolId`].
// This symbol is constant,
// generated at compile-time,
// and is intended to be used with a global interner.
// Since a global interner is initialized on first use,
// which in turn populates the interner using [`fill`] above,
// this constant will always represent a valid global symbol within the
// context of reads.
//
// The constants are not all-uppercase,
// which creates the illusion that the symbols were dynamically generated;
// this isn't entirely false,
// given that internment _is_ a runtime operation even for these
// symbols.
//
// Certain symbols are Rust identifiers,
// and therefore begin with a capital letter;
// this is also done by rustc
// (see https://doc.rust-lang.org/nightly/nightly-rustc/src/rustc_span/symbol.rs.html).
//
// See parent documentation for more information.
//
// These end up in the `st` module,
// which is re-exported by the parent module.
static_symbols! {
// Index begins at 1, since 0 is reserved during interner initialization
True: "true",
False: "false",
// [Symbols will be added here as they are needed.]
// Marker indicating the end of the static symbols
END_STATIC: "{{end static}}"
}
#[cfg(test)]
mod test {
use super::st;
use crate::{
global,
sym::{GlobalSymbolIntern, SymbolId},
};
type Ix = global::ProgSymSize;
// The global interners are instantiated with the prefill.
#[test]
fn global_sanity_check() {
// If we _don't_ prefill, make sure we're not starting at the first
// offset when interning, otherwise it'll look correct.
let new: SymbolId<Ix> = "force offset".intern();
assert!(
new.as_usize() > st::END_STATIC.as_usize(),
"a new global symbol allocation was not > END_STATIC, \
indicating that prefill is not working!"
);
// Further sanity check to make sure indexes align as expected,
// not that you wouldn't otherwise notice that the whole system is
// broken, but this ought to offer a more direct hint as to what
// went wrong.
assert_eq!(st::True, "true".intern());
assert_eq!(st::False, "false".intern());
}
}

View File

@ -55,7 +55,7 @@ use std::thread::LocalKey;
/// see either [`GlobalSymbolResolve::lookup_str`] or
/// [`Interner::index_lookup`].
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct SymbolId<Ix: SymbolIndexSize>(Ix::NonZero);
pub struct SymbolId<Ix: SymbolIndexSize>(pub(super) Ix::NonZero);
assert_eq_size!(Option<SymbolId<u16>>, SymbolId<u16>);
/// Identifier of a symbol within a single package.
@ -153,7 +153,9 @@ pub trait SymbolIndexSize:
macro_rules! supported_symbol_index {
($prim:ty, $nonzero:ty, $interner:ty, $global:ident) => {
thread_local! {
pub(super) static $global: $interner = <$interner>::new();
pub(super) static $global: $interner = super::prefill::fill(
<$interner>::with_capacity(global::INIT_GLOBAL_INTERNER_CAPACITY)
);
}
impl SymbolIndexSize for $prim {