tame/tamer/src/sym/symbol.rs

405 lines
12 KiB
Rust
Raw Blame History

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

// String internment symbol objects
//
// Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
//
// This file is part of TAME.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//! Symbol objects for string internment system.
//!
//! See the [parent module](super) for more information.
use crate::global;
use std::convert::{TryFrom, TryInto};
use std::fmt::{self, Debug};
use std::num::{NonZeroU16, NonZeroU32, NonZeroU8};
use std::ops::Deref;
/// Unique symbol identifier.
///
/// _Do not construct this value yourself;_
/// use an [`Interner`].
///
/// This newtype helps to prevent other indexes from being used where a
/// symbol index is expected.
/// Note, however, that it provides no defense against mixing symbol indexes
/// between multiple [`Interner`]s.
///
/// The index `0` is never valid because of
/// [`SymbolIndexSize::NonZero`],
/// which allows us to have `Option<SymbolId>` at no space cost.
///
/// [`Interner`]: super::Interner
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct SymbolId<Ix: SymbolIndexSize>(Ix::NonZero);
assert_eq_size!(Option<Symbol<u16>>, Symbol<u16>);
impl<Ix: SymbolIndexSize> SymbolId<Ix> {
/// Construct index from a non-zero `u16` value.
///
/// Panics
/// ------
/// Will panic if `n == 0`.
pub fn from_int(n: Ix) -> SymbolId<Ix> {
SymbolId(Ix::new(n).unwrap())
}
/// Construct index from an unchecked non-zero `u16` value.
///
/// This does not verify that `n > 0` and so must only be used in
/// contexts where this invariant is guaranteed to hold.
/// Unlike [`from_int`](SymbolId::from_int),
/// this never panics.
pub unsafe fn from_int_unchecked(n: Ix) -> SymbolId<Ix> {
SymbolId(Ix::new_unchecked(n))
}
/// Construct index from a non-zero `u16` value.
///
/// Panics
/// ------
/// Will panic if `n == 0`.
pub fn from_u16(n: u16) -> SymbolId<u16> {
SymbolId::from_int(n)
}
/// Construct index from a non-zero `u32` value.
///
/// Panics
/// ------
/// Will panic if `n == 0`.
pub fn from_u32(n: u32) -> SymbolId<u32> {
SymbolId::from_int(n)
}
pub fn as_usize(self) -> usize {
self.0.into().as_usize()
}
}
impl<Ix: SymbolIndexSize> From<SymbolId<Ix>> for usize
where
<Ix as TryInto<usize>>::Error: Debug,
{
fn from(value: SymbolId<Ix>) -> usize {
value.0.into().as_usize()
}
}
impl<'i, Ix: SymbolIndexSize> From<&Symbol<'i, Ix>> for SymbolId<Ix> {
fn from(sym: &Symbol<'i, Ix>) -> Self {
sym.index()
}
}
/// An integer type paired with its respective `NonZero` type that may be
/// used to index symbols.
///
/// The trait is name as such so that error messages make it clear that a
/// primitive type has to be explicitly accounted for.
///
/// This trait must be implemented on a primitive type like [`u16`].
pub trait SymbolIndexSize:
Sized
+ Copy
+ Debug
+ PartialEq
+ Eq
+ TryFrom<usize>
+ TryInto<usize>
+ 'static
{
/// The associated `NonZero*` type (e.g. [`NonZeroU16`]).
type NonZero: Copy + Into<Self> + Debug;
/// A symbol with a static lifetime suitable for placement at index 0 in
/// the string interment table,
/// which is not a valid [`SymbolId`] value.
fn dummy_sym() -> &'static Symbol<'static, Self>;
/// Construct a new non-zero value from the provided primitive value.
///
/// If the value is `0`, the result will be [`None`].
fn new(n: Self) -> Option<Self::NonZero>;
/// Construct a new non-zero value from the provided primitive value
/// without checking whether the value is non-zero.
unsafe fn new_unchecked(n: Self) -> Self::NonZero;
/// Convert primitive value into a [`usize`].
fn as_usize(self) -> usize;
}
macro_rules! supported_symbol_index {
($prim:ty, $nonzero:ty, $dummy:ident) => {
impl SymbolIndexSize for $prim {
type NonZero = $nonzero;
fn dummy_sym() -> &'static Symbol<'static, Self> {
&$dummy
}
fn new(n: Self) -> Option<Self::NonZero> {
Self::NonZero::new(n)
}
unsafe fn new_unchecked(n: Self) -> Self::NonZero {
Self::NonZero::new_unchecked(n)
}
fn as_usize(self) -> usize {
self as usize
}
}
};
}
supported_symbol_index!(u8, NonZeroU8, DUMMY_SYM_8);
supported_symbol_index!(u16, NonZeroU16, DUMMY_SYM_16);
supported_symbol_index!(u32, NonZeroU32, DUMMY_SYM_32);
/// Interned string.
///
/// A reference to this symbol is returned each time the same string is
/// interned with the same [`Interner`];
/// as such,
/// symbols can be compared for equality by pointer;
/// the underlying symbol id need not be used.
///
/// Each symbol is identified by a unique integer
/// (see [`index`](Symbol::index)).
/// The use of integers creates a more dense range of values than pointers,
/// which allows callers to use a plain [`Vec`] as a map instead of
/// something far more expensive like
/// [`HashSet`](std::collections::HashSet);
/// this is especially beneficial for portions of the system that make
/// use of nearly all interned symbols,
/// like the ASG.
/// A [`SymbolId`] can be mapped back into its [`Symbol`] by calling
/// [`Interner::index_lookup`] on the same interner that produced it.
///
/// The symbol also stores a string slice referencing the interned string
/// itself,
/// whose lifetime is that of the [`Interner`]'s underlying data store.
/// Dereferencing the symbol will expose the underlying slice.
///
/// [`Interner`]: super::Interner
/// [`Interner::index_lookup`]: super::Interner::index_lookup
#[derive(Copy, Clone, Debug)]
pub struct Symbol<'i, Ix: SymbolIndexSize> {
index: SymbolId<Ix>,
str: &'i str,
}
/// Interned string within a single package.
///
/// This type should be preferred to [`ProgSymbol`] when only a single
/// package's symbols are being processed.
pub type PkgSymbol<'i> = Symbol<'i, global::PkgSymSize>;
/// Interned string within an entire program.
///
/// This symbol type is preconfigured to accommodate a larger number of
/// symbols than [`PkgSymbol`] and is situable for use in a linker.
/// Use this type only when necessary.
pub type ProgSymbol<'i> = Symbol<'i, global::ProgSymSize>;
impl<'i, Ix: SymbolIndexSize> Symbol<'i, Ix> {
/// Construct a new interned value.
///
/// _This must only be done by an [`Interner`]._
/// As such,
/// this function is not public.
///
/// For test builds (when `cfg(test)`),
/// `new_dummy` is available to create symbols for tests.
///
/// [`Interner`]: super::Interner
#[inline]
pub(super) fn new(index: SymbolId<Ix>, str: &'i str) -> Symbol<'i, Ix> {
Self { index, str }
}
/// Retrieve unique symbol index.
///
/// This is a densely-packed identifier that can be used as an index for
/// mapping.
/// See [`SymbolId`] for more information.
#[inline]
pub fn index(&self) -> SymbolId<Ix> {
self.index
}
/// Construct a new interned value _for testing_.
///
/// This is a public version of [`Symbol::new`] available for test
/// builds.
/// This separate name is meant to strongly imply that you should not be
/// doing this otherwise.
///
/// See also `dummy_symbol!`.
#[cfg(test)]
#[inline(always)]
pub fn new_dummy(index: SymbolId<Ix>, str: &'i str) -> Symbol<'i, Ix> {
Self::new(index, str)
}
}
impl<'i, Ix: SymbolIndexSize> PartialEq for Symbol<'i, Ix> {
fn eq(&self, other: &Self) -> bool {
std::ptr::eq(self as *const _, other as *const _)
|| std::ptr::eq(self.str.as_ptr(), other.str.as_ptr())
}
}
impl<'i, Ix: SymbolIndexSize> Eq for Symbol<'i, Ix> {}
impl<'i, Ix: SymbolIndexSize> Deref for Symbol<'i, Ix> {
type Target = str;
/// Dereference to interned string slice.
///
/// This allows for symbols to be used where strings are expected.
#[inline]
fn deref(&self) -> &str {
self.str
}
}
impl<'i, Ix: SymbolIndexSize> fmt::Display for Symbol<'i, Ix> {
/// Display name of underlying string.
///
/// Since symbols contain pointers to their interned slices,
/// we effectively get this for free.
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.str)
}
}
lazy_static! {
/// Dummy 8-bit [`Symbol`] for use at index `0`.
///
/// A symbol must never have an index of `0`,
/// so this can be used as a placeholder.
/// The chosen [`SymbolId`] here does not matter since this will
/// never be referenced.
static ref DUMMY_SYM_8: Symbol<'static, u8> =
Symbol::new(SymbolId::from_int(1), "!BADSYMREF!");
/// Dummy 16-bit [`Symbol`] for use at index `0`.
///
/// A symbol must never have an index of `0`,
/// so this can be used as a placeholder.
/// The chosen [`SymbolId`] here does not matter since this will
/// never be referenced.
static ref DUMMY_SYM_16: Symbol<'static, u16> =
Symbol::new(SymbolId::from_int(1), "!BADSYMREF!");
/// Dummy 32-bit [`Symbol`] for use at index `0`.
///
/// A symbol must never have an index of `0`,
/// so this can be used as a placeholder.
/// The chosen [`SymbolId`] here does not matter since this will
/// never be referenced.
static ref DUMMY_SYM_32: Symbol<'static, u32> =
Symbol::new(SymbolId::from_int(1), "!BADSYMREF!");
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn self_compares_eq() {
let sym = Symbol::new(SymbolId::from_int(1u16), "str");
assert_eq!(&sym, &sym);
}
#[test]
fn copy_compares_equal() {
let sym = Symbol::new(SymbolId::from_int(1u16), "str");
let cpy = sym;
assert_eq!(sym, cpy);
}
// Integer values are for convenience, not identity. They cannot be
// used as a unique identifier across different interners.
#[test]
fn same_index_different_slices_compare_unequal() {
let a = Symbol::new(SymbolId::from_int(1u16), "a");
let b = Symbol::new(SymbolId::from_int(1u16), "b");
assert_ne!(a, b);
}
// As mentioned above, ids are _not_ the identity of the symbol. If
// two values point to the same location in memory, they are assumed
// to have come from the same interner, and should therefore have
// the same index this should never happen unless symbols are
// being created without the use of interners, which is unsupported.
//
// This test is a cautionary tale.
#[test]
fn different_index_same_slices_compare_equal() {
let slice = "str";
let a = Symbol::new(SymbolId::from_int(1u16), slice);
let b = Symbol::new(SymbolId::from_int(2u16), slice);
assert_eq!(a, b);
}
#[test]
fn cloned_symbols_compare_equal() {
let sym = Symbol::new(SymbolId::from_int(1u16), "foo");
assert_eq!(sym, sym.clone());
}
// &Symbol can be used where string slices are expected (this won't
// compile otherwise).
#[test]
fn ref_can_be_used_as_string_slice() {
let slice = "str";
let sym_slice: &str = &Symbol::new(SymbolId::from_int(1u16), slice);
assert_eq!(slice, sym_slice);
}
// For use when we can guarantee proper ids.
#[test]
fn can_create_index_unchecked() {
assert_eq!(SymbolId::from_int(1u32), unsafe {
SymbolId::from_int_unchecked(1)
});
}
#[test]
fn can_retrieve_symbol_index() {
let index = SymbolId::from_int(1u16);
assert_eq!(index, Symbol::new(index, "").index());
}
#[test]
fn displays_as_interned_value() {
let sym = Symbol::new(SymbolId::from_int(1u16), "foo");
assert_eq!(format!("{}", sym), sym.str);
}
}