tamer: Introduce span

This is an initial implementation optimized for expected use cases. Hopefully that pans out and doesn't come back to bite me. Regarding the context: it only allows for interned paths atm, which are strings (and so much be valid UTF-8, which is fine for us, but sucks for something more general-purpose). I'll be curious if the context needs extension later on, or if different contexts will be stored in IRs (e.g. to store a template application site as well as the location of the expansion within the template body).
2021-08-13 14:59:25 -04:00 · 2021-08-13 14:59:25 -04:00 · 0ff0f88e5f
parent 29ab4b9bfc
commit 0ff0f88e5f
3 changed files with 503 additions and 1 deletions
--- a/tamer/src/global.rs
+++ b/tamer/src/global.rs
@ -24,14 +24,20 @@
 //!   and risk incompatibilities or maintenance issues as requirements
 //!   change.
 //!
+//! This package contains static assertions to ensure that certain sizes
+//!   will play well with one-another and provide a clear direction if
+//!   something goes wrong
+//!     (beyond a slew of other type errors).
+//!
 //! By convention,
 //!   import this entire module rather than individual members and reference
 //!   them as `global::foo` to emphasize their nature and risk.

-use std::num;
+use std::{mem::size_of, num};

 /// A size capable of representing every interned string in a package.
 pub type PkgSymSize = u16;
+const_assert!(size_of::<PkgSymSize>() <= size_of::<ProgSymSize>());

 /// A non-zero equivalent of [`PkgSymSize`];
 pub type NonZeroPkgSymSize = num::NonZeroU16;
@ -42,6 +48,21 @@ pub type ProgSymSize = u32;
 /// A non-zero equivalent of [`ProgSymSize`];
 pub type NonZeroProgSymSize = num::NonZeroU32;

+/// Size large enough to accommodate the number of bytes in a source file.
+pub type SourceFileSize = u32;
+
+/// Length of tokens derived from source files.
+///
+/// This size should be kept small,
+///   since its principle use is as an alternative to a byte offset
+///   ([`SourceFileSize`]) for struct packing.
+/// It should, however, be large enough to handle comment and
+///   documentation tokens.
+/// Tokens that are too large can potentially be split into multiple
+///   tokens.
+pub type FrontendTokenLength = u16;
+const_assert!(size_of::<FrontendTokenLength>() <= size_of::<SourceFileSize>());
+
 /// A size capable of representing indexes of each individual identifier
 ///   within a single package.
 ///
@ -53,6 +74,7 @@ pub type NonZeroProgSymSize = num::NonZeroU32;
 ///
 /// This must be ≥ [`PkgSymSize`].
 pub type PkgIdentSize = u16;
+const_assert!(size_of::<PkgIdentSize>() >= size_of::<PkgSymSize>());

 /// A size capable of representing every individual identifier and
 ///   expression within a single package.
@ -63,12 +85,14 @@ pub type PkgIdentSize = u16;
 ///   this must accommodate far more than the user's expectations
 ///     working within the provided level of abstraction.
 pub type PkgIdentExprSize = u32;
+const_assert!(size_of::<PkgIdentExprSize>() <= size_of::<ProgIdentExprSize>());

 /// A size capable of representing the union of every identifier of every
 ///   package used by an entire program.
 ///
 /// This must be ≥ [`ProgSymSize`].
 pub type ProgIdentSize = u32;
+const_assert!(size_of::<ProgIdentSize>() >= size_of::<ProgSymSize>());

 /// A size capable of representing the union of every identifier and every
 ///   expression of every package used by an entire program.
@ -81,3 +105,4 @@ pub type ProgIdentSize = u32;
 ///
 /// This must be ≥ [`ProgSymSize`].
 pub type ProgIdentExprSize = u32;
+const_assert!(size_of::<ProgIdentExprSize>() >= size_of::<ProgSymSize>());
--- a/tamer/src/lib.rs
+++ b/tamer/src/lib.rs
@ -34,6 +34,7 @@ pub mod fs;
 pub mod ir;
 pub mod ld;
 pub mod obj;
+pub mod span;
 pub mod sym;
 pub mod tpwrap;

--- a/tamer/src/span.rs
+++ b/tamer/src/span.rs
@ -0,0 +1,476 @@
+// Source spans
+//
+//  Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
+//
+//  This file is part of TAME.
+//
+//  This program is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  This program is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Mapping to source input byte intervals.
+//!
+//! A [`Span`] is a mapping to a byte interval within a source file,
+//!   representing primarily where some IR entity originated.
+//! This underpins the diagnostic system,
+//!   intended to:
+//!
+//!   1. Give the user specific information for debugging errors in their
+//!       programs; and
+//!   2. Provide high-resolution information for source code inquiries,
+//!        such as "where is this identifier?" and "what exists at my cursor
+//!        position within this file"?
+//!
+//! A span contains a [`Context`] representing the source location.
+//! A context's path is a [`PathIndex`],
+//!   which represents an interned string slice,
+//!     _not_ a [`PathBuf`](std::path::PathBuf) or
+//!     [`OsStr`](std::ffi::OsStr).
+//!
+//! ```
+//! use tamer::span::{Span, Context};
+//! use tamer::sym::GlobalSymbolIntern;
+//!
+//! // From raw parts
+//! let ctx: Context = "some/path/foo".intern().into();
+//! let span = Span::new(2, 6, ctx);
+//!
+//! assert_eq!(2, span.offset());
+//! assert_eq!(6, span.len());
+//! assert_eq!(ctx, span.ctx());
+//!
+//! // From a closed byte interval
+//! let spani = Span::from_byte_interval((10, 25), "some/path/bar".intern());
+//! assert_eq!(10, spani.offset());
+//! assert_eq!(15, spani.len());
+//!
+//! // Freely copyable
+//! let cp = span;
+//! assert_eq!(cp, span);
+//! ```
+//!
+//! Span is expected to be able to fit within a general-purpose CPU register
+//!   on a 64-bit system, and so does not exceed 8 bytes in length.
+//!
+//! Spans are one of the most common objects in TAMER,
+//!   competing only with [symbols](crate::sym).
+//! But unlike symbols,
+//!   [`Span`]s are designed to be meaningfully identifiable and copyable
+//!   without interning.
+//!
+//! A span is ordered as such:
+//!
+//!   1. Spans group by [`Context`],
+//!        though the relative ordering of each [`Context`] isn't
+//!        necessarily meaningful;
+//!   2. Spans are then ordered relative to their offset; and
+//!   3. Spans are finally ordered by their length.
+//!
+//! Note that this means that a span beginning after but ending before
+//!   another span will still order higher,
+//!     as shown in the example below.
+//!
+//! ```
+//! # use tamer::span::{Span, Context};
+//! # use tamer::sym::GlobalSymbolIntern;
+//! #
+//! # let ctx: Context = "some/path/foo".intern().into();
+//! #
+//! // Visualization of spans:
+//! // [..... ..... ..... .....]
+//! //    [A====]   [B==] |   
+//! //        | |   [C=]  |
+//! //        | |   [D====]
+//! //        | |    [E]  |
+//! //        [F=====]    |
+//! //        [G=======]  |
+//!
+//! let A = Span::new(2,  6, ctx);
+//! let B = Span::new(10, 5, ctx);
+//! let C = Span::new(10, 4, ctx);
+//! let D = Span::new(10, 6, ctx);
+//! let E = Span::new(11, 3, ctx);
+//! let F = Span::new(5,  7, ctx);
+//! let G = Span::new(5,  8, ctx);
+//!
+//! let mut spans = vec![A, B, C, D, E, F, G];
+//! spans.sort();
+//!
+//! assert_eq!(spans, vec![A, F, G, C, B, D, E]);
+//! ```
+//!
+//! Design Rationale
+//! ================
+//! It is expected that spans will be created and copied frequently,
+//!   as they are propagated to every IR in the system.
+//! It is further expected that the data within a span will only be
+//!   referenced for diagnostic purposes,
+//!     or for utilities operating on original source code
+//!       (such as code formatters).
+//!
+//! When a span is referenced,
+//!   it will either be to determine the exact location of a particular
+//!     entity,
+//!   or it will be to attempt to locate a similar entity in a higher-level
+//!     IR associated with the same region of code.
+//! The latter requires that spans be comparable in a meaningful way,
+//!   exhibiting at least partial ordering.
+//!
+//! Spans are therefore optimized for three primary use cases:
+//!   - copying;
+//!   - comparison; and
+//!   - ordering.
+//!
+//! Span Structure
+//! --------------
+//! Spans are packed into 64-bit values that can be readily converted into a
+//!   [`u64`] value that is totally ordered relative to a given [`Context`],
+//!   byte offset, and byte length.
+//! This means that sorting a collection of [`Span`]s will group spans by
+//!   their [`Context`];
+//!     will sort those spans relative to their starting offset within that
+//!       context; and
+//!     will sort again by the ending offset.
+//!
+//! This means that spans are [`Eq`] and [`Ord`],
+//!   and efficiently so by simply comparing the byte values of the entire
+//!   struct as a single [`u64`].
+//! This allows spans to be sorted relative to their positions within a
+//!   context;
+//!     be placed into a binary tree for mapping back to higher-level IRs;
+//!     gives spans a meaningful unique identifier;
+//!     be freely copied without cost;
+//!     and more,
+//!       all very efficiently and without having to access individual
+//!         struct members.
+//!
+//! To accomplish this,
+//!   [`Span`] uses `repr(packed)` and orders the fields for little endian
+//!   systems like `x86_64`,
+//!     which is what our team uses.
+//! The `packed` representation had to be used because the byte orderings
+//!   are [`u16`], [`u32`], [`u16`],
+//!     which makes the [`u32`] byte offset unaligned.
+//! Note that,
+//!   while this _is_ unaligned,
+//!   this is _not_ unaligned _memory_ access,
+//!   since the entire [`Span`] will be retrieved from (aligned) memory at
+//!     once;
+//!       the unaligned fields within the [`u64`] do not incur a measurable
+//!       performance cost.
+//!
+//! Related Work
+//! ============
+//! This span is motivated by [rustc's compressed `Span`](rustc-span).
+//! TAMER's span size relies on [`global::PkgSymSize`] being 16 bits in length,
+//!   which _should_ be a reasonable assumption.
+//! If ever that assumption becomes violated,
+//!   and it is deemed that packages containing so many symbols should be permitted,
+//!   TAMER's [`Span`] can accommodate in a similar with to rustc's by
+//!     interning the larger span data and tagging this span as such.
+//!
+//!
+//! [rustc-span]: https://doc.rust-lang.org/stable/nightly-rustc/rustc_span/struct.Span.html
+
+use std::convert::TryInto;
+
+use crate::global;
+use crate::sym::PkgSymbolId;
+
+/// Description of a source location and byte interval for some object.
+///
+/// Spans represent byte intervals within a given source context.
+/// A span should map to useful positions for helping users debug error
+///   messages.
+/// If code is generated, desugared, or otherwise manipulated,
+///   the span ought to reference the original location of the code that can
+///   be referenced and modified to correct any problems.
+///
+/// See the [module-level documentation](self) for more information.
+#[cfg(target_endian = "little")]
+#[repr(packed)]
+#[derive(Debug, Clone, Copy)]
+pub struct Span {
+    /// Token length (ending byte offset - `offset`).
+    len: global::FrontendTokenLength,
+
+    /// Starting 0-indexed byte position, inclusive.
+    offset: global::SourceFileSize,
+
+    /// Context onto which byte offsets are mapped,
+    ///   such as a source file.
+    ctx: Context,
+}
+
+impl Span {
+    /// Create a new span from its constituent parts.
+    pub fn new<C: Into<Context>>(
+        offset: global::SourceFileSize,
+        len: global::FrontendTokenLength,
+        ctx: C,
+    ) -> Self {
+        Self {
+            ctx: ctx.into(),
+            offset,
+            len,
+        }
+    }
+
+    /// Create a span from a byte interval and context.
+    ///
+    /// Panics
+    /// ======
+    /// This will panic in the unlikely case that the difference between the
+    ///   start and end of the interval exceeds the maximum of
+    ///   [`global::FrontendTokenLength`].
+    ///
+    /// If this error occurs,
+    ///   the parser should consider splitting large tokens up into multiple
+    ///   tokens;
+    ///     increasing [`global::FrontendTokenLength`] should be a last
+    ///     resort,
+    ///       since it has wide-reaching implications on the size of
+    ///       [`Span`].
+    ///
+    /// The user is not expected to know how to recover from this error
+    ///   without debugging the compiler.
+    /// It is not expected that this would occur on any valid inputs.
+    pub fn from_byte_interval<B, C>(interval: B, ctx: C) -> Self
+    where
+        B: Into<ClosedByteInterval>,
+        C: Into<Context>,
+    {
+        let binterval = interval.into();
+
+        Self {
+            offset: binterval.0,
+            len: (binterval.1 - binterval.0)
+                .try_into()
+                .expect("span length exceeds global::FrontendTokenLength"),
+            ctx: ctx.into(),
+        }
+    }
+
+    // A span represented uniquely as a totally ordered [`u64`].
+    //
+    // For more information on this important properly,
+    //   see the documentation for [`Span`] itself.
+    pub fn as_u64(self) -> u64 {
+        // We take a number of precautions to make this safe (in the sense
+        //   of correctness),
+        //     through struct packing and a `cfg` directive for endianness.
+        // In any case,
+        //   a `u64` isn't going to harm anyone.
+        unsafe { std::mem::transmute(self) }
+    }
+
+    /// Byte offset of the beginning of the span relative to its context.
+    pub fn offset(&self) -> global::SourceFileSize {
+        self.offset
+    }
+
+    /// Length of the span in bytes.
+    ///
+    /// The interval of the span is `[offset, offset+len]`.
+    pub fn len(&self) -> global::FrontendTokenLength {
+        self.len
+    }
+
+    /// The context to which the span applies.
+    ///
+    /// The context is, for example, a file.
+    pub fn ctx(&self) -> Context {
+        self.ctx
+    }
+}
+
+impl Into<u64> for Span {
+    fn into(self) -> u64 {
+        self.as_u64()
+    }
+}
+
+impl PartialEq for Span {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_u64() == other.as_u64()
+    }
+}
+
+impl Eq for Span {}
+
+impl PartialOrd for Span {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for Span {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.as_u64().cmp(&other.as_u64())
+    }
+}
+
+// This assertion verifies our above expectations.
+// If this fails,
+//   then you have either modified [`global`] constants or you have modified
+//   the fields of [`Span`] itself,
+//     in which case you should read "Related Work" above to determine
+//     whether this was a good idea or if interned spans should be
+//     introduced.
+// In any case,
+//   hopefully this was planned for,
+//   because otherwise your week has just been ruined.
+assert_eq_size!(Span, u64);
+
+/// Context for byte offsets (e.g. a source file).
+///
+/// A context is lifetime-free and [`Copy`]-able,
+///   with the assumption that an interned [`PathIndex`] will only need to
+///   be resolved to its underlying value in a diagnostic context where the
+///   internment system is readily available.
+///
+/// Since this is used within [`Span`],
+///   it must be kept as small as possible.
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct Context(PathIndex);
+
+impl<P: Into<PathIndex>> From<P> for Context {
+    fn from(path: P) -> Self {
+        Self(path.into())
+    }
+}
+
+/// An interned path.
+///
+/// This is interned as a string slice ([`PkgSymbolId`]),
+///   not a `PathBuf`.
+/// Consequently,
+///   it is not an `OsStr`.
+///
+/// This newtype emphasizes that it differs from typical symbol usage,
+///   especially given that it'll always use the [`PkgSymbolId`] interner,
+///   _not_ necessarily whatever global interner is used for all other
+///   symbols.
+/// In the future,
+///   these may be interned separately.
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct PathIndex(PkgSymbolId);
+
+impl From<PkgSymbolId> for PathIndex {
+    fn from(sym: PkgSymbolId) -> Self {
+        Self(sym)
+    }
+}
+
+/// A closed interval (range of values including its endpoints) representing
+///   source bytes associated with a token.
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct ClosedByteInterval<T = global::SourceFileSize>(pub T, pub T)
+where
+    T: Copy + PartialOrd;
+
+impl<T: Copy + PartialOrd> From<(T, T)> for ClosedByteInterval<T> {
+    /// Convert a tuple into a closed byte interval where the first index
+    ///   represents the start of the interval and the second index the
+    ///   end.
+    ///
+    /// Panics
+    /// ======
+    /// The second value must be ≥ the first.
+    fn from(src: (T, T)) -> Self {
+        assert!(src.1 >= src.0);
+
+        Self(src.0, src.1)
+    }
+}
+
+assert_eq_size!(ClosedByteInterval, u64);
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::sym::GlobalSymbolIntern;
+
+    // Little endian check.
+    //
+    // This ensures that the byte ordering is as expected,
+    //   otherwise the resulting integer will not have the properties we
+    //   require for sorting and comparison.
+    #[cfg(target_endian = "little")]
+    #[test]
+    fn span_pack_le() {
+        let span =
+            Span::new(0xA3A2A1A0, 0xB1B0, PkgSymbolId::test_from_int(0xC1C0));
+
+        assert_eq!(
+            0xC1C0_A3A2A1A0_B1B0,
+            // ^       ^     ^
+            // ctx   offset  len
+            span.as_u64(),
+            "endianness check failed: {:X?}",
+            span.as_u64()
+        );
+    }
+
+    #[cfg(target_endian = "big")]
+    #[test]
+    fn span_pack_be_not_supported() {
+        panic!("Big-endian systems are not currently supported");
+    }
+
+    // The tests that follow are corollaries of the above, but the below
+    // tests do test that the implementations function as intended.
+
+    #[test]
+    fn span_at_later_offset_in_same_context_compares_greater() {
+        let ctx = "imaginary".intern();
+        let first = Span::new(10, 5, ctx);
+        let second = Span::new(20, 5, ctx);
+
+        // These two assertions must be identical.
+        assert!(second > first);
+        assert!(second.as_u64() > first.as_u64());
+    }
+
+    #[test]
+    fn spans_order_by_context_start_and_len() {
+        let ctxa = "context a".intern();
+        let ctxb = "context b".intern();
+
+        // Sanity check, otherwise this test won't work as expected.
+        assert!(ctxa < ctxb);
+
+        let sa1 = Span::new(10, 1, ctxa);
+        let sa2 = Span::new(22, 1, ctxa);
+        let sa3 = Span::new(35, 1, ctxa);
+
+        let sb1 = Span::new(11, 1, ctxb);
+        let sb2 = Span::new(20, 1, ctxb);
+        let sb3 = Span::new(33, 1, ctxb);
+
+        let mut spans = vec![sa3, sb2, sb1, sa2, sa1, sb3];
+        spans.sort();
+
+        assert_eq!(spans, vec![sa1, sa2, sa3, sb1, sb2, sb3]);
+    }
+
+    #[test]
+    pub fn retrieve_span_components() {
+        let ctx: Context = "foo".intern().into();
+        let offset = 100;
+        let len = 50;
+
+        let span = Span::new(offset, len, ctx);
+
+        assert_eq!((offset, len, ctx), (span.offset(), span.len(), span.ctx()));
+    }
+}