tame/tamer/src/sym/prefill.rs

// Pre-interned strings
//
//  Copyright (C) 2014-2023 Ryan Specialty, LLC.
//
//  This file is part of TAME.
//
//  This program is free software: you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation, either version 3 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program.  If not, see <http://www.gnu.org/licenses/>.

//! Pre-interned strings.
//!
//! These strings are expected to be encountered nearly every run,
//!   and substitute static strings that would otherwise appear hard-coded
//!   in the system and have to be interned to be compared against other
//!   values.
//!
//! See the [parent module](super) for more information.

use super::{Interner, SymbolId, SymbolIndexSize};
use crate::global;

/// Static symbol identifier that is stable between runs of the same version
///   of TAMER.
///
/// This symbol id is allocated at compile-time.
///
/// Safety
/// ======
/// All objects implementing this trait must have the same byte
///   representation as its inner [`SymbolId`].
pub unsafe trait StaticSymbolId<Ix: SymbolIndexSize = global::ProgSymSize>:
    private::Sealed
{
    // Traits cannot contain constant functions.
    // See [`st_as_sym`] below.
}

/// Convert any [`StaticSymbolId`] into its inner [`SymbolId`].
///
/// Static symbols are typed to convey useful information to newtypes that
///   wish to wrap or compose them.
/// This function peels back that type information to expose the inner
///   symbol.
///
/// Safety and Rationale
/// ====================
/// This function does its best to work around the limitation in Rust that
///   traits cannot contain constant functions
///   (at the time of writing).
///
/// To do this,
///   we require that every object of type [`StaticSymbolId`] have _the same
///   byte representation_ as [`SymbolId`].
/// Since Rust optimizes away simple newtype wrappers,
///   this means that we can simply cast the value to a symbol.
///
/// For example, if we have `StaticSymbolId<u32>`,
///   this would cast to a `SymbolId<u32>`.
/// The inner value of `SymbolId<u32>` is
///   `<u32 as SymbolIndexSize>::NonZero`,
///     which has the same byte representation as `u32`.
///
/// This would normally be done using [`std::mem::transmute`],
///   which ensures that the two types have compatible sizes.
/// Unfortunately,
///   the types here do not have fixed size and constant functions are
///   unable to verify that they are compatible at the time of writing.
/// We therefore must use [`std::mem::transmute_copy`] to circumvent this
///   size check.
///
/// Circumventing this check is safe given our trait bounds for all static
///   symbols in this module and its children.
/// However,
///   for this safety to hold,
///   we must ensure that no outside modules can implement
///   [`StaticSymbolId`] on their own objects.
/// For this reason,
///   [`StaticSymbolId`] implements [`private::Sealed`].
///
/// With that,
///   we get [`SymbolId`] polymorphism despite Rust's limitations.
///
/// A Note About Nightly
/// ====================
/// At the time of writing,
///   though,
///   this _does_ require two unstable features:
///     `const_fn_trait_bound` and `const_transmute_copy`.
/// We can get rid of the latter using raw pointer casts,
///   just as it does,
///   but since we're already relying on unstable flags,
///     we may as well use it while we require nightly for other things as
///     well.
///
/// `const_fn_trait_bound` cannot be removed in this situation without
///   another plan.
/// `const_panic` could be used with an enum,
///   but that still requires nightly.
pub const fn st_as_sym<T, Ix>(st: &T) -> SymbolId<Ix>
where
    T: StaticSymbolId<Ix>,
    Ix: SymbolIndexSize,
{
    // SAFETY: A number of precautions are taken to make this a safe and
    // sensible transformation; see function doc above.
    SymbolId(unsafe { std::mem::transmute_copy(st) })
}

/// Generate a newtype containing a condensed [`SymbolId`].
macro_rules! static_symbol_newtype {
    ($(#[$attr:meta])* $name:ident<$size:ty>) => {
        $(#[$attr])*
        /// This is a statically-allocated symbol.
        ///
        /// This symbol is generated at compile-time and expected to be
        ///   available in the 32-bit global interner once it has been
        ///   initialized.
        #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
        pub struct $name(<$size as SymbolIndexSize>::NonZero);

        // Mark this as a static symbol type, ensuring that it size is fully
        //   compatible with the underlying `SymbolId` so as not to cause
        //   problems with `st_as_sym`.
        impl private::Sealed for $name {}
        unsafe impl StaticSymbolId<$size> for $name {}
        assert_eq_size!($name, SymbolId<$size>);

        impl $name {
            const fn new(id: $size) -> Self {
                Self(unsafe {
                    <$size as SymbolIndexSize>::NonZero::new_unchecked(id)
                })
            }

            /// Cast static symbol into a [`SymbolId`] suitable for the global
            ///   program-level interner.
            ///
            /// This is safe since global interner will always contain this
            ///   symbol before it can be read.
            pub const fn as_sym(self) -> SymbolId<$size> {
                SymbolId(self.0)
            }

            pub const fn as_usize(self) -> usize {
                self.0.get() as usize
            }
        }

        impl From<$name> for SymbolId<$size> {
            fn from(st: $name) -> Self {
                st.as_sym()
            }
        }
    };
}

/// Generate a series of newtypes and the macro `static_symbol_ty!` which
///   can be used to take a short identifier and convert it into its full
///   type identifier.
macro_rules! static_symbol_newtypes {
    ($($(#[$attr:meta])* $short:ident: $ty:ident<$size:ty>,)*) => {
        $(
            static_symbol_newtype!($(#[$attr])* $ty<$size>);
        )*

        macro_rules! static_symbol_ty {
            $(
                ($short) => {
                    $ty
                };
            )*
        }
    }
}

/// Generate symbols for preinterned strings.
///
/// These symbols,
///   rather than being generated by the global internment system,
///   are generated statically.
/// Once the global interner is initialized
///   (see [parent module](`super`)),
///   which is on first access,
///   these symbols will reference valid values.
macro_rules! static_symbol_consts {
    (@i $i:expr; <$size:ty> $name:ident: $ty:ident $str:expr, $($tail:tt)*) => {
        #[doc=concat!(
            "Interned `",
            stringify!($ty),
            "` ",
            static_symbol_consts!(@!str $ty $str),
            "."
        )]
        #[doc=""]
        #[doc=concat!(
            "For the raw (untyped) version, see [`raw::",
            stringify!($name),
            "`]."
        )]
        pub const $name: static_symbol_ty!($ty) =
            <static_symbol_ty!($ty)>::new($i);

        // Recurse until no tail is left (terminating condition below).
        static_symbol_consts!{
            // This will result in 1 + 1 + 1 + 1 ... and will eventually hit
            // the recursion limit if we have too many static symbols, after
            // which time we may have to switch methodology.
            @i $i + 1;
            <$size>

            $($tail)*
        }
    };

    // Terminating condition.
    (@i $i:expr; <$size:ty>) => {
        /// Number of statically allocated symbols.
        ///
        /// This can be used to help determine a base capacity for
        ///   collections holding [`SymbolId`]s.
        pub const ST_COUNT: usize = $i - 1;
    };

    // Whitespace with newlines causes rustdoc parsing issues.
    (@!str ws $str:expr) => {
        "whitespace"
    };

    (@!str $ty:ident $str:expr) => {
        concat!("string `\"", $str, "\"`")
    };
}

/// Statically allocate [`SymbolId`]s for the provided symbols,
///   and schedule their static strings to be interned upon initialization
///   of the global interner.
///
/// This generates `fill`,
///   which the global interners call by default.
/// Any interner may optionally invoke this,
///   immediately after initialization,
///     /before/ any internment requests.
macro_rules! static_symbols {
    (<$size:ty>; $($name:ident : $ty:ident $str:expr),*) => {
        static_symbol_consts! {
            // Index 0 is not valid, so begin at 1
            @i 1;
            <$size>

            $(
                $name: $ty $str,
            )*
        }

        /// Expose each of the [typed static symbols](super) as raw
        ///   [`SymbolId`] values.
        ///
        /// These constants are useful for `match` expressions and any other
        ///   contexts where the type of the symbol must match,
        ///     and where the static type metadata is unimportant.
        ///
        /// This is equivalent to calling `as_sym` on the static newtype,
        ///   or using [`st_as_sym`](super::super::st_as_sym).
        pub mod raw {
            use super::SymbolId;

            $(
                #[doc=concat!(
                    "Raw (untyped) interned `",
                    stringify!($ty),
                    "` ",
                    static_symbols!(@!str $ty $str),
                    "."
                )]
                #[doc=""]
                #[doc=concat!(
                    "For the typed version, see [`super::",
                    stringify!($name),
                    "`]."
                )]
                pub const $name: SymbolId<$size> = super::$name.as_sym();
            )*
        }

        /// Fill a new interner with static symbols.
        ///
        /// Panics
        /// ======
        /// This function will panic if the interner has any symbols,
        ///   which would cause misalignment with the generated constants.
        pub(in super::super) fn fill<'a, I, Ix>(interner: I) -> I
        where
            I: Interner<'a, Ix>,
            Ix: SymbolIndexSize
        {
            assert!(
                interner.len() == 0,
                "cannot fill non-empty Interner with static symbols"
            );

            // This array does not exist as a constant, because that would
            //   require that we count the number of items first for the
            //   sake of the type definition.
            // This is more convenient.
            [
                $(
                    $str,
                )*
            ].into_iter().for_each(|sym| { interner.intern(sym); });

            interner
        }
    };

    // Whitespace with newlines causes rustdoc parsing issues.
    (@!str ws $str:expr) => {
        "whitespace"
    };

    (@!str $ty:ident $str:expr) => {
        concat!("string `\"", $str, "\"`")
    };
}

static_symbol_newtypes! {
    /// A symbol suitable as a C-style identifier.
    ///
    /// This is the traditional `[a-zA-Z_][a-zA-Z0-9_]*`,
    ///   common in many programming languages.
    cid: CIdentStaticSymbolId<global::ProgSymSize>,

    /// Base-10 (decimal) integer value as a string.
    dec: DecStaticSymbolId<global::ProgSymSize>,

    /// A symbol resembling a QName of the form `prefix:local`.
    ///
    /// A symbol of this type does _not_ mean that the symbol is intended to
    ///   be a QName;
    ///     this is merely a way to describe it.
    /// For example,
    ///   `map:head` is intended as an identifier type,
    ///     not a QName.
    qname: QnameIdentStaticSymbolId<global::ProgSymSize>,

    /// This symbol serves only as a marker in the internment pool to
    ///   delimit symbol ranges;
    ///     its string value is incidental and should not be relied upon.
    mark: MarkStaticSymbolId<global::ProgSymSize>,

    /// A symbol suitable as a TAME identifier.
    ///
    /// This is [`CIdentStaticSymbolId`] with `-` added:
    ///   `[a-zA-Z_-][a-zA-Z0-9_-]*`.
    /// This is also suitable as an XML node or attribute name.
    tid: TameIdentStaticSymbolId<global::ProgSymSize>,

    /// Symbol representing a URI.
    ///
    /// This is intended for use primarily as an XML namespace.
    /// URIs are expected to _not_ contain quotes and other characters that
    ///   may need escaping in XML attributes.
    uri: UriStaticSymbolId<global::ProgSymSize>,

    /// Any other generic string that does not fit into any particular type.
    str: GenericStaticSymbolId<global::ProgSymSize>,

    /// Common strings of whitespace
    ///   (where a character of whitespace is `[ \n]`).
    ///
    /// There are certainly other whitespace characters,
    ///   but this is intended to be conservative to address only the most
    ///   common cases.
    ws: WhitespaceStaticSymbolId<global::ProgSymSize>,

    /// Static 16-bit [`Span`](crate::span::Span) context.
    ///
    /// These contexts are intended for use in generated code where a better
    ///   context cannot be derived.
    ctx: ContextStaticSymbolId<u16>,

    /// This symbol serves only as a marker in the internment pool to
    ///   delimit symbol ranges;
    ///     its string value is incidental and should not be relied upon.
    mark16: Mark16StaticSymbolId<u16>,
}

/// Static symbols (pre-allocated).
///
/// Each of the constants in this module represent a [`SymbolId`] statically
///   allocated at compile-time.
/// The strings that they represent are automatically populated into the
///   global interners when the interner is first accessed.
///
/// _You should always use the generated constant to reference these
///    symbols!_
/// Do not rely upon their integer value,
///   as it _will_ change over time.
/// The sole exception is to use marker symbols to identify ranges
///   of symbols;
///     see [`MarkStaticSymbolId`].
///
/// See [`crate::sym`] for more information on static symbols.
///
/// `static` is a keyword in Rust,
///   so we shorten the module name to `st`.
///
/// The constants follow a naming convention:
///   - `L_` indicates that the identifier is all-lowercase.
pub mod st {
    use super::*;

    // Convert `0 ≤ n ≤ 9` into a static symbol representing a single
    //   decimal digit.
    //
    // Panics
    // ======
    // This will panic if `n > 9`.
    pub fn decimal1(n: u8) -> DecStaticSymbolId {
        assert!(n < 10);

        // The symbols are expected to be in a very specific position in the
        //   pool (n+1).
        // This is verified by tests at the bottom of this file.
        DecStaticSymbolId(unsafe {
            <global::ProgSymSize as SymbolIndexSize>::NonZero::new_unchecked(
                (n as global::ProgSymSize) + 1,
            )
        })
    }

    impl From<u8> for DecStaticSymbolId {
        // Convert `0 ≤ n ≤ 9` into a static symbol representing a single
        //   decimal digit.
        //
        // See [`decimal1`].
        fn from(n: u8) -> Self {
            decimal1(n)
        }
    }

    /// Whether the provided symbol is part of the static symbol list that
    ///   is pre-interned.
    #[inline]
    pub fn is_pre_interned(sym: SymbolId) -> bool {
        let symid = sym.as_usize();
        symid <= END_STATIC.as_usize()
    }

    /// Whether the given [`SymbolId`] is within a group of symbols
    ///   delimited by markers `a` and `b`.
    ///
    /// This provides a _reasonably_ efficient way to compare a [`SymbolId`]
    ///   against a large set of [`SymbolId`]s.
    /// There are more efficient ways to accomplish this,
    ///   though,
    ///   if performance ever does become a concern;
    ///     the current implementation is kept simple until then.
    #[inline]
    pub fn is_between_markers(
        a: MarkStaticSymbolId,
        b: MarkStaticSymbolId,
        sym: SymbolId,
    ) -> bool {
        let symid = sym.as_usize();
        symid > a.as_usize() && symid < b.as_usize()
    }

    /// Whether the provided [`SymbolId`] is recognized as a common
    ///   whitespace symbol in the preinterned symbol list.
    ///
    /// If this returns `true`,
    ///   then this is a quick way to determine that the provided
    ///   [`SymbolId`] does contain only whitespace.
    /// However,
    ///   this is _not_ comprehensive and never will be,
    ///     so an answer of `false` means "it may or may not be whitespace";
    ///       you should fall back to other methods of checking for
    ///       whitespace if this fails.
    #[inline]
    pub fn is_common_whitespace(sym: SymbolId) -> bool {
        is_between_markers(WS_SYM_START, WS_SYM_END, sym)
    }

    /// Attempt to make a quick determination without a memory lookup
    ///   (symbol resolution) whether the given [`SymbolId`]'s string
    ///   representation definitely contains the given byte value.
    ///
    /// A value of [`None`] means "maybe, maybe not",
    ///   indicating that the caller ought to fall back to a slower check
    ///   that utilizes the symbol's resolved string.
    /// A value of [`Some`] indicates that `sym`,
    ///   were it to be resolved,
    ///   definitely does or does not contain the byte `ch`.
    ///
    /// This is intended to encapsulate special,
    ///   loosely-defined cases where we can test that the interned symbols
    ///   actually properly adhere to the implementation of this function.
    #[inline]
    pub fn quick_contains_byte(sym: SymbolId, ch: u8) -> Option<bool> {
        match (is_pre_interned(sym), ch) {
            // No control characters or null bytes.
            (true, 0..=0x1F) => Some(false),

            // No characters outside the 7-bit ASCII range.
            (true, 0x80..) => Some(false),

            // Or the character range immediately preceding it,
            //   where 7F == DEL.
            // They are explicitly listed here so that readers do not have
            //   to consult an ASCII table to avoid unintentional bugs.
            (true, b'{' | b'|' | b'}' | b'~' | 0x7F) => Some(false),

            // We don't check for anything else (yet).
            (true, _) => None,

            // We cannot possibly know statically whether dynamically
            //   interned symbols contain any particular byte.
            (false, _) => None,
        }
    }

    static_symbols! {
        <crate::global::ProgSymSize>;

        // Decimal strings are expected to be at index (n+1).
        // See `decimal1`.
        N0: dec "0",
        N1: dec "1",
        N2: dec "2",
        N3: dec "3",
        N4: dec "4",
        N5: dec "5",
        N6: dec "6",
        N7: dec "7",
        N8: dec "8",
        N9: dec "9",

        L_ALL: cid "all",
        L_ANY: cid "any",
        L_APPLY: cid "apply",
        L_APPLY_TEMPLATE: tid "apply-template",
        L_ARG: cid "arg",
        L_AS: cid "as",
        L_BASE_TYPE: tid "base-type",
        L_BOOLEAN: cid "boolean",
        L_C: cid "c",
        L_CAR: cid "car",
        L_CASE: cid "case",
        L_CASES: cid "cases",
        L_CDR: cid "cdr",
        L_CEIL: cid "ceil",
        L_CGEN: cid "cgen",
        L_CLASS: cid "class",
        L_CLASSIFY: cid "classify",
        L_CONS: cid "cons",
        L_CONST: cid "const",
        L_CORE: cid "core",
        L_DASH: cid "dash",
        L_DEFAULT: cid "default",
        L_DEP: cid "dep",
        L_DESC: cid "desc",
        L_DIM: cid "dim",
        L_DISPLAY: cid "display",
        L_DOT: cid "dot",
        L_DTYPE: cid "dtype",
        L_DYN_NODE: tid "dyn-node",
        L_ELIG_CLASS_YIELDS: tid "elig-class-yields",
        L_EMPTY: cid "empty",
        L_ENUM: cid "enum",
        L_EQ: cid "eq",
        L_ERROR: cid "error",
        L_EXEC: cid "exec",
        L_EXPAND_BARRIER: tid "expand-barrier",
        L_EXPAND_FUNCTION: tid "expand-function",
        L_EXPAND_GROUP: tid "expand-group",
        L_EXPAND_SEQUENCE: tid "expand-sequence",
        L_EXPORT: cid "export",
        L_EXPT: cid "expt",
        L_EXTERN: cid "extern",
        L_FALSE: cid "false",
        L_FLOAT: cid "float",
        L_FLOOR: cid "floor",
        L_FOR_EACH: tid "for-each",
        L_FRAGMENT: cid "fragment",
        L_FRAGMENTS: cid "fragments",
        L_FROM: cid "from",
        L_FUNC: cid "func",
        L_FUNCTION: cid "function",
        L_GEN: cid "gen",
        L_GENERATED: cid "generated",
        L_GENERATES: cid "generates",
        L_GENSYM: cid "gensym",
        L_GENTLE_NO: tid "gentle-no",
        L_GT: cid "gt",
        L_GTE: cid "gte",
        L_ID: cid "id",
        L_IDENTIFIER: cid "identifier",
        L_IF: cid "if",
        L_IGNORE_MISSING: tid "ignore-missing",
        L_IMPORT: cid "import",
        L_INDEX: cid "index",
        L_INLINE_TEMPLATE: tid "inline-template",
        L_INTEGER: cid "integer",
        L_ISOVERRIDE: cid "isoverride",
        L_ITEM: cid "item",
        L_KEY: cid "key",
        L_L: cid "l",
        L_LABEL: cid "label",
        L_LENGTH_OF: tid "length-of",
        L_LET: cid "let",
        L_LOCAL: cid "local",
        L_LOWER: cid "lower",
        L_LPARAM: cid "lparam",
        L_LT: cid "lt",
        L_LTE: cid "lte",
        L_LV: cid "lv",
        L_MAP: cid "map",
        L_MAP_EXEC: tid "map-exec",
        L_MAP_FROM: tid "map-from",
        L_MAP_HEAD: qname "map:head",
        L_MAP_TAIL: qname "map:tail",
        L_MATCH: cid "match",
        L_META: cid "meta",
        L_METHOD: cid "method",
        L_NAME: cid "name",
        L_NAME_PREFIX: tid "name-prefix",
        L_NE: cid "ne",
        L_NO: cid "no",
        L_NOVALIDATE: cid "novalidate",
        L_OF: cid "of",
        L_ON: cid "on",
        L_OTHERWISE: cid "otherwise",
        L_OVERRIDE: cid "override",
        L_PACKAGE: cid "package",
        L_PARAM: cid "param",
        L_PARAM_ADD: tid "param-add",
        L_PARAM_CLASS_TO_YIELDS: tid "param-class-to-yields",
        L_PARAM_COPY: tid "param-copy",
        L_PARAM_INHERIT: tid "param-inherit",
        L_PARAM_META: tid "param-meta",
        L_PARAM_SYM_VALUE: tid "param-sym-value",
        L_PARAM_TYPEDEF_LOOKUP: tid "param-typedef-lookup",
        L_PARAM_VALUE: tid "param-value",
        L_PARENT: cid "parent",
        L_PASS: cid "pass",
        L_PATH: cid "path",
        L_PREFIX: cid "prefix",
        L_PREPROC: cid "preproc",
        L_PRODUCT: cid "product",
        L_PROGRAM: cid "program",
        L_PROGRAM_MAP: tid "program-map",
        L_QUOTIENT: cid "quotient",
        L_RATE: cid "rate",
        L_RATER: cid "rater",
        L_RATE_EACH: cid "rate-each",
        L_RECURSE: cid "recurse",
        L_RETMAP: cid "retmap",
        L_RETMAP_EXEC: tid "retmap-exec",
        L_RETMAP_HEAD: qname "retmap:head",
        L_RETMAP_TAIL: qname "retmap:tail",
        L_RETURN_MAP: tid "return-map",
        L_RMDASH: cid "rmdash",
        L_RMUNDERSCORE: cid "rmunderscore",
        L_SCALAR: cid "scalar",
        L_SECTION: cid "section",
        L_SET: cid "set",
        L_SNAKE: cid "snake",
        L_SRC: cid "src",
        L_STATIC: cid "static",
        L_SUFFIX: cid "suffix",
        L_SUM: cid "sum",
        L_SYM: cid "sym",
        L_SYMTABLE: cid "symtable",
        L_SYM_DEP: cid "sym-dep",
        L_SYM_DEPS: cid "sym-deps",
        L_SYM_REF: cid "sym-ref",
        L_SYM_SET: tid "sym-set",
        L_T: cid "t",
        L_TEMPLATE: cid "template",
        L_TERMINATE: cid "terminate",
        L_TEXT: cid "text",
        L_TITLE: cid "title",
        L_TO: cid "to",
        L_TPL: cid "tpl",
        L_TRANSFORM: cid "transform",
        L_TRANSLATE: cid "translate",
        L_TRUE: cid "true",
        L_TYPE: cid "type",
        L_TYPEDEF: cid "typedef",
        L_UCFIRST: cid "ucfirst",
        L_UNION: cid "union",
        L_UNIQUE: cid "unique",
        L_UNLESS: cid "unless",
        L_UPPER: cid "upper",
        L_UUROOTPATH: cid "__rootpath",
        L_VALUE: cid "value",
        L_VALUES: cid "values",
        L_VALUE_OF: cid "value-of",
        L_VECTOR: cid "vector",
        L_VIRTUAL: cid "virtual",
        L_WARNING: cid "warning",
        L_WHEN: cid "when",
        L_WITH_PARAM: tid "with-param",
        L_WORKSHEET: cid "worksheet",
        L_XMLNS: cid "xmlns",
        L_YIELD: cid "yield",
        L_YIELDS: cid "yields",

        L_TPLP_VALUES: str "@values@",

        FW_SLASH: str "/",
        FW_SLASH_DOT: str "/.",

        CC_ANY_OF: cid "anyOf",

        U_TRUE: cid "TRUE",

        URI_LV_CALC: uri "http://www.lovullo.com/calc",
        URI_LV_LINKER: uri "http://www.lovullo.com/rater/linker",
        URI_LV_PREPROC: uri "http://www.lovullo.com/rater/preproc",
        URI_LV_PROGRAM_MAP: uri "http://www.lovullo.com/rater/map",
        URI_LV_RATER: uri "http://www.lovullo.com/rater",
        URI_LV_TPL: uri "http://www.lovullo.com/rater/apply-template",
        URI_LV_WORKSHEET: uri "http://www.lovullo.com/rater/worksheet",

        // Common whitespace.
        //
        // _This does not represent all forms of whitespace!_
        // Clearly,
        //   but it is worth emphasizing.
        //
        // The intent of these whitespace symbols is to provide a means to
        //   determine whether that symbol represents a common form of
        //   whitespace,
        //     before falling back to a more expensive symbol dereference
        //     and (likely-)linear scan.
        //
        // This list is preliminary and ought to be measured by evaluating a
        //   real-world codebase;
        //     it ought not to bloat the symbol table,
        //       but ought to get the most common cases so as not to fall
        //       back to a more expensive dereferencing of a symbol and
        //       subsequent scanning.
        //
        // There are improvements that can be made here,
        //   such as aligning the symbol ids such that whitespace can be
        //   asserted with a bitmask.
        WS_SYM_START: mark "###WS_START",
        WS_EMPTY: ws "",
        WS_SP1: ws " ",
        WS_SP2: ws "  ",
        WS_SP3: ws "   ",
        WS_SP4: ws "    ",
        WS_SP5: ws "     ",
        WS_SP6: ws "      ",
        WS_SP7: ws "       ",
        WS_SP8: ws "        ",
        WS_LF1: ws "\n",
        WS_LF2: ws "\n\n",
        WS_LF1_SP1: ws "\n ",
        WS_LF1_SP2: ws "\n  ",
        WS_LF1_SP3: ws "\n   ",
        WS_LF1_SP4: ws "\n    ",
        WS_LF1_SP5: ws "\n     ",
        WS_LF1_SP6: ws "\n      ",
        WS_LF1_SP7: ws "\n       ",
        WS_LF1_SP8: ws "\n        ",
        WS_LF2_SP1: ws "\n\n ",
        WS_LF2_SP2: ws "\n\n  ",
        WS_LF2_SP3: ws "\n\n   ",
        WS_LF2_SP4: ws "\n\n    ",
        WS_LF2_SP5: ws "\n\n     ",
        WS_LF2_SP6: ws "\n\n      ",
        WS_LF2_SP7: ws "\n\n       ",
        WS_LF2_SP8: ws "\n\n        ",
        WS_SYM_END: mark "###WS_END",

        // [Symbols will be added here as they are needed.]

        // Marker indicating the end of the static symbols
        //   (this must always be last).
        END_STATIC: mark "###END"
    }
}

/// Static 16-bit symbols (pre-allocated).
///
/// These symbols are intended for situations where a smaller symbol size is
///   necessary.
/// Presently,
///   this includes only the [`Span`](crate::span::Span) context.
///
/// See also [st](super::st) for general static symbols.
pub mod st16 {
    use super::*;

    static_symbols! {
        <u16>;

        // Special contexts.
        CTX_DUMMY: ctx "#!DUMMY",
        CTX_UNKNOWN: ctx "#!UNKNOWN",
        CTX_LINKER: ctx "#!LINKER",

        // [Symbols will be added here as they are needed.]

        // Marker indicating the end of the static symbols
        //   (this must always be last).
        END_STATIC: mark16 "###END"
    }
}

/// Non-public module that can contain public traits.
///
/// The problem this module tries to solve is preventing anything outside of
///   this crate from implementing the `StaticSymbolId` trait,
///     since doing so opens us up to undefined behavior when transmuting
///     via [`st_as_sym`](super::st_as_sym).
mod private {
    /// Extend this trait to prevent other modules from implementing the
    ///   subtype.
    ///
    /// Since other modules extend [`StaticSymbolId`](super::StaticSymbolId)
    ///   for their own traits,
    ///     this trait must be `pub`.
    /// But, since it is contained within a private module,
    ///   it is not possible to import the trait to implement it on other
    ///   things.
    pub trait Sealed {}
}

#[cfg(test)]
mod test {
    use super::{st, st16, DecStaticSymbolId};
    use crate::sym::{GlobalSymbolIntern, GlobalSymbolResolve, SymbolId};

    #[test]
    fn global_sanity_check_st() {
        // If we _don't_ prefill, make sure we're not starting at the first
        // offset when interning, otherwise it'll look correct.
        let new: SymbolId = "force offset".intern();

        assert!(
            new.as_usize() > st::END_STATIC.as_usize(),
            "a new global symbol allocation was not > END_STATIC, \
             indicating that prefill is either not working or that \
             the prefill contains duplicate strings!"
        );

        // Further sanity check to make sure indexes align as expected,
        // not that you wouldn't otherwise notice that the whole system is
        // broken, but this ought to offer a more direct hint as to what
        // went wrong.
        assert_eq!(st::L_TRUE.as_sym(), "true".intern());
        assert_eq!(st::L_FALSE.as_sym(), "false".intern());
    }

    // Just ensure raw symbols are available and match.
    #[test]
    fn sanity_check_st_raw() {
        assert_eq!(st::L_TRUE.as_sym(), st::raw::L_TRUE);
    }

    #[test]
    fn global_sanity_check_st16() {
        // If we _don't_ prefill, make sure we're not starting at the first
        // offset when interning, otherwise it'll look correct.
        let new: SymbolId<u16> = "force offset".intern();

        assert!(
            new.as_usize() > st16::END_STATIC.as_usize(),
            "a new 16-bit global symbol allocation was not > END_STATIC, \
             indicating that prefill is either not working or that \
             the prefill contains duplicate strings!"
        );
    }

    #[test]
    fn decimal1_0_to_9() {
        for n in 0..=9 {
            assert_eq!(st::decimal1(n).as_sym().lookup_str(), n.to_string());

            // From<u8>
            assert_eq!(
                DecStaticSymbolId::from(n).as_sym().lookup_str(),
                n.to_string()
            );
        }
    }

    #[test]
    #[should_panic]
    fn decimal1_gt_9_panics() {
        st::decimal1(10);
    }

    #[test]
    fn st_count_matches_actual_count() {
        // This assumes that static symbols begin at 1 and end at
        //   END_STATIC.
        assert_eq!(
            st::END_STATIC.as_usize(),
            st::ST_COUNT,
            "st::ST_COUNT does not match the number of static symbols"
        );
    }

    // [`quick_contains_bytes`] is asking for trouble if it's not properly
    //   maintained.
    // It is expected that its implementation is manually verified,
    //   and it is written in a way that is clear and unambiguous.
    // With that said,
    //   this does some minor spot-checking.
    #[test]
    fn quick_contains_byte_verify() {
        use super::super::GlobalSymbolResolve;
        use memchr::memchr;
        use st::quick_contains_byte;

        // No static symbols will contain control characters.
        assert_eq!(quick_contains_byte(st::L_TRUE.into(), 0x01), Some(false));

        // But we don't know about dynamically-allocated ones.
        assert_eq!(
            quick_contains_byte("NOT A PREINTERNED SYM".into(), 0x01),
            None
        );

        // We chose to explicitly keep certain characters out of the
        //   preinterned list.
        // Let's verify that is the case by iterating through _all of the
        //   static interns_.
        for sym_id in 1..=st::ST_COUNT {
            let sym = unsafe { SymbolId::from_int_unchecked(sym_id as u32) };

            // If you get an error in this block,
            //   that means that you have added a symbol that violates
            //   assumptions made in `quick_contains_byte`.
            // Either that implementation needs changing and this test
            //   updated,
            //     or you need to not add that symbol to the static symbol
            //     list.
            for ch in b'{'..=0x7F {
                assert_eq!(
                    memchr(ch, sym.lookup_str().as_bytes()),
                    None,
                    "Pre-interned static symbol {sym:?} \
                       contains unexpected byte 0x{ch:X}"
                );
            }
        }
    }
}
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								// Pre-interned strings
 								//
-												Copyright year and name update

Ryan Specialty Group (RSG) rebranded to Ryan Specialty after its IPO.

											
										
										
											2023-01-17 23:09:25 -05:00
+								//  Copyright (C) 2014-2023 Ryan Specialty, LLC.
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								//
 								//  This file is part of TAME.
 								//
 								//  This program is free software: you can redistribute it and/or modify
 								//  it under the terms of the GNU General Public License as published by
 								//  the Free Software Foundation, either version 3 of the License, or
 								//  (at your option) any later version.
 								//
 								//  This program is distributed in the hope that it will be useful,
 								//  but WITHOUT ANY WARRANTY; without even the implied warranty of
 								//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 								//  GNU General Public License for more details.
 								//
 								//  You should have received a copy of the GNU General Public License
 								//  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 								//! Pre-interned strings.
 								//!
 								//! These strings are expected to be encountered nearly every run,
 								//!   and substitute static strings that would otherwise appear hard-coded
 								//!   in the system and have to be interned to be compared against other
 								//!   values.
 								//!
 								//! See the [parent module](super) for more information.
 								use super::{Interner, SymbolId, SymbolIndexSize};
 								use crate::global;
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
-												tamer: sym::prefill: Static symbol polymorphism

See the docs for a much deeper discussion.  In summary: traits do not
support static methods, and this is the workaround, which relies on unstable
nightly constant function features.

This implementation is tested using `qname_const!`, and will be utilized
with a new static type in a following commit.

											
										
										
											2021-10-02 00:50:20 -04:00
+								/// Static symbol identifier that is stable between runs of the same version
 								///   of TAMER.
 								///
 								/// This symbol id is allocated at compile-time.
 								///
-												tamer: Integrate clippy

This invokes clippy as part of `make check` now, which I had previously
avoided doing (I'll elaborate on that below).

This commit represents the changes needed to resolve all the warnings
presented by clippy.  Many changes have been made where I find the lints to
be useful and agreeable, but there are a number of lints, rationalized in
`src/lib.rs`, where I found the lints to be disagreeable.  I have provided
rationale, primarily for those wondering why I desire to deviate from the
default lints, though it does feel backward to rationalize why certain lints
ought to be applied (the reverse should be true).

With that said, this did catch some legitimage issues, and it was also
helpful in getting some older code up-to-date with new language additions
that perhaps I used in new code but hadn't gone back and updated old code
for.  My goal was to get clippy working without errors so that, in the
future, when others get into TAMER and are still getting used to Rust,
clippy is able to help guide them in the right direction.

One of the reasons I went without clippy for so long (though I admittedly
forgot I wasn't using it for a period of time) was because there were a
number of suggestions that I found disagreeable, and I didn't take the time
to go through them and determine what I wanted to follow.  Furthermore, it
was hard to make that judgment when I was new to the language and lacked
the necessary experience to do so.

One thing I would like to comment further on is the use of `format!` with
`expect`, which is also what the diagnostic system convenience methods
do (which clippy does not cover).  Because of all the work I've done trying
to understand Rust and looking at disassemblies and seeing what it
optimizes, I falsely assumed that Rust would convert such things into
conditionals in my otherwise-pure code...but apparently that's not the case,
when `format!` is involved.

I noticed that, after making the suggested fix with `get_ident`, Rust
proceeded to then inline it into each call site and then apply further
optimizations.  It was also previously invoking the thread lock (for the
interner) unconditionally and invoking the `Display` implementation.  That
is not at all what I intended for, despite knowing the eager semantics of
function calls in Rust.

Anyway, possibly more to come on that, I'm just tired of typing and need to
move on.  I'll be returning to investigate further diagnostic messages soon.

											
										
										
											2023-01-12 10:46:48 -05:00
+								/// Safety
 								/// ======
 								/// All objects implementing this trait must have the same byte
 								///   representation as its inner [`SymbolId`].
-												tamer: sym::prefill: Static symbol polymorphism

See the docs for a much deeper discussion.  In summary: traits do not
support static methods, and this is the workaround, which relies on unstable
nightly constant function features.

This implementation is tested using `qname_const!`, and will be utilized
with a new static type in a following commit.

											
										
										
											2021-10-02 00:50:20 -04:00
+								pub unsafe trait StaticSymbolId<Ix: SymbolIndexSize = global::ProgSymSize>:
 								    private::Sealed
 								{
 								    // Traits cannot contain constant functions.
 								    // See [`st_as_sym`] below.
 								}
 								/// Convert any [`StaticSymbolId`] into its inner [`SymbolId`].
 								///
 								/// Static symbols are typed to convey useful information to newtypes that
 								///   wish to wrap or compose them.
 								/// This function peels back that type information to expose the inner
 								///   symbol.
 								///
 								/// Safety and Rationale
 								/// ====================
 								/// This function does its best to work around the limitation in Rust that
 								///   traits cannot contain constant functions
 								///   (at the time of writing).
 								///
 								/// To do this,
 								///   we require that every object of type [`StaticSymbolId`] have _the same
 								///   byte representation_ as [`SymbolId`].
 								/// Since Rust optimizes away simple newtype wrappers,
 								///   this means that we can simply cast the value to a symbol.
 								///
 								/// For example, if we have `StaticSymbolId<u32>`,
 								///   this would cast to a `SymbolId<u32>`.
 								/// The inner value of `SymbolId<u32>` is
 								///   `<u32 as SymbolIndexSize>::NonZero`,
 								///     which has the same byte representation as `u32`.
 								///
 								/// This would normally be done using [`std::mem::transmute`],
 								///   which ensures that the two types have compatible sizes.
 								/// Unfortunately,
 								///   the types here do not have fixed size and constant functions are
 								///   unable to verify that they are compatible at the time of writing.
 								/// We therefore must use [`std::mem::transmute_copy`] to circumvent this
 								///   size check.
 								///
 								/// Circumventing this check is safe given our trait bounds for all static
 								///   symbols in this module and its children.
 								/// However,
 								///   for this safety to hold,
 								///   we must ensure that no outside modules can implement
 								///   [`StaticSymbolId`] on their own objects.
 								/// For this reason,
 								///   [`StaticSymbolId`] implements [`private::Sealed`].
 								///
 								/// With that,
 								///   we get [`SymbolId`] polymorphism despite Rust's limitations.
 								///
 								/// A Note About Nightly
 								/// ====================
 								/// At the time of writing,
 								///   though,
 								///   this _does_ require two unstable features:
 								///     `const_fn_trait_bound` and `const_transmute_copy`.
 								/// We can get rid of the latter using raw pointer casts,
 								///   just as it does,
 								///   but since we're already relying on unstable flags,
 								///     we may as well use it while we require nightly for other things as
 								///     well.
 								///
 								/// `const_fn_trait_bound` cannot be removed in this situation without
 								///   another plan.
 								/// `const_panic` could be used with an enum,
 								///   but that still requires nightly.
 								pub const fn st_as_sym<T, Ix>(st: &T) -> SymbolId<Ix>
 								where
 								    T: StaticSymbolId<Ix>,
 								    Ix: SymbolIndexSize,
 								{
 								    // SAFETY: A number of precautions are taken to make this a safe and
 								    // sensible transformation; see function doc above.
 								    SymbolId(unsafe { std::mem::transmute_copy(st) })
 								}
-												tamer: sym::prefill: Remove StaticSymbolId in favor of refined types

`StaticSymbolId` was created before the more specific types, which render it
unnecessary.  If we need a generic type, it can be re-introduced, but using
`static_symbol_newtypes!`.

											
										
										
											2021-09-23 23:35:45 -04:00
+								/// Generate a newtype containing a condensed [`SymbolId`].
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								macro_rules! static_symbol_newtype {
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								    ($(#[$attr:meta])* $name:ident<$size:ty>) => {
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								        $(#[$attr])*
-												tamer: sym::prefill: Remove StaticSymbolId in favor of refined types

`StaticSymbolId` was created before the more specific types, which render it
unnecessary.  If we need a generic type, it can be re-introduced, but using
`static_symbol_newtypes!`.

											
										
										
											2021-09-23 23:35:45 -04:00
+								        /// This is a statically-allocated symbol.
 								        ///
 								        /// This symbol is generated at compile-time and expected to be
 								        ///   available in the 32-bit global interner once it has been
 								        ///   initialized.
 								        #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								        pub struct $name(<$size as SymbolIndexSize>::NonZero);
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
-												tamer: sym::prefill: Static symbol polymorphism

See the docs for a much deeper discussion.  In summary: traits do not
support static methods, and this is the workaround, which relies on unstable
nightly constant function features.

This implementation is tested using `qname_const!`, and will be utilized
with a new static type in a following commit.

											
										
										
											2021-10-02 00:50:20 -04:00
+								        // Mark this as a static symbol type, ensuring that it size is fully
 								        //   compatible with the underlying `SymbolId` so as not to cause
 								        //   problems with `st_as_sym`.
 								        impl private::Sealed for $name {}
 								        unsafe impl StaticSymbolId<$size> for $name {}
 								        assert_eq_size!($name, SymbolId<$size>);
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								        impl $name {
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								            const fn new(id: $size) -> Self {
 								                Self(unsafe {
 								                    <$size as SymbolIndexSize>::NonZero::new_unchecked(id)
 								                })
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								            }
-												tamer: sym::prefill: Remove StaticSymbolId in favor of refined types

`StaticSymbolId` was created before the more specific types, which render it
unnecessary.  If we need a generic type, it can be re-introduced, but using
`static_symbol_newtypes!`.

											
										
										
											2021-09-23 23:35:45 -04:00
+								            /// Cast static symbol into a [`SymbolId`] suitable for the global
 								            ///   program-level interner.
 								            ///
 								            /// This is safe since global interner will always contain this
 								            ///   symbol before it can be read.
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								            pub const fn as_sym(self) -> SymbolId<$size> {
 								                SymbolId(self.0)
-												tamer: sym::prefill: Remove StaticSymbolId in favor of refined types

`StaticSymbolId` was created before the more specific types, which render it
unnecessary.  If we need a generic type, it can be re-introduced, but using
`static_symbol_newtypes!`.

											
										
										
											2021-09-23 23:35:45 -04:00
+								            }
 								            pub const fn as_usize(self) -> usize {
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								                self.0.get() as usize
-												tamer: sym::prefill: Remove StaticSymbolId in favor of refined types

`StaticSymbolId` was created before the more specific types, which render it
unnecessary.  If we need a generic type, it can be re-introduced, but using
`static_symbol_newtypes!`.

											
										
										
											2021-09-23 23:35:45 -04:00
+								            }
 								        }
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								        impl From<$name> for SymbolId<$size> {
-												tamer: sym::prefill: Remove StaticSymbolId in favor of refined types

`StaticSymbolId` was created before the more specific types, which render it
unnecessary.  If we need a generic type, it can be re-introduced, but using
`static_symbol_newtypes!`.

											
										
										
											2021-09-23 23:35:45 -04:00
+								            fn from(st: $name) -> Self {
 								                st.as_sym()
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								            }
 								        }
 								    };
 								}
 								/// Generate a series of newtypes and the macro `static_symbol_ty!` which
 								///   can be used to take a short identifier and convert it into its full
 								///   type identifier.
 								macro_rules! static_symbol_newtypes {
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								    ($($(#[$attr:meta])* $short:ident: $ty:ident<$size:ty>,)*) => {
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								        $(
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								            static_symbol_newtype!($(#[$attr])* $ty<$size>);
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								        )*
 								        macro_rules! static_symbol_ty {
 								            $(
 								                ($short) => {
 								                    $ty
 								                };
 								            )*
 								        }
 								    }
 								}
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								/// Generate symbols for preinterned strings.
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								///
 								/// These symbols,
 								///   rather than being generated by the global internment system,
 								///   are generated statically.
 								/// Once the global interner is initialized
 								///   (see [parent module](`super`)),
 								///   which is on first access,
 								///   these symbols will reference valid values.
 								macro_rules! static_symbol_consts {
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								    (@i $i:expr; <$size:ty> $name:ident: $ty:ident $str:expr, $($tail:tt)*) => {
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								        #[doc=concat!(
 								            "Interned `",
 								            stringify!($ty),
-												tamer: Xirf::Text refinement

This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.

This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.

The idea is to capture the most common whitespace as preinterned
symbols.  Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.

The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character.  This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large.  (They become large when template-expanded.)  I'll optimize further
if I notice it show up during profiling.

This also frees XIR itself from being concerned by Whitespace.  Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic.  I'd rather avoid tweaking it.

tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).

Onward and yonward.

DEV-7145

											
										
										
											2022-07-27 15:49:38 -04:00
+								            "` ",
 								            static_symbol_consts!(@!str $ty $str),
 								            "."
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								        )]
-												tamer: sym: Expose raw SymbolId for static symbols

This provides a child `raw` module that exposes a SymbolId representing the
inner value of each of the static newtypes.  This is needed in situations
where the type must match and the type of the static symbol is not
important.

In particular, when comparing against runtime-allocated symbols in `match`
expressions.

It is also worth noting that this commit managed to hit a bug in Rustc that
was fixed on 10/1/2021.  We use nightly, and it doesn't seem that this
occurred in stable, from bug reports.

  - https://github.com/rust-lang/rust/issues/89393
  - https://github.com/rust-lang/rust/commit/5ab1245303c26d3ae33b1adaa89fef2b8d9fb9ca
  - Original issue: https://github.com/rust-lang/rust/issues/72476

The error was:

  compiler/rustc_mir_build/src/thir/pattern/deconstruct_pat.rs:1191:22:
  Unexpected type for `Single` constructor: <u32 as sym::symbol::SymbolIndexSize>::NonZero

  thread 'rustc' panicked at 'Box<dyn Any>', compiler/rustc_errors/src/lib.rs:1146:9

This occurred because we were trying to use `SymbolId` as the type, which
uses a projected type as its inner value: `SymbolId<Ix: SymbolIndexSize>(Ix::NonZero)`.
This was not a problem with the static newtypes because their inner type was
simply `SymbolId<Ix>`, which is not projected.

This is one of the risks of using nightly.

But, the point is: if you receive this error, upgrade your toolchain.

											
										
										
											2021-10-18 10:41:15 -04:00
+								        #[doc=""]
 								        #[doc=concat!(
 								            "For the raw (untyped) version, see [`raw::",
 								            stringify!($name),
 								            "`]."
 								        )]
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								        pub const $name: static_symbol_ty!($ty) =
 								            <static_symbol_ty!($ty)>::new($i);
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
 								        // Recurse until no tail is left (terminating condition below).
 								        static_symbol_consts!{
 								            // This will result in 1 + 1 + 1 + 1 ... and will eventually hit
 								            // the recursion limit if we have too many static symbols, after
 								            // which time we may have to switch methodology.
 								            @i $i + 1;
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								            <$size>
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								            $($tail)*
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								        }
 								    };
 								    // Terminating condition.
-												tamer: xir::escape::CachingEscaper: Use new sym::st::ST_COUNT

This adds a constant `ST_COUNT` representing the number of statically
allocated symbols, and uses that to estimate an initial capacity for the
`CachingEscaper`.

This is just a guess (and is certainly too low), but we can adjust later on
after profiling, if it ever comes up.

											
										
										
											2021-11-15 21:40:42 -05:00
+								    (@i $i:expr; <$size:ty>) => {
 								        /// Number of statically allocated symbols.
 								        ///
 								        /// This can be used to help determine a base capacity for
 								        ///   collections holding [`SymbolId`]s.
 								        pub const ST_COUNT: usize = $i - 1;
-												tamer: Xirf::Text refinement

This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.

This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.

The idea is to capture the most common whitespace as preinterned
symbols.  Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.

The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character.  This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large.  (They become large when template-expanded.)  I'll optimize further
if I notice it show up during profiling.

This also frees XIR itself from being concerned by Whitespace.  Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic.  I'd rather avoid tweaking it.

tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).

Onward and yonward.

DEV-7145

											
										
										
											2022-07-27 15:49:38 -04:00
+								    };
 								    // Whitespace with newlines causes rustdoc parsing issues.
 								    (@!str ws $str:expr) => {
 								        "whitespace"
 								    };
 								    (@!str $ty:ident $str:expr) => {
 								        concat!("string `\"", $str, "\"`")
 								    };
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								}
 								/// Statically allocate [`SymbolId`]s for the provided symbols,
 								///   and schedule their static strings to be interned upon initialization
 								///   of the global interner.
 								///
-												tamer: sym::prefill: Static symbol polymorphism

See the docs for a much deeper discussion.  In summary: traits do not
support static methods, and this is the workaround, which relies on unstable
nightly constant function features.

This implementation is tested using `qname_const!`, and will be utilized
with a new static type in a following commit.

											
										
										
											2021-10-02 00:50:20 -04:00
+								/// This generates `fill`,
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								///   which the global interners call by default.
 								/// Any interner may optionally invoke this,
 								///   immediately after initialization,
 								///     /before/ any internment requests.
 								macro_rules! static_symbols {
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								    (<$size:ty>; $($name:ident : $ty:ident $str:expr),*) => {
 								        static_symbol_consts! {
 								            // Index 0 is not valid, so begin at 1
 								            @i 1;
 								            <$size>
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								            $(
 								                $name: $ty $str,
 								            )*
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								        }
-												tamer: sym: Expose raw SymbolId for static symbols

This provides a child `raw` module that exposes a SymbolId representing the
inner value of each of the static newtypes.  This is needed in situations
where the type must match and the type of the static symbol is not
important.

In particular, when comparing against runtime-allocated symbols in `match`
expressions.

It is also worth noting that this commit managed to hit a bug in Rustc that
was fixed on 10/1/2021.  We use nightly, and it doesn't seem that this
occurred in stable, from bug reports.

  - https://github.com/rust-lang/rust/issues/89393
  - https://github.com/rust-lang/rust/commit/5ab1245303c26d3ae33b1adaa89fef2b8d9fb9ca
  - Original issue: https://github.com/rust-lang/rust/issues/72476

The error was:

  compiler/rustc_mir_build/src/thir/pattern/deconstruct_pat.rs:1191:22:
  Unexpected type for `Single` constructor: <u32 as sym::symbol::SymbolIndexSize>::NonZero

  thread 'rustc' panicked at 'Box<dyn Any>', compiler/rustc_errors/src/lib.rs:1146:9

This occurred because we were trying to use `SymbolId` as the type, which
uses a projected type as its inner value: `SymbolId<Ix: SymbolIndexSize>(Ix::NonZero)`.
This was not a problem with the static newtypes because their inner type was
simply `SymbolId<Ix>`, which is not projected.

This is one of the risks of using nightly.

But, the point is: if you receive this error, upgrade your toolchain.

											
										
										
											2021-10-18 10:41:15 -04:00
+								        /// Expose each of the [typed static symbols](super) as raw
 								        ///   [`SymbolId`] values.
 								        ///
 								        /// These constants are useful for `match` expressions and any other
 								        ///   contexts where the type of the symbol must match,
 								        ///     and where the static type metadata is unimportant.
 								        ///
 								        /// This is equivalent to calling `as_sym` on the static newtype,
 								        ///   or using [`st_as_sym`](super::super::st_as_sym).
 								        pub mod raw {
 								            use super::SymbolId;
 								            $(
 								                #[doc=concat!(
 								                    "Raw (untyped) interned `",
 								                    stringify!($ty),
-												tamer: Xirf::Text refinement

This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.

This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.

The idea is to capture the most common whitespace as preinterned
symbols.  Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.

The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character.  This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large.  (They become large when template-expanded.)  I'll optimize further
if I notice it show up during profiling.

This also frees XIR itself from being concerned by Whitespace.  Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic.  I'd rather avoid tweaking it.

tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).

Onward and yonward.

DEV-7145

											
										
										
											2022-07-27 15:49:38 -04:00
+								                    "` ",
 								                    static_symbols!(@!str $ty $str),
 								                    "."
-												tamer: sym: Expose raw SymbolId for static symbols

This provides a child `raw` module that exposes a SymbolId representing the
inner value of each of the static newtypes.  This is needed in situations
where the type must match and the type of the static symbol is not
important.

In particular, when comparing against runtime-allocated symbols in `match`
expressions.

It is also worth noting that this commit managed to hit a bug in Rustc that
was fixed on 10/1/2021.  We use nightly, and it doesn't seem that this
occurred in stable, from bug reports.

  - https://github.com/rust-lang/rust/issues/89393
  - https://github.com/rust-lang/rust/commit/5ab1245303c26d3ae33b1adaa89fef2b8d9fb9ca
  - Original issue: https://github.com/rust-lang/rust/issues/72476

The error was:

  compiler/rustc_mir_build/src/thir/pattern/deconstruct_pat.rs:1191:22:
  Unexpected type for `Single` constructor: <u32 as sym::symbol::SymbolIndexSize>::NonZero

  thread 'rustc' panicked at 'Box<dyn Any>', compiler/rustc_errors/src/lib.rs:1146:9

This occurred because we were trying to use `SymbolId` as the type, which
uses a projected type as its inner value: `SymbolId<Ix: SymbolIndexSize>(Ix::NonZero)`.
This was not a problem with the static newtypes because their inner type was
simply `SymbolId<Ix>`, which is not projected.

This is one of the risks of using nightly.

But, the point is: if you receive this error, upgrade your toolchain.

											
										
										
											2021-10-18 10:41:15 -04:00
+								                )]
 								                #[doc=""]
 								                #[doc=concat!(
 								                    "For the typed version, see [`super::",
 								                    stringify!($name),
 								                    "`]."
 								                )]
 								                pub const $name: SymbolId<$size> = super::$name.as_sym();
 								            )*
 								        }
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								        /// Fill a new interner with static symbols.
 								        ///
 								        /// Panics
 								        /// ======
 								        /// This function will panic if the interner has any symbols,
 								        ///   which would cause misalignment with the generated constants.
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								        pub(in super::super) fn fill<'a, I, Ix>(interner: I) -> I
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								        where
 								            I: Interner<'a, Ix>,
 								            Ix: SymbolIndexSize
 								        {
 								            assert!(
 								                interner.len() == 0,
 								                "cannot fill non-empty Interner with static symbols"
 								            );
 								            // This array does not exist as a constant, because that would
 								            //   require that we count the number of items first for the
 								            //   sake of the type definition.
 								            // This is more convenient.
-												tamer: Replace explicit array::IntoIter::new with IntoIter

Now that we're on 2021 Edition, the default behavior has changed to be
consistent.

											
										
										
											2021-10-02 01:03:19 -04:00
+								            [
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								                $(
 								                    $str,
 								                )*
-												tamer: Replace explicit array::IntoIter::new with IntoIter

Now that we're on 2021 Edition, the default behavior has changed to be
consistent.

											
										
										
											2021-10-02 01:03:19 -04:00
+								            ].into_iter().for_each(|sym| { interner.intern(sym); });
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
 								            interner
 								        }
-												tamer: Xirf::Text refinement

This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.

This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.

The idea is to capture the most common whitespace as preinterned
symbols.  Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.

The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character.  This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large.  (They become large when template-expanded.)  I'll optimize further
if I notice it show up during profiling.

This also frees XIR itself from being concerned by Whitespace.  Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic.  I'd rather avoid tweaking it.

tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).

Onward and yonward.

DEV-7145

											
										
										
											2022-07-27 15:49:38 -04:00
+								    };
 								    // Whitespace with newlines causes rustdoc parsing issues.
 								    (@!str ws $str:expr) => {
 								        "whitespace"
 								    };
 								    (@!str $ty:ident $str:expr) => {
 								        concat!("string `\"", $str, "\"`")
 								    };
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								}
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								static_symbol_newtypes! {
 								    /// A symbol suitable as a C-style identifier.
 								    ///
 								    /// This is the traditional `[a-zA-Z_][a-zA-Z0-9_]*`,
 								    ///   common in many programming languages.
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								    cid: CIdentStaticSymbolId<global::ProgSymSize>,
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
-												tamer: obj::xmle::xir: Complete l:dep

The `l:dep` section of the `xmle` file, after formatting (since XIR writes
without newlines and indentation), is now identical to the existing xmle
writer.  I can now move on to the other sections.

Note that the attribute movement in this commit is simply to get the diff to
properly align.  Once the current xmle writer is removed, I'll organize them
a bit more sensibly.

`obj::xmle::xir` also needs documentation, now that it's shown to be viable.

											
										
										
											2021-09-30 13:06:30 -04:00
+								    /// Base-10 (decimal) integer value as a string.
 								    dec: DecStaticSymbolId<global::ProgSymSize>,
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								    /// A symbol resembling a QName of the form `prefix:local`.
 								    ///
 								    /// A symbol of this type does _not_ mean that the symbol is intended to
 								    ///   be a QName;
 								    ///     this is merely a way to describe it.
 								    /// For example,
 								    ///   `map:head` is intended as an identifier type,
 								    ///     not a QName.
 								    qname: QnameIdentStaticSymbolId<global::ProgSymSize>,
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								    /// This symbol serves only as a marker in the internment pool to
 								    ///   delimit symbol ranges;
 								    ///     its string value is incidental and should not be relied upon.
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								    mark: MarkStaticSymbolId<global::ProgSymSize>,
-												tamer: obj::xmle::xir: Write l:map-from

This contains some awkward coupling for opening and closing tags to reduce
the complexity of the `Iterator` types that must be manually
specified.  That may be addressed shortly.

											
										
										
											2021-10-05 16:13:47 -04:00
+								    /// A symbol suitable as a TAME identifier.
 								    ///
 								    /// This is [`CIdentStaticSymbolId`] with `-` added:
 								    ///   `[a-zA-Z_-][a-zA-Z0-9_-]*`.
 								    /// This is also suitable as an XML node or attribute name.
 								    tid: TameIdentStaticSymbolId<global::ProgSymSize>,
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								    /// Symbol representing a URI.
 								    ///
 								    /// This is intended for use primarily as an XML namespace.
 								    /// URIs are expected to _not_ contain quotes and other characters that
 								    ///   may need escaping in XML attributes.
 								    uri: UriStaticSymbolId<global::ProgSymSize>,
-												tamer: ld::xmle: Narrow Sections types

This moves the logic that sorts identifiers into sections into Sections
itself, and introduces XmleSections to allow for mocking for testing.

This then allows us to narrow the types significantly, eliminating some
runtime checks.  The types can be narrowed further, but I'll be limiting the
work I'll be doing now; this'll be inevitably addressed as we use the ASG
for the compiler.

This also handles moving Sections tests, which was a TODO from the previous
commit.

DEV-10859

											
										
										
											2021-10-14 12:37:32 -04:00
+								    /// Any other generic string that does not fit into any particular type.
 								    str: GenericStaticSymbolId<global::ProgSymSize>,
-												tamer: Xirf::Text refinement

This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.

This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.

The idea is to capture the most common whitespace as preinterned
symbols.  Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.

The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character.  This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large.  (They become large when template-expanded.)  I'll optimize further
if I notice it show up during profiling.

This also frees XIR itself from being concerned by Whitespace.  Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic.  I'd rather avoid tweaking it.

tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).

Onward and yonward.

DEV-7145

											
										
										
											2022-07-27 15:49:38 -04:00
+								    /// Common strings of whitespace
 								    ///   (where a character of whitespace is `[ \n]`).
 								    ///
 								    /// There are certainly other whitespace characters,
 								    ///   but this is intended to be conservative to address only the most
 								    ///   common cases.
 								    ws: WhitespaceStaticSymbolId<global::ProgSymSize>,
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								    /// Static 16-bit [`Span`](crate::span::Span) context.
 								    ///
 								    /// These contexts are intended for use in generated code where a better
 								    ///   context cannot be derived.
 								    ctx: ContextStaticSymbolId<u16>,
-												tamer: sym: Expose raw SymbolId for static symbols

This provides a child `raw` module that exposes a SymbolId representing the
inner value of each of the static newtypes.  This is needed in situations
where the type must match and the type of the static symbol is not
important.

In particular, when comparing against runtime-allocated symbols in `match`
expressions.

It is also worth noting that this commit managed to hit a bug in Rustc that
was fixed on 10/1/2021.  We use nightly, and it doesn't seem that this
occurred in stable, from bug reports.

  - https://github.com/rust-lang/rust/issues/89393
  - https://github.com/rust-lang/rust/commit/5ab1245303c26d3ae33b1adaa89fef2b8d9fb9ca
  - Original issue: https://github.com/rust-lang/rust/issues/72476

The error was:

  compiler/rustc_mir_build/src/thir/pattern/deconstruct_pat.rs:1191:22:
  Unexpected type for `Single` constructor: <u32 as sym::symbol::SymbolIndexSize>::NonZero

  thread 'rustc' panicked at 'Box<dyn Any>', compiler/rustc_errors/src/lib.rs:1146:9

This occurred because we were trying to use `SymbolId` as the type, which
uses a projected type as its inner value: `SymbolId<Ix: SymbolIndexSize>(Ix::NonZero)`.
This was not a problem with the static newtypes because their inner type was
simply `SymbolId<Ix>`, which is not projected.

This is one of the risks of using nightly.

But, the point is: if you receive this error, upgrade your toolchain.

											
										
										
											2021-10-18 10:41:15 -04:00
 								    /// This symbol serves only as a marker in the internment pool to
 								    ///   delimit symbol ranges;
 								    ///     its string value is incidental and should not be relied upon.
 								    mark16: Mark16StaticSymbolId<u16>,
-												tamer: sym::prefill: Initial typed static symbol concept

We'll see how the syntax evolves over time.  It's not ideal to have to
specify the type, rather than having the compiler infer it, but I don't much
feel like getting into my first procedural macro right now, so we'll stick
with this approach for the time being.

This will set the stage to be able to safely e.g. create QNames statically
at compile-time and would allow us to make any attempts to bypass it
unsafe.

											
										
										
											2021-09-23 00:37:39 -04:00
+								}
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								/// Static symbols (pre-allocated).
 								///
 								/// Each of the constants in this module represent a [`SymbolId`] statically
 								///   allocated at compile-time.
 								/// The strings that they represent are automatically populated into the
 								///   global interners when the interner is first accessed.
 								///
 								/// _You should always use the generated constant to reference these
 								///    symbols!_
 								/// Do not rely upon their integer value,
 								///   as it _will_ change over time.
 								/// The sole exception is to use marker symbols to identify ranges
 								///   of symbols;
 								///     see [`MarkStaticSymbolId`].
 								///
 								/// See [`crate::sym`] for more information on static symbols.
 								///
 								/// `static` is a keyword in Rust,
 								///   so we shorten the module name to `st`.
 								///
 								/// The constants follow a naming convention:
 								///   - `L_` indicates that the identifier is all-lowercase.
 								pub mod st {
 								    use super::*;
-												tamer: obj::xmle::xir: Complete l:dep

The `l:dep` section of the `xmle` file, after formatting (since XIR writes
without newlines and indentation), is now identical to the existing xmle
writer.  I can now move on to the other sections.

Note that the attribute movement in this commit is simply to get the diff to
properly align.  Once the current xmle writer is removed, I'll organize them
a bit more sensibly.

`obj::xmle::xir` also needs documentation, now that it's shown to be viable.

											
										
										
											2021-09-30 13:06:30 -04:00
+								    // Convert `0 ≤ n ≤ 9` into a static symbol representing a single
 								    //   decimal digit.
 								    //
 								    // Panics
 								    // ======
 								    // This will panic if `n > 9`.
 								    pub fn decimal1(n: u8) -> DecStaticSymbolId {
 								        assert!(n < 10);
 								        // The symbols are expected to be in a very specific position in the
 								        //   pool (n+1).
 								        // This is verified by tests at the bottom of this file.
 								        DecStaticSymbolId(unsafe {
 								            <global::ProgSymSize as SymbolIndexSize>::NonZero::new_unchecked(
 								                (n as global::ProgSymSize) + 1,
 								            )
 								        })
 								    }
 								    impl From<u8> for DecStaticSymbolId {
 								        // Convert `0 ≤ n ≤ 9` into a static symbol representing a single
 								        //   decimal digit.
 								        //
 								        // See [`decimal1`].
 								        fn from(n: u8) -> Self {
 								            decimal1(n)
 								        }
 								    }
-												tamer: sym::prefill::quick_contains_byte: New function

This will be utilized by NIR to avoid having to perform memory lookups for
preinterned static symbols.

DEV-13156

											
										
										
											2022-11-01 11:59:55 -04:00
+								    /// Whether the provided symbol is part of the static symbol list that
 								    ///   is pre-interned.
 								    #[inline]
 								    pub fn is_pre_interned(sym: SymbolId) -> bool {
 								        let symid = sym.as_usize();
 								        symid <= END_STATIC.as_usize()
 								    }
-												tamer: Xirf::Text refinement

This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.

This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.

The idea is to capture the most common whitespace as preinterned
symbols.  Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.

The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character.  This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large.  (They become large when template-expanded.)  I'll optimize further
if I notice it show up during profiling.

This also frees XIR itself from being concerned by Whitespace.  Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic.  I'd rather avoid tweaking it.

tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).

Onward and yonward.

DEV-7145

											
										
										
											2022-07-27 15:49:38 -04:00
+								    /// Whether the given [`SymbolId`] is within a group of symbols
 								    ///   delimited by markers `a` and `b`.
 								    ///
 								    /// This provides a _reasonably_ efficient way to compare a [`SymbolId`]
 								    ///   against a large set of [`SymbolId`]s.
 								    /// There are more efficient ways to accomplish this,
 								    ///   though,
 								    ///   if performance ever does become a concern;
 								    ///     the current implementation is kept simple until then.
 								    #[inline]
 								    pub fn is_between_markers(
 								        a: MarkStaticSymbolId,
 								        b: MarkStaticSymbolId,
 								        sym: SymbolId,
 								    ) -> bool {
 								        let symid = sym.as_usize();
 								        symid > a.as_usize() && symid < b.as_usize()
 								    }
 								    /// Whether the provided [`SymbolId`] is recognized as a common
 								    ///   whitespace symbol in the preinterned symbol list.
 								    ///
 								    /// If this returns `true`,
 								    ///   then this is a quick way to determine that the provided
 								    ///   [`SymbolId`] does contain only whitespace.
 								    /// However,
 								    ///   this is _not_ comprehensive and never will be,
 								    ///     so an answer of `false` means "it may or may not be whitespace";
 								    ///       you should fall back to other methods of checking for
 								    ///       whitespace if this fails.
 								    #[inline]
 								    pub fn is_common_whitespace(sym: SymbolId) -> bool {
 								        is_between_markers(WS_SYM_START, WS_SYM_END, sym)
 								    }
-												tamer: sym::prefill::quick_contains_byte: New function

This will be utilized by NIR to avoid having to perform memory lookups for
preinterned static symbols.

DEV-13156

											
										
										
											2022-11-01 11:59:55 -04:00
+								    /// Attempt to make a quick determination without a memory lookup
 								    ///   (symbol resolution) whether the given [`SymbolId`]'s string
 								    ///   representation definitely contains the given byte value.
 								    ///
 								    /// A value of [`None`] means "maybe, maybe not",
 								    ///   indicating that the caller ought to fall back to a slower check
 								    ///   that utilizes the symbol's resolved string.
 								    /// A value of [`Some`] indicates that `sym`,
 								    ///   were it to be resolved,
 								    ///   definitely does or does not contain the byte `ch`.
 								    ///
 								    /// This is intended to encapsulate special,
 								    ///   loosely-defined cases where we can test that the interned symbols
 								    ///   actually properly adhere to the implementation of this function.
 								    #[inline]
 								    pub fn quick_contains_byte(sym: SymbolId, ch: u8) -> Option<bool> {
 								        match (is_pre_interned(sym), ch) {
 								            // No control characters or null bytes.
 								            (true, 0..=0x1F) => Some(false),
 								            // No characters outside the 7-bit ASCII range.
 								            (true, 0x80..) => Some(false),
 								            // Or the character range immediately preceding it,
 								            //   where 7F == DEL.
 								            // They are explicitly listed here so that readers do not have
 								            //   to consult an ASCII table to avoid unintentional bugs.
 								            (true, b'{' | b'|' | b'}' | b'~' | 0x7F) => Some(false),
 								            // We don't check for anything else (yet).
 								            (true, _) => None,
 								            // We cannot possibly know statically whether dynamically
 								            //   interned symbols contain any particular byte.
 								            (false, _) => None,
 								        }
 								    }
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								    static_symbols! {
-												tamer: sym: Expose raw SymbolId for static symbols

This provides a child `raw` module that exposes a SymbolId representing the
inner value of each of the static newtypes.  This is needed in situations
where the type must match and the type of the static symbol is not
important.

In particular, when comparing against runtime-allocated symbols in `match`
expressions.

It is also worth noting that this commit managed to hit a bug in Rustc that
was fixed on 10/1/2021.  We use nightly, and it doesn't seem that this
occurred in stable, from bug reports.

  - https://github.com/rust-lang/rust/issues/89393
  - https://github.com/rust-lang/rust/commit/5ab1245303c26d3ae33b1adaa89fef2b8d9fb9ca
  - Original issue: https://github.com/rust-lang/rust/issues/72476

The error was:

  compiler/rustc_mir_build/src/thir/pattern/deconstruct_pat.rs:1191:22:
  Unexpected type for `Single` constructor: <u32 as sym::symbol::SymbolIndexSize>::NonZero

  thread 'rustc' panicked at 'Box<dyn Any>', compiler/rustc_errors/src/lib.rs:1146:9

This occurred because we were trying to use `SymbolId` as the type, which
uses a projected type as its inner value: `SymbolId<Ix: SymbolIndexSize>(Ix::NonZero)`.
This was not a problem with the static newtypes because their inner type was
simply `SymbolId<Ix>`, which is not projected.

This is one of the risks of using nightly.

But, the point is: if you receive this error, upgrade your toolchain.

											
										
										
											2021-10-18 10:41:15 -04:00
+								        <crate::global::ProgSymSize>;
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
-												tamer: obj::xmle::xir: Complete l:dep

The `l:dep` section of the `xmle` file, after formatting (since XIR writes
without newlines and indentation), is now identical to the existing xmle
writer.  I can now move on to the other sections.

Note that the attribute movement in this commit is simply to get the diff to
properly align.  Once the current xmle writer is removed, I'll organize them
a bit more sensibly.

`obj::xmle::xir` also needs documentation, now that it's shown to be viable.

											
										
										
											2021-09-30 13:06:30 -04:00
+								        // Decimal strings are expected to be at index (n+1).
 								        // See `decimal1`.
 								        N0: dec "0",
 								        N1: dec "1",
 								        N2: dec "2",
 								        N3: dec "3",
 								        N4: dec "4",
 								        N5: dec "5",
 								        N6: dec "6",
 								        N7: dec "7",
 								        N8: dec "8",
 								        N9: dec "9",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_ALL: cid "all",
 								        L_ANY: cid "any",
 								        L_APPLY: cid "apply",
 								        L_APPLY_TEMPLATE: tid "apply-template",
 								        L_ARG: cid "arg",
 								        L_AS: cid "as",
 								        L_BASE_TYPE: tid "base-type",
-												tamer: obj::xmle::xir: Complete l:dep

The `l:dep` section of the `xmle` file, after formatting (since XIR writes
without newlines and indentation), is now identical to the existing xmle
writer.  I can now move on to the other sections.

Note that the attribute movement in this commit is simply to get the diff to
properly align.  Once the current xmle writer is removed, I'll organize them
a bit more sensibly.

`obj::xmle::xir` also needs documentation, now that it's shown to be viable.

											
										
										
											2021-09-30 13:06:30 -04:00
+								        L_BOOLEAN: cid "boolean",
-												tamer: xir::st: Static namespace prefixes (c and t)

In particular, `t:*` will be recognized by NIR for short-hand template
application.  These will be utilized in an upcoming commit.

DEV-7145

											
										
										
											2022-08-10 16:33:46 -04:00
+								        L_C: cid "c",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_CAR: cid "car",
 								        L_CASE: cid "case",
 								        L_CASES: cid "cases",
 								        L_CDR: cid "cdr",
 								        L_CEIL: cid "ceil",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_CGEN: cid "cgen",
 								        L_CLASS: cid "class",
-												tamer: xir::parse::ele: Initial element parser generator concept

This begins generating parsers that are capable of parsing elements.  I need
to move on, so this abstraction isn't going to go as far as it could, but
let's see where it takes me.

This was the work that required the recent lookahead changes, which has been
detailed in previous commits.

This initial support is basic, but robust.  It supports parsing elements
with attributes and children, but it does not yet support the equivalent of
the Kleene star (`*`).  Such support will likely be added by supporting
parsers that are able to recurse on their own definition in tail position,
which will also require supporting parsers that do not add to the stack.

This generates parsers that, like all the other parsers, use enums to
provide a typed stack.  Stitched parsers produce a nested stack that is
always bounded in size.  Fortunately, expressions---which can nest
deeply---do not need to maintain ancestor context on the stack, and so this
should work fine; we can get away with this because XIRF ensures proper
nesting for us.  Statements that _do_ need to maintain such context are not
nested.

This also does not yet support emitting an object on closing tag, which
will be necessary for NIR, which will be a streaming IR that is "near" to
the source XML in structure.  This will then be used to lower into AIR for
the ASG, which gives structure needed for further analysis.

More information to come; I just want to get this committed to serve as a
mental synchronization point and clear my head, since I've been sitting on
these changes for so long and have to keep stashing them as I tumble down
rabbit holes covered in yak hair.

DEV-7145

											
										
										
											2022-07-13 13:55:32 -04:00
+								        L_CLASSIFY: cid "classify",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_CONS: cid "cons",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_CONST: cid "const",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_CORE: cid "core",
 								        L_DASH: cid "dash",
 								        L_DEFAULT: cid "default",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_DEP: cid "dep",
 								        L_DESC: cid "desc",
-												tamer: obj::xmle::xir: Complete l:dep

The `l:dep` section of the `xmle` file, after formatting (since XIR writes
without newlines and indentation), is now identical to the existing xmle
writer.  I can now move on to the other sections.

Note that the attribute movement in this commit is simply to get the diff to
properly align.  Once the current xmle writer is removed, I'll organize them
a bit more sensibly.

`obj::xmle::xir` also needs documentation, now that it's shown to be viable.

											
										
										
											2021-09-30 13:06:30 -04:00
+								        L_DIM: cid "dim",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_DISPLAY: cid "display",
 								        L_DOT: cid "dot",
-												tamer: obj::xmle::xir: Complete l:dep

The `l:dep` section of the `xmle` file, after formatting (since XIR writes
without newlines and indentation), is now identical to the existing xmle
writer.  I can now move on to the other sections.

Note that the attribute movement in this commit is simply to get the diff to
properly align.  Once the current xmle writer is removed, I'll organize them
a bit more sensibly.

`obj::xmle::xir` also needs documentation, now that it's shown to be viable.

											
										
										
											2021-09-30 13:06:30 -04:00
+								        L_DTYPE: cid "dtype",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_DYN_NODE: tid "dyn-node",
-												tamer: obj::xmlo: Extract error types into own module

											
										
										
											2021-11-16 15:47:52 -05:00
+								        L_ELIG_CLASS_YIELDS: tid "elig-class-yields",
-												tamer: obj::xmle::xir: Complete l:dep

The `l:dep` section of the `xmle` file, after formatting (since XIR writes
without newlines and indentation), is now identical to the existing xmle
writer.  I can now move on to the other sections.

Note that the attribute movement in this commit is simply to get the diff to
properly align.  Once the current xmle writer is removed, I'll organize them
a bit more sensibly.

`obj::xmle::xir` also needs documentation, now that it's shown to be viable.

											
										
										
											2021-09-30 13:06:30 -04:00
+								        L_EMPTY: cid "empty",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_ENUM: cid "enum",
-												tamer: xir::parse::ele: Initial namespace prefix matching support

This allows matching on a namespace prefix by providing a `Prefix` instead
of a `QName`.  This works, but is missing a couple notable things (and
possibly more):

  1. Tracking the QName that is _actually_ matched so that it can be used in
     messages stating what the expected closing tag is; and
  2. Making that QName available via a binding.

This will be used to match on `t:*` in NIR.  If you're wondering how
attribute parsing is supposed to work with that (of course you're wondering
that, random person reading this)---that'll have to work differently for
those matches, since template shorthand application contains argument names
as attributes.

DEV-7145

											
										
										
											2022-08-10 16:50:02 -04:00
+								        L_EQ: cid "eq",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_ERROR: cid "error",
-												tamer: obj::xmle::xir: Complete writer functionality

This is a significant milestone, in the sense that it is the culmination of
the past month or so of work to prove that an Iterator-based XIR will be
viable for the system.

This barely had any impact on the performance from the previous commit
reporting the profiling.  This performs at least as well as the quick-xml
based writer.  In isolated benchmarks, it performs better, but in the real
world, the linker spends most of its time reading xmlo files, and so minor
differences in writing do not have a significant overall impact.

With that said, a lot of cleanup and documentation is still needed.  That is
the subject of the upcoming commits, before this writer can finalized.

											
										
										
											2021-10-08 16:35:45 -04:00
+								        L_EXEC: cid "exec",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_EXPAND_BARRIER: tid "expand-barrier",
 								        L_EXPAND_FUNCTION: tid "expand-function",
 								        L_EXPAND_GROUP: tid "expand-group",
 								        L_EXPAND_SEQUENCE: tid "expand-sequence",
-												tamer: xir::parse::ele: Initial element parser generator concept

This begins generating parsers that are capable of parsing elements.  I need
to move on, so this abstraction isn't going to go as far as it could, but
let's see where it takes me.

This was the work that required the recent lookahead changes, which has been
detailed in previous commits.

This initial support is basic, but robust.  It supports parsing elements
with attributes and children, but it does not yet support the equivalent of
the Kleene star (`*`).  Such support will likely be added by supporting
parsers that are able to recurse on their own definition in tail position,
which will also require supporting parsers that do not add to the stack.

This generates parsers that, like all the other parsers, use enums to
provide a typed stack.  Stitched parsers produce a nested stack that is
always bounded in size.  Fortunately, expressions---which can nest
deeply---do not need to maintain ancestor context on the stack, and so this
should work fine; we can get away with this because XIRF ensures proper
nesting for us.  Statements that _do_ need to maintain such context are not
nested.

This also does not yet support emitting an object on closing tag, which
will be necessary for NIR, which will be a streaming IR that is "near" to
the source XML in structure.  This will then be used to lower into AIR for
the ASG, which gives structure needed for further analysis.

More information to come; I just want to get this committed to serve as a
mental synchronization point and clear my head, since I've been sitting on
these changes for so long and have to keep stashing them as I tumble down
rabbit holes covered in yak hair.

DEV-7145

											
										
										
											2022-07-13 13:55:32 -04:00
+								        L_EXPORT: cid "export",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_EXPT: cid "expt",
-												tamer: obj::xmlo::reader: preproc:symtable/preproc:sym parsing

This integrates much of the work done so far to parse into a
`XmloEvent::SymDecl`.  The attribute parsing _is_ verbose, and I do intend
to abstract it away later on, but I'm going to wait on that for now.

The new reader should be finishing up soon, which is really exciting, since
I started working on this months ago (before having to take a break on
TAMER); I'm anticipating strong performance gains in the reader, and this is
a test that will tell us how the compiler will perform moving forward with
the abstractions that I've spent so much time on.

DEV-10863

											
										
										
											2022-03-30 09:06:10 -04:00
+								        L_EXTERN: cid "extern",
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								        L_FALSE: cid "false",
-												tamer: obj::xmle::xir: Complete l:dep

The `l:dep` section of the `xmle` file, after formatting (since XIR writes
without newlines and indentation), is now identical to the existing xmle
writer.  I can now move on to the other sections.

Note that the attribute movement in this commit is simply to get the diff to
properly align.  Once the current xmle writer is removed, I'll organize them
a bit more sensibly.

`obj::xmle::xir` also needs documentation, now that it's shown to be viable.

											
										
										
											2021-09-30 13:06:30 -04:00
+								        L_FLOAT: cid "float",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_FLOOR: cid "floor",
 								        L_FOR_EACH: tid "for-each",
-												tamer: obj::xmlo::reader: preproc:fragment parsing

This concludes the bulk of the header parsing, though there are surely going
to be other issues when I try to read a real xmlo file, such as
whitespace.  That is something I expect that I'd rather handle as part of
XIRF, but maybe I'll initially ignore it here just to get it working.  We'll
see.

DEV-10863

											
										
										
											2022-03-30 21:41:59 -04:00
+								        L_FRAGMENT: cid "fragment",
 								        L_FRAGMENTS: cid "fragments",
-												tamer: obj::xmle::xir: Write l:map-from

This contains some awkward coupling for opening and closing tags to reduce
the complexity of the `Iterator` types that must be manually
specified.  That may be addressed shortly.

											
										
										
											2021-10-05 16:13:47 -04:00
+								        L_FROM: cid "from",
-												tamer: obj::xmle::xir: Complete writer functionality

This is a significant milestone, in the sense that it is the culmination of
the past month or so of work to prove that an Iterator-based XIR will be
viable for the system.

This barely had any impact on the performance from the previous commit
reporting the profiling.  This performs at least as well as the quick-xml
based writer.  In isolated benchmarks, it performs better, but in the real
world, the linker spends most of its time reading xmlo files, and so minor
differences in writing do not have a significant overall impact.

With that said, a lot of cleanup and documentation is still needed.  That is
the subject of the upcoming commits, before this writer can finalized.

											
										
										
											2021-10-08 16:35:45 -04:00
+								        L_FUNC: cid "func",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_FUNCTION: cid "function",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_GEN: cid "gen",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_GENERATED: cid "generated",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_GENERATES: cid "generates",
 								        L_GENSYM: cid "gensym",
 								        L_GENTLE_NO: tid "gentle-no",
-												tamer: xir::parse::ele: Store matching QName on NS match

When we match a QName against a namespace, we ought to store the matching
QName to use (a) in error messages and (b) to make available as a
binding.  The former is necessary for sensible errors (rather than saying
that it's e.g. expecting a closing `t:*`) and the latter is necessary for
e.g. getting the template name out of `t:foo`.

DEV-7145

											
										
										
											2022-08-11 01:15:44 -04:00
+								        L_GT: cid "gt",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_GTE: cid "gte",
-												tamer: obj::xmlo::reader: preproc:fragment parsing

This concludes the bulk of the header parsing, though there are surely going
to be other issues when I try to read a real xmlo file, such as
whitespace.  That is something I expect that I'd rather handle as part of
XIRF, but maybe I'll initially ignore it here just to get it working.  We'll
see.

DEV-10863

											
										
										
											2022-03-30 21:41:59 -04:00
+								        L_ID: cid "id",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_IDENTIFIER: cid "identifier",
 								        L_IF: cid "if",
 								        L_IGNORE_MISSING: tid "ignore-missing",
 								        L_IMPORT: cid "import",
 								        L_INDEX: cid "index",
 								        L_INLINE_TEMPLATE: tid "inline-template",
-												tamer: obj::xmle::xir: Complete l:dep

The `l:dep` section of the `xmle` file, after formatting (since XIR writes
without newlines and indentation), is now identical to the existing xmle
writer.  I can now move on to the other sections.

Note that the attribute movement in this commit is simply to get the diff to
properly align.  Once the current xmle writer is removed, I'll organize them
a bit more sensibly.

`obj::xmle::xir` also needs documentation, now that it's shown to be viable.

											
										
										
											2021-09-30 13:06:30 -04:00
+								        L_INTEGER: cid "integer",
-												tamer: obj::xmlo::reader: preproc:symtable/preproc:sym parsing

This integrates much of the work done so far to parse into a
`XmloEvent::SymDecl`.  The attribute parsing _is_ verbose, and I do intend
to abstract it away later on, but I'm going to wait on that for now.

The new reader should be finishing up soon, which is really exciting, since
I started working on this months ago (before having to take a break on
TAMER); I'm anticipating strong performance gains in the reader, and this is
a test that will tell us how the compiler will perform moving forward with
the abstractions that I've spent so much time on.

DEV-10863

											
										
										
											2022-03-30 09:06:10 -04:00
+								        L_ISOVERRIDE: cid "isoverride",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_ITEM: cid "item",
 								        L_KEY: cid "key",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_L: cid "l",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_LABEL: cid "label",
 								        L_LENGTH_OF: tid "length-of",
 								        L_LET: cid "let",
 								        L_LOCAL: cid "local",
 								        L_LOWER: cid "lower",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_LPARAM: cid "lparam",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_LT: cid "lt",
 								        L_LTE: cid "lte",
-												tamer: Begin XIR-based xmlo reader impl

There isn't a whole lot here, but there is additional work needed in various
places to support upcoming changes and so I want to get this commited to
ease the cognitive burden of what I have thusfar.  And to stop stashing.  We
have a feature flag for a reason.

DEV-10863

											
										
										
											2021-10-28 21:21:30 -04:00
+								        L_LV: cid "lv",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_MAP: cid "map",
-												tamer: obj::xmle::xir: Complete writer functionality

This is a significant milestone, in the sense that it is the culmination of
the past month or so of work to prove that an Iterator-based XIR will be
viable for the system.

This barely had any impact on the performance from the previous commit
reporting the profiling.  This performs at least as well as the quick-xml
based writer.  In isolated benchmarks, it performs better, but in the real
world, the linker spends most of its time reading xmlo files, and so minor
differences in writing do not have a significant overall impact.

With that said, a lot of cleanup and documentation is still needed.  That is
the subject of the upcoming commits, before this writer can finalized.

											
										
										
											2021-10-08 16:35:45 -04:00
+								        L_MAP_EXEC: tid "map-exec",
-												tamer: obj::xmle::xir: Write l:map-from

This contains some awkward coupling for opening and closing tags to reduce
the complexity of the `Iterator` types that must be manually
specified.  That may be addressed shortly.

											
										
										
											2021-10-05 16:13:47 -04:00
+								        L_MAP_FROM: tid "map-from",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_MAP_HEAD: qname "map:head",
 								        L_MAP_TAIL: qname "map:tail",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_MATCH: cid "match",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_META: cid "meta",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_METHOD: cid "method",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_NAME: cid "name",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_NAME_PREFIX: tid "name-prefix",
 								        L_NE: cid "ne",
 								        L_NO: cid "no",
 								        L_NOVALIDATE: cid "novalidate",
 								        L_OF: cid "of",
 								        L_ON: cid "on",
 								        L_OTHERWISE: cid "otherwise",
 								        L_OVERRIDE: cid "override",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_PACKAGE: cid "package",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_PARAM: cid "param",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_PARAM_ADD: tid "param-add",
 								        L_PARAM_CLASS_TO_YIELDS: tid "param-class-to-yields",
 								        L_PARAM_COPY: tid "param-copy",
 								        L_PARAM_INHERIT: tid "param-inherit",
 								        L_PARAM_META: tid "param-meta",
 								        L_PARAM_SYM_VALUE: tid "param-sym-value",
 								        L_PARAM_TYPEDEF_LOOKUP: tid "param-typedef-lookup",
 								        L_PARAM_VALUE: tid "param-value",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_PARENT: cid "parent",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_PASS: cid "pass",
 								        L_PATH: cid "path",
 								        L_PREFIX: cid "prefix",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_PREPROC: cid "preproc",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_PRODUCT: cid "product",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_PROGRAM: cid "program",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_PROGRAM_MAP: tid "program-map",
 								        L_QUOTIENT: cid "quotient",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_RATE: cid "rate",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_RATER: cid "rater",
 								        L_RATE_EACH: cid "rate-each",
 								        L_RECURSE: cid "recurse",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_RETMAP: cid "retmap",
-												tamer: obj::xmle::xir: Complete writer functionality

This is a significant milestone, in the sense that it is the culmination of
the past month or so of work to prove that an Iterator-based XIR will be
viable for the system.

This barely had any impact on the performance from the previous commit
reporting the profiling.  This performs at least as well as the quick-xml
based writer.  In isolated benchmarks, it performs better, but in the real
world, the linker spends most of its time reading xmlo files, and so minor
differences in writing do not have a significant overall impact.

With that said, a lot of cleanup and documentation is still needed.  That is
the subject of the upcoming commits, before this writer can finalized.

											
										
										
											2021-10-08 16:35:45 -04:00
+								        L_RETMAP_EXEC: tid "retmap-exec",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_RETMAP_HEAD: qname "retmap:head",
 								        L_RETMAP_TAIL: qname "retmap:tail",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_RETURN_MAP: tid "return-map",
 								        L_RMDASH: cid "rmdash",
 								        L_RMUNDERSCORE: cid "rmunderscore",
 								        L_SCALAR: cid "scalar",
 								        L_SECTION: cid "section",
 								        L_SET: cid "set",
 								        L_SNAKE: cid "snake",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_SRC: cid "src",
-												tamer: obj::xmle::xir: Complete writer functionality

This is a significant milestone, in the sense that it is the culmination of
the past month or so of work to prove that an Iterator-based XIR will be
viable for the system.

This barely had any impact on the performance from the previous commit
reporting the profiling.  This performs at least as well as the quick-xml
based writer.  In isolated benchmarks, it performs better, but in the real
world, the linker spends most of its time reading xmlo files, and so minor
differences in writing do not have a significant overall impact.

With that said, a lot of cleanup and documentation is still needed.  That is
the subject of the upcoming commits, before this writer can finalized.

											
										
										
											2021-10-08 16:35:45 -04:00
+								        L_STATIC: cid "static",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_SUFFIX: cid "suffix",
 								        L_SUM: cid "sum",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_SYM: cid "sym",
-												tamer: xir::parse::ele: Initial element parser generator concept

This begins generating parsers that are capable of parsing elements.  I need
to move on, so this abstraction isn't going to go as far as it could, but
let's see where it takes me.

This was the work that required the recent lookahead changes, which has been
detailed in previous commits.

This initial support is basic, but robust.  It supports parsing elements
with attributes and children, but it does not yet support the equivalent of
the Kleene star (`*`).  Such support will likely be added by supporting
parsers that are able to recurse on their own definition in tail position,
which will also require supporting parsers that do not add to the stack.

This generates parsers that, like all the other parsers, use enums to
provide a typed stack.  Stitched parsers produce a nested stack that is
always bounded in size.  Fortunately, expressions---which can nest
deeply---do not need to maintain ancestor context on the stack, and so this
should work fine; we can get away with this because XIRF ensures proper
nesting for us.  Statements that _do_ need to maintain such context are not
nested.

This also does not yet support emitting an object on closing tag, which
will be necessary for NIR, which will be a streaming IR that is "near" to
the source XML in structure.  This will then be used to lower into AIR for
the ASG, which gives structure needed for further analysis.

More information to come; I just want to get this committed to serve as a
mental synchronization point and clear my head, since I've been sitting on
these changes for so long and have to keep stashing them as I tumble down
rabbit holes covered in yak hair.

DEV-7145

											
										
										
											2022-07-13 13:55:32 -04:00
+								        L_SYMTABLE: cid "symtable",
-												tamer: obj::xmlo::reader: preproc:sym-deps processing

This parses the symbol dependency list (adjacency list).

I'm noticing some glaring issues in error handling, particularly that the
token being parsed while an error occurs is not returned and so recovery is
impossible.  I'll have to address that later on, after I get this parser
completed.

Another previous question that I had a hard time answering in prior months
was how I was going to compose boilerplate parsers, e.g. handling the
parsing of single-attribute elements and such.  A pattern is clearly taking
shape, and with the composition of parsers more formalized, that'll be able
to be abstracted away.  But again, that's going to wait until after this
parser is actually functioning.  Too many delays so far.

DEV-10863

											
										
										
											2022-03-30 15:03:50 -04:00
+								        L_SYM_DEP: cid "sym-dep",
-												tamer: xir::parse::ele: Initial element parser generator concept

This begins generating parsers that are capable of parsing elements.  I need
to move on, so this abstraction isn't going to go as far as it could, but
let's see where it takes me.

This was the work that required the recent lookahead changes, which has been
detailed in previous commits.

This initial support is basic, but robust.  It supports parsing elements
with attributes and children, but it does not yet support the equivalent of
the Kleene star (`*`).  Such support will likely be added by supporting
parsers that are able to recurse on their own definition in tail position,
which will also require supporting parsers that do not add to the stack.

This generates parsers that, like all the other parsers, use enums to
provide a typed stack.  Stitched parsers produce a nested stack that is
always bounded in size.  Fortunately, expressions---which can nest
deeply---do not need to maintain ancestor context on the stack, and so this
should work fine; we can get away with this because XIRF ensures proper
nesting for us.  Statements that _do_ need to maintain such context are not
nested.

This also does not yet support emitting an object on closing tag, which
will be necessary for NIR, which will be a streaming IR that is "near" to
the source XML in structure.  This will then be used to lower into AIR for
the ASG, which gives structure needed for further analysis.

More information to come; I just want to get this committed to serve as a
mental synchronization point and clear my head, since I've been sitting on
these changes for so long and have to keep stashing them as I tumble down
rabbit holes covered in yak hair.

DEV-7145

											
										
										
											2022-07-13 13:55:32 -04:00
+								        L_SYM_DEPS: cid "sym-deps",
-												tamer: obj::xmlo::reader: preproc:sym-deps processing

This parses the symbol dependency list (adjacency list).

I'm noticing some glaring issues in error handling, particularly that the
token being parsed while an error occurs is not returned and so recovery is
impossible.  I'll have to address that later on, after I get this parser
completed.

Another previous question that I had a hard time answering in prior months
was how I was going to compose boilerplate parsers, e.g. handling the
parsing of single-attribute elements and such.  A pattern is clearly taking
shape, and with the composition of parsers more formalized, that'll be able
to be abstracted away.  But again, that's going to wait until after this
parser is actually functioning.  Too many delays so far.

DEV-10863

											
										
										
											2022-03-30 15:03:50 -04:00
+								        L_SYM_REF: cid "sym-ref",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_SYM_SET: tid "sym-set",
-												tamer: xir::st: Static namespace prefixes (c and t)

In particular, `t:*` will be recognized by NIR for short-hand template
application.  These will be utilized in an upcoming commit.

DEV-7145

											
										
										
											2022-08-10 16:33:46 -04:00
+								        L_T: cid "t",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_TEMPLATE: cid "template",
 								        L_TERMINATE: cid "terminate",
 								        L_TEXT: cid "text",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_TITLE: cid "title",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_TO: cid "to",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_TPL: cid "tpl",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_TRANSFORM: cid "transform",
 								        L_TRANSLATE: cid "translate",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_TRUE: cid "true",
 								        L_TYPE: cid "type",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_TYPEDEF: cid "typedef",
 								        L_UCFIRST: cid "ucfirst",
 								        L_UNION: cid "union",
 								        L_UNIQUE: cid "unique",
 								        L_UNLESS: cid "unless",
 								        L_UPPER: cid "upper",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_UUROOTPATH: cid "__rootpath",
-												tamer: iter::{TryCollect, TryFromIter}: New traits

These traits augment Rust's built-in traits to handle failure scenarios,
which will allow us to encapsulate lowering logic into discrete,
self-parsing units that enforce e.g. schemas (the example alludes to my
intentions).

											
										
										
											2021-11-05 16:22:06 -04:00
+								        L_VALUE: cid "value",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_VALUES: cid "values",
 								        L_VALUE_OF: cid "value-of",
 								        L_VECTOR: cid "vector",
-												tamer: obj::xmlo::reader: preproc:symtable/preproc:sym parsing

This integrates much of the work done so far to parse into a
`XmloEvent::SymDecl`.  The attribute parsing _is_ verbose, and I do intend
to abstract it away later on, but I'm going to wait on that for now.

The new reader should be finishing up soon, which is really exciting, since
I started working on this months ago (before having to take a break on
TAMER); I'm anticipating strong performance gains in the reader, and this is
a test that will tell us how the compiler will perform moving forward with
the abstractions that I've spent so much time on.

DEV-10863

											
										
										
											2022-03-30 09:06:10 -04:00
+								        L_VIRTUAL: cid "virtual",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_WARNING: cid "warning",
 								        L_WHEN: cid "when",
-												tamer: asg: Shorthand and long-form template arguments

This applies to template application only; there's still some work to do for
template parameters in definitions (well, for deriving them in `xmli` at
least).  And, as you can see, there's still a lot of TODO items here.

I ended up backtracking on tree edges to Meta, and even on cross edges to
Meta, because it complicated xmli derivation with no benefit right now;
maybe a cross edge will be re-added in the future, but I need to move on and
see where this takes me.

But, it works.

DEV-13708

											
										
										
											2023-03-23 00:04:53 -04:00
+								        L_WITH_PARAM: tid "with-param",
-												tamer: ir::asg::ident: Use symbols in place of string slice mapping

`IdentKind` needs to be written to `xmle` files and displayed in error
messages.  String slices were used when quick-xml was used for writing,
which will be going away with the new writer.

											
										
										
											2021-09-29 23:18:23 -04:00
+								        L_WORKSHEET: cid "worksheet",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_XMLNS: cid "xmlns",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        L_YIELD: cid "yield",
-												tamer: Start of XIR-based xmle writer

This has been a long time coming, and has been repeatedly stashed as other
parts of the system have evolved to support it.  The introduction of the XIR
tree was to write tests for this (which are sloppy atm).

This currently writes out the `xmle` header and _most_ of the `l:dep`
section; it's missing the object-type-specific attributes.  There is,
relatively speaking, not much more work to do here.

The feature flag `wip-xir-xmle-writer` was introduced to toggle this system
in place of `XmleWriter`.  Initial benchmarks show that it will be
competitive with the quick-xml-based writer, but remember that is not the
goal: the purpose of this is to test XIR in a production system before we
continue to implement it for a frontend, and to refactor so that we do not
have multiple implementations writing XML files (once we echo the source XML
files).

I'm excited to get this done with so that I can move on.  This has been
rather exhausting.

											
										
										
											2021-09-28 14:52:31 -04:00
+								        L_YIELDS: cid "yields",
-												tamer: nir::tplshort: Desugar body into @values@

This represents a significant departure from how the XSLT-based TAME handles
the `@values@` param, but it will end up having the same effect.  It builds
upon prior work, utilizing the fact that referencing a template in TAMER
will expand it.

The problem is this: allowing trees in `Meta` would add yet another
container; we have `Pkg` and `Tpl` already.  This was the same problem with
template application---I didn't want to add support for binding arguments
separately, and so re-used templates themselves, reaching the generalization
I just mentioned above.

`Meta` is intended to be a lexical metasyntatic variable.  That keeps its
implementation quite simple.  But if we start allowing trees, that gets
rather complicated really quickly, and starts to require much more complex
AIR parser state.

But we can accomplish the same behavior by desugaring into an existing
container---a template---and placing the body within it.  Then, in the
future, we'll parse `param-copy` into a simple `Air::RefIdent`, which will
expand the closed template and produce the same result as it does today in
the XSLT-based system.

This leaves open issues of closure (variable binding) in complex scenarios,
such as in templates that introduce metavariables to be utilized by the
body.  That's never a practice I liked, but we'll see how things evolve.

Further, this does not yet handle nested template applications.

But this saved me a ton of work.  Desugaring is much simpler.

The question is going to be how the XSLT-based compiler responds to this for
large packages with thousands of template applications.  I'll have to see
if it's worth the hit at that time, or if we should inline it when
generating the `xmli` file, producing the same `@values@` as
before.  But as it stands at this moment, the output is _not_ compatible
with the current compiler, as it expects `@values@` to be a tree, so a
modification would have to be made there.

DEV-13708

											
										
										
											2023-03-23 14:40:40 -04:00
+								        L_TPLP_VALUES: str "@values@",
-												tamer: src::asg::graph::object::pkg::name: New module

This introduces, but does not yet integrate, `CanonicalName`, which not only
represents canonicalized package names, but handles namespec resolution.

The term "namespec" is motivated by Git's use of *spec (e.g. refspec)
referring to various ways of specifying a particular object.  Names look
like paths, and are derived from them, but they _are not paths_.  Their
resolution is a purely lexical operation, and they include a number of
restrictions to simplify their clarity and handling.  I expect them to
evolve more in the future, and I've had ideas to do so for quite some time.

In particular, resolving packages in this way and then loading the from the
filesystem relative to the project root will ensure that
traversing (conceptually) to a parent directory will not operate
unintuitively with symlinks.  The path will always resolve unambigiously.

(With that said, if the symlink is to a shared directory with different
directory structures, that doesn't solve the compilation problem---we'll
have to move object files into a project-specific build directory to handle
that.)

Span Slicing
------------
Okay, it's worth commenting on the horridity of the path name slicing that
goes on here.  Care has been taken to ensure that spans will be able to be
properly sliced in all relevant contexts, and there are plenty of words
devoted to that in the documentation committed here.

But there is a more fundamental problem here that I regret not having solved
earlier, because I don't have the time for it right now: while we do have
SPair, it makes no guarantees that the span associated with the corresponding
SymbolId is actually the span that matches the original source lexeme.  In
fact, it's often not.

This is a problem when we want to slice up a symbol in an SPair and produce
a sensible span.  If it _is_ a source lexeme with its original span, that's
no problem.  But if it's _not_, then the two are not in sync, and slicing up
the span won't produce something that actually makes sense to the user.  Or,
worse (or maybe it's not worse?), it may cause a panic if the slicing is out
of bounds.

The solution in the future might be to store explicitly the state of an
SPair, or call it Lexeme, or something, so that we know the conditions under
which slicing is safe.  If I ever have time for that in this project.

But the result of the lack of a proper abstraction really shows here: this
is some of the most confusing code in TAMER, and it's really not doing
anything all that complicated.  It is disproportionately confusing.

DEV-13162

											
										
										
											2023-05-04 12:28:08 -04:00
+								        FW_SLASH: str "/",
 								        FW_SLASH_DOT: str "/.",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        CC_ANY_OF: cid "anyOf",
-												tamer: NIR->xmli: Basic match support

This introduces `<match on="foo" />` and `<match on="foo" value="bar" />`,
which are both equality predicates.  Other types of predicates are not yet
supported.

This change is a bit messy and leaves a bit to be desired.  `NirToAir` is
quite messy and needs some cleanup.  There's also the issue of introducing
XML-specific errors in NIR so that users know what things like "subject"
mean, but not being able to do so yet because NIR is agnostic to the source
document type; another layer of abstraction is needed.

But, my priority is first to get derivation of a particularly
expensive (generated) package in our internal systems working first.

DEV-13708

											
										
										
											2023-04-06 14:31:06 -04:00
+								        U_TRUE: cid "TRUE",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        URI_LV_CALC: uri "http://www.lovullo.com/calc",
-												tamer: obj::xmle::xir: Complete l:dep

The `l:dep` section of the `xmle` file, after formatting (since XIR writes
without newlines and indentation), is now identical to the existing xmle
writer.  I can now move on to the other sections.

Note that the attribute movement in this commit is simply to get the diff to
properly align.  Once the current xmle writer is removed, I'll organize them
a bit more sensibly.

`obj::xmle::xir` also needs documentation, now that it's shown to be viable.

											
										
										
											2021-09-30 13:06:30 -04:00
+								        URI_LV_LINKER: uri "http://www.lovullo.com/rater/linker",
-												tamer: Introduce NIR (accepting only)

This introduces NIR, but only as an accepting grammar; it doesn't yet emit
the NIR IR, beyond TODOs.

This modifies `tamec` to, while copying XIR, also attempt to lower NIR to
produce parser errors, if any.  It does not yet fail compilation, as I just
want to be cautious and observe that everything's working properly for a
little while as people use it, before I potentially break builds.

This is the culmination of months of supporting effort.  The NIR grammar is
derived from our existing TAME sources internally, which I use for now as a
test case until I introduce test cases directly into TAMER later on (I'd do
it now, if I hadn't spent so much time on this; I'll start introducing tests
as I begin emitting NIR tokens).  This is capable of fully parsing our
largest system with >900 packages, as well as `core`.

`tamec`'s lowering is a mess; that'll be cleaned up in future commits.  The
same can be said about `tameld`.

NIR's grammar has some initial documentation, but this will improve over
time as well.

The generated docs still need some improvement, too, especially with
generated identifiers; I just want to get this out here for testing.

DEV-7145

											
										
										
											2022-08-29 15:28:03 -04:00
+								        URI_LV_PREPROC: uri "http://www.lovullo.com/rater/preproc",
 								        URI_LV_PROGRAM_MAP: uri "http://www.lovullo.com/rater/map",
 								        URI_LV_RATER: uri "http://www.lovullo.com/rater",
 								        URI_LV_TPL: uri "http://www.lovullo.com/rater/apply-template",
 								        URI_LV_WORKSHEET: uri "http://www.lovullo.com/rater/worksheet",
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
-												tamer: Xirf::Text refinement

This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.

This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.

The idea is to capture the most common whitespace as preinterned
symbols.  Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.

The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character.  This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large.  (They become large when template-expanded.)  I'll optimize further
if I notice it show up during profiling.

This also frees XIR itself from being concerned by Whitespace.  Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic.  I'd rather avoid tweaking it.

tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).

Onward and yonward.

DEV-7145

											
										
										
											2022-07-27 15:49:38 -04:00
+								        // Common whitespace.
 								        //
 								        // _This does not represent all forms of whitespace!_
 								        // Clearly,
 								        //   but it is worth emphasizing.
 								        //
 								        // The intent of these whitespace symbols is to provide a means to
 								        //   determine whether that symbol represents a common form of
 								        //   whitespace,
 								        //     before falling back to a more expensive symbol dereference
 								        //     and (likely-)linear scan.
 								        //
 								        // This list is preliminary and ought to be measured by evaluating a
 								        //   real-world codebase;
 								        //     it ought not to bloat the symbol table,
 								        //       but ought to get the most common cases so as not to fall
 								        //       back to a more expensive dereferencing of a symbol and
 								        //       subsequent scanning.
 								        //
 								        // There are improvements that can be made here,
 								        //   such as aligning the symbol ids such that whitespace can be
 								        //   asserted with a bitmask.
-												tamer: sym::prefill::quick_contains_byte: New function

This will be utilized by NIR to avoid having to perform memory lookups for
preinterned static symbols.

DEV-13156

											
										
										
											2022-11-01 11:59:55 -04:00
+								        WS_SYM_START: mark "###WS_START",
-												tamer: Xirf::Text refinement

This teaches XIRF to optionally refine Text into RefinedText, which
determines whether the given SymbolId represents entirely whitespace.

This is something I've been putting off for some time, but now that I'm
parsing source language for NIR, it is necessary, in that we can only permit
whitespace Text nodes in certain contexts.

The idea is to capture the most common whitespace as preinterned
symbols.  Note that this heuristic ought to be determined from scanning a
codebase, which I haven't done yet; this is just an initial list.

The fallback is to look up the string associated with the SymbolId and
perform a linear scan, aborting on the first non-whitespace character.  This
combination of checks should be sufficiently performant for now considering
that this is only being run on source files, which really are not all that
large.  (They become large when template-expanded.)  I'll optimize further
if I notice it show up during profiling.

This also frees XIR itself from being concerned by Whitespace.  Initially I
had used quick-xml's whitespace trimming, but it messed up my span
calculations, and those were a pain in the ass to implement to begin with,
since I had to resort to pointer arithmetic.  I'd rather avoid tweaking it.

tameld will not check for whitespace, since it's not important---xmlo files,
if malformed, are the fault of the compiler; we can ignore text nodes except
in the context of code fragments, where they are never whitespace (unless
that's also a compiler bug).

Onward and yonward.

DEV-7145

											
										
										
											2022-07-27 15:49:38 -04:00
+								        WS_EMPTY: ws "",
 								        WS_SP1: ws " ",
 								        WS_SP2: ws "  ",
 								        WS_SP3: ws "   ",
 								        WS_SP4: ws "    ",
 								        WS_SP5: ws "     ",
 								        WS_SP6: ws "      ",
 								        WS_SP7: ws "       ",
 								        WS_SP8: ws "        ",
 								        WS_LF1: ws "\n",
 								        WS_LF2: ws "\n\n",
 								        WS_LF1_SP1: ws "\n ",
 								        WS_LF1_SP2: ws "\n  ",
 								        WS_LF1_SP3: ws "\n   ",
 								        WS_LF1_SP4: ws "\n    ",
 								        WS_LF1_SP5: ws "\n     ",
 								        WS_LF1_SP6: ws "\n      ",
 								        WS_LF1_SP7: ws "\n       ",
 								        WS_LF1_SP8: ws "\n        ",
 								        WS_LF2_SP1: ws "\n\n ",
 								        WS_LF2_SP2: ws "\n\n  ",
 								        WS_LF2_SP3: ws "\n\n   ",
 								        WS_LF2_SP4: ws "\n\n    ",
 								        WS_LF2_SP5: ws "\n\n     ",
 								        WS_LF2_SP6: ws "\n\n      ",
 								        WS_LF2_SP7: ws "\n\n       ",
 								        WS_LF2_SP8: ws "\n\n        ",
-												tamer: sym::prefill::quick_contains_byte: New function

This will be utilized by NIR to avoid having to perform memory lookups for
preinterned static symbols.

DEV-13156

											
										
										
											2022-11-01 11:59:55 -04:00
+								        WS_SYM_END: mark "###WS_END",
-												tamer: obj::xmlo::reader: preproc:fragment parsing

This concludes the bulk of the header parsing, though there are surely going
to be other issues when I try to read a real xmlo file, such as
whitespace.  That is something I expect that I'd rather handle as part of
XIRF, but maybe I'll initially ignore it here just to get it working.  We'll
see.

DEV-10863

											
										
										
											2022-03-30 21:41:59 -04:00
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								        // [Symbols will be added here as they are needed.]
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								        // Marker indicating the end of the static symbols
 								        //   (this must always be last).
-												tamer: sym::prefill::quick_contains_byte: New function

This will be utilized by NIR to avoid having to perform memory lookups for
preinterned static symbols.

DEV-13156

											
										
										
											2022-11-01 11:59:55 -04:00
+								        END_STATIC: mark "###END"
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								    }
 								}
 								/// Static 16-bit symbols (pre-allocated).
 								///
 								/// These symbols are intended for situations where a smaller symbol size is
 								///   necessary.
 								/// Presently,
 								///   this includes only the [`Span`](crate::span::Span) context.
 								///
 								/// See also [st](super::st) for general static symbols.
 								pub mod st16 {
 								    use super::*;
 								    static_symbols! {
 								        <u16>;
 								        // Special contexts.
-												tamer: xir::reader: Initial introduction of spans

This is a large change, and was a bit of a tedious one, given the
comprehensive tests.

This introduces proper offsets and lengths for spans, with the exception of
some quick-xml errors that still need proper mapping.  Further, this still
uses `UNKNOWN_CONTEXT`, which will be resolved shortly.

This also introduces `SpanlessError`, which `Error` explicitly _does not_
implement `From<SpanlessError>` for---this forces the caller to provide a
span before the error is compatable with the return value, ensuring that
spans will actually be available rather than forgotten for errors.  This is
important, given that errors are generally less tested than the happy path,
and errors are when users need us the most (so, need span information).

Further, I had to use pointer arithmetic in order to calculate many of the
spans, because quick-xml does not provide enough information.  There's no
safety considerations here, and the comprehensive unit test will ensure
correct behavior if the implementation changes in the future.

I would like to introduce typed spans at some point---I made some
opinionated choices when it comes to what the spans ought to
represent.  Specifically, whether to include the `<` or `>` with the open
span (depends), whether to include quotes with attribute values (no),
and some other details highlighted in the test cases.  If we provide typed
spans, then we could, knowing the type of span, calculate other spans on
request, e.g. to include or omit quotes for attributes.  Different such
spans may be useful in different situations when presenting information to
the user.

This also highlights gaps in the tokens emitted by XIR, such as whitespace
between attributes, the `=` between name and value, and so on.  These are
important when it comes to code formatting, so that we can reliably
reconstruct the XML tree, but it's not important right now.  I anticipate
future changes would allow the XIR reader to be configured (perhaps via
generics, like a strategy-type pattern) to optionally omit these tokens if
desired.

Anyway, more to come.

DEV-10934

											
										
										
											2022-04-08 11:03:46 -04:00
+								        CTX_DUMMY: ctx "#!DUMMY",
 								        CTX_UNKNOWN: ctx "#!UNKNOWN",
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								        CTX_LINKER: ctx "#!LINKER",
 								        // [Symbols will be added here as they are needed.]
 								        // Marker indicating the end of the static symbols
 								        //   (this must always be last).
-												tamer: sym::prefill::quick_contains_byte: New function

This will be utilized by NIR to avoid having to perform memory lookups for
preinterned static symbols.

DEV-13156

											
										
										
											2022-11-01 11:59:55 -04:00
+								        END_STATIC: mark16 "###END"
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								    }
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								}
-												tamer: sym::prefill: Static symbol polymorphism

See the docs for a much deeper discussion.  In summary: traits do not
support static methods, and this is the workaround, which relies on unstable
nightly constant function features.

This implementation is tested using `qname_const!`, and will be utilized
with a new static type in a following commit.

											
										
										
											2021-10-02 00:50:20 -04:00
+								/// Non-public module that can contain public traits.
 								///
 								/// The problem this module tries to solve is preventing anything outside of
 								///   this crate from implementing the `StaticSymbolId` trait,
 								///     since doing so opens us up to undefined behavior when transmuting
 								///     via [`st_as_sym`](super::st_as_sym).
 								mod private {
 								    /// Extend this trait to prevent other modules from implementing the
 								    ///   subtype.
 								    ///
 								    /// Since other modules extend [`StaticSymbolId`](super::StaticSymbolId)
 								    ///   for their own traits,
 								    ///     this trait must be `pub`.
 								    /// But, since it is contained within a private module,
 								    ///   it is not possible to import the trait to implement it on other
 								    ///   things.
 								    pub trait Sealed {}
 								}
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								#[cfg(test)]
 								mod test {
-												tamer: obj::xmle::xir: Complete l:dep

The `l:dep` section of the `xmle` file, after formatting (since XIR writes
without newlines and indentation), is now identical to the existing xmle
writer.  I can now move on to the other sections.

Note that the attribute movement in this commit is simply to get the diff to
properly align.  Once the current xmle writer is removed, I'll organize them
a bit more sensibly.

`obj::xmle::xir` also needs documentation, now that it's shown to be viable.

											
										
										
											2021-09-30 13:06:30 -04:00
+								    use super::{st, st16, DecStaticSymbolId};
 								    use crate::sym::{GlobalSymbolIntern, GlobalSymbolResolve, SymbolId};
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
 								    #[test]
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								    fn global_sanity_check_st() {
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								        // If we _don't_ prefill, make sure we're not starting at the first
 								        // offset when interning, otherwise it'll look correct.
 								        let new: SymbolId = "force offset".intern();
 								        assert!(
 								            new.as_usize() > st::END_STATIC.as_usize(),
 								            "a new global symbol allocation was not > END_STATIC, \
-												tamer: sym::prefill::test::global_sanity_check: Note duplicate strings

I want to make it clear in the assertion that the problem could be caused by
duplicate strings.  We do not sort by string, because in part we may in the
future want to group certain symbols together in some arbitrary way so we
can compare ranges (using the markers).

If that doesn't end up happening, it may be better to just sort by string
to obviate the problem.

											
										
										
											2021-09-24 16:24:02 -04:00
+								             indicating that prefill is either not working or that \
 								             the prefill contains duplicate strings!"
-												tamer: Remove Ix generalization throughout system

This had the writing on the wall all the same as the `'i` interner lifetime
that came before it.  It was too much of a maintenance burden trying to
accommodate both 16-bit and 32-bit symbols generically.

There is a situation where we do still want 16-bit symbols---the
`Span`.  Therefore, I have left generic support for symbol sizes, as well as
the different global interners, but `SymbolId` now defaults to 32-bit, as
does `Asg`.  Further, the size parameter has been removed from the rest of
the code, with the exception of `Span`.

This cleans things up quite a bit, and is much nicer to work with.  If we
want 16-bit symbols in the future for packing to increase CPU cache
performance, we can handle that situation then in that specific case; it's a
premature optimization that's not at all worth the effort here.

											
										
										
											2021-09-23 14:52:53 -04:00
+								        );
 								        // Further sanity check to make sure indexes align as expected,
 								        // not that you wouldn't otherwise notice that the whole system is
 								        // broken, but this ought to offer a more direct hint as to what
 								        // went wrong.
-												tamer: sym::prefill: All-caps constants for static symbols

It's really awkward not having them caps, when not only are constants
expected to be, but also that we cannot maintain consistency between the
string and the identifier name in even the simplest of cases.

(We could use `r#`, but that's too cumbersome.)

											
										
										
											2021-09-23 23:46:06 -04:00
+								        assert_eq!(st::L_TRUE.as_sym(), "true".intern());
 								        assert_eq!(st::L_FALSE.as_sym(), "false".intern());
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								    }
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
-												tamer: sym: Expose raw SymbolId for static symbols

This provides a child `raw` module that exposes a SymbolId representing the
inner value of each of the static newtypes.  This is needed in situations
where the type must match and the type of the static symbol is not
important.

In particular, when comparing against runtime-allocated symbols in `match`
expressions.

It is also worth noting that this commit managed to hit a bug in Rustc that
was fixed on 10/1/2021.  We use nightly, and it doesn't seem that this
occurred in stable, from bug reports.

  - https://github.com/rust-lang/rust/issues/89393
  - https://github.com/rust-lang/rust/commit/5ab1245303c26d3ae33b1adaa89fef2b8d9fb9ca
  - Original issue: https://github.com/rust-lang/rust/issues/72476

The error was:

  compiler/rustc_mir_build/src/thir/pattern/deconstruct_pat.rs:1191:22:
  Unexpected type for `Single` constructor: <u32 as sym::symbol::SymbolIndexSize>::NonZero

  thread 'rustc' panicked at 'Box<dyn Any>', compiler/rustc_errors/src/lib.rs:1146:9

This occurred because we were trying to use `SymbolId` as the type, which
uses a projected type as its inner value: `SymbolId<Ix: SymbolIndexSize>(Ix::NonZero)`.
This was not a problem with the static newtypes because their inner type was
simply `SymbolId<Ix>`, which is not projected.

This is one of the risks of using nightly.

But, the point is: if you receive this error, upgrade your toolchain.

											
										
										
											2021-10-18 10:41:15 -04:00
+								    // Just ensure raw symbols are available and match.
 								    #[test]
 								    fn sanity_check_st_raw() {
 								        assert_eq!(st::L_TRUE.as_sym(), st::raw::L_TRUE);
 								    }
-												tamer: sym: 16-bit static symbol prefill

The 16-bit interner at present will be used only for span contexts.  In the
future, this interner may become specialized specifically for that, but for
now let's just re-use what we already have so that I can move on.

DEV-10733

											
										
										
											2021-09-28 10:39:20 -04:00
+								    #[test]
 								    fn global_sanity_check_st16() {
 								        // If we _don't_ prefill, make sure we're not starting at the first
 								        // offset when interning, otherwise it'll look correct.
 								        let new: SymbolId<u16> = "force offset".intern();
 								        assert!(
 								            new.as_usize() > st16::END_STATIC.as_usize(),
 								            "a new 16-bit global symbol allocation was not > END_STATIC, \
 								             indicating that prefill is either not working or that \
 								             the prefill contains duplicate strings!"
 								        );
 								    }
-												tamer: obj::xmle::xir: Complete l:dep

The `l:dep` section of the `xmle` file, after formatting (since XIR writes
without newlines and indentation), is now identical to the existing xmle
writer.  I can now move on to the other sections.

Note that the attribute movement in this commit is simply to get the diff to
properly align.  Once the current xmle writer is removed, I'll organize them
a bit more sensibly.

`obj::xmle::xir` also needs documentation, now that it's shown to be viable.

											
										
										
											2021-09-30 13:06:30 -04:00
 								    #[test]
 								    fn decimal1_0_to_9() {
 								        for n in 0..=9 {
 								            assert_eq!(st::decimal1(n).as_sym().lookup_str(), n.to_string());
 								            // From<u8>
 								            assert_eq!(
 								                DecStaticSymbolId::from(n).as_sym().lookup_str(),
 								                n.to_string()
 								            );
 								        }
 								    }
 								    #[test]
 								    #[should_panic]
 								    fn decimal1_gt_9_panics() {
 								        st::decimal1(10);
 								    }
-												tamer: xir::escape::CachingEscaper: Use new sym::st::ST_COUNT

This adds a constant `ST_COUNT` representing the number of statically
allocated symbols, and uses that to estimate an initial capacity for the
`CachingEscaper`.

This is just a guess (and is certainly too low), but we can adjust later on
after profiling, if it ever comes up.

											
										
										
											2021-11-15 21:40:42 -05:00
 								    #[test]
 								    fn st_count_matches_actual_count() {
 								        // This assumes that static symbols begin at 1 and end at
 								        //   END_STATIC.
 								        assert_eq!(
 								            st::END_STATIC.as_usize(),
 								            st::ST_COUNT,
 								            "st::ST_COUNT does not match the number of static symbols"
 								        );
 								    }
-												tamer: sym::prefill::quick_contains_byte: New function

This will be utilized by NIR to avoid having to perform memory lookups for
preinterned static symbols.

DEV-13156

											
										
										
											2022-11-01 11:59:55 -04:00
 								    // [`quick_contains_bytes`] is asking for trouble if it's not properly
 								    //   maintained.
 								    // It is expected that its implementation is manually verified,
 								    //   and it is written in a way that is clear and unambiguous.
 								    // With that said,
 								    //   this does some minor spot-checking.
 								    #[test]
 								    fn quick_contains_byte_verify() {
 								        use super::super::GlobalSymbolResolve;
 								        use memchr::memchr;
 								        use st::quick_contains_byte;
 								        // No static symbols will contain control characters.
 								        assert_eq!(quick_contains_byte(st::L_TRUE.into(), 0x01), Some(false));
 								        // But we don't know about dynamically-allocated ones.
 								        assert_eq!(
 								            quick_contains_byte("NOT A PREINTERNED SYM".into(), 0x01),
 								            None
 								        );
 								        // We chose to explicitly keep certain characters out of the
 								        //   preinterned list.
 								        // Let's verify that is the case by iterating through _all of the
 								        //   static interns_.
 								        for sym_id in 1..=st::ST_COUNT {
 								            let sym = unsafe { SymbolId::from_int_unchecked(sym_id as u32) };
 								            // If you get an error in this block,
 								            //   that means that you have added a symbol that violates
 								            //   assumptions made in `quick_contains_byte`.
 								            // Either that implementation needs changing and this test
 								            //   updated,
 								            //     or you need to not add that symbol to the static symbol
 								            //     list.
 								            for ch in b'{'..=0x7F {
 								                assert_eq!(
 								                    memchr(ch, sym.lookup_str().as_bytes()),
 								                    None,
 								                    "Pre-interned static symbol {sym:?} \
 								                       contains unexpected byte 0x{ch:X}"
 								                );
 								            }
 								        }
 								    }
-												tamer: sym::prefill: Introduce static symbols

This is the beginning of static symbols, which is becoming increasing
necessary as it's quite a pain to have to deal with interning static strings
any place they're used.

It's _more_ of a pain to do that in conjunction with newtypes (e.g. `QName`,
`AttValue`, etc) that make use of `SymbolId`; this will allow us to
construct _those_ statically as well, and additional work to support that
will be coming up.

DEV-10701

											
										
										
											2021-09-22 16:04:56 -04:00
+								}