tame/tamer/src/xir/tree.rs

// XIR tree representation
//
//  Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
//
//  This file is part of TAME.
//
//  This program is free software: you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation, either version 3 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program.  If not, see <http://www.gnu.org/licenses/>.

//! XIR token stream parsed into a tree-based IR (XIRT).
//!
//! **This is a work-in-progress implementation.**
//! It will be augmented only as needed.
//!
//! Parsing is handled by [`ParserState::parse_token`].
//! An [`Iterator::scan`]-based parser can be constructed using
//!   [`parser_from`] or [`parse`][parse()].
//!
//! ```
//! use tamer::xir::tree::{ParserState, parse, parser_from};
//!# use tamer::xir::Token;
//!
//!# let token_stream: std::vec::IntoIter<Token> = vec![].into_iter();
//! // Lazily parse a stream of XIR tokens as an iterator, yielding the next
//! // fully parsed object.  This may consume any number of tokens.
//! let parser = parser_from(token_stream);
//!
//!# let token_stream: std::vec::IntoIter<Token> = vec![].into_iter();
//! // Consume a single token at a time, yielding either an incomplete state
//! // or the next parsed object.
//! let parser = token_stream.scan(ParserState::new(), parse);
//! ```
//!
//! `parser_from` Or `parse`?
//! =========================
//! [`parser_from`] is implemented in terms of [`parse`][parse()].
//! They have slightly different use cases and tradeoffs:
//!
//! [`parse`][parse()] yields a [`Result`] containing [`Parsed`],
//!   which _may_ contain a [`Parsed::Tree`],
//!   but it's more likely to contain [`Parsed::Incomplete`];
//!     this is because it typically takes multiple [`Token`]s to complete
//!     parsing within a given context.
//!
//! In return, though, you get some important guarantees:
//!
//!   1. [`parse`][parse()] consumes only a _single_ token; and
//!   2. It has a constant upper bound for execution time.
//!
//! This means that [`parse`][parse()] will never cause the system to
//!   hang---you
//!     are in complete control over how much progress parsing makes,
//!       and are free to stop and resume it at any time.
//!
//! However,
//!   if you do not care about those things,
//!   working with [`Parsed`] is verbose and inconvenient;
//!     sometimes you just want the next [`Tree`] object.
//! For this,
//!   we have [`parser_from`],
//!     which does two things:
//!
//!   1. It filters out all [`Parsed::Incomplete`]; and
//!   2. On [`Parsed::Tree`],
//!        it yields the inner [`Tree`].
//!
//! This is a much more convenient API,
//!   but is not without its downsides:
//!     if the context is large
//!       (e.g. the root node of a large XML document),
//!       parsing can take a considerable amount of time,
//!         and the [`Iterator`] produced by [`parser_from`] will cause the
//!         system to process [`Iterator::next`] for that entire duration.
//!
//! See also [`attr_parser_from`] and [`parse_attrs`] for parsing only
//!   attributes partway through a token stream.
//!
//! Cost of Parsing
//! ===============
//! While [`Tree`] is often much easier to work with than a stream of
//!   [`Token`],
//!     there are notable downsides:
//!
//!   - The context in which parsing began
//!       (see _Parser Implementation_ below)
//!       must complete before _any_ token is emitted.
//!     If parsing begins at the root element,
//!       this means that the _entire XML document_ must be loaded into
//!       memory before it is available for use.
//!   - While the token stream is capable of operating using constant memory
//!       (since [`Token`] can be discarded after being consumed),
//!         a [`Tree`] holds a significant amount of data in memory.
//!
//! It is recommended to parse into [`Tree`] only for the portions of the
//!   XML document that will benefit from it.
//! For example,
//!   by avoiding parsing of the root element into a tree,
//!     you can emit [`Tree`] for child elements without having to wait for
//!     the entire document to be parsed.
//!
//!
//! Validity Of Token Stream
//! ========================
//! XIR verifies that each [`Token`] is syntactically valid and follows an
//!   XML grammar subset;
//!     as such,
//!       the tree parser does not concern itself with syntax analysis.
//! It does,
//!   however,
//!   perform _[semantic analysis]_ on the token stream.
//! Given that,
//!   [`ParserState::parse_token`] returns a [`Result`],
//!     with parsing errors represented by this module's [`ParseError`].
//!
//! As an example,
//!   a XIR token stream permits unbalanced tags.
//! However,
//!   we cannot represent an invalid tree,
//!   so that would result in a semantic error.
//!
//! [semantic analysis]: https://en.wikipedia.org/wiki/Semantic_analysis_(compilers)
//!
//!
//! Parser Implementation
//! =====================
//! The parser that lowers the XIR [`Token`] stream into a [`Tree`]
//!   is implemented on [`ParserState`],
//!     which exists to encapsulate the [`Stack`].
//!
//! This parser is a [stack machine],
//!   where the stack represents the [`Tree`] that is under construction.
//! Parsing operates on _context_.
//! At present, the only parsing context is an element---it
//!   begins parsing at an opening tag ([`Token::Open`]) and completes
//!   parsing at a _matching_ [`Token::Close`].
//! All attributes and child nodes encountered during parsing of an element
//!   will automatically be added to the appropriate element,
//!     recursively.
//!
//! [stack machine]: https://en.wikipedia.org/wiki/Stack_machine
//!
//! State Machine With A Typed Stack
//! --------------------------------
//! The parser is a [finate-state machine (FSM)] with a stack encoded in
//!   variants of [`Stack`],
//!     where each variant represents the current state of the parser.
//! The parser cannot be reasoned about as a pushdown automaton because the
//!   language of the [`Stack`] is completely arbitrary,
//!     but it otherwise operates in a similar manner.
//!
//! Each state transition consumes the entire stack and produces a new one,
//!   which may be identical.
//! Intuitively, though, based on the construction of [`Stack`],
//!   this is equivalent to popping the needed data off of the stack and
//!   optionally pushing additional information.
//!
//! By encoding the stack in [`Stack`] variants,
//!   we are able to verify statically that the stack is always in a valid
//!   state and contains expected data---that
//!     is, our stack is fully type-safe.
//!
//! [state machine]: https://en.wikipedia.org/wiki/Finite-state_machine
//!
//! High-Resolution Attributes
//! --------------------------
//! XIRT supports [`Token::AttrValueFragment`],
//!   which can produce concatenated attribute values that retain the
//!   [`Span`] of each of their constituent parts.
//! This could allow,
//!   for example,
//!   creating an LSP server that would expose all of the TAME templates and
//!     source inputs used to generate an identifier.
//!
//! However,
//!   note that the XIR token stream introduced [`Token::AttrValueFragment`]
//!   primarily to eliminate the need for unnecessary [symbol
//!   lookups](crate::sym), copying, and heap allocations.
//! XIRT must perform extra heap allocations to process these fragments.
//! Once processed,
//!   an [`Attr::Extensible`] object is produced;
//!     the value is _not_ concatenated and interned,
//!       allowing it to be cheaply converted back into a [`Token`] stream
//!       for writing without unnecessary overhead.
//!
//! For more information,
//!   see [`AttrParts`].

mod attr;
mod parse;

use super::{QName, Token, TokenResultStream, TokenStream};
use crate::{span::Span, sym::SymbolId};
use std::{error::Error, fmt::Display, iter, mem::take};

pub use attr::{Attr, AttrList, AttrParts, SimpleAttr};

/// A XIR tree (XIRT).
///
/// This object represents a XIR token stream parsed into a tree
///   representation.
/// This representation is easier to process and manipulate in most contexts,
///   but also requires memory allocation for the entire tree and requires
///   that a potentially significant portion of a token stream be processed
///     (e.g. from start to end tag for a given element).
///
/// _Note that this implementation is incomplete!_
/// It will be augmented as needed.
///
/// For more information,
///  see the [module-level documentation](self).
#[derive(Debug, Clone, Eq, PartialEq)]
pub enum Tree {
    /// XML element.
    Element(Element),

    /// Text node.
    ///
    /// A text node cannot contain other [`Tree`] elements;
    ///   sibling text nodes must exist within an [`Element`].
    Text(SymbolId, Span),

    /// This variant exists purely because `#[non_exhaustive]` has no effect
    ///   within the crate.
    ///
    /// This ensures that matches must account for other variants that will
    ///   be introduced in the future,
    ///     easing the maintenance burden
    ///       (for both implementation and unit tests).
    _NonExhaustive,
}

impl Into<Option<Element>> for Tree {
    #[inline]
    fn into(self) -> Option<Element> {
        match self {
            Self::Element(ele) => Some(ele),
            _ => None,
        }
    }
}

impl Into<Option<SymbolId>> for Tree {
    #[inline]
    fn into(self) -> Option<SymbolId> {
        match self {
            Self::Text(text, _) => Some(text),
            _ => None,
        }
    }
}

impl Tree {
    /// Yield a reference to the inner value if it is an [`Element`],
    ///   otherwise [`None`].
    #[inline]
    pub fn as_element<'a>(&'a self) -> Option<&'a Element> {
        match self {
            Self::Element(ele) => Some(ele),
            _ => None,
        }
    }

    /// Yield the inner value if it is an [`Element`],
    ///   otherwise [`None`].
    #[inline]
    pub fn into_element(self) -> Option<Element> {
        self.into()
    }

    /// Whether the inner value is an [`Element`].
    #[inline]
    pub fn is_element(&self) -> bool {
        matches!(self, Self::Element(_))
    }

    /// Yield a string representation of the element,
    ///   if applicable.
    ///
    /// This is incomplete.
    #[inline]
    pub fn as_sym(&self) -> Option<SymbolId> {
        match self {
            Self::Text(sym, ..) => Some(*sym),
            _ => None,
        }
    }
}

/// Element node.
///
/// This represents an [XML element] beginning with an opening tag that is
///   either self-closing or ending with a balanced closing tag.
/// The two spans together represent the span of the entire element with all
///   its constituents.
///
/// [XML element]: https://www.w3.org/TR/REC-xml/#sec-starttags
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct Element {
    name: QName,
    /// Zero or more attributes.
    attrs: Option<AttrList>,
    /// Zero or more child nodes.
    children: Vec<Tree>,
    /// Spans for opening and closing tags respectively.
    span: (Span, Span),
}

impl Element {
    /// Element name.
    #[inline]
    pub fn name(&self) -> QName {
        self.name
    }

    /// Child [`Tree`] objects of this element.
    #[inline]
    pub fn children(&self) -> &Vec<Tree> {
        &self.children
    }

    /// Attributes of this element.
    #[inline]
    pub fn attrs(&self) -> Option<&AttrList> {
        self.attrs.as_ref()
    }

    /// Opens an element for incremental construction.
    ///
    /// This is intended for use by the parser to begin building an element.
    /// It does not represent a completed element and should not be yielded
    ///   to any outside caller until it is complete.
    /// This incomplete state is encoded in [`Stack::BuddingElement`].
    #[inline]
    fn open(name: QName, span: Span) -> Self {
        Self {
            name,
            attrs: None,
            children: vec![],
            span: (span, span), // We do not yet know where the span will end
        }
    }

    /// Complete an element's span by setting its ending span.
    ///
    /// When elements are still budding (see [`Stack::BuddingElement`]),
    ///   the ending span is set to the starting span,
    ///   since the end is not yet known.
    #[inline]
    fn close_span(self, close_span: Span) -> Self {
        Element {
            span: (self.span.0, close_span),
            ..self
        }
    }
}

/// A [`Stack`] representing an element and its (optional) parent's stack.
///
/// Storing the parent of an [`Element`] allows it to be manipulated on the
///   [`Stack`] using the usual operations,
///     while maintaining the context needed to later add it as a child to
///     its parent once the element is completed.
///
/// This is used to represent a [`Stack::BuddingElement`].
/// This type exists because enum variants are not their own types,
///   but we want to nest _only_ element stacks,
///     not any type of stack.
#[derive(Debug, Eq, PartialEq)]
pub struct ElementStack {
    element: Element,

    /// Parent element stack to be restored once element has finished
    ///   processing.
    pstack: Option<Box<ElementStack>>,
}

impl ElementStack {
    /// Attempt to close an element,
    ///   verifying that the closing tag is either self-closing or
    ///   balanced.
    ///
    /// This does not verify that a request to self-close only happens if
    ///   there are no child elements;
    ///     that is the responsibility of the parser producing the XIR
    ///     stream to ensure that self-closing can only happen during
    ///     attribute parsing.
    fn try_close(
        self,
        close_name: Option<QName>,
        close_span: Span,
    ) -> Result<Self> {
        let Element {
            name: ele_name,
            span: (open_span, _),
            ..
        } = self.element;

        // Note that self-closing with children is syntactically
        // invalid and is expected to never make it into a XIR
        // stream to begin with, so we don't check for it.
        if let Some(name) = close_name {
            if name != ele_name {
                return Err(ParseError::UnbalancedTag {
                    open: (ele_name, open_span),
                    close: (name, close_span),
                });
            }
        }

        Ok(Self {
            element: self.element.close_span(close_span),
            pstack: self.pstack,
        })
    }

    /// Transfer stack element into the parent as a child and return the
    ///   previous [`Stack`] state,
    ///     or yield a [`Stack::ClosedElement`] if there is no parent.
    ///
    /// If there is a parent element,
    ///   then the returned [`Stack`] will represent the state of the stack
    ///   prior to the child element being opened,
    ///     as stored with [`ElementStack::store`].
    fn consume_child_or_complete(self) -> Stack {
        match self.pstack {
            Some(parent_stack) => Stack::BuddingElement(
                parent_stack.consume_element(self.element),
            ),

            None => Stack::ClosedElement(self.element),
        }
    }

    /// Push the provided [`Element`] onto the child list of the inner
    ///   [`Element`].
    fn consume_element(mut self, child: Element) -> Self {
        self.element.children.push(Tree::Element(child));
        self
    }

    /// Push the provided [`Attr`] onto the attribute list of the inner
    ///   [`Element`].
    fn consume_attrs(mut self, attr_list: AttrList) -> Self {
        self.element.attrs.replace(attr_list);
        self
    }

    /// Transfer self to the heap to be later restored.
    ///
    /// This method simply exists for self-documentation.
    fn store(self) -> Box<Self> {
        Box::new(self)
    }
}

/// The state and typed stack of the XIR parser stack machine.
///
/// Since all possible states of the stack are known statically,
///   we encode the stack into variants,
///     where each variant represents the state of the parser's state
///     machine.
/// This way,
///   we know that the stack is always well-formed,
///   and benefit from strong type checking.
/// This also allows Rust to optimize its use.
///
/// Rust will compile this into a value that exists on the stack,
///   so we wind up with an actual stack machine in the end anyway.
///
/// For more information,
///   see the [module-level documentation](self).
#[derive(Debug, Eq, PartialEq)]
pub enum Stack {
    /// Empty stack.
    Empty,

    /// Empty stack expected to parse isolated, individual attributes.
    ///
    /// The purpose of this over `Empty` is to ensure that the parser is
    ///   able to properly fail on invalid XIR input when the caller is not
    ///   trying to parse individual attributes.
    IsolatedAttrEmpty,

    /// An [`Element`] that is still under construction.
    ///
    /// (This is a tree IR,
    ///    so here's a plant pun).
    BuddingElement(ElementStack),

    /// A completed [`Element`].
    ///
    /// This should be consumed and emitted.
    ClosedElement(Element),

    /// An [`AttrList`] that is still under construction.
    BuddingAttrList(Option<ElementStack>, AttrList),

    /// An attribute is awaiting its value,
    ///   after which it will be attached to an element.
    AttrName(Option<(Option<ElementStack>, AttrList)>, QName, Span),

    /// An attribute whose value is being constructed of value fragments,
    ///   after which it will be attached to an element.
    AttrFragments(Option<(Option<ElementStack>, AttrList)>, AttrParts),

    /// A completed [`AttrList`] without any [`Element`] context.
    IsolatedAttrList(AttrList),

    /// A completed [`Attr`] without any [`AttrList`] context.
    IsolatedAttr(Attr),

    /// Parsing has completed relative to the initial context.
    ///
    /// This is the final accepting state of the state machine.
    /// The parser will not operate while in this state,
    ///   which must be explicitly acknowledged and cleared in order to
    ///   indicate that additional tokens are expected and are not in
    ///   error.
    Done,
}

impl Default for Stack {
    fn default() -> Self {
        Self::Empty
    }
}

impl Stack {
    /// Attempt to open a new element.
    ///
    /// If the stack is [`Self::Empty`],
    ///   then the element will be considered to be a root element,
    ///     meaning that it will be completed once it is closed.
    /// If the stack contains [`Self::BuddingElement`],
    ///   then a child element will be started,
    ///     which will be consumed by the parent one closed rather than
    ///     being considered a completed [`Element`].
    ///
    /// Attempting to open an element in any other context is an error.
    fn open_element(self, name: QName, span: Span) -> Result<Self> {
        let element = Element::open(name, span);

        Ok(Self::BuddingElement(ElementStack {
            element,
            pstack: match self {
                // Opening a root element (or lack of context).
                Self::Empty => Ok(None),

                // Open a child element.
                Self::BuddingElement(pstack) => Ok(Some(pstack.store())),

                // Opening a child element in attribute parsing context.
                // Automatically close the attributes despite a missing
                //   AttrEnd to accommodate non-reader XIR.
                Self::BuddingAttrList(Some(pstack), attr_list) => {
                    Ok(Some(pstack.consume_attrs(attr_list).store()))
                }

                // Attempting to open a child element in an isolated
                //   attribute parsing context means that `AttrEnd` was not
                //   provided
                //     (or that we're not parsing in the correct context).
                Self::BuddingAttrList(None, ..) | Self::IsolatedAttrEmpty => {
                    Err(ParseError::AttrNameExpected(Token::Open(name, span)))
                }

                _ => todo! {},
            }?,
        }))
    }

    /// Attempt to close an element.
    ///
    /// Elements can be either self-closing
    ///   (in which case `name` is [`None`]),
    ///   or have their own independent closing tags.
    /// If a name is provided,
    ///   then it _must_ match the name of the element currently being
    ///     processed---that is,
    ///       the tree must be _balanced_.
    /// An unbalanced tree results in a [`ParseError::UnbalancedTag`].
    fn close_element(self, name: Option<QName>, span: Span) -> Result<Self> {
        match self {
            Self::BuddingElement(stack) => stack
                .try_close(name, span)
                .map(ElementStack::consume_child_or_complete),

            // We can implicitly complete the attribute list if there's a
            //   missing `Token::AttrEnd`,
            //     which alleviates us from having to unnecessarily generate
            //     it outside of readers.
            Self::BuddingAttrList(Some(stack), attr_list) => stack
                .consume_attrs(attr_list)
                .try_close(name, span)
                .map(ElementStack::consume_child_or_complete),

            // See the error variant description for more information.
            Self::BuddingAttrList(None, ..) => {
                Err(ParseError::MissingIsolatedAttrEnd(span))
            }

            _ => todo! {},
        }
    }

    /// Begin an attribute on an element.
    ///
    /// An attribute begins with a [`QName`] representing its name.
    /// It will be attached to a parent element after being closed with a
    ///   value via [`Stack::close_attr`].
    fn open_attr(self, name: QName, span: Span) -> Result<Self> {
        Ok(match self {
            // Begin construction of an attribute list on a new element.
            Self::BuddingElement(ele_stack) => Self::AttrName(
                Some((Some(ele_stack), Default::default())),
                name,
                span,
            ),

            // Continuation of attribute list.
            Self::BuddingAttrList(ele_stack, attr_list) => {
                Self::AttrName(Some((ele_stack, attr_list)), name, span)
            }

            // Isolated single attribute.
            Self::IsolatedAttrEmpty => Self::AttrName(None, name, span),

            _ => todo!("open_attr in state {:?}", self),
        })
    }

    /// Push a value fragment onto an attribute.
    ///
    /// This begins to build an attribute out of value fragments,
    ///   which is also completed by [`Stack::close_attr`].
    /// The attribute information that was previously held in
    ///   [`Stack::AttrName`] is moved into a [`AttrParts`] if that has not
    ///   already happend,
    ///     which is responsible for managing future fragments.
    ///
    /// This will cause heap allocation.
    fn push_attr_value(self, value: SymbolId, span: Span) -> Result<Self> {
        Ok(match self {
            Self::AttrName(head, name, open_span) => {
                // This initial capacity can be adjusted after we observe
                // empirically what we most often parse, or we can make it
                // configurable.
                let mut parts = AttrParts::with_capacity(name, open_span, 2);

                parts.push_value(value, span);
                Self::AttrFragments(head, parts)
            }

            Self::AttrFragments(head, mut parts) => {
                parts.push_value(value, span);
                Self::AttrFragments(head, parts)
            }

            _ => todo! {},
        })
    }

    /// Assigns a value to an opened attribute and attaches to the parent
    ///   element.
    ///
    /// If the attribute is composed of fragments ([`Stack::AttrFragments`]),
    ///   this serves as the final fragment and will yield an
    ///   [`Attr::Extensible`] with no further processing.
    fn close_attr(self, value: SymbolId, span: Span) -> Result<Self> {
        Ok(match self {
            Self::AttrName(Some((ele_stack, attr_list)), name, open_span) => {
                Self::BuddingAttrList(
                    ele_stack,
                    attr_list.push(Attr::new(name, value, (open_span, span))),
                )
            }

            Self::AttrFragments(Some((ele_stack, attr_list)), mut parts) => {
                parts.push_value(value, span);

                Stack::BuddingAttrList(
                    ele_stack,
                    attr_list.push(Attr::Extensible(parts)),
                )
            }

            // Isolated single attribute.
            Self::AttrName(None, name, open_span) => {
                Stack::IsolatedAttr(Attr::new(name, value, (open_span, span)))
            }

            _ => todo! {},
        })
    }

    /// End attribute parsing.
    ///
    /// If parsing occurs within an element context,
    ///   the accumulated [`AttrList`] will be attached to the budding
    ///   [`Element`].
    fn end_attrs(self) -> Result<Self> {
        Ok(match self {
            Self::BuddingAttrList(None, attr_list) => {
                Self::IsolatedAttrList(attr_list)
            }

            Self::BuddingAttrList(Some(ele_stack), attr_list) => {
                Self::BuddingElement(ele_stack.consume_attrs(attr_list))
            }

            Self::IsolatedAttrEmpty => Self::Done,

            _ => todo!("attr error"),
        })
    }

    /// Appends a text node as a child of an element.
    ///
    /// This is valid only for a [`Stack::BuddingElement`].
    fn text(self, value: SymbolId, span: Span) -> Result<Self> {
        Ok(match self {
            Self::BuddingElement(mut ele) => {
                ele.element.children.push(Tree::Text(value, span));

                Self::BuddingElement(ele)
            }
            _ => todo! {},
        })
    }
}

/// State while parsing a XIR token stream into a tree.
///
/// [`ParserState`] is responsible only for dispatch and bookkeeping;
///   state transitions and stack manipulation are handled by the various
///   methods on [`Stack`].
///
/// This is a stack machine with the interface of a state machine.
/// The stack is encoded into the variants themselves
///   (which Rust will allocate on the stack),
///     which is sensible given that we always know exactly what and how
///     many arguments we need.
/// This gives us both the stack we want and type safety,
///   and has compile-time guarantees to ensure that we cannot produce a
///   stack that is not suitable for the computation at hand.
///
/// Note that this cannot be reasoned about in terms of a pushdown automaton
///   because there is no set language for the stack---it
///     contains arbitrary data holding the state of the current computation,
///       which is (theoretically, but not practically) unbounded by the
///       recursive nature of [`Element`].
///
/// This is very similar to the [XmlWriter](super::writer::XmlWriter),
///   except that a stack is needed to accumulate tokens until we can begin
///   emitting a tree.
#[derive(Debug, Default)]
pub struct ParserState {
    stack: Stack,
}

impl ParserState {
    /// Create state of a new parser that has not yet seen any input
    ///   tokens.
    ///
    /// _Consider using [`parser_from`] instead._
    ///
    /// Parsers using this state are suitable only for valid starting
    ///   contexts,
    ///     as defined in the [module-level documentation](self).
    pub fn new() -> Self {
        Self {
            stack: Default::default(),
        }
    }

    /// Initialize the state of the parser with the given [`Stack`].
    fn with(stack: Stack) -> Self {
        Self { stack }
    }

    /// Consume a single XIR [`Token`] and attempt to parse it within the
    ///   context of the current [`Stack`].
    ///
    /// Each call to this method represents a [state transition].
    /// Invalid state transitions represent either a semantic error
    ///   (e.g. unbalanced tags)
    ///   or unimplemented features that will be added as needed.
    ///
    /// This parser is not responsible for validating _syntax_,
    ///   since valid syntax is already implied by the existence of
    ///   [`Token`].
    /// But it does perform semantic analysis on that token stream.
    ///
    /// All heavy lifting is done by the various methods on [`Stack`].
    ///
    /// See the [module-level documentation](self) for more information on
    ///   the implementation of the parser.
    pub fn parse_token(&mut self, tok: Token) -> Result<Parsed> {
        let stack = take(&mut self.stack);

        match tok {
            Token::Open(name, span) => stack.open_element(name, span),
            Token::Close(name, span) => stack.close_element(name, span),
            Token::AttrName(name, span) => stack.open_attr(name, span),
            Token::AttrValueFragment(value, span) => {
                stack.push_attr_value(value, span)
            }
            Token::AttrValue(value, span) => stack.close_attr(value, span),
            Token::AttrEnd => stack.end_attrs(),
            Token::Text(value, span) => stack.text(value, span),

            Token::Comment(..) | Token::CData(..) | Token::Whitespace(..) => {
                Err(ParseError::Todo(tok, stack))
            }
        }
        .map(|new_stack| self.store_or_emit(new_stack))
    }

    /// Emit a completed object or store the current stack for further processing.
    fn store_or_emit(&mut self, new_stack: Stack) -> Parsed {
        match new_stack {
            Stack::ClosedElement(ele) => Parsed::Tree(Tree::Element(ele)),
            Stack::IsolatedAttrList(attr_list) => Parsed::AttrList(attr_list),

            Stack::IsolatedAttr(attr) => {
                self.stack = Stack::IsolatedAttrEmpty;
                Parsed::Attr(attr)
            }

            // This parser has completed relative to its initial context and
            //   is not expecting any further input.
            Stack::Done => Parsed::Done,

            _ => {
                self.stack = new_stack;
                Parsed::Incomplete
            }
        }
    }
}

/// Result of a XIR tree parsing operation.
pub type Result<T> = std::result::Result<T, ParseError>;

/// Parsing error from [`ParserState`].
#[derive(Debug, Eq, PartialEq)]
pub enum ParseError {
    /// The closing tag does not match the opening tag at the same level of
    ///   nesting.
    UnbalancedTag {
        open: (QName, Span),
        close: (QName, Span),
    },

    /// [`Token::AttrEnd`] was expected in an isolated attribute context,
    ///   but [`Token::Close`] was encountered instead.
    ///
    /// This means that we encountered an element close while parsing
    ///   attributes in an isolated context,
    ///     which may happen if we're parsing only attributes as part
    ///     of a larger XIR stream.
    /// This should never happen if our XIR is well-formed _from a reader_,
    ///     but could happen if we generate XIR that we are not expecting to
    ///     subsequently parse.
    ///
    /// There is nothing the user can do to correct it;
    ///   this represents a bug in the compiler.
    MissingIsolatedAttrEnd(Span),

    /// An attribute was expected as the next [`Token`].
    AttrNameExpected(Token),

    /// Token stream ended before attribute parsing was complete.
    UnexpectedAttrEof,

    /// Not yet implemented.
    Todo(Token, Stack),
}

impl Display for ParseError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            // TODO: not a useful error because of symbols and missing span information
            Self::UnbalancedTag {
                open: (open_name, open_span),
                close: (close_name, close_span),
            } => {
                write!(
                    f,
                    "expected closing tag `{}`, but found `{}` at {} \
                     (opening tag at {})",
                    open_name, close_name, close_span, open_span
                )
            }

            Self::MissingIsolatedAttrEnd(span) => {
                // Try to be helpful to developers and users alike.
                #[cfg(test)]
                let testmsg = "or a problem with your test case";
                #[cfg(not(test))]
                let testmsg = "and should be reported";

                write!(
                    f,
                    "internal error: expecting AttrEnd, found Close at {}; \
                       this represents a compiler bug {}",
                    span, testmsg
                )
            }

            Self::AttrNameExpected(tok) => {
                write!(f, "attribute name expected, found {}", tok)
            }

            // TODO: Perhaps we should include the last-encountered Span.
            Self::UnexpectedAttrEof => {
                write!(
                    f,
                    "unexpected end of input during isolated attribute parsing",
                )
            }

            Self::Todo(tok, stack) => {
                write!(
                    f,
                    "TODO: `{:?}` unrecognized.  The parser is not yet \
                        complete, so this could represent either a missing \
                        feature or a semantic error.  Stack: `{:?}`.",
                    tok, stack
                )
            }
        }
    }
}

impl Error for ParseError {
    fn source(&self) -> Option<&(dyn Error + 'static)> {
        None
    }
}

/// Either a parsed [`Tree`] or an indication that more tokens are needed to
///   complete the active context.
///
/// This has the same structure as [`Option`],
///   but is its own type to avoid confusion as to what this type may mean
///   when deeply nested within other types
///     (e.g. `Option<Result<Parsed, ParserError>>` reads a bit better
///       than `Option<Result<Option<Tree>, ParserError>>`).
#[derive(Debug, Eq, PartialEq)]
pub enum Parsed {
    /// Parsing of an object is complete.
    ///
    /// See [`parser_from`].
    Tree(Tree),

    /// Parsing of an isolated attribute list is complete.
    ///
    /// See [`parse_attrs`].
    AttrList(AttrList),

    /// Parsing of a single isolated attribute is complete.
    ///
    /// See [`attr_parser_from`].
    Attr(Attr),

    /// The parser needs more token data to emit an object
    ///   (the active context is not yet complete).
    Incomplete,

    /// All parsing has completed successfully relative to the original
    ///   context.
    ///
    /// This does not necessarily mean that the XIR token stream has ended,
    ///   because parsing may have started for a portion of it.
    /// However,
    ///   if parsing began at the root node for the XIR stream,
    ///     then this _does_ indicate the end of the XML document.
    ///
    /// To continue using this parser after it has reached this state,
    ///   it must be explicitly reset to indicate that further parsing is
    ///   expected and not an error.
    Done,
}

/// Wrap [`ParserState::parse_token`] result in [`Some`],
///   suitable for use with [`Iterator::scan`].
///
/// If you do not require a single-step [`Iterator::next`] and simply want
///   the next parsed object,
///     use [`parser_from`] instead.
///
/// Note that parsing errors are represented by the wrapped [`Result`],
///   _not_ by [`None`].
///
/// This will produce an iterator that can only return [`None`] if the
///   iterator it scans returns [`None`].
///
/// ```
/// use tamer::xir::tree::{ParserState, parse};
///# use tamer::xir::Token;
///
///# let token_stream: std::vec::IntoIter<Token> = vec![].into_iter();
/// // The above is equivalent to:
/// let parser = token_stream.scan(ParserState::new(), parse);
/// ```
pub fn parse(state: &mut ParserState, tok: Token) -> Option<Result<Parsed>> {
    Some(ParserState::parse_token(state, tok))
}

/// Produce a lazy parser from a given [`TokenStream`],
///   yielding only when an object has been fully parsed.
///
/// Unlike [`parse`][parse()],
///   which is intended for use with [`Iterator::scan`],
///   this will yield /only/ when the underlying parser yields
///   [`Parsed::Tree`],
///     unwrapping the inner [`Tree`] value.
/// This interface is far more convenient,
///   but comes at the cost of not knowing how many parsing steps a single
///   [`Iterator::next`] call will take.
///
/// For more information on contexts,
///   and the parser in general,
///   see the [module-level documentation](self).
///
/// ```
/// use tamer::xir::tree::parser_from;
///# use tamer::xir::Token;
///
///# let token_stream: std::vec::IntoIter<Token> = vec![].into_iter();
/// // Lazily parse a stream of XIR tokens as an iterator.
/// let parser = parser_from(token_stream);
/// ```
pub fn parser_from(
    toks: impl TokenStream,
) -> impl Iterator<Item = Result<Tree>> {
    toks.scan(ParserState::new(), parse)
        .filter_map(|parsed| match parsed {
            Ok(Parsed::Tree(tree)) => Some(Ok(tree)),
            Ok(Parsed::Incomplete) => None,
            Err(x) => Some(Err(x)),

            Ok(Parsed::Done) => todo!("parser_from Parsed::Done"),

            // These make no sense in this context and should never occur.
            Ok(x @ (Parsed::AttrList(_) | Parsed::Attr(_))) => unreachable!(
                "unexpected yield by XIRT (Tree expected): {:?}",
                x
            ),
        })
}

/// Begin parsing in an isolated attribute context,
///   producing an [`AttrList`] that is detached from any [`Element`].
///
/// This is useful when you wish to consume a XIR stream and collect only
///   the attributes of an element.
/// If you wish to process an entire element,
///   use [`parser_from`] instead.
///
/// Parsing must begin at a [`Token::AttrName`] token.
///
/// This will consume tokens until reaching [`Token::AttrEnd`],
///   and so it is important that the XIR stream contain this delimiter;
///     this should be the case with all readers.
#[inline]
pub fn parse_attrs<'a>(
    toks: &mut impl TokenStream,
    dest: AttrList,
) -> Result<AttrList> {
    let mut state = ParserState::with(Stack::BuddingAttrList(None, dest));

    loop {
        match toks.next().and_then(|tok| parse(&mut state, tok)) {
            None => return Err(ParseError::UnexpectedAttrEof),
            Some(Err(err)) => return Err(err),
            Some(Ok(Parsed::Incomplete)) => continue,
            Some(Ok(Parsed::AttrList(attr_list))) => return Ok(attr_list),

            Some(Ok(Parsed::Done)) => todo!("parse_attrs Parsed::Done"),

            // These make no sense in this context and should never occur.
            Some(Ok(x @ (Parsed::Tree(_) | Parsed::Attr(_)))) => unreachable!(
                "unexpected yield by XIRT (AttrList expected): {:?}",
                x
            ),
        }
    }
}

/// Produce a lazy attribute parser from a given [`TokenStream`],
///   yielding only when an attribute has been fully parsed.
///
/// This is a specialized parser that begins parsing partway through a XIR
///   token stream.
/// To parse an entire stream as a tree,
///   see [`parser_from`].
///
/// This parser does not take ownership over the iterator,
///   allowing parsing to continue on the underlying token stream after
///   attribute parsing has completed.
/// Once attribute parsing is finished,
///   parsing is able to continue on the underlying token stream as if the
///   attributes were never present in XIR at all;
///     this also allows this parser to be used as an attribute filter while
///     ensuring that the attributes are syntactically valid.
///
/// For more information on contexts,
///   and the parser in general,
///   see the [module-level documentation](self).
#[inline]
pub fn attr_parser_from<'a>(
    toks: &'a mut impl TokenStream,
) -> impl Iterator<Item = Result<Attr>> + 'a {
    let mut state = ParserState::with(Stack::IsolatedAttrEmpty);

    iter::from_fn(move || {
        loop {
            match toks.next().and_then(|tok| parse(&mut state, tok)) {
                None => return None,
                Some(Err(err)) => return Some(Err(err)),
                Some(Ok(Parsed::Attr(attr))) => return Some(Ok(attr)),
                Some(Ok(Parsed::Incomplete)) => continue,

                // AttrEnd must have been encountered.
                Some(Ok(Parsed::Done)) => return None,

                // These make no sense in this context and should never occur.
                Some(Ok(x @ (Parsed::Tree(_) | Parsed::AttrList(_)))) => {
                    unreachable!(
                        "unexpected yield by XIRT (Attr expected): {:?}",
                        x
                    )
                }
            }
        }
    })
}

#[cfg(test)]
mod test;