// XIR flat (XIRF) // // Copyright (C) 2014-2021 Ryan Specialty Group, LLC. // // This file is part of TAME. // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . //! Lightly-parsed XIR as a flat stream (XIRF). //! //! XIRF lightly parses a raw XIR [`TokenStream`] into a stream of //! [`Object`]s that are, //! like a [`TokenStream`], //! flat in structure. //! It provides the following features over raw XIR: //! //! 1. All closing tags must correspond to a matching opening tag at the //! same depth; //! 2. [`Object`] exposes the [`Depth`] of each opening/closing tag; //! 3. Attribute tokens are parsed into [`Attr`] objects; and //! 4. Parsing will fail if input ends before all elements have been //! closed. //! //! XIRF lowering does not perform any dynamic memory allocation; //! maximum element nesting depth is set statically depending on the needs //! of the caller. use super::{ parse::{ParseState, ParseStateResult, ParseStatus, ParsedResult}, tree::{ attr::{AttrParseError, AttrParseState}, Attr, }, QName, Token, TokenStream, Whitespace, }; use crate::{span::Span, sym::SymbolId}; use arrayvec::ArrayVec; use std::{error::Error, fmt::Display, mem::replace}; /// Tag nesting depth /// (`0` represents the root). #[derive(Debug, Clone, PartialEq, Eq)] pub struct Depth(usize); impl Display for Depth { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { Display::fmt(&self.0, f) } } /// A lightly-parsed XIRF object. /// /// Certain XIR [`Token`]s are formed into a single object, /// such as an [`Attr`]. /// Other objects retain the same format as their underlying token, /// but are still validated to ensure that they are well-formed and that /// the XML is well-structured. #[derive(Debug, Clone, PartialEq, Eq)] pub enum Object { /// Opening tag of an element. Open(QName, Span, Depth), /// Closing tag of an element. /// /// If the name is [`None`], /// then the tag is self-closing. /// If the name is [`Some`], /// then the tag is guaranteed to be balanced /// (matching the depth of its opening tag). Close(Option, Span, Depth), /// An attribute and its value. /// /// The associated [`Span`]s can be found on the enclosed [`Attr`] /// object. Attr(Attr), /// Comment node. Comment(SymbolId, Span), /// Character data as part of an element. /// /// See also [`CData`](Object::CData) variant. Text(SymbolId, Span), /// CData node (``). /// /// _Warning: It is up to the caller to ensure that the string `]]>` is /// not present in the text!_ /// This is intended for reading existing XML data where CData is /// already present, /// not for producing new CData safely! CData(SymbolId, Span), /// Similar to `Text`, /// but intended for use where only whitespace is allowed, /// such as alignment of attributes. Whitespace(Whitespace, Span), } /// XIRF-compatible attribute parser. pub trait FlatAttrParseState = ParseState where ::Error: Into; /// Stack of element [`QName`] and [`Span`] pairs, /// representing the current level of nesting. /// /// This storage is statically allocated, /// allowing XIRF's parser to avoid memory allocation entirely. type ElementStack = ArrayVec<(QName, Span), MAX_DEPTH>; /// XIRF parser state. /// /// This parser is a pushdown automaton. #[derive(Debug, PartialEq, Eq)] pub enum State where SA: FlatAttrParseState, { // TODO: Ensure that non-comment nodes are not encountered before the // root, // and that we do not encounter any non-comment nodes after the // root. /// Parsing nodes. NodeExpected(ElementStack), /// Delegating to attribute parser. AttrExpected(ElementStack, SA), /// Temporary state used to catch missing explicit state transitions in /// `parse_token`. Invalid, } impl Default for State { fn default() -> Self { Self::NodeExpected(Default::default()) } } /// Denotes a state transition. /// /// This newtype was created to produce clear, self-documenting code. struct Transition(T); impl ParseState for State where SA: FlatAttrParseState, { type Object = Object; type Error = StateError; fn parse_token(&mut self, tok: Token) -> ParseStateResult { use ParseStatus::{Dead, Incomplete, Object as Obj}; use State::{AttrExpected, Invalid, NodeExpected}; let result; // This awkward-looking take-reassign forces us to be explicit // about state transitions in every case, // ensuring that we always have documented proof of what state // the system winds up in. // The `Invalid` state prevents using `return`. (Transition(*self), result) = match (replace(self, Invalid), tok) { (NodeExpected(stack), tok) => Self::parse_node(stack, tok), (AttrExpected(stack, mut sa), tok) => match sa.parse_token(tok) { Ok(Incomplete) => { (Transition(AttrExpected(stack, sa)), Ok(Incomplete)) } Ok(Obj(attr)) => ( Transition(AttrExpected(stack, sa)), Ok(Obj(Object::Attr(attr))), ), Ok(Dead(lookahead)) => Self::parse_node(stack, lookahead), Err(x) => (Transition(AttrExpected(stack, sa)), Err(x.into())), }, // See comment at the top of this function. (Invalid, _) => { unreachable!("XIRF parser reached invalid state") } }; result } /// Whether all elements have been closed. /// /// Parsing will fail if there are any open elements. /// Intuitively, /// this means that the parser must have encountered the closing tag /// for the root element. fn is_accepting(&self) -> bool { // TODO: It'd be nice if we could also return additional context to // aid the user in diagnosing the problem, // e.g. what element(s) still need closing. matches!(self, Self::NodeExpected(stack) if stack.len() == 0) } } impl State where SA: FlatAttrParseState, { /// Parse a token while in a state expecting a node. fn parse_node( mut stack: ElementStack, tok: Token, ) -> (Transition, ParseStateResult) { use ParseStatus::Object as Obj; use State::{AttrExpected, NodeExpected}; match tok { Token::Open(qname, span) if stack.len() == MAX_DEPTH => ( Transition(NodeExpected(stack)), Err(StateError::MaxDepthExceeded { open: (qname, span), max: Depth(MAX_DEPTH), }), ), Token::Open(qname, span) => { let depth = stack.len(); stack.push((qname, span)); // Delegate to the attribute parser until it is complete. ( Transition(AttrExpected(stack, SA::default())), Ok(Obj(Object::Open(qname, span, Depth(depth)))), ) } Token::Close(close_oqname, close_span) => { match (close_oqname, stack.pop()) { (_, None) => ( Transition(NodeExpected(stack)), Err(StateError::ExtraClosingTag( close_oqname, close_span, )), ), (Some(qname), Some((open_qname, open_span))) if qname != open_qname => { ( Transition(NodeExpected(stack)), Err(StateError::UnbalancedTag { open: (open_qname, open_span), close: (qname, close_span), }), ) } (..) => { let depth = stack.len(); ( Transition(NodeExpected(stack)), Ok(Obj(Object::Close( close_oqname, close_span, Depth(depth), ))), ) } } } Token::Comment(sym, span) => ( Transition(NodeExpected(stack)), Ok(Obj(Object::Comment(sym, span))), ), Token::Text(sym, span) => ( Transition(NodeExpected(stack)), Ok(Obj(Object::Text(sym, span))), ), Token::CData(sym, span) => ( Transition(NodeExpected(stack)), Ok(Obj(Object::CData(sym, span))), ), Token::Whitespace(ws, span) => ( Transition(NodeExpected(stack)), Ok(Obj(Object::Whitespace(ws, span))), ), // We should transition to `State::Attr` before encountering any // of these tokens. Token::AttrName(..) | Token::AttrValue(..) | Token::AttrValueFragment(..) => { unreachable!("attribute token in NodeExpected state: {tok:?}") } } } } /// Produce a streaming parser lowering a XIR [`TokenStream`] into a XIRF /// stream. pub fn parse( toks: impl TokenStream, ) -> impl Iterator>> { State::::parse(toks) } /// Parsing error from [`State`]. #[derive(Debug, Eq, PartialEq)] pub enum StateError { /// Opening tag exceeds the maximum nesting depth for this parser. MaxDepthExceeded { open: (QName, Span), max: Depth }, /// The closing tag does not match the opening tag at the same level of /// nesting. UnbalancedTag { open: (QName, Span), close: (QName, Span), }, /// Attempt to close a tag with no corresponding opening tag /// (which would result in a negative depth). ExtraClosingTag(Option, Span), /// Error from the attribute parser. AttrError(AttrParseError), } impl Display for StateError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use StateError::*; match self { MaxDepthExceeded { open: (name, span), max, } => { write!( f, "maximum element nesting depth of {max} exceeded \ by `{name}` at {span}" ) } UnbalancedTag { open: (open_name, open_span), close: (close_name, close_span), } => { write!( f, "expected closing tag `{open_name}`, \ but found `{close_name}` at {close_span} \ (opening tag at {open_span})", ) } ExtraClosingTag(Some(name), span) => { write!(f, "closing tag `{name}` at {span} has no opening tag",) } // If this occurs, its likely that something generated invalid // XIR; // it should be a parsing error on read and no generator // should ever produce this. ExtraClosingTag(None, span) => { write!(f, "self-closing tag at {span} has no opening tag") } AttrError(e) => Display::fmt(e, f), } } } impl Error for StateError { fn source(&self) -> Option<&(dyn Error + 'static)> { todo!() } } impl From for StateError { fn from(e: AttrParseError) -> Self { Self::AttrError(e) } } #[cfg(test)] mod test;