976 lines
31 KiB
Rust
976 lines
31 KiB
Rust
// XIR flat (XIRF)
|
|
//
|
|
// Copyright (C) 2014-2023 Ryan Specialty, LLC.
|
|
//
|
|
// This file is part of TAME.
|
|
//
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
//! Lightly-parsed XIR as a flat stream (XIRF).
|
|
//!
|
|
//! XIRF lightly parses a raw XIR [`TokenStream`] into a stream of
|
|
//! [`XirfToken`]s that are,
|
|
//! like a [`TokenStream`],
|
|
//! flat in structure.
|
|
//! It provides the following features over raw XIR:
|
|
//!
|
|
//! 1. All closing tags must correspond to a matching opening tag at the
|
|
//! same depth;
|
|
//! 2. [`XirfToken`] exposes the [`Depth`] of each node-related token;
|
|
//! 3. Attribute tokens are parsed into [`Attr`] objects;
|
|
//! 4. Documents must begin with an element and end with the closing of
|
|
//! that element;
|
|
//! 5. Parsing will fail if input ends before all elements have been
|
|
//! closed.
|
|
//! 6. Text nodes may optionally be parsed into [`RefinedText`] to
|
|
//! distinguish whitespace.
|
|
//!
|
|
//! XIRF lowering does not perform any dynamic memory allocation;
|
|
//! maximum element nesting depth is set statically depending on the needs
|
|
//! of the caller.
|
|
|
|
use super::{
|
|
attr::{Attr, AttrParseError, AttrParseState},
|
|
reader::is_xml_whitespace_char,
|
|
CloseSpan, OpenSpan, QName, Token as XirToken, TokenStream,
|
|
};
|
|
use crate::{
|
|
diagnose::{Annotate, AnnotatedSpan, Diagnostic},
|
|
f::Functor,
|
|
parse::prelude::*,
|
|
span::Span,
|
|
sym::{st::is_common_whitespace, GlobalSymbolResolve, SymbolId},
|
|
xir::EleSpan,
|
|
};
|
|
use arrayvec::ArrayVec;
|
|
use std::{
|
|
convert::Infallible,
|
|
error::Error,
|
|
fmt::{Debug, Display},
|
|
marker::PhantomData,
|
|
};
|
|
|
|
// Used for organization.
|
|
pub use accept::*;
|
|
|
|
/// Tag nesting depth
|
|
/// (`0` represents the root).
|
|
///
|
|
/// Note: the lack of a [`Default`] implementation is intentional so that
|
|
/// this does not see lax initialization;
|
|
/// you probably want [`Depth::root`] in that case.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd)]
|
|
pub struct Depth(pub usize);
|
|
|
|
impl Depth {
|
|
/// Depth representing a root.
|
|
pub fn root() -> Depth {
|
|
Depth(0)
|
|
}
|
|
|
|
/// Yield a new [`Depth`] representing the expected depth of children of
|
|
/// an element at the current depth.
|
|
///
|
|
/// That description is probably more confusing than the method name.
|
|
pub fn child_depth(&self) -> Depth {
|
|
match self {
|
|
Depth(depth) => Depth(depth + 1),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Display for Depth {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
Display::fmt(&self.0, f)
|
|
}
|
|
}
|
|
|
|
/// A lightly-parsed XIRF object.
|
|
///
|
|
/// Certain XIR [`Token`]s are formed into a single object,
|
|
/// such as an [`Attr`].
|
|
/// Other objects retain the same format as their underlying token,
|
|
/// but are still validated to ensure that they are well-formed and that
|
|
/// the XML is well-structured.
|
|
///
|
|
/// Each token representing a child node contains a numeric [`Depth`]
|
|
/// indicating the nesting depth;
|
|
/// this can be used by downstream parsers to avoid maintaining their
|
|
/// own stack in certain cases.
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub enum XirfToken<T: TextType> {
|
|
/// Opening tag of an element.
|
|
Open(QName, OpenSpan, Depth),
|
|
|
|
/// Closing tag of an element.
|
|
///
|
|
/// If the name is [`None`],
|
|
/// then the tag is self-closing.
|
|
/// If the name is [`Some`],
|
|
/// then the tag is guaranteed to be balanced
|
|
/// (matching the depth of its opening tag).
|
|
Close(Option<QName>, CloseSpan, Depth),
|
|
|
|
/// An attribute and its value.
|
|
///
|
|
/// The associated [`Span`]s can be found on the enclosed [`Attr`]
|
|
/// object.
|
|
Attr(Attr),
|
|
|
|
/// Comment node.
|
|
Comment(SymbolId, Span, Depth),
|
|
|
|
/// Character data as part of an element.
|
|
///
|
|
/// See also [`CData`](XirfToken::CData) variant.
|
|
Text(T, Depth),
|
|
|
|
/// CData node (`<![CDATA[...]]>`).
|
|
///
|
|
/// _Warning: It is up to the caller to ensure that the string `]]>` is
|
|
/// not present in the text!_
|
|
/// This is intended for reading existing XML data where CData is
|
|
/// already present,
|
|
/// not for producing new CData safely!
|
|
CData(SymbolId, Span, Depth),
|
|
}
|
|
|
|
impl<T: TextType> XirfToken<T> {
|
|
pub fn open(
|
|
qname: impl Into<QName>,
|
|
span: impl Into<OpenSpan>,
|
|
depth: Depth,
|
|
) -> Self {
|
|
Self::Open(qname.into(), span.into(), depth)
|
|
}
|
|
|
|
pub fn close(
|
|
qname: Option<impl Into<QName>>,
|
|
span: impl Into<CloseSpan>,
|
|
depth: Depth,
|
|
) -> Self {
|
|
Self::Close(qname.map(Into::into), span.into(), depth)
|
|
}
|
|
|
|
pub fn attr(
|
|
qname: impl Into<QName>,
|
|
value: impl Into<SymbolId>,
|
|
span: (impl Into<Span>, impl Into<Span>),
|
|
) -> Self {
|
|
Self::Attr(Attr::new(
|
|
qname.into(),
|
|
value.into(),
|
|
(span.0.into(), span.1.into()),
|
|
))
|
|
}
|
|
|
|
pub fn comment(
|
|
comment: impl Into<SymbolId>,
|
|
span: impl Into<Span>,
|
|
depth: Depth,
|
|
) -> Self {
|
|
Self::Comment(comment.into(), span.into(), depth)
|
|
}
|
|
|
|
pub fn text(text: impl Into<T>, depth: Depth) -> Self {
|
|
Self::Text(text.into(), depth)
|
|
}
|
|
}
|
|
|
|
impl<T: TextType> Token for XirfToken<T> {
|
|
fn ir_name() -> &'static str {
|
|
"XIRF"
|
|
}
|
|
|
|
fn span(&self) -> Span {
|
|
use XirfToken::*;
|
|
|
|
match self {
|
|
Open(_, OpenSpan(span, _), _)
|
|
| Close(_, CloseSpan(span, _), _)
|
|
| Comment(_, span, _)
|
|
| CData(_, span, _) => *span,
|
|
|
|
Text(text, _) => text.span(),
|
|
Attr(attr) => attr.span(),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<T: TextType> Object for XirfToken<T> {}
|
|
|
|
impl<T: TextType> Display for XirfToken<T> {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
use XirfToken::*;
|
|
|
|
match self {
|
|
Open(qname, span, _) => {
|
|
Display::fmt(&XirToken::Open(*qname, *span), f)
|
|
}
|
|
Close(oqname, span, _) => {
|
|
Display::fmt(&XirToken::Close(*oqname, *span), f)
|
|
}
|
|
Attr(attr) => Display::fmt(&attr, f),
|
|
Comment(sym, span, _) => {
|
|
Display::fmt(&XirToken::Comment(*sym, *span), f)
|
|
}
|
|
Text(text, _) => Display::fmt(text, f),
|
|
CData(sym, span, _) => {
|
|
Display::fmt(&XirToken::CData(*sym, *span), f)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<T: TextType> XirfToken<T> {
|
|
pub fn depth(&self) -> Option<Depth> {
|
|
use XirfToken::*;
|
|
|
|
match self {
|
|
Open(_, _, depth)
|
|
| Close(_, _, depth)
|
|
| Comment(_, _, depth)
|
|
| Text(_, depth)
|
|
| CData(_, _, depth) => Some(*depth),
|
|
Attr(_) => None,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<T: TextType> From<Attr> for XirfToken<T> {
|
|
fn from(attr: Attr) -> Self {
|
|
Self::Attr(attr)
|
|
}
|
|
}
|
|
|
|
impl<T: TextType> Functor<Depth> for XirfToken<T> {
|
|
fn map(self, f: impl FnOnce(Depth) -> Depth) -> Self::Target {
|
|
use XirfToken::*;
|
|
|
|
match self {
|
|
Open(qn, span, depth) => Open(qn, span, f(depth)),
|
|
Close(qn, span, depth) => Close(qn, span, f(depth)),
|
|
Attr(_) => self,
|
|
Comment(sym, span, depth) => Comment(sym, span, f(depth)),
|
|
Text(text, depth) => Text(text, f(depth)),
|
|
CData(cdata, span, depth) => CData(cdata, span, f(depth)),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Token of an optionally refined [`Text`].
|
|
///
|
|
/// XIRF is configurable on the type of processing it performs on [`Text`],
|
|
/// including the detection of [`Whitespace`].
|
|
///
|
|
/// See also [`RefinedText`].
|
|
pub trait TextType = From<Text> + Into<Text> + Token + Eq;
|
|
|
|
#[derive(Debug, PartialEq, Eq, Clone)]
|
|
pub struct Text(pub SymbolId, pub Span);
|
|
|
|
impl Token for Text {
|
|
fn ir_name() -> &'static str {
|
|
"XIRF Text"
|
|
}
|
|
|
|
fn span(&self) -> Span {
|
|
match self {
|
|
Self(_, span) => *span,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Display for Text {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
// TODO: We'll need care to output text so that it does not mess up
|
|
// formatted output.
|
|
// Further,
|
|
// text can be any arbitrary length,
|
|
// and so should probably be elided after a certain length.
|
|
write!(f, "text")
|
|
}
|
|
}
|
|
|
|
/// A sequence of one or more whitespace characters.
|
|
///
|
|
/// Whitespace here is expected to consist of `[ \n\t\r]`
|
|
/// (where the first character in that class is a space).
|
|
#[derive(Debug, PartialEq, Eq, Clone)]
|
|
pub struct Whitespace(pub Text);
|
|
|
|
impl Display for Whitespace {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
// TODO: Escape output as necessary so that we can render the symbol
|
|
// string.
|
|
// See also `<Text as Display>::fmt` TODO.
|
|
write!(f, "whitespace")
|
|
}
|
|
}
|
|
|
|
/// Text that has been refined to a more descriptive form.
|
|
///
|
|
/// This type may be used as a [`TextType`] to instruct XIRF to detect
|
|
/// [`Whitespace`].
|
|
#[derive(Debug, PartialEq, Eq, Clone)]
|
|
pub enum RefinedText {
|
|
/// Provided [`Text`] has been determined to be [`Whitespace`].
|
|
Whitespace(Whitespace),
|
|
/// Provided [`Text`] was not able to be refined into a more specific
|
|
/// type.
|
|
Unrefined(Text),
|
|
}
|
|
|
|
impl Token for RefinedText {
|
|
fn ir_name() -> &'static str {
|
|
"XIRF RefinedText"
|
|
}
|
|
|
|
fn span(&self) -> Span {
|
|
match self {
|
|
Self::Whitespace(Whitespace(text)) | Self::Unrefined(text) => {
|
|
text.span()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Display for RefinedText {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
match self {
|
|
Self::Whitespace(ws) => Display::fmt(ws, f),
|
|
Self::Unrefined(text) => Display::fmt(text, f),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<Text> for RefinedText {
|
|
fn from(text: Text) -> Self {
|
|
match text {
|
|
Text(sym, _) if is_whitespace(sym) => {
|
|
Self::Whitespace(Whitespace(text))
|
|
}
|
|
_ => Self::Unrefined(text),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<RefinedText> for Text {
|
|
fn from(value: RefinedText) -> Self {
|
|
match value {
|
|
RefinedText::Whitespace(Whitespace(text))
|
|
| RefinedText::Unrefined(text) => text,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// XIRF-compatible attribute parser.
|
|
pub trait FlatAttrParseState<const MAX_DEPTH: usize> =
|
|
ClosedParseState<Token = XirToken, Object = Attr>
|
|
where
|
|
Self: Default,
|
|
<Self as ParseState>::Error: Into<XirToXirfError>,
|
|
StateContext<MAX_DEPTH>: AsMut<<Self as ParseState>::Context>;
|
|
|
|
/// Stack of element [`QName`] and [`Span`] pairs,
|
|
/// representing the current level of nesting.
|
|
///
|
|
/// This storage is statically allocated,
|
|
/// allowing XIRF's parser to avoid memory allocation entirely.
|
|
type ElementStack<const MAX_DEPTH: usize> = ArrayVec<(QName, Span), MAX_DEPTH>;
|
|
|
|
/// Lower [XIR](XirToken) into [XIRF](XirfToken),
|
|
/// accepting only fully parsed XML documents.
|
|
///
|
|
/// If parsing is expected to stop before reaching the end of the document,
|
|
/// see [`PartialXirToXirf`].
|
|
/// For more information on accepting states,
|
|
/// see [`XirfAcceptor`].
|
|
pub type FullXirToXirf<const MAX_DEPTH: usize, T> =
|
|
XirToXirf<MAX_DEPTH, T, AttrParseState, FullXirfAcceptor>;
|
|
|
|
/// Lower [XIR](XirToken) into [XIRF](XirfToken),
|
|
/// accepting partially parsed XML documents at node boundaries.
|
|
///
|
|
/// If the entire XML document ought to be parsed,
|
|
/// see [`FullXirToXirf`] to provide a guarantee of an error in case the
|
|
/// system stops parsing before completion.
|
|
/// For more information on accepting states,
|
|
/// see [`XirfAcceptor`].
|
|
pub type PartialXirToXirf<const MAX_DEPTH: usize, T> =
|
|
XirToXirf<MAX_DEPTH, T, AttrParseState, PartialXirfAcceptor>;
|
|
|
|
/// Lower [XIR](XirToken) into [XIRF](XirfToken).
|
|
///
|
|
/// This parser is a pushdown automaton that parses a single XML document.
|
|
#[derive(Debug, PartialEq, Eq)]
|
|
pub enum XirToXirf<
|
|
const MAX_DEPTH: usize,
|
|
T,
|
|
SA = AttrParseState,
|
|
A: XirfAcceptor = FullXirfAcceptor,
|
|
> where
|
|
SA: FlatAttrParseState<MAX_DEPTH>,
|
|
T: TextType,
|
|
{
|
|
/// Document parsing has not yet begun.
|
|
PreRoot(PhantomData<(T, A)>),
|
|
/// Parsing nodes.
|
|
NodeExpected,
|
|
/// Delegating to attribute parser.
|
|
AttrExpected(SA),
|
|
/// End of document has been reached.
|
|
Done,
|
|
}
|
|
|
|
impl<const MAX_DEPTH: usize, T, SA, A: XirfAcceptor> Default
|
|
for XirToXirf<MAX_DEPTH, T, SA, A>
|
|
where
|
|
SA: FlatAttrParseState<MAX_DEPTH>,
|
|
T: TextType,
|
|
{
|
|
fn default() -> Self {
|
|
Self::PreRoot(PhantomData::default())
|
|
}
|
|
}
|
|
|
|
pub type StateContext<const MAX_DEPTH: usize> =
|
|
Context<ElementStack<MAX_DEPTH>>;
|
|
|
|
/// Whether the given [`SymbolId`] is all whitespace according to
|
|
/// [`is_xml_whitespace_char`].
|
|
///
|
|
/// This will first consult the pre-interned whitespace symbol list using
|
|
/// [`is_common_whitespace`].
|
|
/// If that check fails,
|
|
/// it will resort to looking up the symbol and performing a linear scan
|
|
/// of the string,
|
|
/// terminating early if a non-whitespace character is found.
|
|
///
|
|
/// Note that the empty string is considered to be whitespace.
|
|
#[inline]
|
|
fn is_whitespace(sym: SymbolId) -> bool {
|
|
// See `sym::prefill`;
|
|
// this may require maintenance to keep the prefill list up-to-date
|
|
// with common whitespace symbols to avoid symbol lookups.
|
|
// This common check is purely a performance optimization.
|
|
is_common_whitespace(sym) || {
|
|
// If this is called often and is too expensive,
|
|
// it may be worth caching metadata about symbols,
|
|
// either for XIRF or globally.
|
|
// This requires multiple dereferences
|
|
// (for looking up the intern for the `SymbolId`,
|
|
// which may result in multiple (CPU) cache misses,
|
|
// but that would have to be profiled since the symbol may
|
|
// have just been interned and may be cached still)
|
|
// and then a linear scan of the associated `str`,
|
|
// though it will terminate as soon as it finds a non-whitespace
|
|
// character.
|
|
sym.lookup_str().chars().all(is_xml_whitespace_char)
|
|
}
|
|
}
|
|
|
|
impl<const MAX_DEPTH: usize, T, SA, A: XirfAcceptor> ParseState
|
|
for XirToXirf<MAX_DEPTH, T, SA, A>
|
|
where
|
|
SA: FlatAttrParseState<MAX_DEPTH>,
|
|
T: TextType,
|
|
{
|
|
type Token = XirToken;
|
|
type Object = XirfToken<T>;
|
|
type Error = XirToXirfError;
|
|
type Context = StateContext<MAX_DEPTH>;
|
|
|
|
fn parse_token(
|
|
self,
|
|
tok: Self::Token,
|
|
stack: &mut Self::Context,
|
|
) -> TransitionResult<Self> {
|
|
use XirToXirf::{AttrExpected, Done, NodeExpected, PreRoot};
|
|
|
|
match (self, tok) {
|
|
// Comments are permitted before and after the first root element.
|
|
(st @ (PreRoot(_) | Done), XirToken::Comment(sym, span)) => {
|
|
let depth = Depth(stack.len());
|
|
Transition(st).ok(XirfToken::Comment(sym, span, depth))
|
|
}
|
|
|
|
// Ignore whitespace before or after root.
|
|
(st @ (PreRoot(_) | Done), XirToken::Text(sym, _))
|
|
if is_whitespace(sym) =>
|
|
{
|
|
Transition(st).incomplete()
|
|
}
|
|
|
|
(PreRoot(_), tok @ XirToken::Open(..)) => {
|
|
Self::parse_node(tok, stack)
|
|
}
|
|
|
|
(st @ PreRoot(_), tok) => {
|
|
Transition(st).err(XirToXirfError::RootOpenExpected(tok))
|
|
}
|
|
|
|
(NodeExpected, tok) => Self::parse_node(tok, stack),
|
|
|
|
(AttrExpected(sa), tok) => sa.delegate(
|
|
tok,
|
|
stack,
|
|
|sa| Transition(AttrExpected(sa)),
|
|
|| Transition(NodeExpected),
|
|
),
|
|
|
|
(Done, tok) => Transition(Done).dead(tok),
|
|
}
|
|
}
|
|
|
|
/// Whether all elements have been closed.
|
|
///
|
|
/// Parsing will fail if there are any open elements.
|
|
/// Intuitively,
|
|
/// this means that the parser must have encountered the closing tag
|
|
/// for the root element.
|
|
fn is_accepting(&self, _: &Self::Context) -> bool {
|
|
// TODO: It'd be nice if we could also return additional context to
|
|
// aid the user in diagnosing the problem,
|
|
// e.g. what element(s) still need closing.
|
|
A::is_accepting(self)
|
|
}
|
|
}
|
|
|
|
/// Configurable accepting states for [`XirToXirf`].
|
|
///
|
|
/// See this module's [`XirfAcceptor`] for more information.
|
|
mod accept {
|
|
use super::*;
|
|
|
|
/// Acceptor for [`XirToXirf`].
|
|
///
|
|
/// This is responsible for determining whether [`XirToXirf`] is in an
|
|
/// accepting state.
|
|
/// There are two acceptors:
|
|
///
|
|
/// 1. [`FullXirfAcceptor`] expects that the _entire_ XML document be
|
|
/// completely parsed up to and including the closing root node;
|
|
/// and
|
|
/// 2. [`PartialXirfAcceptor`] allows parsing to halt part-way through
|
|
/// an XML document,
|
|
/// provided that parsing ends at a node boundary.
|
|
///
|
|
/// See each respective acceptor for more information.
|
|
pub trait XirfAcceptor: Debug + PartialEq + Eq + Display + Default {
|
|
fn is_accepting<const MAX_DEPTH: usize, T, SA, A: XirfAcceptor>(
|
|
st: &XirToXirf<MAX_DEPTH, T, SA, A>,
|
|
) -> bool
|
|
where
|
|
SA: FlatAttrParseState<MAX_DEPTH>,
|
|
T: TextType;
|
|
}
|
|
|
|
/// Acceptor for fully parsed XML documents for [`XirToXirf`].
|
|
///
|
|
/// This acceptor should be used when the intent of the lowering
|
|
/// pipeline is to fully parse the [XIR](XirToken) stream.
|
|
/// In other words:
|
|
/// this should be used when the XML document being read ought to be
|
|
/// read _fully_,
|
|
/// where halting parsing before the root node would indicate a
|
|
/// defect in the system.
|
|
///
|
|
/// For example,
|
|
/// when reading a file with TAME sources in `tamec`,
|
|
/// the compiler ought to ensure that the entire file is read to
|
|
/// completion.
|
|
/// If the lowering pipeline stops requesting tokens before the XIR
|
|
/// stream has ended,
|
|
/// then that means that compilation has halted before the system
|
|
/// has had a chance to consider the rest of the file.
|
|
/// Because the lowering pipeline is intended to parse and present
|
|
/// errors on the entire file each run,
|
|
/// this would represent a bug in the system,
|
|
/// and so we ought to fail.
|
|
///
|
|
/// Downstream parsers ought to fail for their own reasons as well,
|
|
/// but this provides an extra layer of protection for _anything_ that
|
|
/// happens to read XML files.
|
|
///
|
|
/// For an example of a situation where we may not wish to fail,
|
|
/// see [`PartialXirfAcceptor`].
|
|
#[derive(Debug, PartialEq, Eq, Default)]
|
|
pub struct FullXirfAcceptor;
|
|
|
|
impl XirfAcceptor for FullXirfAcceptor {
|
|
fn is_accepting<const MAX_DEPTH: usize, T, SA, A: XirfAcceptor>(
|
|
st: &XirToXirf<MAX_DEPTH, T, SA, A>,
|
|
) -> bool
|
|
where
|
|
SA: FlatAttrParseState<MAX_DEPTH>,
|
|
T: TextType,
|
|
{
|
|
matches!(st, XirToXirf::Done)
|
|
}
|
|
}
|
|
|
|
impl Display for FullXirfAcceptor {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
write!(f, "accepting only full documents")
|
|
}
|
|
}
|
|
|
|
/// Acceptor for either full _or_ partially parsed XML documents for
|
|
/// [`XirToXirf`].
|
|
///
|
|
/// This acceptor is intended to be used when parsing the entirety of a
|
|
/// [XIR](XirToken) stream is not desirable;
|
|
/// it allows parsing to be completed at a node boundary
|
|
/// (when a node is expected).
|
|
/// This acceptor builds on the behavior of [`FullXirfAcceptor`],
|
|
/// and so will also accept all fully parsed documents.
|
|
///
|
|
/// For example,
|
|
/// when reading object files in `tameld`,
|
|
/// the linker is concerned only with header information;
|
|
/// the remainder of the XML document does not contain useful
|
|
/// information and would be wasteful to parse.
|
|
/// In that case,
|
|
/// we rely on downstream parsers to determine whether the document
|
|
/// has been sufficiently parsed.
|
|
///
|
|
/// This acceptor provides one weaker guarantee:
|
|
/// that parsing has _at least_ completed parsing a node,
|
|
/// such as an element.
|
|
/// Parsing must complete at a node boundary,
|
|
/// and so cannot halt in the middle of attribute parsing for an
|
|
/// element,
|
|
/// for example.
|
|
#[derive(Debug, PartialEq, Eq, Default)]
|
|
pub struct PartialXirfAcceptor;
|
|
|
|
impl XirfAcceptor for PartialXirfAcceptor {
|
|
fn is_accepting<const MAX_DEPTH: usize, T, SA, A: XirfAcceptor>(
|
|
st: &XirToXirf<MAX_DEPTH, T, SA, A>,
|
|
) -> bool
|
|
where
|
|
SA: FlatAttrParseState<MAX_DEPTH>,
|
|
T: TextType,
|
|
{
|
|
FullXirfAcceptor::is_accepting(st)
|
|
|| matches!(st, XirToXirf::NodeExpected)
|
|
}
|
|
}
|
|
|
|
impl Display for PartialXirfAcceptor {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
write!(f, "accepting partial documents at node boundaries")
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<const MAX_DEPTH: usize, T, SA, A: XirfAcceptor> Display
|
|
for XirToXirf<MAX_DEPTH, T, SA, A>
|
|
where
|
|
SA: FlatAttrParseState<MAX_DEPTH>,
|
|
T: TextType,
|
|
{
|
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
use XirToXirf::*;
|
|
|
|
match self {
|
|
PreRoot(_) => write!(f, "expecting document root"),
|
|
NodeExpected => write!(f, "expecting a node"),
|
|
AttrExpected(sa) => Display::fmt(sa, f),
|
|
Done => write!(f, "done parsing document root"),
|
|
}?;
|
|
|
|
// e.g. ", accepting ..."
|
|
write!(f, ", ")?;
|
|
Display::fmt(&A::default(), f)
|
|
}
|
|
}
|
|
|
|
impl<const MAX_DEPTH: usize, T, SA, A: XirfAcceptor>
|
|
XirToXirf<MAX_DEPTH, T, SA, A>
|
|
where
|
|
SA: FlatAttrParseState<MAX_DEPTH>,
|
|
T: TextType,
|
|
{
|
|
/// Parse a token while in a state expecting a node.
|
|
fn parse_node(
|
|
tok: <Self as ParseState>::Token,
|
|
stack: &mut ElementStack<MAX_DEPTH>,
|
|
) -> TransitionResult<Self> {
|
|
use XirToXirf::{AttrExpected, Done, NodeExpected};
|
|
|
|
let depth = Depth(stack.len());
|
|
|
|
match tok {
|
|
XirToken::Open(qname, span) if stack.len() == MAX_DEPTH => {
|
|
Transition(NodeExpected).err(XirToXirfError::MaxDepthExceeded {
|
|
open: (qname, span.tag_span()),
|
|
max: Depth(MAX_DEPTH),
|
|
})
|
|
}
|
|
|
|
XirToken::Open(qname, span) => {
|
|
stack.push((qname, span.tag_span()));
|
|
|
|
// Delegate to the attribute parser until it is complete.
|
|
Transition(AttrExpected(SA::default()))
|
|
.ok(XirfToken::Open(qname, span, depth))
|
|
}
|
|
|
|
XirToken::Close(close_oqname, close_span) => {
|
|
match (close_oqname, stack.pop()) {
|
|
(_, None) => unreachable!("parser should be in Done state"),
|
|
|
|
(Some(qname), Some((open_qname, open_span)))
|
|
if qname != open_qname =>
|
|
{
|
|
Transition(NodeExpected).err(
|
|
XirToXirfError::UnbalancedTag {
|
|
open: (open_qname, open_span),
|
|
close: (qname, close_span.tag_span()),
|
|
},
|
|
)
|
|
}
|
|
|
|
// Final closing tag (for root node) completes the document.
|
|
(..) if stack.is_empty() => Transition(Done).ok(
|
|
XirfToken::Close(close_oqname, close_span, Depth(0)),
|
|
),
|
|
|
|
(..) => {
|
|
let depth = stack.len();
|
|
|
|
Transition(NodeExpected).ok(XirfToken::Close(
|
|
close_oqname,
|
|
close_span,
|
|
Depth(depth),
|
|
))
|
|
}
|
|
}
|
|
}
|
|
|
|
XirToken::Comment(sym, span) => Transition(NodeExpected)
|
|
.ok(XirfToken::Comment(sym, span, depth)),
|
|
|
|
XirToken::Text(sym, span) => Transition(NodeExpected)
|
|
.ok(XirfToken::Text(T::from(Text(sym, span)), depth)),
|
|
|
|
XirToken::CData(sym, span) => {
|
|
Transition(NodeExpected).ok(XirfToken::CData(sym, span, depth))
|
|
}
|
|
|
|
// We should transition to `State::Attr` before encountering any
|
|
// of these tokens.
|
|
XirToken::AttrName(..)
|
|
| XirToken::AttrValue(..)
|
|
| XirToken::AttrValueFragment(..) => {
|
|
unreachable!("attribute token in NodeExpected state: {tok:?}")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Produce a streaming parser lowering a XIR [`TokenStream`] into a XIRF
|
|
/// stream.
|
|
pub fn parse<const MAX_DEPTH: usize, T: TextType>(
|
|
toks: impl TokenStream,
|
|
) -> impl Iterator<Item = ParsedResult<XirToXirf<MAX_DEPTH, T>>> {
|
|
XirToXirf::<MAX_DEPTH, T>::parse(toks)
|
|
}
|
|
|
|
/// Parsing error from [`XirToXirf`].
|
|
#[derive(Debug, Eq, PartialEq)]
|
|
pub enum XirToXirfError {
|
|
/// Opening root element tag was expected.
|
|
RootOpenExpected(XirToken),
|
|
|
|
/// Opening tag exceeds the maximum nesting depth for this parser.
|
|
MaxDepthExceeded { open: (QName, Span), max: Depth },
|
|
|
|
/// The closing tag does not match the opening tag at the same level of
|
|
/// nesting.
|
|
UnbalancedTag {
|
|
open: (QName, Span),
|
|
close: (QName, Span),
|
|
},
|
|
|
|
/// Error from the attribute parser.
|
|
AttrError(AttrParseError),
|
|
}
|
|
|
|
impl Display for XirToXirfError {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
use XirToXirfError::*;
|
|
|
|
match self {
|
|
RootOpenExpected(_tok) => {
|
|
write!(f, "missing opening root element",)
|
|
}
|
|
|
|
MaxDepthExceeded {
|
|
open: (_name, _),
|
|
max,
|
|
} => {
|
|
write!(
|
|
f,
|
|
"maximum XML element nesting depth of `{max}` exceeded"
|
|
)
|
|
}
|
|
|
|
UnbalancedTag {
|
|
open: (open_name, _),
|
|
close: (_close_name, _),
|
|
} => {
|
|
write!(f, "expected closing tag for `{open_name}`")
|
|
}
|
|
|
|
AttrError(e) => Display::fmt(e, f),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Error for XirToXirfError {
|
|
fn source(&self) -> Option<&(dyn Error + 'static)> {
|
|
match self {
|
|
Self::AttrError(e) => Some(e),
|
|
_ => None,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Diagnostic for XirToXirfError {
|
|
fn describe(&self) -> Vec<AnnotatedSpan> {
|
|
use XirToXirfError::*;
|
|
|
|
match self {
|
|
RootOpenExpected(tok) => {
|
|
// TODO: Should the span be the first byte,
|
|
// or should we delegate that question to an e.g. `SpanLike`?
|
|
tok.span()
|
|
.error("an opening root node was expected here")
|
|
.into()
|
|
}
|
|
|
|
MaxDepthExceeded {
|
|
open: (_, span),
|
|
max,
|
|
} => span
|
|
.error(format!(
|
|
"this opening tag increases the level of nesting \
|
|
past the limit of {max}"
|
|
))
|
|
.into(),
|
|
|
|
UnbalancedTag {
|
|
open: (open_name, open_span),
|
|
close: (_close_name, close_span),
|
|
} => {
|
|
// TODO: hint saying that the nesting could be wrong, etc;
|
|
// we can't just suggest a replacement,
|
|
// since that's not necessarily the problem
|
|
vec![
|
|
open_span
|
|
.note(format!("element `{open_name}` is opened here")),
|
|
// No need to state the close name since the source line
|
|
// will be highlighted by the diagnostic message.
|
|
close_span.error(format!("expected `</{open_name}>`")),
|
|
]
|
|
}
|
|
|
|
AttrError(e) => e.describe(),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<AttrParseError> for XirToXirfError {
|
|
fn from(e: AttrParseError) -> Self {
|
|
Self::AttrError(e)
|
|
}
|
|
}
|
|
|
|
/// Lower a [`XirfToken`] stream into a [`XirToken`] stream.
|
|
///
|
|
/// This is the dual of [`XirToXirf`],
|
|
/// and is intended to be used when the system _generates_ XML.
|
|
/// If you do not need any features of XIRF,
|
|
/// and aren't using any operation that produces it,
|
|
/// then you may also skip a step and just emit XIR to avoid having to
|
|
/// perform this lowering operation.
|
|
#[derive(Debug, PartialEq, Eq)]
|
|
pub enum XirfToXir<T: TextType> {
|
|
Ready(PhantomData<T>),
|
|
AttrVal(PhantomData<T>),
|
|
}
|
|
|
|
impl<T: TextType> Default for XirfToXir<T> {
|
|
fn default() -> Self {
|
|
Self::Ready(Default::default())
|
|
}
|
|
}
|
|
|
|
impl<T: TextType> Display for XirfToXir<T> {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
write!(f, "translating XIRF to XIR")
|
|
}
|
|
}
|
|
|
|
impl<T: TextType> ParseState for XirfToXir<T> {
|
|
type Token = XirfToken<T>;
|
|
type Object = XirToken;
|
|
type Error = Infallible;
|
|
|
|
fn parse_token(
|
|
self,
|
|
tok: Self::Token,
|
|
_: NoContext,
|
|
) -> TransitionResult<Self::Super> {
|
|
use XirToken as Xir;
|
|
use XirfToXir::*;
|
|
use XirfToken as Xirf;
|
|
|
|
macro_rules! to {
|
|
($tok:expr) => {
|
|
Transition(self).ok($tok)
|
|
};
|
|
}
|
|
|
|
match tok {
|
|
Xirf::Open(qname, ospan, _) => to!(Xir::Open(qname, ospan)),
|
|
Xirf::Close(qname, cspan, _) => to!(Xir::Close(qname, cspan)),
|
|
Xirf::Attr(attr) => match self {
|
|
Self::Ready(p) => Transition(AttrVal(p))
|
|
.ok(Xir::AttrName(attr.name(), attr.attr_span().key_span()))
|
|
.with_lookahead(Xirf::Attr(attr)),
|
|
Self::AttrVal(p) => Transition(Ready(p)).ok(Xir::AttrValue(
|
|
attr.value(),
|
|
attr.attr_span().value_span(),
|
|
)),
|
|
},
|
|
Xirf::Comment(sym, span, _) => to!(Xir::Comment(sym, span)),
|
|
Xirf::Text(x, _) => match x.into() {
|
|
Text(sym, span) => to!(Xir::Text(sym, span)),
|
|
},
|
|
Xirf::CData(sym, span, _) => to!(Xir::CData(sym, span)),
|
|
}
|
|
}
|
|
|
|
fn is_accepting(&self, _: &Self::Context) -> bool {
|
|
matches!(self, Self::Ready(_))
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
pub mod test;
|