tame/tamer/src/xir/writer.rs

575 lines
18 KiB
Rust

// XIR writer
//
// Copyright (C) 2014-2023 Ryan Specialty, LLC.
//
// This file is part of TAME.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//! Lower XIR stream into an XML byte stream via [`Write`].
use super::{DefaultEscaper, Escaper};
use super::{Error as XirError, QName, Token, TokenStream};
use crate::sym::GlobalSymbolResolve;
use std::io::{Error as IoError, Write};
use std::result;
pub type Result<T = WriterState> = result::Result<T, Error>;
#[derive(Debug)]
pub enum Error {
Io(IoError),
Xir(XirError),
// TODO: No String here, since then we cannot resolve inner symbols for
// display to the user
UnexpectedToken(String, WriterState),
Todo(String, WriterState),
}
impl std::fmt::Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Io(e) => e.fmt(f),
Self::Xir(e) => e.fmt(f),
Self::UnexpectedToken(tok, state) => write!(
f,
"Invalid token {} at XML writer state {:?}",
tok, state
),
Self::Todo(tok, state) => write!(
f,
"Unexpected {} at XML writer state {:?}; TAMER intends to \
support this operation, but it was not yet thought to be \
needed. More development is needed to support it, but \
hopefully very little. However, if you hit this during \
normal use of TAME, then this represents a bug.",
tok, state
),
}
}
}
impl PartialEq for Error {
fn eq(&self, other: &Self) -> bool {
match (self, other) {
(Error::Io(_), _) => false,
(Error::Xir(a), Error::Xir(b)) => a == b,
(
Error::UnexpectedToken(aa, ab),
Error::UnexpectedToken(ba, bb),
) => aa == ba && ab == bb,
(Error::Todo(aa, ab), Error::Todo(ba, bb)) => aa == ba && ab == bb,
_ => false,
}
}
}
impl std::error::Error for Error {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::Io(e) => Some(e),
Self::Xir(e) => Some(e),
_ => None,
}
}
}
impl From<IoError> for Error {
fn from(e: IoError) -> Self {
Self::Io(e)
}
}
impl From<XirError> for Error {
fn from(e: XirError) -> Self {
Self::Xir(e)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum WriterState {
/// A node is expected to be output next.
NodeExpected,
/// A node is currently being output and has not yet been closed.
NodeOpen,
/// Cursor is adjacent to an attribute name within an element.
AttrNameAdjacent,
/// Cursor is adjacent to an attribute fragment within an element.
AttrFragmentAdjacent,
}
impl Default for WriterState {
fn default() -> Self {
Self::NodeExpected
}
}
impl WriterState {
#[inline]
fn close_tag_if_open<W: Write>(&self, sink: &mut W) -> Result<()> {
match *self {
Self::NodeOpen => Ok(sink.write_all(b">")?),
_ => Ok(()),
}
}
}
/// Write an XML representation.
///
/// This trait is intended for use with [XIR](super) [`Token`] stream.
/// It uses a finate state machine (FSM),
/// where states are represented by [`WriterState`],
/// to avoid lookahead requirements.
pub trait XmlWriter<S = DefaultEscaper>
where
S: Escaper,
Self: Sized,
{
/// Write XML representation into the provided buffer.
///
/// The writer acts as a state machine to determine whether previous
/// output requires closing.
/// Each write operation takes a previous [`WriterState`],
/// and transitions to a new [`WriterState`] after performing the
/// write operation
/// (which may be the same as the previous state).
/// This returned state must be provided to the next `write` operation
/// to produce valid output.
///
/// If you have a series of writes to perform,
/// consider using an [`Iterator`] implementing [`XmlWriter`].
#[must_use = "Write operation may fail"]
fn write<W: Write>(
self,
sink: &mut W,
prev_state: WriterState,
escaper: &S,
) -> Result;
/// Allocate a new buffer and write into it,
/// returning both the new buffer and the writer state.
///
/// See [`write`](XmlWriter::write) for more information.
///
/// This is intended primarily for testing;
/// it is recommended that you use [`write`](XmlWriter::write) instead,
/// unless you _really_ need a new owned `Vec<u8>`.
#[cfg(test)]
fn write_new(
self,
prev_state: WriterState,
escaper: &S,
) -> Result<(Vec<u8>, WriterState)> {
let mut buf = Vec::<u8>::new();
let state = self.write(&mut buf, prev_state, escaper)?;
Ok((buf, state))
}
}
impl<S: Escaper> XmlWriter<S> for QName {
#[inline]
fn write<W: Write>(
self,
sink: &mut W,
prev_state: WriterState,
_escaper: &S,
) -> Result {
if let Some(prefix) = self.prefix() {
sink.write_all(prefix.lookup_str().as_bytes())?;
sink.write_all(b":")?;
}
sink.write_all(self.local_name().lookup_str().as_bytes())?;
Ok(prev_state)
}
}
impl<S: Escaper> XmlWriter<S> for &Token {
fn write<W: Write>(
self,
sink: &mut W,
prev_state: WriterState,
escaper: &S,
) -> Result {
type W = WriterState; // More concise
use Token::*;
match (self, prev_state) {
(Open(name, _), W::NodeExpected | W::NodeOpen) => {
// If a node is still open, then we are a child.
prev_state.close_tag_if_open(sink)?;
sink.write_all(b"<")?;
name.write(sink, prev_state, escaper)?;
Ok(W::NodeOpen)
}
(Close(None, _), W::NodeOpen) => {
sink.write_all(b"/>")?;
Ok(W::NodeExpected)
}
(Close(Some(name), _), W::NodeExpected | W::NodeOpen) => {
// If open, we're going to produce an element of the form
// `<foo></foo>`.
prev_state.close_tag_if_open(sink)?;
sink.write_all(b"</")?;
name.write(sink, prev_state, escaper)?;
sink.write_all(b">")?;
Ok(W::NodeExpected)
}
(AttrName(name, _), W::NodeOpen) => {
sink.write_all(b" ")?;
name.write(sink, prev_state, escaper)?;
Ok(W::AttrNameAdjacent)
}
(AttrValue(value, _), W::AttrNameAdjacent) => {
sink.write_all(b"=\"")?;
sink.write_all(escaper.escape(*value).lookup_str().as_bytes())?;
sink.write_all(b"\"")?;
Ok(W::NodeOpen)
}
(AttrValue(value, _), W::AttrFragmentAdjacent) => {
sink.write_all(escaper.escape(*value).lookup_str().as_bytes())?;
sink.write_all(b"\"")?;
Ok(W::NodeOpen)
}
(AttrValueFragment(value, _), W::AttrNameAdjacent) => {
sink.write_all(b"=\"")?;
sink.write_all(escaper.escape(*value).lookup_str().as_bytes())?;
Ok(W::AttrFragmentAdjacent)
}
(AttrValueFragment(value, _), W::AttrFragmentAdjacent) => {
sink.write_all(escaper.escape(*value).lookup_str().as_bytes())?;
Ok(W::AttrFragmentAdjacent)
}
// TODO: We have no way of knowing if text should be formatted
// as CData,
// which may also be beneficial to avoid escaping if we
// haven't yet encountered the unescaped representation.
(Text(text, _), W::NodeExpected | W::NodeOpen) => {
prev_state.close_tag_if_open(sink)?;
sink.write_all(escaper.escape(*text).lookup_str().as_bytes())?;
Ok(W::NodeExpected)
}
// XXX: While we currently only output comments that have been
// _read_ as comments,
// that will not always be the case and we must escape `--`!
(Comment(comment, _), W::NodeExpected | W::NodeOpen) => {
prev_state.close_tag_if_open(sink)?;
sink.write_all(b"<!--")?;
sink.write_all(comment.lookup_str().as_bytes())?;
sink.write_all(b"-->")?;
Ok(W::NodeExpected)
}
// As-of-yet unsupported operations that weren't needed at the
// time of writing, but were planned for in the design of Xir.
(invalid @ AttrName(_, _), W::AttrNameAdjacent) => {
Err(Error::Todo(format!("{:?}", invalid), prev_state))
}
// Everything else represents either an invalid state transition
// that would produce invalid XML, or something that we forgot
// to account for in the above error.
(invalid, _) => Err(Error::UnexpectedToken(
format!("{:?}", invalid),
prev_state,
)),
}
}
}
impl<I: TokenStream, S: Escaper> XmlWriter<S> for I {
fn write<W: Write>(
mut self,
sink: &mut W,
initial_state: WriterState,
escaper: &S,
) -> Result {
self.try_fold(initial_state, |prev_state, tok| {
tok.write(sink, prev_state, escaper)
})
}
}
#[cfg(test)]
mod test {
use std::borrow::Cow;
use super::*;
use crate::{
convert::ExpectInto,
span::{dummy::DUMMY_SPAN, Span},
sym::GlobalSymbolIntern,
xir::{
error::SpanlessError,
test::{close, close_empty, open},
QName,
},
};
type TestResult = std::result::Result<(), Error>;
#[derive(Debug, Default)]
struct MockEscaper {}
// Simply adds ":ESC" as a suffix to the provided byte slice.
impl Escaper for MockEscaper {
fn escape_bytes(value: &[u8]) -> Cow<[u8]> {
let mut esc = value.to_owned();
esc.extend_from_slice(b":ESC");
Cow::Owned(esc)
}
fn unescape_bytes(
_: &[u8],
) -> result::Result<Cow<[u8]>, SpanlessError> {
unreachable!("Writer should not be unescaping!")
}
}
const S: Span = DUMMY_SPAN;
#[test]
fn writes_beginning_node_tag_without_prefix() -> TestResult {
let name = QName::new_local("no-prefix".unwrap_into());
let result = open(name, S)
.write_new(Default::default(), &MockEscaper::default())?;
assert_eq!(result.0, b"<no-prefix");
assert_eq!(result.1, WriterState::NodeOpen);
Ok(())
}
#[test]
fn writes_beginning_node_tag_with_prefix() -> TestResult {
let name = ("prefix", "element-name");
let result = open(name, S)
.write_new(Default::default(), &MockEscaper::default())?;
assert_eq!(result.0, b"<prefix:element-name");
assert_eq!(result.1, WriterState::NodeOpen);
Ok(())
}
#[test]
fn closes_open_node_when_opening_another() -> TestResult {
let name = ("p", "another-element");
let result = open(name, S)
.write_new(WriterState::NodeOpen, &MockEscaper::default())?;
assert_eq!(result.0, b"><p:another-element");
assert_eq!(result.1, WriterState::NodeOpen);
Ok(())
}
#[test]
fn closes_open_node_as_empty_element() -> TestResult {
let result = close_empty(S)
.write_new(WriterState::NodeOpen, &MockEscaper::default())?;
assert_eq!(result.0, b"/>");
assert_eq!(result.1, WriterState::NodeExpected);
Ok(())
}
#[test]
fn closing_tag_when_node_expected() -> TestResult {
let name = ("a", "closed-element");
let result = close(Some(name), S)
.write_new(WriterState::NodeExpected, &MockEscaper::default())?;
assert_eq!(result.0, b"</a:closed-element>");
assert_eq!(result.1, WriterState::NodeExpected);
Ok(())
}
// Does _not_ check that it's balanced. Not our job. In fact, we want
// to explicitly support outputting malformed XML.
#[test]
fn closes_open_node_with_closing_tag() -> TestResult {
let name = ("b", "closed-element");
let result = close(Some(name), S)
.write_new(WriterState::NodeOpen, &MockEscaper::default())?;
assert_eq!(result.0, b"></b:closed-element>");
assert_eq!(result.1, WriterState::NodeExpected);
Ok(())
}
#[test]
fn writes_attr_name_to_open_node() -> TestResult {
let name_ns = ("some", "attr").unwrap_into();
let name_local = "nons".unwrap_into();
// Namespace prefix
let result = Token::AttrName(name_ns, S)
.write_new(WriterState::NodeOpen, &MockEscaper::default())?;
assert_eq!(result.0, b" some:attr");
assert_eq!(result.1, WriterState::AttrNameAdjacent);
// No namespace prefix
let result = Token::AttrName(name_local, S)
.write_new(WriterState::NodeOpen, &MockEscaper::default())?;
assert_eq!(result.0, b" nons");
assert_eq!(result.1, WriterState::AttrNameAdjacent);
Ok(())
}
#[test]
fn writes_attr_value_when_adjacent_to_attr() -> TestResult {
let value = "test str".intern();
let result = Token::AttrValue(value, S).write_new(
WriterState::AttrNameAdjacent,
&MockEscaper::default(),
)?;
assert_eq!(result.0, br#"="test str:ESC""#);
assert_eq!(result.1, WriterState::NodeOpen);
Ok(())
}
#[test]
fn writes_attr_value_consisting_of_fragments() -> TestResult {
let value_left = "left".intern();
let value_mid = " mid".intern();
let value_right = " right".intern();
let result = vec![
Token::AttrValueFragment(value_left, S),
Token::AttrValueFragment(value_mid, S),
Token::AttrValue(value_right, S),
]
.into_iter()
.write_new(WriterState::AttrNameAdjacent, &MockEscaper::default())?;
assert_eq!(result.0, br#"="left:ESC mid:ESC right:ESC""#);
assert_eq!(result.1, WriterState::NodeOpen);
Ok(())
}
#[test]
fn writes_text() -> TestResult {
let text = "test unescaped".intern();
// When a node is expected.
let result = Token::Text(text, S)
.write_new(WriterState::NodeExpected, &MockEscaper::default())?;
assert_eq!(result.0, b"test unescaped:ESC");
assert_eq!(result.1, WriterState::NodeExpected);
// When a node is still open.
let result = Token::Text(text, S)
.write_new(WriterState::NodeOpen, &MockEscaper::default())?;
assert_eq!(result.0, b">test unescaped:ESC");
assert_eq!(result.1, WriterState::NodeExpected);
Ok(())
}
#[test]
fn writes_comment() -> TestResult {
// Just to be sure it's not trying to escape when we say it
// shouldn't, we include a character that must otherwise be escaped.
let comment = "comment > escaped".intern();
// When a node is expected.
let result = Token::Comment(comment, S)
.write_new(WriterState::NodeExpected, &MockEscaper::default())?;
assert_eq!(result.0, b"<!--comment > escaped-->");
assert_eq!(result.1, WriterState::NodeExpected);
// When a node is still open.
let result = Token::Comment(comment, S)
.write_new(WriterState::NodeOpen, &MockEscaper::default())?;
assert_eq!(result.0, b"><!--comment > escaped-->");
assert_eq!(result.1, WriterState::NodeExpected);
Ok(())
}
#[test]
fn unsupported_transition_results_in_error() -> TestResult {
assert!(matches!(
Token::AttrValue("".intern(), S).write(
&mut vec![],
WriterState::NodeExpected,
&MockEscaper::default()
),
Err(Error::UnexpectedToken(_, WriterState::NodeExpected)),
));
Ok(())
}
// The cherry on top, just to demonstrate how this will be used in
// practice.
#[test]
fn test_valid_sequence_of_tokens() -> TestResult {
let root: QName = ("r", "root").unwrap_into();
let result = vec![
open(root, S),
Token::AttrName(("an", "attr").unwrap_into(), S),
Token::AttrValue("value".intern(), S),
Token::Text("text".intern(), S),
open(("c", "child"), S),
close_empty(S),
close(Some(root), S),
]
.into_iter()
.write_new(Default::default(), &MockEscaper::default())?;
assert_eq!(
result.0,
br#"<r:root an:attr="value:ESC">text:ESC<c:child/></r:root>"#
);
assert_eq!(result.1, WriterState::NodeExpected);
Ok(())
}
}