tamer: frontend: Begin basic XML parsing

The first step in the process is to emit the raw XML events that can then be
immediately output again to echo the results into another file.  This will
then allow us to begin parsing the input incrementally, and begin to morph
the output into a real `xmlo` file.
main
Mike Gerwitz 2021-07-27 00:28:57 -04:00
parent d9dcfe8777
commit ca6ef3ed36
5 changed files with 200 additions and 22 deletions

View File

@ -32,7 +32,7 @@ use std::path::Path;
#[cfg(feature = "wip-frontends")]
use {
std::io::BufReader,
tamer::frontend::{FrontendParser, XmlFrontendParser},
tamer::frontend::{FrontendEvent, FrontendParser, XmlFrontendParser},
tamer::fs::File,
};
@ -64,7 +64,14 @@ pub fn main() -> Result<(), Box<dyn Error>> {
let file: BufReader<fs::File> = File::open(source)?;
let mut parser = XmlFrontendParser::new(file);
parser.parse_next()?;
// Parse all the way through, but don't do anything with it
// yet.
loop {
match parser.parse_next()? {
FrontendEvent::Eof => break,
_ => continue,
}
}
}
fs::copy(source, dest)?;

View File

@ -49,7 +49,8 @@ mod parser;
mod xml;
pub use parser::{
FrontendError, FrontendEvent, FrontendParser, FrontendResult, Token,
ClosedByteInterval, FrontendError, FrontendEvent, FrontendParser,
FrontendResult, Token,
};
pub use xml::XmlFrontendParser;

View File

@ -21,9 +21,19 @@
//!
//! See the [parent module](super) for more information.
use std::{borrow::Cow, fmt::Display, num::NonZeroUsize};
use std::{borrow::Cow, fmt::Display};
/// Recovering, zero-copy, scannerless parser.
///
/// Note that the lifetime exists on this trait due to the lack of GATs,
/// which prevents us from having a lifetime tied to `parse_next`;
/// this is the same problem that we have with `Iterator`.
/// An alternative would be to forego a trait at all for parsers,
/// but that only pushes the problem up the chain.
/// Remember that the parser produces short-lived tokens that are intended
/// to be immediately lowered,
/// and this problem doesn't exist at lower levels where date are owned
/// by a given IR.
pub trait FrontendParser<'l, T, E> {
/// Human-readable short description of parser.
///
@ -57,13 +67,24 @@ pub trait FrontendParser<'l, T, E> {
/// indicates that the caller should take special care in
/// determining whether parsing should be considered be to a
/// failure.
fn parse_next(&mut self) -> FrontendResult<FrontendEvent<'l, T, E>, E>;
fn parse_next(&'l mut self) -> FrontendResult<FrontendEvent<'l, T, E>, E>;
}
/// Raw input string associated with a token.
#[derive(Debug, PartialEq, Eq)]
pub struct Lexeme<'a>(Cow<'a, [u8]>);
/// A closed interval (range of values including its endpoints) representing
/// source bytes associated with a token.
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct ClosedByteInterval<T: Copy = usize>(pub T, pub T);
impl<T: Copy> From<(T, T)> for ClosedByteInterval<T> {
fn from(src: (T, T)) -> Self {
Self(src.0, src.1)
}
}
/// A lexeme combined with a type (kind) and location.
///
/// The `interval` represents the starting and ending offset, inclusive, of
@ -80,7 +101,7 @@ pub struct Token<'l, T> {
/// The token kind represents the parsed information and should always
/// be used in place of the lexeme (which may not be available),
/// unless referring back to the source input.
kind: T,
pub kind: T,
/// Raw input from which the token was generated.
///
@ -92,14 +113,18 @@ pub struct Token<'l, T> {
/// a lexeme may be available only immediately after a token is
/// emitted,
/// unless the caller wishes to copy its value.
lexeme: Option<Lexeme<'l>>,
pub lexeme: Option<Lexeme<'l>>,
/// Starting and ending offset of the lexeme, inclusive.
///
/// An interval may not be available if a token was generated by the
/// compiler in a manner that is not associated with any source
/// input.
interval: Option<(usize, NonZeroUsize)>,
///
/// A note on terminology: we use "interval" instead of "span" here,
/// because the latter is intended to hold slightly different data as
/// part of a lower-level IR.
pub interval: Option<ClosedByteInterval>,
}
/// Result of attempting to parse input for the next token.
@ -128,7 +153,7 @@ pub enum FrontendEvent<'l, T, E> {
///
/// Note that recovery tokens may not have interval information if
/// their source input is not sensible.
interval: (usize, usize),
interval: ClosedByteInterval,
/// Zero or more tokens that may be substituted in place of the
/// erroneous input in an attempt to continue parsing.
@ -156,7 +181,7 @@ pub enum FrontendError<E> {
/// Starting and ending byte offsets of source input that produced
/// the error.
interval: (usize, usize),
interval: ClosedByteInterval,
},
/// EOF reached with recoverable errors.

View File

@ -19,8 +19,13 @@
//! XML frontend for the TAME programming language.
use super::{FrontendEvent, FrontendParser, FrontendResult};
use quick_xml::{Error as XmlError, Reader as XmlReader};
use super::{
ClosedByteInterval, FrontendError, FrontendEvent, FrontendParser,
FrontendResult, Token,
};
use crate::tpwrap::quick_xml::Error as XmlError;
use quick_xml::events::Event as XmlEvent;
use quick_xml::Reader as XmlReader;
use std::fmt::Display;
use std::io::BufRead;
@ -29,7 +34,14 @@ pub struct XmlFrontendParser<B>
where
B: BufRead,
{
_reader: XmlReader<B>,
/// XML parser.
reader: XmlReader<B>,
/// Buffer for all XML data besides namespaces.
buf: Vec<u8>,
/// Buffer for namespace data.
nsbuf: Vec<u8>,
}
impl<B> XmlFrontendParser<B>
@ -39,11 +51,29 @@ where
pub fn new(buf_read: B) -> Self {
let reader = XmlReader::from_reader(buf_read);
Self { _reader: reader }
Self {
reader,
buf: Vec::new(),
nsbuf: Vec::new(),
}
}
/// Calculate the closed byte interval representing the bytes associated
/// with a given [`XmlEvent`].
fn calc_interval(
pos_start: usize,
pos_cur: usize,
ev: &XmlEvent,
) -> ClosedByteInterval {
match ev {
XmlEvent::Empty(_) => ClosedByteInterval(pos_start, pos_cur - 1),
_ => ClosedByteInterval(pos_start, pos_start),
}
}
}
impl<'l, B> FrontendParser<'l, XmlToken, XmlFrontendError>
impl<'l, B> FrontendParser<'l, XmlToken<'l>, XmlFrontendError>
for XmlFrontendParser<B>
where
B: BufRead,
@ -52,29 +82,73 @@ where
"XML-based package specification language"
}
fn parse_next(&mut self) -> XmlFrontendResult<XmlFrontendEvent<'l>> {
Ok(FrontendEvent::Eof)
fn parse_next(&'l mut self) -> XmlFrontendResult<XmlFrontendEvent<'l>> {
let reader = &mut self.reader;
let pos_start = reader.buffer_position();
reader
.read_namespaced_event(&mut self.buf, &mut self.nsbuf)
.map(|(ns, ev)| match ev {
XmlEvent::Eof => FrontendEvent::Eof,
_ => {
let interval = Some(Self::calc_interval(
pos_start,
reader.buffer_position(),
&ev,
));
FrontendEvent::Token(Token {
kind: XmlToken::RawXmlEvent((ns, ev)),
lexeme: None,
interval,
})
}
})
.map_err(|e| FrontendError::UnrecoverableError {
source: XmlFrontendError::XmlError(e.into()),
interval: ClosedByteInterval(
pos_start,
reader.buffer_position(),
),
})
}
}
pub type XmlFrontendEvent<'l> = FrontendEvent<'l, XmlToken, XmlFrontendError>;
pub type XmlFrontendEvent<'l> =
FrontendEvent<'l, XmlToken<'l>, XmlFrontendError>;
pub enum XmlToken {}
type Namespace<'a> = &'a [u8];
type NamespacedXmlEvent<'a> = (Option<Namespace<'a>>, XmlEvent<'a>);
#[derive(Debug)]
pub enum XmlToken<'l> {
RawXmlEvent(NamespacedXmlEvent<'l>),
}
#[derive(Debug, PartialEq)]
pub enum XmlFrontendError {
XmlError(XmlError),
}
impl Display for XmlFrontendError {
fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(fmt, "TODO fmt")
match self {
Self::XmlError(e) => e.fmt(fmt),
}
}
}
impl std::error::Error for XmlFrontendError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
None
match self {
Self::XmlError(e) => Some(e),
}
}
}
impl<E: Into<XmlError>> From<E> for XmlFrontendError {
fn from(err: E) -> Self {
Self::XmlError(err.into())
}
}

View File

@ -17,12 +17,20 @@
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
// NB: Due to the complexity and verbosity of mocking XML events,
// these tests are coupled with the XML parser.
// Care should be taken to try to mitigate minor changes to the library's
// output so as not to make these tests overly fragile.
use super::*;
type Sut<B> = XmlFrontendParser<B>;
// TODO: Just for initial testing; empty files shouldn't be valid, since
// they don't give the parser enough information as to what type of file it
// is.
#[test]
fn emits_eof() {
fn emits_eof_for_empty_file() {
let stub_data: &[u8] = &[];
let mut sut = Sut::new(stub_data);
@ -30,3 +38,66 @@ fn emits_eof() {
assert!(matches!(result, Ok(FrontendEvent::Eof)));
}
// Until the parser is complete, we need raw tokens so that we can echo them
// back out.
#[test]
fn produces_raw_xml_events_as_tokens() -> Result<(), Box<dyn std::error::Error>>
{
let stub_data: &[u8] = r#"<valid-xml xmlns="foons" />"#.as_bytes();
let mut sut = Sut::new(stub_data);
loop {
match sut.parse_next()? {
FrontendEvent::Token(Token {
kind: XmlToken::RawXmlEvent((ns, ev)),
lexeme: _,
interval,
}) => {
if ns.is_none() {
continue;
}
// Interval should be the starting byte offset to the offset
// of the final byte, not the byte after it.
assert!(matches!(
interval,
Some(ClosedByteInterval(0, hi))
if hi == stub_data.len() - 1
));
if let XmlEvent::Empty(start) = ev {
assert_eq!(start.name(), b"valid-xml");
break;
}
}
x => panic!("Unexpected: {:?}", x),
}
}
Ok(())
}
#[test]
fn produces_error_on_xml_parse_failure() {
let stub_data: &[u8] = b"<ok /><!-- EOF in comment";
let mut sut = Sut::new(stub_data);
loop {
match sut.parse_next() {
Ok(FrontendEvent::Eof) => panic!("Expected error"),
Err(e) => match e {
FrontendError::UnrecoverableError {
source: XmlFrontendError::XmlError(_),
interval: ClosedByteInterval(x, y),
} if x == 6 && y >= x => break,
_ => panic!("Error mismatch: {:?}", e),
},
_ => continue,
}
}
}