tamer: ir::xir::tree::parser_from: Yield parsed trees

Previously, `parser_from` was a simple wrapper around `parse`; now, this provides a more convenient API where `next` will yield the next parsed object. See docs for much more information and rationale.
2021-09-09 13:05:11 -04:00 · 2021-09-09 13:05:11 -04:00 · 4987bc39b0
parent 1452a4186a
commit 4987bc39b0
1 changed files with 98 additions and 13 deletions
--- a/tamer/src/ir/xir/tree.rs
+++ b/tamer/src/ir/xir/tree.rs
@ -32,14 +32,56 @@
 //!
 //!# type Ix = u16;
 //!# let token_stream: std::vec::IntoIter<Token<Ix>> = vec![].into_iter();
-//! // Lazily parse a stream of XIR tokens as an iterator.
+//! // Lazily parse a stream of XIR tokens as an iterator, yielding the next
+//! // fully parsed object.  This may consume any number of tokens.
 //! let parser = parser_from(token_stream);
-//!#
+//!
 //!# let token_stream: std::vec::IntoIter<Token<Ix>> = vec![].into_iter();
-//! // The above is equivalent to:
+//! // Consume a single token at a time, yielding either an incomplete state
+//! // or the next parsed object.
 //! let parser = token_stream.scan(ParserState::new(), parse);
 //! ```
 //!
+//! `parser_from` Or `parse`?
+//! =========================
+//! [`parser_from`] is implemented in terms of [`parse`].
+//! They have slightly different use cases and tradeoffs:
+//!
+//! [`parse`] yields a [`Result`] containing [`Parsed`],
+//!   which _may_ contain a [`Parsed::Object`],
+//!   but it's more likely to contain [`Parsed::Incomplete`];
+//!     this is because it typically takes multiple [`Token`]s to complete
+//!     parsing within a given context.
+//!
+//! In return, though, you get some important guarantees:
+//!
+//!   1. [`parse`] consumes only a _single_ token; and
+//!   2. It has a constant upper bound for execution time.
+//!
+//! This means that [`parse`] will never cause the system to hang---you
+//!   are in complete control over how much progress parsing makes,
+//!     and are free to stop and resume it at any time.
+//!
+//! However,
+//!   if you do not care about those things,
+//!   working with [`Parsed`] is verbose and inconvenient;
+//!     sometimes you just want the next [`Tree`] object.
+//! For this,
+//!   we have [`parser_from`],
+//!     which does two things:
+//!
+//!   1. It filters out all [`Parsed::Incomplete`]; and
+//!   2. On [`Parsed::Object`],
+//!        it yields the inner [`Tree`].
+//!
+//! This is a much more convenient API,
+//!   but is not without its downsides:
+//!     if the context is large
+//!       (e.g. the root node of a large XML document),
+//!       parsing can take a considerable amount of time,
+//!         and the [`Iterator`] produced by [`parser_from`] will cause the
+//!         system to process [`Iterator::next`] for that entire duration.
+//!
 //! Cost of Parsing
 //! ===============
 //! While [`Tree`] is often much easier to work with than a stream of
@ -392,8 +434,9 @@ pub enum Parsed<Ix: SymbolIndexSize> {
 /// Wrap [`ParserState::parse_token`] result in [`Some`],
 ///   suitable for use with [`Iterator::scan`].
 ///
-/// _Consider using [`parser_from`] instead,_
-///   as it encapsulates [`ParserState`].
+/// If you do not require a single-step [`Iterator::next`] and simply want
+///   the next parsed object,
+///     use [`parser_from`] instead.
 ///
 /// Note that parsing errors are represented by the wrapped [`Result`],
 ///   _not_ by [`None`].
@ -417,13 +460,17 @@ pub fn parse<Ix: SymbolIndexSize>(
    Some(ParserState::parse_token(state, tok))
 }

-/// Produce a lazy parser from a given [`Token`] iterator.
+/// Produce a lazy parser from a given [`Token`] iterator,
+///   yielding only when an object has been fully parsed.
 ///
-/// This produces a parser using [`parse`],
-///   which uses [`Iterator::scan`].
-/// This will lazily parse a stream of [`Token`],
-///   emitting [`Parsed::Object`] once enough data has been gathered for the
-///   running context.
+/// Unlike [`parse`],
+///   which is intended for use with [`Iterator::scan`],
+///   this will yield /only/ when the underlying parser yields
+///   [`Parsed::Object`],
+///     unwrapping the inner [`Tree`] value.
+/// This interface is far more convenient,
+///   but comes at the cost of not knowing how many parsing steps a single
+///   [`Iterator::next`] call will take.
 ///
 /// For more information on contexts,
 ///   and the parser in general,
@ -440,8 +487,13 @@ pub fn parse<Ix: SymbolIndexSize>(
 /// ```
 pub fn parser_from<Ix: SymbolIndexSize>(
    toks: impl Iterator<Item = Token<Ix>>,
-) -> impl Iterator<Item = Result<Parsed<Ix>, ParseError>> {
+) -> impl Iterator<Item = Result<Tree<Ix>, ParseError>> {
    toks.scan(ParserState::new(), parse)
+        .filter_map(|parsed| match parsed {
+            Ok(Parsed::Object(tree)) => Some(Ok(tree)),
+            Ok(Parsed::Incomplete) => None,
+            Err(x) => Some(Err(x)),
+        })
 }

 #[cfg(test)]
@ -543,7 +595,7 @@ mod test {
            span: (*S, *S2),
        };

-        let mut sut = parser_from(toks);
+        let mut sut = toks.scan(ParserState::new(), parse);

        assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // Open
        assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // AttrName
@ -556,4 +608,37 @@ mod test {
        );
        assert_eq!(sut.next(), None);
    }
+
+    #[test]
+    fn parser_from_filters_incomplete() {
+        let name = ("ns", "elem").unwrap_into();
+        let attr = "a".unwrap_into();
+        let val = AttrValue::Escaped("val1".intern());
+
+        let toks = std::array::IntoIter::new([
+            Token::<Ix>::Open(name, *S),
+            Token::AttrName(attr, *S),
+            Token::AttrValue(val, *S2),
+            Token::SelfClose(*S2),
+        ]);
+
+        let expected = Element {
+            name,
+            attrs: vec![Attr {
+                name: attr,
+                value: val,
+                span: (*S, *S2),
+            }],
+            children: vec![],
+            span: (*S, *S2),
+        };
+
+        let mut sut = parser_from(toks);
+
+        // Unlike the previous tests, we should filter out all the
+        // `Parsed::Incomplete` and yield only when we have a fully parsed
+        // object.
+        assert_eq!(sut.next(), Some(Ok(Tree::Element(expected))));
+        assert_eq!(sut.next(), None);
+    }
 }