tamer: asg::graph::visit::topo: Introduce topological sort

This is an initial implementation that does not yet produce errors on cycles. Documentation is not yet complete. The implementation is fairly basic, and similar to Petgraph's DFS. A terminology note: the DFS will be ontology-aware (or at least aware of edge metadata) to avoid traversing edges that would introduce cycles in situations where they are permitted, which effectively performs a topological sort on an implicitly _filtered_ graph. This will end up replacing ld::xmle::lower::sort. DEV-13162
2023-04-26 09:49:50 -04:00 · 2023-04-26 09:49:50 -04:00 · e3094e0bad
parent be05fbb833
commit e3094e0bad
7 changed files with 533 additions and 2 deletions
--- a/tamer/Cargo.lock
+++ b/tamer/Cargo.lock
@ -116,6 +116,7 @@ dependencies = [
 "arrayvec",
 "bumpalo",
 "exitcode",
+ "fixedbitset",
 "fxhash",
 "getopts",
 "memchr",
--- a/tamer/Cargo.toml
+++ b/tamer/Cargo.toml
@ -26,6 +26,7 @@ lto = true
 arrayvec = ">= 0.7.1"
 bumpalo = ">= 2.6.0"
 exitcode = "1.1.2"
+fixedbitset = ">= 0.4.1"  # also used by petgraph
 fxhash = ">= 0.2.1"
 getopts = "0.2"
 memchr = ">= 2.3.4"  # quick-xml expects =2.3.4 at the time
--- a/tamer/src/asg/graph.rs
+++ b/tamer/src/asg/graph.rs
@ -182,6 +182,15 @@ impl Asg {
        self.graph
    }

+    /// Number of [`Object`]s on the graph.
+    ///
+    /// This is equivalent to the number of nodes on the graph at the time
+    ///   of writing,
+    ///     but that may not always be the case.
+    fn object_count(&self) -> usize {
+        self.graph.node_count()
+    }
+
    /// Index the provided symbol `name` as representing the
    ///   [`ObjectIndex`] in the immediate environment `imm_env`.
    ///
@ -493,9 +502,9 @@ impl Asg {
    ///   compilation unit,
    ///     which is a package.
    #[inline]
-    pub fn lookup<O: ObjectRelatable, OS: ObjectIndexRelTo<O>>(
+    pub fn lookup<O: ObjectRelatable>(
        &self,
-        imm_env: OS,
+        imm_env: impl ObjectIndexRelTo<O>,
        id: SPair,
    ) -> Option<ObjectIndex<O>> {
        // The type `O` is encoded into the index on [`Self::index`] and so
--- a/tamer/src/asg/graph/object.rs
+++ b/tamer/src/asg/graph/object.rs
@ -210,6 +210,14 @@ macro_rules! object_gen {
            $($kind,)+
        }

+        impl<T: ObjectInner> From<&Object<T>> for ObjectTy {
+            fn from(obj: &Object<T>) -> Self {
+                match obj {
+                    $(Object::$kind(_) => ObjectTy::$kind,)+
+                }
+            }
+        }
+
        /// The collection of potential objects of [`Object`].
        pub trait ObjectInner {
            $(type $kind;)+
@ -350,6 +358,10 @@ impl Object<OnlyObjectInner> {
        }
    }

+    pub fn ty(&self) -> ObjectTy {
+        self.into()
+    }
+
    /// Retrieve an [`Ident`] reference,
    ///   or [`None`] if the object is not an identifier.
    pub fn as_ident_ref(&self) -> Option<&Ident> {
@ -849,6 +861,12 @@ impl<O: ObjectKind> From<ObjectIndex<O>> for NodeIndex {
    }
 }

+impl<O: ObjectKind> From<ObjectIndex<O>> for usize {
+    fn from(value: ObjectIndex<O>) -> Self {
+        Into::<NodeIndex>::into(value).index()
+    }
+}
+
 impl<O: ObjectKind> From<ObjectIndex<O>> for Span {
    fn from(value: ObjectIndex<O>) -> Self {
        match value {
--- a/tamer/src/asg/graph/visit.rs
+++ b/tamer/src/asg/graph/visit.rs
@ -24,5 +24,7 @@
 //!   state of the [`Asg`](super::Asg).

 mod ontree;
+mod topo;

 pub use ontree::{tree_reconstruction, Depth, TreePreOrderDfs, TreeWalkRel};
+pub use topo::{topo_sort, TopoPostOrderDfs};
--- a/tamer/src/asg/graph/visit/topo.rs
+++ b/tamer/src/asg/graph/visit/topo.rs
@ -0,0 +1,169 @@
+// Topological sort ASG traversal
+//
+//  Copyright (C) 2014-2023 Ryan Specialty, LLC.
+//
+//  This file is part of TAME.
+//
+//  This program is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  This program is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+//! Topological sort of [`Asg`] with ontological consideration.
+//!
+//! This toplogical sort is a depth-first search (DFS) that emits nodes in
+//!   post-order.
+//! Intuitively,
+//!   it emits objects sorted in such a way that they appear before each of
+//!   their dependencies.
+//!
+//! The ordering is deterministic between runs on the same graph,
+//!   but it is only one of potentially many orderings.
+//!
+//! The only information provided by this sort is a stream of
+//!   [`ObjectIndex`]es ordered linearly.
+//! No information about the edge or source object is provided,
+//!   nor is information about the length of the current path,
+//!   since an object may be visited any number of different ways and the
+//!   caller ought not rely on the particular path taken.
+//! Furthermore,
+//!   an object may be visited any number of times from any number of paths,
+//!   but only the first visit is emitted,
+//!     so any additional information would provide an incomplete picture;
+//!       this sort is _not_ intended to provide information about all paths
+//!       to a particular object and cannot be used in that way.
+
+use super::super::{Asg, ObjectIndex};
+use crate::asg::{graph::object::DynObjectRel, AsgError, Object};
+use fixedbitset::FixedBitSet;
+
+pub fn topo_sort(
+    asg: &Asg,
+    init: impl Iterator<Item = ObjectIndex<Object>>,
+) -> TopoPostOrderDfs {
+    TopoPostOrderDfs::new(asg, init)
+}
+
+/// Topological sort implemented as a post-order depth-first search (DFS).
+///
+/// See the [module-level documentation](super) for important information
+///   about this traversal.
+pub struct TopoPostOrderDfs<'a> {
+    /// Reference [`Asg`].
+    ///
+    /// Holding a reference to the [`Asg`] allows this object to serve
+    ///   conveniently as an iterator.
+    asg: &'a Asg,
+
+    /// DFS stack.
+    ///
+    /// As objects (nodes/vertices) are visited,
+    ///   its relationships (edge targets) are pushed onto the stack.
+    /// Each iterator pops a relationship off the stack and visits it.
+    ///
+    /// The traversal ends once the stack becomes empty.
+    /// It is expected the stack is initialized with at least one initial
+    ///   object prior to beginning the traversal.
+    stack: Vec<ObjectIndex<Object>>,
+
+    /// Objects that have already been added to [`Self::stack`].
+    ///
+    /// An object that has already been visited will _not_ be visited
+    ///   again.
+    /// A visited object is only present in [`Self::stack`] until it is
+    ///   finished,
+    ///     after which it appears in [`Self::finished`].
+    visited: FixedBitSet,
+
+    /// Objects that have been emitted and pop'd from [`Self::stack`].
+    ///
+    /// This is used for cycle detection.
+    /// Before pushing an object onto [`Self::stack`],
+    ///   the system first checks [`Self::visited`].
+    /// If an object has been visited,
+    ///   but has not yet been finished,
+    ///   then it must still be present on the stack and must therefore
+    ///   be part of a cycle.
+    finished: FixedBitSet,
+}
+
+pub trait ObjectRelFilter = Fn(DynObjectRel) -> bool;
+
+/// Initial capacity of the [`TopoPostOrderDfs`] stack.
+///
+/// The stack will need to be able to accommodate all nodes and their
+///   siblings within the longest path taken by the DFS.
+/// If there are many rooted objects
+///   (e.g. for `tameld`),
+///     this may be quite large.
+///
+/// The current number is arbitrary and only intended to reduce initial
+///   small re-allocations;
+///     it is too small for linking and too large for individual packages.
+const INIT_STACK_CAP: usize = 32;
+
+impl<'a> TopoPostOrderDfs<'a> {
+    fn new(
+        asg: &'a Asg,
+        init: impl Iterator<Item = ObjectIndex<Object>>,
+    ) -> Self {
+        let set_cap = asg.object_count();
+
+        let mut stack = Vec::with_capacity(INIT_STACK_CAP);
+        init.collect_into(&mut stack);
+
+        Self {
+            asg,
+            stack,
+            visited: FixedBitSet::with_capacity(set_cap),
+            finished: FixedBitSet::with_capacity(set_cap),
+        }
+    }
+}
+
+impl<'a> Iterator for TopoPostOrderDfs<'a> {
+    type Item = Result<ObjectIndex<Object>, AsgError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // Rust doesn't have guaranteed TCO as of 2023-04
+        loop {
+            let next = *self.stack.last()?;
+
+            if self.visited.put(next.into()) {
+                self.stack.pop(); // next
+
+                if !self.finished.put(next.into()) {
+                    break Some(Ok(next));
+                } else {
+                    // Must have been visited by another path.
+                    continue;
+                };
+            }
+
+            self.asg
+                .edges_dyn(next)
+                .map(|dyn_oi| *dyn_oi.target())
+                .filter(|&oi| {
+                    let finished = self.finished.contains(oi.into());
+
+                    // TODO:
+                    let _is_cycle =
+                        !finished && self.visited.contains(oi.into());
+
+                    !finished
+                })
+                .collect_into(&mut self.stack);
+        }
+    }
+}
+
+#[cfg(test)]
+mod test;
--- a/tamer/src/asg/graph/visit/topo/test.rs
+++ b/tamer/src/asg/graph/visit/topo/test.rs
@ -0,0 +1,331 @@
+// Test topological sort ASG traversal
+//
+//  Copyright (C) 2014-2023 Ryan Specialty, LLC.
+//
+//  This file is part of TAME.
+//
+//  This program is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  This program is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+use super::*;
+use crate::{
+    asg::{
+        air::{Air, AirAggregate},
+        graph::object::{self, ObjectTy, Pkg},
+        ExprOp,
+    },
+    parse::{util::SPair, ParseState},
+    span::{dummy::*, Span, UNKNOWN_SPAN},
+};
+use std::fmt::Debug;
+
+use Air::*;
+
+fn topo_report_only(
+    asg: &Asg,
+    edges: impl Iterator<Item = ObjectIndex<Object>>,
+) -> Vec<Result<(ObjectTy, Span), AsgError>> {
+    topo_sort(asg, edges)
+        .map(|result| {
+            result
+                .map(|oi| oi.resolve(asg))
+                .map(|obj| (obj.ty(), obj.span()))
+        })
+        .collect()
+}
+
+fn topo_report<I: IntoIterator<Item = Air>>(
+    toks: I,
+) -> Vec<Result<(ObjectTy, Span), AsgError>>
+where
+    I::IntoIter: Debug,
+{
+    let mut parser = AirAggregate::parse(toks.into_iter());
+    assert!(parser.all(|x| x.is_ok()));
+
+    let asg = &parser.finalize().unwrap().into_context();
+    let oi_root = asg.root(UNKNOWN_SPAN);
+
+    topo_report_only(
+        asg,
+        oi_root.edges_filtered::<Pkg>(asg).map(ObjectIndex::widen),
+    )
+}
+
+#[test]
+fn sorts_objects_given_single_root() {
+    let id_a = SPair("expr_a".into(), S3);
+    let id_b = SPair("expr_b".into(), S9);
+    let id_c = SPair("expr_c".into(), S12);
+
+    #[rustfmt::skip]
+    let toks = vec![
+        // Packages are auto-rooted as part of the graph's ontology.
+        // There is only one for this test.
+        PkgStart(S1),
+          // Before this can be computed,
+          //   its dependencies must be.
+          ExprStart(ExprOp::Sum, S2),           // -.
+            BindIdent(id_a),                    //  |
+                                                //  |
+            // This is a dependency,            //  |
+            //   but it is owned by this Expr   //  |
+            //   and so would have been emitted //  |
+            //   first anyway.                  //  |
+            ExprStart(ExprOp::Sum, S4),         //  |
+            ExprEnd(S5),                        //  |
+                                                //  v
+            // But this is a reference to another
+            //   Expr that appears later.
+            RefIdent(SPair(id_b.symbol(), S6)),  // --. 
+          ExprEnd(S7),                           //    | 
+                                                 //    |
+          // This will have to be emitted        //    |
+          //   _before_ the above Expr that      //    |
+          //   depends on its value having been  //    |
+          //   computed.                         //   /  
+          ExprStart(ExprOp::Sum, S8),            // <`
+            BindIdent(id_b),
+          ExprEnd(S10),
+
+          // A sibling expression with no dependency on
+          //   other expressions.
+          ExprStart(ExprOp::Sum, S11),
+            BindIdent(id_c),
+          ExprEnd(S13),
+        PkgEnd(S14),
+    ];
+
+    use ObjectTy::*;
+    let m = |a: Span, b: Span| a.merge(b).unwrap();
+
+    #[rustfmt::skip]
+    assert_eq!(
+        Ok(vec![
+            // The first leaf is this anonymous child expression,
+            //   which has no dependencies.
+            (Expr,  m(S4, S5)  ),  // child of id_a
+
+            // The sibling of the above expression is a reference to the
+            //   value of `id_b`.
+            // `id_a` cannot be computed before it.
+            (Expr,  m(S8, S10) ),  // id_b
+            (Ident, S9,        ),  // id_b
+
+            // With `id_b` emitted,
+            //   `id_a` has no more dependencies,
+            //   and so itself can be emitted.
+            (Expr,  m(S2, S7)  ),  // id_a
+            (Ident, S3,        ),  // id_a
+
+            // `id_a` has a sibling `id_c`.
+            // Its ordering is undefined relative to `id_a`
+            //   (it could also be ordered before it),
+            //   but the implementation of the traversal causes it to be
+            //     output in the same order as it appeared in the source
+            //     token stream.
+            (Expr,  m(S11, S13)),  // id_c
+            (Ident, S12        ),  // id_c
+
+            // We end with the root that was explicitly provided to
+            //   `topo_sort` via `topo_report`.
+            (Pkg,   m(S1, S14) ),
+        ]),
+        topo_report(toks).into_iter().collect(),
+    );
+}
+
+// Like the above test,
+//   but the path is deeper to emphasize that the topological sort applies
+//   recursively to dependencies.
+// Multiple expressions depending on the same dependency have an arbitrary
+//   order that is deterministic between runs.
+#[test]
+fn sorts_objects_given_single_root_more_complex() {
+    let id_a = SPair("expr_a".into(), S3);
+    let id_b = SPair("expr_b".into(), S7);
+    let id_c = SPair("expr_c".into(), S11);
+    let id_d = SPair("expr_d".into(), S15);
+
+    #[rustfmt::skip]
+    let toks = vec![
+        PkgStart(S1),
+          ExprStart(ExprOp::Sum, S2),
+            BindIdent(id_a),
+            RefIdent(SPair(id_b.symbol(), S4)),  // ---.
+          ExprEnd(S5),                           //     )
+                                                 //    /
+          ExprStart(ExprOp::Sum, S6),            //   /
+            BindIdent(id_b),                     // <'
+            RefIdent(SPair(id_d.symbol(), S8)),  // -------.
+          ExprEnd(S9),                           // <.      |
+                                                 //   \     |
+          ExprStart(ExprOp::Sum, S10),           //    \    |
+            BindIdent(id_c),                     //     )   |
+            RefIdent(SPair(id_b.symbol(), S12)), // ---'   /
+          ExprEnd(S13),                          //       /
+                                                 //      /
+          ExprStart(ExprOp::Sum, S14),           //     /
+            BindIdent(id_d),                     // <--'
+          ExprEnd(S16),
+        PkgEnd(S17),
+    ];
+
+    use ObjectTy::*;
+    let m = |a: Span, b: Span| a.merge(b).unwrap();
+
+    #[rustfmt::skip]
+    assert_eq!(
+        Ok(vec![
+            (Expr,  m(S14, S16)),  // id_d
+            (Ident, S15        ),  // id_d
+
+            (Expr,  m(S6, S9) ),   // id_b
+            (Ident, S7,        ),  // id_b
+
+            (Expr,  m(S2, S5)  ),  // id_a
+            (Ident, S3,        ),  // id_a
+
+            (Expr,  m(S10, S13)),  // id_c
+            (Ident, S11        ),  // id_c
+
+            (Pkg,   m(S1, S17) ),
+        ]),
+        topo_report(toks).into_iter().collect(),
+    );
+}
+
+// This tests what the linker (tameld) does:
+//   topologically sorts explicitly rooted objects and ignores everything
+//   else.
+// This also gives us dead code elimination.
+#[test]
+fn omits_unreachable() {
+    let id_a = SPair("expr_a".into(), S3);
+    let id_b = SPair("expr_b".into(), S7);
+    let id_c = SPair("expr_c".into(), S11);
+    let id_d = SPair("expr_d".into(), S15);
+
+    // We will only use a portion of this graph.
+    #[rustfmt::skip]
+    let toks = vec![
+        PkgStart(S1),
+          ExprStart(ExprOp::Sum, S2),
+            BindIdent(id_a),
+            RefIdent(SPair(id_b.symbol(), S4)),  // ---.
+          ExprEnd(S5),                           //     )
+                                                 //    /
+          ExprStart(ExprOp::Sum, S6),            //   /
+            BindIdent(id_b),                     // <'
+            RefIdent(SPair(id_d.symbol(), S8)),  // -------.
+          ExprEnd(S9),                           // <.      |
+                                                 //   \     |
+          ExprStart(ExprOp::Sum, S10),           //    \    |
+            BindIdent(id_c),                     //     )   |
+            RefIdent(SPair(id_b.symbol(), S12)), // ---'   /
+          ExprEnd(S13),                          //       /
+                                                 //      /
+          ExprStart(ExprOp::Sum, S14),           //     /
+            BindIdent(id_d),                     // <--'
+          ExprEnd(S16),
+        PkgEnd(S17),
+    ];
+
+    use ObjectTy::*;
+    let m = |a: Span, b: Span| a.merge(b).unwrap();
+
+    let mut parser = AirAggregate::parse(toks.into_iter());
+    assert!(parser.all(|x| x.is_ok()));
+
+    let asg = &parser.finalize().unwrap().into_context();
+
+    let oi_pkg = asg
+        .root(UNKNOWN_SPAN)
+        .edges_filtered::<object::Pkg>(&asg)
+        .next()
+        .expect("cannot find Pkg on graph");
+
+    let oi_b = asg
+        .lookup::<object::Ident>(oi_pkg, id_b)
+        .expect("missing oi_b");
+
+    // We'll use only `oi_b` as the root,
+    //   which will include it and its (only) dependency.
+    // The rest of the graph must be ignored.
+    let report = topo_report_only(&asg, [oi_b.widen()].into_iter());
+
+    #[rustfmt::skip]
+    assert_eq!(
+        Ok(vec![
+            (Expr,  m(S14, S16)),  // id_d
+            (Ident, S15        ),  // id_d
+
+            (Expr,  m(S6, S9) ),   // id_b
+            (Ident, S7,        ),  // id_b
+        ]),
+        report.into_iter().collect(),
+    );
+}
+
+// If multiple roots are given,
+//   and they have entirely independent subgraphs,
+//   then their ordering is deterministic between runs of the same graph,
+//     but undefined.
+//
+// This is no different than the ordering of siblings above;
+//   this simply provides an explicit example for the behavior of provided
+//   roots since that is the entry point for this API.
+#[test]
+fn sorts_objects_given_multiple_roots() {
+    let id_a = SPair("expr_a".into(), S3);
+    let id_b = SPair("expr_b".into(), S8);
+
+    #[rustfmt::skip]
+    let toks = vec![
+        // First root
+        PkgStart(S1),
+          ExprStart(ExprOp::Sum, S2),
+            BindIdent(id_a),
+          ExprEnd(S4),
+        PkgEnd(S5),
+
+        // Second root,
+        //   independent of the first.
+        PkgStart(S6),
+          ExprStart(ExprOp::Sum, S7),
+            BindIdent(id_b),
+          ExprEnd(S9),
+        PkgEnd(S10),
+    ];
+
+    use ObjectTy::*;
+    let m = |a: Span, b: Span| a.merge(b).unwrap();
+
+    #[rustfmt::skip]
+    assert_eq!(
+        Ok(vec![
+            // First root.
+            (Expr,  m(S2, S4) ),
+            (Ident, S3),
+            (Pkg,   m(S1, S5) ),
+
+            // Second root,
+            //   but the fact that it is emitted after the first is not
+            //   behavior that should be relied upon.
+            (Expr,  m(S7, S9) ),
+            (Ident, S8),
+            (Pkg,   m(S6, S10)),
+        ]),
+        topo_report(toks).into_iter().collect(),
+    );
+}