// Topological sort ASG traversal // // Copyright (C) 2014-2023 Ryan Specialty, LLC. // // This file is part of TAME. // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . //! Topological sort of [`Asg`] with ontological consideration. //! //! This toplogical sort is a depth-first search (DFS) that emits nodes in //! post-order. //! Intuitively, //! it emits objects sorted in such a way that they appear before each of //! their dependencies. //! //! The ordering is deterministic between runs on the same graph, //! but it is only one of potentially many orderings. //! //! The only information provided by this sort is a stream of //! [`ObjectIndex`]es ordered linearly. //! No information about the edge or source object is provided, //! nor is information about the length of the current path, //! since an object may be visited any number of different ways and the //! caller ought not rely on the particular path taken. //! Furthermore, //! an object may be visited any number of times from any number of paths, //! but only the first visit is emitted, //! so any additional information would provide an incomplete picture; //! this sort is _not_ intended to provide information about all paths //! to a particular object and cannot be used in that way. //! //! Cutting Of Cycles //! ================= //! A _cycle_ is a path that references another object earlier in the path, //! as if it loops in on itself. //! Cycles are generally not permitted, //! as they would require that a value would have to be computed before it //! could compute itself. //! This almost certainly represents an error in the program's specification. //! //! Cycles are permitted for recursion. //! More information can be found in [`ObjectRel::can_recurse`]. //! //! A toplogical ordering is defined only for graphs that do not contain //! cycles. //! To order a graph _with_ cycles, //! the depth-first search performs a _cut_, //! whereby the edge that would have led to the cycle is omitted, //! as if cutting a loop of string at the point that it is tied. //! An example of such a cut can be found in [`ObjectRel::can_recurse`]. //! //! This is done in two scenarios: //! //! 1. An unsupported cycle is an error. //! A cut is performed as a means of error recovery so that the process //! may continue and discover more errors before terminating. //! //! 2. A cycle representing allowed recursion performs a cut since the //! path taken thus far already represents a valid ordering. use crate::{ asg::{ graph::object::DynObjectRel, Asg, Object, ObjectIndex, ObjectIndexResolvedSpan, ObjectKind, }, diagnose::{Annotate, AnnotatedSpan, Diagnostic}, }; use fixedbitset::FixedBitSet; use std::{error::Error, fmt::Display, iter::once}; #[cfg(doc)] use crate::{asg::graph::object::ObjectRel, span::Span}; /// Topological sort with cutting of ontologically permitted cycles. /// /// This is a TAMER-specific topological sort that is aware of the graph's /// ontology and will automatically sort an acyclic subgraph produced by /// cutting permitted cycles. /// See the [module-level documentation](self) for more information. pub fn topo_sort( asg: &Asg, init: impl Iterator>, ) -> TopoPostOrderDfs { TopoPostOrderDfs::new(asg, init.map(ObjectIndex::widen)) } /// Topological sort implemented as a post-order depth-first search (DFS). /// /// See the [module-level documentation](self) for important information /// about this traversal. pub struct TopoPostOrderDfs<'a> { /// Reference [`Asg`]. /// /// Holding a reference to the [`Asg`] allows this object to serve /// conveniently as an iterator. asg: &'a Asg, /// DFS stack. /// /// As objects (nodes/vertices) are visited, /// its relationships (edge targets) are pushed onto the stack. /// Each iterator pops a relationship off the stack and visits it. /// /// The inner [`Result`] serves as a cycle flag set by /// [`Self::flag_or_cut_cycle`]. /// Computing the proper [`Cycle`] error before placing it on the stack /// would not only bloat the size of each element of this stack, /// but also use unnecessary memory on the heap. /// The proper [`Cycle`] error will be computed when this element is /// retrieved by [`Self::next_oi`]. /// /// _This may contain duplicate [`ObjectIndex`]es even if the graph /// contains no cycles;_ /// see [`Self::push_neighbors`] for an explanation. /// /// The traversal ends once the stack becomes empty. /// It is expected the stack is initialized with at least one initial /// object prior to beginning the traversal. stack: Vec, ObjectIndex>>, /// Objects that have already been added to [`Self::stack`]. /// /// An object that has already been visited will _not_ be visited /// again. /// A visited object is only present in [`Self::stack`] until it is /// finished, /// after which it appears in [`Self::finished`]. visited: FixedBitSet, /// Objects that have been emitted and pop'd from [`Self::stack`]. /// /// This is used for cycle detection. /// Before pushing an object onto [`Self::stack`], /// the system first checks [`Self::visited`]. /// If an object has been visited, /// but has not yet been finished, /// then it must still be present on the stack and must therefore /// be part of a cycle. finished: FixedBitSet, } pub trait ObjectRelFilter = Fn(DynObjectRel) -> bool; /// Initial capacity of the [`TopoPostOrderDfs`] stack. /// /// The stack will need to be able to accommodate all nodes and their /// siblings within the longest path taken by the DFS. /// If there are many rooted objects /// (e.g. for `tameld`), /// this may be quite large. /// /// The current number is arbitrary and only intended to reduce initial /// small re-allocations; /// it is too small for linking and too large for individual packages. const INIT_STACK_CAP: usize = 32; impl<'a> TopoPostOrderDfs<'a> { fn new( asg: &'a Asg, init: impl Iterator>, ) -> Self { let set_cap = asg.object_count(); let mut stack = Vec::with_capacity(INIT_STACK_CAP); init.map(Ok).collect_into(&mut stack); Self { asg, stack, visited: FixedBitSet::with_capacity(set_cap), finished: FixedBitSet::with_capacity(set_cap), } } /// Push the neighbors of the given [`ObjectIndex`] onto [`Self::stack`] /// for later processing. /// /// Placing neighbors on the stack allows us to yield elements from the /// iterator without having to keep track of where we are on the graph /// for each node in the path. /// /// When visiting a node for the first time, /// its neighbors /// (objects to which `src_oi` has an edge) /// are pushed onto [`Self::stack`]. /// It is expected that `src_oi` is left on the stack, /// ensuring that its neighbors are processed before `src_oi` is, /// leading to a post-order traversal. /// /// Objects that have already been emitted will _not_ be pushed onto the /// stack; /// this determination is made by consulting [`Self::finished`]. /// /// Each object that is pushed onto the stack will be checked by /// [`Self::flag_or_cut_cycle`]; /// see that function for more information. /// It is important that each cycle be flagged individually, /// rather than returning an error from this function, /// otherwise only one cycle per object would be found. /// /// Duplicate Stack Entries Without Cycles /// ====================================== /// [`Self::stack`] may contain duplicate [`ObjectIndex`]es even if /// there is no cycle. /// /// The reason for this is that a cycle only occurs when an /// [`ObjectIndex`] is part of the path currently being visited. /// But [`Self::stack`] contains objects that have _not yet been visited_; /// they've been placed onto the stack by this method to be visited at /// a future point. /// /// Consider this graph: /// /// ```text /// (A) -> (B) -> (D) /// '---> (C) <---' /// ``` /// /// A traversal might yield this stack if `C` is visited before `B`: /// /// ```text /// [A] // root /// [A, C, B] // self.push_neighbors(A) /// [A, C, B, D] // self.push_neighbors(B) /// [A, C, B, D, C] // self.push_neighbors(D) /// ``` /// /// Since `C` does not contain an edge _to_ any previous object, /// there is no cycle. /// /// For this reason, /// it is important for the implementation to check [`Self::finished`] /// when removing objects from the stack to ensure that they have not /// already been emitted. fn push_neighbors(&mut self, src_oi: ObjectIndex) { self.asg .edges_dyn(src_oi) .filter(|dyn_oi| !self.finished.contains((*dyn_oi.target()).into())) .filter_map(|dyn_oi| { Self::flag_or_cut_cycle(&self.visited, self.asg, dyn_oi) }) .collect_into(&mut self.stack); } /// Determine if the provided relation would introduce a cycle if /// appended to the current path and flag it if so. /// /// This should be called only after having checked [`Self::finished`], /// which means that a node is _not_ in the path because it has /// already been emitted. /// /// With [`Self::finished`] having been ruled out, /// this uses [`Self::visited`] to determine if a node must be part of /// the active path of the DFS. /// If so, /// then introducing it again would produce a cycle. /// /// Cycles are permitted under limited circumstances, /// where the edge represents a recursive target. /// This determination is made utilizing the graph's ontology via /// [`DynObjectRel::can_recurse`]. /// If the cycle ends up being permitted, /// then we perform a cut by filtering out the edge entirely, /// as if it did not exist. /// It is up to the graph's ontology to ensure that all such cuts will /// result in a valid ordering. /// (Cuts also occur during error recovery for unsupported cycles.) /// /// We use [`Result`] where `E` is [`ObjectIndex`] to simply flag the /// object as containing a cycle; /// this allows us to defer computation of the cycle and allocation /// of memory for that path until we actually visit the node on /// [`Self::stack`]. /// This allows the element size of [`Self::stack`] to remain small. /// /// See [`Self::find_cycle_path`] for the actual cycle computation that /// will eventually be performed. fn flag_or_cut_cycle( visited: &FixedBitSet, asg: &Asg, dyn_oi: DynObjectRel, ) -> Option, ObjectIndex>> { let oi = *dyn_oi.target(); if visited.contains(oi.into()) { if dyn_oi.can_recurse(asg) { None // cut } else { Some(Err(oi)) } } else { Some(Ok(oi)) } } /// Attempt to retrieve the next [`ObjectIndex`] from the stack for /// processing, /// leaving it on the stack. /// /// If the object atop of the stack has been flagged as a cycle by /// [`Self::flag_or_cut_cycle`], /// then the actual path associated with the cycle will be computed /// by [`Self::find_cycle_path`] and an a [`Cycle`] returned. /// /// See also [`Self::pop_next_oi`]. fn next_oi(&self) -> Option, Cycle>> { self.stack .last() .map(|result| result.map_err(|oi| self.find_cycle_path(oi))) } /// Remove an item from [`Self::stack`]. /// /// A better API for the future would take ownership over the stack and /// know for certain that the element being removed is the element /// previously returned. /// /// See also [`Self::next_oi`]. fn pop_next_oi(&mut self) { self.stack.pop(); } /// Knowing that the provided [`ObjectIndex`] would produce a cycle if /// added to the current path, /// calculate the path representing the cycle. /// /// This is a linear-time (`O(n)`) operation that performs a new heap /// allocation. /// Since cycles are an error case, /// it is expected that they will not often occur and so the DFS /// algorithm is optimized for the most common case; /// it is not worth computing the path during the course of the /// search since that path would almost always be discarded. /// /// Deriving a path relies on understanding that: /// /// 1. An [`ObjectIndex`] in [`Self::stack`] is either awaiting /// processing or is _currently_ being processed. /// This means that it contains the path, /// but it also contains neighbors of objects in the path. /// We must filter out those neighbors. /// /// 2. The [`Result`] in [`Self::stack`] indicates whether the object /// causes a cycle. /// A previous object in the path must therefore be [`Ok`], /// otherwise it would not have been traversed, /// and so we must filter all [`Err`]s. /// In doing so, /// we also filter out `next` at the top of the stack, /// and so _this function works correctly regardless of whether /// `next` has already been `pop`'d from the stack_. /// /// 3. [`Self::visited`] is set just before neighbors of an object are /// pushed onto [`Self::stack`]. /// Therefore, /// only objects marked as visited are part of the active path, /// and so to discover that path we need only filter out /// non-visited objects. /// /// 4. [`Self::stack`] contains a path from a provided root. /// We want to cut off the path at the beginning of the cycle. /// The easiest way to do this is to iterate through the stack in /// reverse, /// stopping as soon as we encounter an [`ObjectIndex`] /// matching `next`. /// This has the effect of producing a cycle path in post-order, /// which is consistent with the ordering of [`Self`]'s /// traversal. /// /// 5. The [`ObjectIndex`]es sourced from the [`Asg`] do not contain /// the spans of the target objects. /// Cycles will almost certainly result in diagnostic messages, /// which require accurate spans, /// and so we must resolve the [`ObjectIndex`] to retrieve the /// target [`Span`]. /// /// The path produced will therefore be reversed, /// with `next` as the last element. /// `next` will _not_ be duplicated as the first element, /// which means that if you were to repeat the returned path /// indefinitely end-to-end /// (e.g. using [`Iterator::cycle`]), /// you would have precisely this cycle. /// /// With all of that said, /// the implementation is fairly straightforward and concise. fn find_cycle_path(&self, next: ObjectIndex) -> Cycle { let mut path = self .stack .iter() .rev() .copied() .filter_map(Result::ok) .take_while(|&oi| oi != next) .filter(|&oi| self.visited.contains(oi.into())) .map(|oi| oi.resolve_span(self.asg)) .collect::>(); // We stopped _at_ `next`, // so we need to manually add it to the path. path.push(next.resolve_span(self.asg)); Cycle { path } } } impl<'a> Iterator for TopoPostOrderDfs<'a> { type Item = Result, Cycle>; fn next(&mut self) -> Option { // Rust doesn't have guaranteed TCO as of 2023-04 loop { match self.next_oi()? { Ok(next) => { if self.visited.put(next.into()) { self.pop_next_oi(); // See `Self::push_neighbors` for explanation. if !self.finished.put(next.into()) { break Some(Ok(next)); } } else { self.push_neighbors(next); } } Err(cycle) => { self.pop_next_oi(); return Some(Err(cycle)); } }; } } } /// A graph cycle. /// /// A cycle means that a path contains a duplicate node, /// as if it looped back on itself. /// In terms of TAME, /// a cycle implies a circular dependency. /// /// Identifying Cycle Objects /// ========================= /// TODO: Object names need to be derived from the cycle to display /// concisely to the user. /// The cycle very likely contains identifiers that can be used to describe /// the cycle in more concise terms. /// /// It used to be the case that cycles contained identifier names, /// but that was before the topological sort was generalized to include /// all graph objects; /// see the commit that introduced this message for more information. /// /// TODO: We also ought to represent the spans associated with _references_, /// _in addition to_ just the referenced object. #[derive(Debug, PartialEq)] pub struct Cycle { /// The path representing the cycle in post-order (reversed). /// /// It is expected that [`ObjectIndex`]'s associated [`Span`] has been /// resolved to that of the target object /// (e.g. using [`ObjectIndex::resolve_span`]). /// This allows the indexes to be useful in a diagnostic context. /// /// See [`Self::path_rev`] for more information. path: Vec>, } impl Cycle { /// The path representing the cycle in post-order (reversed). /// /// The path is truncated such that the first node in the path is the /// beginning of the cycle. /// The final node in the cycle is omitted, /// since it is the same as the first; /// if you repeated this path indefinitely /// (e.g. with [`Iterator::cycle`]) /// then you would have precisely the cycle. /// /// The [`ObjectIndex`]es should have [`Span`]s that are resolved /// against the target so that they are useful in a diagnostic /// context. pub fn path_rev(&self) -> &Vec> { &self.path } } impl Display for Cycle { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { // TODO: See note on [`Cycle`] about deriving names. write!(f, "[...]") } } impl Error for Cycle {} impl Diagnostic for Cycle { fn describe(&self) -> Vec { let path = &self.path; let n = path.len(); let ident = path.last().unwrap(); // TODO: See note on [`Cycle`] about deriving names. path.iter() .rev() .enumerate() .map(|(i, oi)| { oi.note(match i { 0 => format!( "[0/{n}] the cycle begins here, depending on..." ), // TODO: s/object// _ => { format!("[{i}/{n}] ...this object, which depends on...") } }) }) .chain(once(ident.error(format!( "[{n}/{n}] ...the object once again, \ creating the cycle" )))) .collect::>() } } #[cfg(test)] mod test;