tame/tamer/src/asg/graph/visit/topo.rs

541 lines
20 KiB
Rust
Raw Normal View History

// Topological sort ASG traversal
//
// Copyright (C) 2014-2023 Ryan Specialty, LLC.
//
// This file is part of TAME.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//! Topological sort of [`Asg`] with ontological consideration.
//!
//! This toplogical sort is a depth-first search (DFS) that emits nodes in
//! post-order.
//! Intuitively,
//! it emits objects sorted in such a way that they appear before each of
//! their dependencies.
//!
//! The ordering is deterministic between runs on the same graph,
//! but it is only one of potentially many orderings.
//!
//! The only information provided by this sort is a stream of
//! [`ObjectIndex`]es ordered linearly.
//! No information about the edge or source object is provided,
//! nor is information about the length of the current path,
//! since an object may be visited any number of different ways and the
//! caller ought not rely on the particular path taken.
//! Furthermore,
//! an object may be visited any number of times from any number of paths,
//! but only the first visit is emitted,
//! so any additional information would provide an incomplete picture;
//! this sort is _not_ intended to provide information about all paths
//! to a particular object and cannot be used in that way.
//!
//! Cutting Of Cycles
//! =================
//! A _cycle_ is a path that references another object earlier in the path,
//! as if it loops in on itself.
//! Cycles are generally not permitted,
//! as they would require that a value would have to be computed before it
//! could compute itself.
//! This almost certainly represents an error in the program's specification.
//!
//! Cycles are permitted for recursion.
//! More information can be found in [`ObjectRel::can_recurse`].
//!
//! A toplogical ordering is defined only for graphs that do not contain
//! cycles.
//! To order a graph _with_ cycles,
//! the depth-first search performs a _cut_,
//! whereby the edge that would have led to the cycle is omitted,
//! as if cutting a loop of string at the point that it is tied.
//! An example of such a cut can be found in [`ObjectRel::can_recurse`].
//!
//! This is done in two scenarios:
//!
//! 1. An unsupported cycle is an error.
//! A cut is performed as a means of error recovery so that the process
//! may continue and discover more errors before terminating.
//!
//! 2. A cycle representing allowed recursion performs a cut since the
//! path taken thus far already represents a valid ordering.
use crate::{
asg::{
graph::object::DynObjectRel, Asg, Object, ObjectIndex,
ObjectIndexResolvedSpan, ObjectKind,
},
diagnose::{Annotate, AnnotatedSpan, Diagnostic},
};
use fixedbitset::FixedBitSet;
use std::{error::Error, fmt::Display, iter::once};
#[cfg(doc)]
use crate::{asg::graph::object::ObjectRel, span::Span};
/// Topological sort with cutting of ontologically permitted cycles.
///
/// This is a TAMER-specific topological sort that is aware of the graph's
/// ontology and will automatically sort an acyclic subgraph produced by
/// cutting permitted cycles.
/// See the [module-level documentation](self) for more information.
pub fn topo_sort<O: ObjectKind>(
asg: &Asg,
init: impl Iterator<Item = ObjectIndex<O>>,
) -> TopoPostOrderDfs {
TopoPostOrderDfs::new(asg, init.map(ObjectIndex::widen))
}
/// Topological sort implemented as a post-order depth-first search (DFS).
///
/// See the [module-level documentation](self) for important information
/// about this traversal.
pub struct TopoPostOrderDfs<'a> {
/// Reference [`Asg`].
///
/// Holding a reference to the [`Asg`] allows this object to serve
/// conveniently as an iterator.
asg: &'a Asg,
/// DFS stack.
///
/// As objects (nodes/vertices) are visited,
/// its relationships (edge targets) are pushed onto the stack.
/// Each iterator pops a relationship off the stack and visits it.
///
/// The inner [`Result`] serves as a cycle flag set by
/// [`Self::flag_or_cut_cycle`].
/// Computing the proper [`Cycle`] error before placing it on the stack
/// would not only bloat the size of each element of this stack,
/// but also use unnecessary memory on the heap.
/// The proper [`Cycle`] error will be computed when this element is
/// retrieved by [`Self::next_oi`].
///
/// _This may contain duplicate [`ObjectIndex`]es even if the graph
/// contains no cycles;_
/// see [`Self::push_neighbors`] for an explanation.
///
/// The traversal ends once the stack becomes empty.
/// It is expected the stack is initialized with at least one initial
/// object prior to beginning the traversal.
stack: Vec<Result<ObjectIndex<Object>, ObjectIndex<Object>>>,
/// Objects that have already been added to [`Self::stack`].
///
/// An object that has already been visited will _not_ be visited
/// again.
/// A visited object is only present in [`Self::stack`] until it is
/// finished,
/// after which it appears in [`Self::finished`].
visited: FixedBitSet,
/// Objects that have been emitted and pop'd from [`Self::stack`].
///
/// This is used for cycle detection.
/// Before pushing an object onto [`Self::stack`],
/// the system first checks [`Self::visited`].
/// If an object has been visited,
/// but has not yet been finished,
/// then it must still be present on the stack and must therefore
/// be part of a cycle.
finished: FixedBitSet,
}
pub trait ObjectRelFilter = Fn(DynObjectRel) -> bool;
/// Initial capacity of the [`TopoPostOrderDfs`] stack.
///
/// The stack will need to be able to accommodate all nodes and their
/// siblings within the longest path taken by the DFS.
/// If there are many rooted objects
/// (e.g. for `tameld`),
/// this may be quite large.
///
/// The current number is arbitrary and only intended to reduce initial
/// small re-allocations;
/// it is too small for linking and too large for individual packages.
const INIT_STACK_CAP: usize = 32;
impl<'a> TopoPostOrderDfs<'a> {
fn new(
asg: &'a Asg,
init: impl Iterator<Item = ObjectIndex<Object>>,
) -> Self {
let set_cap = asg.object_count();
let mut stack = Vec::with_capacity(INIT_STACK_CAP);
init.map(Ok).collect_into(&mut stack);
Self {
asg,
stack,
visited: FixedBitSet::with_capacity(set_cap),
finished: FixedBitSet::with_capacity(set_cap),
}
}
/// Push the neighbors of the given [`ObjectIndex`] onto [`Self::stack`]
/// for later processing.
///
/// Placing neighbors on the stack allows us to yield elements from the
/// iterator without having to keep track of where we are on the graph
/// for each node in the path.
///
/// When visiting a node for the first time,
/// its neighbors
/// (objects to which `src_oi` has an edge)
/// are pushed onto [`Self::stack`].
/// It is expected that `src_oi` is left on the stack,
/// ensuring that its neighbors are processed before `src_oi` is,
/// leading to a post-order traversal.
///
/// Objects that have already been emitted will _not_ be pushed onto the
/// stack;
/// this determination is made by consulting [`Self::finished`].
///
/// Each object that is pushed onto the stack will be checked by
/// [`Self::flag_or_cut_cycle`];
/// see that function for more information.
/// It is important that each cycle be flagged individually,
/// rather than returning an error from this function,
/// otherwise only one cycle per object would be found.
///
/// Duplicate Stack Entries Without Cycles
/// ======================================
/// [`Self::stack`] may contain duplicate [`ObjectIndex`]es even if
/// there is no cycle.
///
/// The reason for this is that a cycle only occurs when an
/// [`ObjectIndex`] is part of the path currently being visited.
/// But [`Self::stack`] contains objects that have _not yet been visited_;
/// they've been placed onto the stack by this method to be visited at
/// a future point.
///
/// Consider this graph:
///
/// ```text
/// (A) -> (B) -> (D)
/// '---> (C) <---'
/// ```
///
/// A traversal might yield this stack if `C` is visited before `B`:
///
/// ```text
/// [A] // root
/// [A, C, B] // self.push_neighbors(A)
/// [A, C, B, D] // self.push_neighbors(B)
/// [A, C, B, D, C] // self.push_neighbors(D)
/// ```
///
/// Since `C` does not contain an edge _to_ any previous object,
/// there is no cycle.
///
/// For this reason,
/// it is important for the implementation to check [`Self::finished`]
/// when removing objects from the stack to ensure that they have not
/// already been emitted.
fn push_neighbors(&mut self, src_oi: ObjectIndex<Object>) {
self.asg
.edges_dyn(src_oi)
.filter(|dyn_oi| !self.finished.contains((*dyn_oi.target()).into()))
.filter_map(|dyn_oi| {
Self::flag_or_cut_cycle(&self.visited, self.asg, dyn_oi)
})
.collect_into(&mut self.stack);
}
/// Determine if the provided relation would introduce a cycle if
/// appended to the current path and flag it if so.
///
/// This should be called only after having checked [`Self::finished`],
/// which means that a node is _not_ in the path because it has
/// already been emitted.
///
/// With [`Self::finished`] having been ruled out,
/// this uses [`Self::visited`] to determine if a node must be part of
/// the active path of the DFS.
/// If so,
/// then introducing it again would produce a cycle.
///
/// Cycles are permitted under limited circumstances,
/// where the edge represents a recursive target.
/// This determination is made utilizing the graph's ontology via
/// [`DynObjectRel::can_recurse`].
/// If the cycle ends up being permitted,
/// then we perform a cut by filtering out the edge entirely,
/// as if it did not exist.
/// It is up to the graph's ontology to ensure that all such cuts will
/// result in a valid ordering.
/// (Cuts also occur during error recovery for unsupported cycles.)
///
/// We use [`Result`] where `E` is [`ObjectIndex`] to simply flag the
/// object as containing a cycle;
/// this allows us to defer computation of the cycle and allocation
/// of memory for that path until we actually visit the node on
/// [`Self::stack`].
/// This allows the element size of [`Self::stack`] to remain small.
///
/// See [`Self::find_cycle_path`] for the actual cycle computation that
/// will eventually be performed.
fn flag_or_cut_cycle(
visited: &FixedBitSet,
asg: &Asg,
dyn_oi: DynObjectRel,
) -> Option<Result<ObjectIndex<Object>, ObjectIndex<Object>>> {
let oi = *dyn_oi.target();
if visited.contains(oi.into()) {
if dyn_oi.can_recurse(asg) {
None // cut
} else {
Some(Err(oi))
}
} else {
Some(Ok(oi))
}
}
/// Attempt to retrieve the next [`ObjectIndex`] from the stack for
/// processing,
/// leaving it on the stack.
///
/// If the object atop of the stack has been flagged as a cycle by
/// [`Self::flag_or_cut_cycle`],
/// then the actual path associated with the cycle will be computed
/// by [`Self::find_cycle_path`] and an a [`Cycle`] returned.
///
/// See also [`Self::pop_next_oi`].
fn next_oi(&self) -> Option<Result<ObjectIndex<Object>, Cycle>> {
self.stack
.last()
.map(|result| result.map_err(|oi| self.find_cycle_path(oi)))
}
/// Remove an item from [`Self::stack`].
///
/// A better API for the future would take ownership over the stack and
/// know for certain that the element being removed is the element
/// previously returned.
///
/// See also [`Self::next_oi`].
fn pop_next_oi(&mut self) {
self.stack.pop();
}
/// Knowing that the provided [`ObjectIndex`] would produce a cycle if
/// added to the current path,
/// calculate the path representing the cycle.
///
/// This is a linear-time (`O(n)`) operation that performs a new heap
/// allocation.
/// Since cycles are an error case,
/// it is expected that they will not often occur and so the DFS
/// algorithm is optimized for the most common case;
/// it is not worth computing the path during the course of the
/// search since that path would almost always be discarded.
///
/// Deriving a path relies on understanding that:
///
/// 1. An [`ObjectIndex`] in [`Self::stack`] is either awaiting
/// processing or is _currently_ being processed.
/// This means that it contains the path,
/// but it also contains neighbors of objects in the path.
/// We must filter out those neighbors.
///
/// 2. The [`Result`] in [`Self::stack`] indicates whether the object
/// causes a cycle.
/// A previous object in the path must therefore be [`Ok`],
/// otherwise it would not have been traversed,
/// and so we must filter all [`Err`]s.
/// In doing so,
/// we also filter out `next` at the top of the stack,
/// and so _this function works correctly regardless of whether
/// `next` has already been `pop`'d from the stack_.
///
/// 3. [`Self::visited`] is set just before neighbors of an object are
/// pushed onto [`Self::stack`].
/// Therefore,
/// only objects marked as visited are part of the active path,
/// and so to discover that path we need only filter out
/// non-visited objects.
///
/// 4. [`Self::stack`] contains a path from a provided root.
/// We want to cut off the path at the beginning of the cycle.
/// The easiest way to do this is to iterate through the stack in
/// reverse,
/// stopping as soon as we encounter an [`ObjectIndex`]
/// matching `next`.
/// This has the effect of producing a cycle path in post-order,
/// which is consistent with the ordering of [`Self`]'s
/// traversal.
///
/// 5. The [`ObjectIndex`]es sourced from the [`Asg`] do not contain
/// the spans of the target objects.
/// Cycles will almost certainly result in diagnostic messages,
/// which require accurate spans,
/// and so we must resolve the [`ObjectIndex`] to retrieve the
/// target [`Span`].
///
/// The path produced will therefore be reversed,
/// with `next` as the last element.
/// `next` will _not_ be duplicated as the first element,
/// which means that if you were to repeat the returned path
/// indefinitely end-to-end
/// (e.g. using [`Iterator::cycle`]),
/// you would have precisely this cycle.
///
/// With all of that said,
/// the implementation is fairly straightforward and concise.
fn find_cycle_path(&self, next: ObjectIndex<Object>) -> Cycle {
let mut path = self
.stack
.iter()
.rev()
.copied()
.filter_map(Result::ok)
.take_while(|&oi| oi != next)
.filter(|&oi| self.visited.contains(oi.into()))
.map(|oi| oi.resolve_span(self.asg))
.collect::<Vec<_>>();
// We stopped _at_ `next`,
// so we need to manually add it to the path.
path.push(next.resolve_span(self.asg));
Cycle { path }
}
}
impl<'a> Iterator for TopoPostOrderDfs<'a> {
type Item = Result<ObjectIndex<Object>, Cycle>;
fn next(&mut self) -> Option<Self::Item> {
// Rust doesn't have guaranteed TCO as of 2023-04
loop {
match self.next_oi()? {
Ok(next) => {
if self.visited.put(next.into()) {
self.pop_next_oi();
// See `Self::push_neighbors` for explanation.
if !self.finished.put(next.into()) {
break Some(Ok(next));
}
} else {
self.push_neighbors(next);
}
}
Err(cycle) => {
self.pop_next_oi();
return Some(Err(cycle));
}
};
}
}
}
/// A graph cycle.
///
/// A cycle means that a path contains a duplicate node,
/// as if it looped back on itself.
/// In terms of TAME,
/// a cycle implies a circular dependency.
///
/// Identifying Cycle Objects
/// =========================
/// TODO: Object names need to be derived from the cycle to display
/// concisely to the user.
/// The cycle very likely contains identifiers that can be used to describe
/// the cycle in more concise terms.
///
/// It used to be the case that cycles contained identifier names,
/// but that was before the topological sort was generalized to include
/// all graph objects;
/// see the commit that introduced this message for more information.
///
/// TODO: We also ought to represent the spans associated with _references_,
/// _in addition to_ just the referenced object.
#[derive(Debug, PartialEq)]
pub struct Cycle {
/// The path representing the cycle in post-order (reversed).
///
/// It is expected that [`ObjectIndex`]'s associated [`Span`] has been
/// resolved to that of the target object
/// (e.g. using [`ObjectIndex::resolve_span`]).
/// This allows the indexes to be useful in a diagnostic context.
///
/// See [`Self::path_rev`] for more information.
path: Vec<ObjectIndexResolvedSpan<Object>>,
}
impl Cycle {
/// The path representing the cycle in post-order (reversed).
///
/// The path is truncated such that the first node in the path is the
/// beginning of the cycle.
/// The final node in the cycle is omitted,
/// since it is the same as the first;
/// if you repeated this path indefinitely
/// (e.g. with [`Iterator::cycle`])
/// then you would have precisely the cycle.
///
/// The [`ObjectIndex`]es should have [`Span`]s that are resolved
/// against the target so that they are useful in a diagnostic
/// context.
pub fn path_rev(&self) -> &Vec<ObjectIndexResolvedSpan<Object>> {
&self.path
}
}
impl Display for Cycle {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
// TODO: See note on [`Cycle`] about deriving names.
write!(f, "[...]")
}
}
impl Error for Cycle {}
impl Diagnostic for Cycle {
fn describe(&self) -> Vec<AnnotatedSpan> {
let path = &self.path;
let n = path.len();
let ident = path.last().unwrap();
// TODO: See note on [`Cycle`] about deriving names.
path.iter()
.rev()
.enumerate()
.map(|(i, oi)| {
oi.note(match i {
0 => format!(
"[0/{n}] the cycle begins here, depending on..."
),
// TODO: s/object/<TYPE OF OBJECT>/
_ => {
format!("[{i}/{n}] ...this object, which depends on...")
}
})
})
.chain(once(ident.error(format!(
"[{n}/{n}] ...the object once again, \
creating the cycle"
))))
.collect::<Vec<_>>()
}
}
#[cfg(test)]
mod test;