662 lines
24 KiB
XML
662 lines
24 KiB
XML
<?xml version="1.0"?>
|
|
<!--
|
|
Copyright (C) 2014-2019 Ryan Specialty Group, LLC.
|
|
|
|
This file is part of tame-core.
|
|
|
|
tame-core is free software: you can redistribute it and/or modify it
|
|
under the terms of the GNU General Public License as
|
|
published by the Free Software Foundation, either version 3 of the
|
|
License, or (at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
-->
|
|
<package xmlns="http://www.lovullo.com/rater"
|
|
xmlns:c="http://www.lovullo.com/calc"
|
|
xmlns:t="http://www.lovullo.com/rater/apply-template"
|
|
core="true"
|
|
desc="Filtering Vectors and Matrices">
|
|
|
|
<import package="../base" />
|
|
<import package="../when" />
|
|
<import package="list" />
|
|
|
|
|
|
<typedef name="CmpOp"
|
|
desc="Comparison operator">
|
|
<enum type="integer">
|
|
<!-- DO NOT REORDER; see mrange 'over' check -->
|
|
<item name="CMP_OP_EQ" value="1" desc="Equal (=)" />
|
|
<item name="CMP_OP_LT" value="2" desc="Less than (<)" />
|
|
<item name="CMP_OP_LTE" value="3" desc="Less than or equal to (<=)" />
|
|
<item name="CMP_OP_GT" value="4" desc="Greater than (>)" />
|
|
<item name="CMP_OP_GTE" value="5" desc="Greater than or equal to (>=)" />
|
|
</enum>
|
|
</typedef>
|
|
|
|
|
|
<section title="Vector Filtering">
|
|
<function name="vfilter_lookup"
|
|
desc="Filter predicate by value and use corresponding index in
|
|
source vector as a value">
|
|
<param name="vector_pred" type="float" set="vector" desc="Vector to filter" />
|
|
<param name="vector_src" type="float" set="vector" desc="Vector to filter" />
|
|
<param name="value" type="float" desc="Predicate value" />
|
|
<param name="start_index" type="integer" desc="Starting vector_pred index" />
|
|
|
|
<t:cons-until-empty set="vector_pred" only="value" index="start_index">
|
|
<c:value-of name="vector_src" index="start_index" />
|
|
</t:cons-until-empty>
|
|
</function>
|
|
|
|
|
|
\ref{_vfilter-mask_} allows filtering a vector using a boolean vector as
|
|
a mask.
|
|
If an index in the mask is~$0$,
|
|
then that corresponding index in the source vector will be removed.
|
|
The mask vector \should be the same length as the source vector.\footnote{
|
|
Remember that TAME treats undefined values as~$0$.}
|
|
|
|
<template name="_vfilter-mask_"
|
|
desc="Filter vector using a binary vector as a mask">
|
|
<param name="@values@" desc="Inline vector" />
|
|
<param name="@name@" desc="Named vector (in place of inline)" />
|
|
<param name="@mask@" desc="Mask vector" />
|
|
|
|
|
|
<c:apply name="_vfilter_mask" mask="@mask@">
|
|
<c:arg name="vector">
|
|
<if name="@name@">
|
|
<c:value-of name="@name@" />
|
|
</if>
|
|
<unless name="@name@">
|
|
<c:vector>
|
|
<param-copy name="@values@" />
|
|
</c:vector>
|
|
</unless>
|
|
</c:arg>
|
|
</c:apply>
|
|
</template>
|
|
|
|
|
|
<function name="_vfilter_mask"
|
|
desc="Filter source vector using binary vector as a mask">
|
|
<param name="vector" type="float" set="vector"
|
|
desc="Source vector" />
|
|
<param name="mask" type="integer" set="vector"
|
|
desc="Binary vector used to filter source vector" />
|
|
|
|
|
|
<c:let>
|
|
<c:values>
|
|
<c:value name="length" type="integer"
|
|
desc="Length of source vector">
|
|
<c:length-of>
|
|
<c:value-of name="vector" />
|
|
</c:length-of>
|
|
</c:value>
|
|
|
|
<!-- TODO: support constants in @index -->
|
|
<c:value name="curmask" type="integer"
|
|
desc="Current mask value">
|
|
<c:value-of name="mask">
|
|
<c:index>
|
|
<c:value-of name="#0" />
|
|
</c:index>
|
|
</c:value-of>
|
|
</c:value>
|
|
</c:values>
|
|
|
|
<c:cases>
|
|
<c:case label="No more elements in source vector">
|
|
<t:when-eq name="length" value="#0" />
|
|
|
|
<c:vector />
|
|
</c:case>
|
|
|
|
|
|
<c:case label="Skip non-match">
|
|
<t:when-eq name="curmask" value="FALSE" />
|
|
|
|
<c:recurse>
|
|
<c:arg name="vector">
|
|
<c:cdr>
|
|
<c:value-of name="vector" />
|
|
</c:cdr>
|
|
</c:arg>
|
|
<c:arg name="mask">
|
|
<c:cdr>
|
|
<c:value-of name="mask" />
|
|
</c:cdr>
|
|
</c:arg>
|
|
</c:recurse>
|
|
</c:case>
|
|
|
|
|
|
<c:otherwise>
|
|
<c:cons>
|
|
<c:value-of name="vector">
|
|
<c:index>
|
|
<c:value-of name="#0" />
|
|
</c:index>
|
|
</c:value-of>
|
|
|
|
<c:recurse>
|
|
<c:arg name="vector">
|
|
<c:cdr>
|
|
<c:value-of name="vector" />
|
|
</c:cdr>
|
|
</c:arg>
|
|
<c:arg name="mask">
|
|
<c:cdr>
|
|
<c:value-of name="mask" />
|
|
</c:cdr>
|
|
</c:arg>
|
|
</c:recurse>
|
|
</c:cons>
|
|
</c:otherwise>
|
|
</c:cases>
|
|
</c:let>
|
|
</function>
|
|
</section>
|
|
|
|
|
|
<section title="Matrix Filtering">
|
|
\ref{mfilter} handles complex filtering of matrices.
|
|
If the requested column~\tt{@col@} is marked as sequential with~\tt{@seq@}
|
|
\emph{and} the comparison operator is~\ref{CMP_OP_EQ},
|
|
then an~$O(lg n)$ bisect algorithm will be used;
|
|
otherwise,
|
|
it will undergo a~$O(n)$ linear scan.
|
|
|
|
<function name="mfilter"
|
|
desc="Filter matrix rows by column value">
|
|
<param name="matrix" type="float" set="matrix" desc="Matrix to filter" />
|
|
<param name="col" type="integer" desc="Column index to filter on" />
|
|
<param name="vals" type="float" desc="Column value to filter on" />
|
|
<param name="seq" type="boolean" desc="Is data sequential?" />
|
|
<param name="op" type="integer" desc="Comparison operator" />
|
|
|
|
<!-- merge the result of each condition in vals into a single set, which
|
|
has the effect of supporting multiple conditions on a single column of
|
|
data (or just one, if they don't want to feel sweet). By performing
|
|
the lookups separately for each, we preserve the bisect-ability of the
|
|
condition. -->
|
|
<t:merge-until-empty set="vals" car="val" glance="TABLE_WHEN_MASK_VALUE">
|
|
<c:apply name="mrange" matrix="matrix" col="col" val="val" seq="seq" op="op">
|
|
<c:arg name="start">
|
|
<c:cases>
|
|
<!-- if we know that the data is sequential, then we may not need to
|
|
perform a linear search (if the dataset is large enough and the
|
|
column value is relatively distinct) -->
|
|
<!-- TODO: bisect is currently only performed for CMP_OP_EQ -->
|
|
<c:case>
|
|
<t:when-eq name="op" value="CMP_OP_EQ" />
|
|
<t:when-eq name="seq" value="TRUE" />
|
|
|
|
<c:apply name="bisect" matrix="matrix" col="col" val="val">
|
|
<c:arg name="start">
|
|
<c:const value="0" desc="Start bisect at beginning" />
|
|
</c:arg>
|
|
|
|
<c:arg name="end">
|
|
<!-- bisect the length of the matrix -->
|
|
<t:dec>
|
|
<c:length-of>
|
|
<c:value-of name="matrix" />
|
|
</c:length-of>
|
|
</t:dec>
|
|
</c:arg>
|
|
</c:apply>
|
|
</c:case>
|
|
|
|
<!-- we have no good guess; linear search :x -->
|
|
<c:otherwise>
|
|
<c:const value="0" desc="Start at the first element" />
|
|
</c:otherwise>
|
|
</c:cases>
|
|
</c:arg>
|
|
|
|
<c:arg name="end">
|
|
<t:dec>
|
|
<c:length-of>
|
|
<c:value-of name="matrix" />
|
|
</c:length-of>
|
|
</t:dec>
|
|
</c:arg>
|
|
</c:apply>
|
|
</t:merge-until-empty>
|
|
</function>
|
|
|
|
|
|
<function name="mrange"
|
|
desc="Filter matrix rows by column value within a certain
|
|
range of indexes (inclusive)">
|
|
<param name="matrix" type="float" set="matrix" desc="Matrix to filter" />
|
|
<param name="col" type="integer" desc="Column index to filter on" />
|
|
<param name="val" type="float" desc="Column value to filter on" />
|
|
<param name="start" type="integer" desc="Starting index (inclusive)" />
|
|
<param name="end" type="integer" desc="Ending index (inclusive)" />
|
|
<param name="seq" type="boolean" desc="Is data sequential?" />
|
|
<param name="op" type="integer" desc="Comparison operator" />
|
|
|
|
<c:let>
|
|
<c:values>
|
|
<c:value name="curval" type="float" desc="Current value">
|
|
<c:value-of name="matrix">
|
|
<c:index>
|
|
<c:value-of name="start" />
|
|
</c:index>
|
|
|
|
<c:index>
|
|
<c:value-of name="col" />
|
|
</c:index>
|
|
</c:value-of>
|
|
</c:value>
|
|
</c:values>
|
|
|
|
<!-- nested let needed so that the curval is available to over
|
|
in the body below -->
|
|
<c:let>
|
|
<c:values>
|
|
<!-- determine if the value we're looking for is over the current value
|
|
in a sorted list (meaning that we will not find it) -->
|
|
<c:value name="over" type="boolean"
|
|
desc="Did we pass the potential value in a sorted list?">
|
|
<c:value-of name="TRUE">
|
|
<t:when-eq name="seq" value="TRUE" />
|
|
<t:when-gt name="curval" value="val" />
|
|
</c:value-of>
|
|
</c:value>
|
|
</c:values>
|
|
|
|
<c:cases>
|
|
<!-- if we're done filtering, then return an empty set -->
|
|
<c:case>
|
|
<t:when-gt name="start" value="end" />
|
|
|
|
<c:vector />
|
|
</c:case>
|
|
|
|
<!-- if the data is sequential and the next element is over the
|
|
requested value, then we're done (can only be used for
|
|
equality and LT{,E}; need a GT{,E} version -->
|
|
<c:case>
|
|
<t:when-lte name="op" value="CMP_OP_LTE" />
|
|
<t:when-eq name="over" value="TRUE" />
|
|
|
|
<c:vector />
|
|
</c:case>
|
|
|
|
|
|
<c:otherwise>
|
|
<c:apply name="_mrange_cmp" matrix="matrix" col="col" val="val"
|
|
start="start" end="end" seq="seq" op="op">
|
|
<c:arg name="cur">
|
|
<c:value-of name="matrix">
|
|
<!-- current row -->
|
|
<c:index>
|
|
<c:value-of name="start" />
|
|
</c:index>
|
|
|
|
<!-- requested column -->
|
|
<c:index>
|
|
<c:value-of name="col" />
|
|
</c:index>
|
|
</c:value-of>
|
|
</c:arg>
|
|
</c:apply>
|
|
</c:otherwise>
|
|
</c:cases>
|
|
</c:let>
|
|
</c:let>
|
|
</function>
|
|
|
|
|
|
<!-- mutually recursive with _mrange -->
|
|
<function name="_mrange_cmp" desc="mrange helper for value comparison">
|
|
<param name="matrix" type="float" set="matrix" desc="Matrix to filter" />
|
|
<param name="col" type="integer" desc="Column index to filter on" />
|
|
<param name="val" type="float" desc="Column value to filter on" />
|
|
<param name="start" type="integer" desc="Starting index (aka current index)" />
|
|
<param name="end" type="integer" desc="Ending index" />
|
|
<param name="seq" type="integer" desc="Is data sequential?" />
|
|
<param name="op" type="integer" desc="Comparison operator" />
|
|
<param name="cur" type="float" desc="Current value" />
|
|
|
|
|
|
<c:let>
|
|
<c:values>
|
|
<c:value name="found" type="boolean"
|
|
desc="Whether comparison matches">
|
|
<c:cases>
|
|
<c:case label="Equal">
|
|
<t:when-eq name="op" value="CMP_OP_EQ" />
|
|
|
|
<c:value-of name="TRUE">
|
|
<t:when-eq name="cur" value="val" />
|
|
</c:value-of>
|
|
</c:case>
|
|
|
|
<c:case label="Less than">
|
|
<t:when-eq name="op" value="CMP_OP_LT" />
|
|
|
|
<c:value-of name="TRUE">
|
|
<t:when-lt name="cur" value="val" />
|
|
</c:value-of>
|
|
</c:case>
|
|
|
|
<c:case label="Less than or equal to">
|
|
<t:when-eq name="op" value="CMP_OP_LTE" />
|
|
|
|
<c:value-of name="TRUE">
|
|
<t:when-lte name="cur" value="val" />
|
|
</c:value-of>
|
|
</c:case>
|
|
|
|
<c:case label="Greater than">
|
|
<t:when-eq name="op" value="CMP_OP_GT" />
|
|
|
|
<c:value-of name="TRUE">
|
|
<t:when-gt name="cur" value="val" />
|
|
</c:value-of>
|
|
</c:case>
|
|
|
|
<c:case label="Greater than or equal to">
|
|
<t:when-eq name="op" value="CMP_OP_GTE" />
|
|
|
|
<c:value-of name="TRUE">
|
|
<t:when-gte name="cur" value="val" />
|
|
</c:value-of>
|
|
</c:case>
|
|
</c:cases>
|
|
</c:value>
|
|
</c:values>
|
|
|
|
<c:cases>
|
|
<!-- if values matches, cons it -->
|
|
<c:case>
|
|
<t:when-eq name="found" value="TRUE" />
|
|
|
|
<c:cons>
|
|
<c:value-of name="matrix">
|
|
<c:index>
|
|
<c:value-of name="start" />
|
|
</c:index>
|
|
</c:value-of>
|
|
|
|
<c:apply name="mrange" matrix="matrix" col="col" val="val"
|
|
end="end" seq="seq" op="op">
|
|
<c:arg name="start">
|
|
<c:sum>
|
|
<c:value-of name="start" />
|
|
<c:const value="1" desc="Check next element" />
|
|
</c:sum>
|
|
</c:arg>
|
|
</c:apply>
|
|
</c:cons>
|
|
</c:case>
|
|
|
|
|
|
<!-- no match, continue (mutual) recursion -->
|
|
<c:otherwise>
|
|
<c:apply name="mrange" matrix="matrix" col="col" val="val"
|
|
end="end" seq="seq" op="op">
|
|
<c:arg name="start">
|
|
<c:sum>
|
|
<c:value-of name="start" />
|
|
<c:const value="1" desc="Check next element" />
|
|
</c:sum>
|
|
</c:arg>
|
|
</c:apply>
|
|
</c:otherwise>
|
|
</c:cases>
|
|
</c:let>
|
|
</function>
|
|
|
|
|
|
<section title="Bisecting">
|
|
Perform an~$O(lg n)$ bisect on a data set.
|
|
|
|
This is intended to limit recursion on very large data sets (and
|
|
consequently will increase performance).
|
|
This will bisect up until a certain point (the gap),
|
|
unless it finds the value in question.
|
|
After finding the value,
|
|
it will perform an~$O(n)$ linear backward search to find the first
|
|
occurrence of the value.
|
|
If the value is not found,
|
|
it will halt at the gap and return the first index of the gap,
|
|
which we will consider its "best guess",
|
|
at which point a linear search can be performed by the caller to
|
|
determine if the value does in fact exist at all.
|
|
|
|
(The reason this operates so oddly is because of its caller;
|
|
we could rid the gap and make this a general-purpose function if need be.
|
|
Technically,
|
|
the gap is useless and saves $lg g$ steps,
|
|
which may be very small.)
|
|
|
|
|
|
<const name="TABLE_WHEN_MASK_VALUE" type="float" value="-0.0000001"
|
|
desc="Used to mask when conditions" />
|
|
<const name="MFILTER_BISECT_GAP_MAX" type="integer" value="10"
|
|
desc="Quit bisect if size is less than or equal to this value" />
|
|
|
|
|
|
<function name="bisect"
|
|
desc="Bisect a matrix toward the requested column value">
|
|
<param name="matrix" type="float" set="matrix" desc="Matrix to bisect" />
|
|
<param name="col" type="integer" desc="Column index to filter on" />
|
|
<param name="val" type="float" desc="Column value to filter on" />
|
|
<param name="start" type="integer" desc="Start index" />
|
|
<param name="end" type="integer" desc="Start end" />
|
|
|
|
<c:let>
|
|
<c:values>
|
|
<!-- the gap represents the number of indexes between the current start
|
|
and end indexes -->
|
|
<c:value name="gap" type="integer" desc="Gap between current start and end">
|
|
<c:sum>
|
|
<c:value-of name="end" />
|
|
|
|
<t:negate>
|
|
<c:value-of name="start" />
|
|
</t:negate>
|
|
</c:sum>
|
|
</c:value>
|
|
</c:values>
|
|
|
|
<!--
|
|
At this point, we need to determine if we should continue the bisect or
|
|
halt. The purpose of the gap is based on the understanding that (with
|
|
our use cases) we will arrive, almost always, at one of two scenarios:
|
|
we found the value, but it's part of a larger set of the same values, or
|
|
the value we are looking for may not even exist at all.
|
|
|
|
The gap just limits recursion (but just barely) at smaller levels, since
|
|
bisect will take lg(n) steps). Increase the gap limit to decrease the
|
|
number of steps, or decrease it to 1 if you want a complete bisection.
|
|
-->
|
|
<c:cases>
|
|
<!-- give up if we've reached our gap limit -->
|
|
<c:case>
|
|
<t:when-lte name="gap" value="MFILTER_BISECT_GAP_MAX" />
|
|
|
|
<!-- we tried our best; return our current position -->
|
|
<c:value-of name="start" />
|
|
</c:case>
|
|
|
|
|
|
<!-- we have not yet reached our gap limit; keep going -->
|
|
<c:otherwise>
|
|
<c:let>
|
|
<c:values>
|
|
<c:value name="mid_index" type="integer" desc="Middle index">
|
|
<!-- to determine the new mid index, add half of the gap to the
|
|
current index -->
|
|
<c:sum>
|
|
<c:value-of name="start" />
|
|
<c:ceil>
|
|
<c:quotient>
|
|
<c:value-of name="gap" />
|
|
<c:const value="2" desc="Bisect" />
|
|
</c:quotient>
|
|
</c:ceil>
|
|
</c:sum>
|
|
</c:value>
|
|
</c:values>
|
|
|
|
<c:let>
|
|
<c:values>
|
|
<c:value name="mid" type="float" desc="Middle value">
|
|
<c:value-of name="matrix">
|
|
<!-- row -->
|
|
<c:index>
|
|
<c:value-of name="mid_index" />
|
|
</c:index>
|
|
|
|
<!-- column -->
|
|
<c:index>
|
|
<c:value-of name="col" />
|
|
</c:index>
|
|
</c:value-of>
|
|
</c:value>
|
|
</c:values>
|
|
|
|
<c:cases>
|
|
<!-- if the middle value is lower than our value, then take the upper half -->
|
|
<c:case>
|
|
<t:when-lt name="mid" value="val" />
|
|
|
|
<c:recurse start="mid_index" />
|
|
</c:case>
|
|
|
|
<!-- similarily, take the lower half if we over-shot -->
|
|
<c:case>
|
|
<t:when-gt name="mid" value="val" />
|
|
|
|
<c:recurse end="mid_index" />
|
|
</c:case>
|
|
|
|
<!-- if we have an exact match, that doesn't necessarily mean that we
|
|
have every value; we may have intersected a set of them -->
|
|
<c:otherwise>
|
|
<!-- this will return an exact index: the first index
|
|
containing the element we've been looking for (it is a
|
|
linear backwards search) -->
|
|
<c:apply name="foremost" matrix="matrix" col="col" i="mid_index" />
|
|
</c:otherwise>
|
|
</c:cases>
|
|
</c:let>
|
|
</c:let>
|
|
</c:otherwise>
|
|
</c:cases>
|
|
</c:let>
|
|
</function>
|
|
|
|
|
|
<function name="foremost"
|
|
desc="Search backwards for the first occurrance in a sorted list">
|
|
<param name="matrix" type="float" set="matrix" desc="Matrix to bisect" />
|
|
<param name="col" type="integer" desc="Column index to search on" />
|
|
<param name="i" type="integer" desc="Current index" />
|
|
|
|
<c:let>
|
|
<c:values>
|
|
<!-- we calculate this rather than accept it via an argument so that
|
|
this function may be called directly in a more convenient manner
|
|
-->
|
|
<c:value name="val" type="float" desc="Current value">
|
|
<c:value-of name="matrix">
|
|
<!-- row -->
|
|
<c:index>
|
|
<c:value-of name="i" />
|
|
</c:index>
|
|
|
|
<!-- column -->
|
|
<c:index>
|
|
<c:value-of name="col" />
|
|
</c:index>
|
|
</c:value-of>
|
|
</c:value>
|
|
|
|
<c:value name="prev" type="float" desc="Previous value">
|
|
<c:value-of name="matrix">
|
|
<!-- row -->
|
|
<c:index>
|
|
<t:dec>
|
|
<c:value-of name="i" />
|
|
</t:dec>
|
|
</c:index>
|
|
|
|
<!-- column -->
|
|
<c:index>
|
|
<c:value-of name="col" />
|
|
</c:index>
|
|
</c:value-of>
|
|
</c:value>
|
|
</c:values>
|
|
|
|
<c:cases>
|
|
<!-- if we have no more indexes to check, then we're done -->
|
|
<c:case>
|
|
<t:when-eq name="i" value="#0" />
|
|
|
|
<c:value-of name="i" />
|
|
</c:case>
|
|
|
|
<!-- if the previous column value is the same value, then continue checking -->
|
|
<c:case>
|
|
<t:when-eq name="prev" value="val" />
|
|
|
|
<c:recurse>
|
|
<c:arg name="i">
|
|
<t:dec>
|
|
<c:value-of name="i" />
|
|
</t:dec>
|
|
</c:arg>
|
|
</c:recurse>
|
|
</c:case>
|
|
|
|
<!-- otherwise, we've found the foremost index -->
|
|
<c:otherwise>
|
|
<c:value-of name="i" />
|
|
</c:otherwise>
|
|
</c:cases>
|
|
</c:let>
|
|
</function>
|
|
|
|
|
|
<template name="_mask-unless_"
|
|
desc="Mask a value unless the condition is truthful">
|
|
<param name="@values@" desc="Body" />
|
|
<param name="@name@" desc="Scalar to check" />
|
|
<param name="@index@" desc="Optional index" />
|
|
<param name="@desc@" desc="Optional description" />
|
|
|
|
<c:cases>
|
|
<!-- if masked -->
|
|
<c:case>
|
|
<t:when-eq name="@name@" index="@index@" value="FALSE" />
|
|
|
|
<!-- TODO: configurable mask via meta and/or param -->
|
|
<c:value-of name="TABLE_WHEN_MASK_VALUE" />
|
|
</c:case>
|
|
|
|
<!-- if not masked -->
|
|
<c:otherwise>
|
|
<param-copy name="@values@" />
|
|
</c:otherwise>
|
|
</c:cases>
|
|
</template>
|
|
</section>
|
|
</section>
|
|
</package>
|