tame/core/vector/filter.xml

<?xml version="1.0"?>
<!--
  Copyright (C) 2014-2023 Ryan Specialty, LLC.

  This file is part of tame-core.

  tame-core is free software: you can redistribute it and/or modify it
  under the terms of the GNU General Public License as
  published by the Free Software Foundation, either version 3 of the
  License, or (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-->
<package xmlns="http://www.lovullo.com/rater"
         xmlns:c="http://www.lovullo.com/calc"
         xmlns:t="http://www.lovullo.com/rater/apply-template"
         core="true"
         desc="Filtering Vectors and Matrices">

  <import package="../base" />
  <import package="../when" />
  <import package="list" />


  <typedef name="CmpOp"
           desc="Comparison operator">
    <enum type="integer">
      <!-- DO NOT REORDER; see mrange 'over' check -->
      <item name="CMP_OP_EQ"  value="1" desc="Equal (=)" />
      <item name="CMP_OP_LT"  value="2" desc="Less than (&lt;)" />
      <item name="CMP_OP_LTE" value="3" desc="Less than or equal to (&lt;=)" />
      <item name="CMP_OP_GT"  value="4" desc="Greater than (&gt;)" />
      <item name="CMP_OP_GTE" value="5" desc="Greater than or equal to (&gt;=)" />
    </enum>
  </typedef>


  <section title="Vector Filtering">
    <function name="vfilter_lookup"
              desc="Filter predicate by value and use corresponding index in
                    source vector as a value">
      <param name="vector_pred" type="float" set="vector" desc="Vector to filter" />
      <param name="vector_src" type="float" set="vector" desc="Vector to filter" />
      <param name="value"  type="float"              desc="Predicate value" />
      <param name="start_index" type="integer" desc="Starting vector_pred index" />

      <t:cons-until-empty set="vector_pred" only="value" index="start_index">
        <c:value-of name="vector_src" index="start_index" />
      </t:cons-until-empty>
    </function>


    \ref{_vfilter-mask_} allows filtering a vector using a boolean vector as
      a mask.
    If an index in the mask is~$0$,
      then that corresponding index in the source vector will be removed.
    The mask vector \should be the same length as the source vector.\footnote{
      Remember that TAME treats undefined values as~$0$.}

    <template name="_vfilter-mask_"
              desc="Filter vector using a binary vector as a mask">
      <param name="@values@" desc="Inline vector" />
      <param name="@name@"   desc="Named vector (in place of inline)" />
      <param name="@mask@"   desc="Mask vector" />


      <c:apply name="_vfilter_mask" mask="@mask@">
        <c:arg name="vector">
          <if name="@name@">
            <c:value-of name="@name@" />
          </if>
          <unless name="@name@">
            <c:vector>
              <param-copy name="@values@" />
            </c:vector>
          </unless>
        </c:arg>
      </c:apply>
    </template>


    <function name="_vfilter_mask"
              desc="Filter source vector using binary vector as a mask">
      <param name="vector" type="float" set="vector"
             desc="Source vector" />
      <param name="mask" type="integer" set="vector"
             desc="Binary vector used to filter source vector" />


      <c:let>
        <c:values>
          <c:value name="length" type="integer"
                   desc="Length of source vector">
            <c:length-of>
              <c:value-of name="vector" />
            </c:length-of>
          </c:value>

          <!-- TODO: support constants in @index -->
          <c:value name="curmask" type="integer"
                   desc="Current mask value">
            <c:value-of name="mask">
              <c:index>
                <c:value-of name="#0" />
              </c:index>
            </c:value-of>
          </c:value>
        </c:values>

        <c:cases>
          <c:case label="No more elements in source vector">
            <t:when-eq name="length" value="#0" />

            <c:vector />
          </c:case>


          <c:case label="Skip non-match">
            <t:when-eq name="curmask" value="FALSE" />

            <c:recurse>
              <c:arg name="vector">
                <c:cdr>
                  <c:value-of name="vector" />
                </c:cdr>
              </c:arg>
              <c:arg name="mask">
                <c:cdr>
                  <c:value-of name="mask" />
                </c:cdr>
              </c:arg>
            </c:recurse>
          </c:case>


          <c:otherwise>
            <c:cons>
              <c:value-of name="vector">
                <c:index>
                  <c:value-of name="#0" />
                </c:index>
              </c:value-of>

              <c:recurse>
                <c:arg name="vector">
                  <c:cdr>
                    <c:value-of name="vector" />
                  </c:cdr>
                </c:arg>
                <c:arg name="mask">
                  <c:cdr>
                    <c:value-of name="mask" />
                  </c:cdr>
                </c:arg>
              </c:recurse>
            </c:cons>
          </c:otherwise>
        </c:cases>
      </c:let>
    </function>
  </section>


  <section title="Matrix Filtering">
    \ref{mfilter} handles complex filtering of matrices.
    If the requested column~\tt{@col@} is marked as sequential with~\tt{@seq@}
      \emph{and} the comparison operator is~\ref{CMP_OP_EQ},
        then an~$O(lg n)$ bisect algorithm will be used;
          otherwise,
            it will undergo a~$O(n)$ linear scan.

    <function name="mfilter"
              desc="Filter matrix rows by column value">
      <param name="matrix" type="float" set="matrix" desc="Matrix to filter" />
      <param name="col"    type="integer"            desc="Column index to filter on" />
      <param name="vals"   type="float"              desc="Column value to filter on" />
      <param name="seq"    type="boolean"            desc="Is data sequential?" />
      <param name="op"     type="integer"            desc="Comparison operator" />

      <!-- merge the result of each condition in vals into a single set, which
           has the effect of supporting multiple conditions on a single column of
           data (or just one, if they don't want to feel sweet). By performing
           the lookups separately for each, we preserve the bisect-ability of the
           condition. -->
      <t:merge-until-empty set="vals" car="val" glance="TABLE_WHEN_MASK_VALUE">
        <c:apply name="mrange" matrix="matrix" col="col" val="val" seq="seq" op="op">
          <c:arg name="start">
            <c:cases>
              <!-- if we know that the data is sequential, then we may not need to
                   perform a linear search (if the dataset is large enough and the
                   column value is relatively distinct) -->
              <!-- TODO: bisect is currently only performed for CMP_OP_EQ -->
              <c:case>
                <t:when-eq name="op" value="CMP_OP_EQ" />
                <t:when-eq name="seq" value="TRUE" />

                <c:apply name="bisect" matrix="matrix" col="col" val="val">
                  <c:arg name="start">
                    <c:const value="0" desc="Start bisect at beginning" />
                  </c:arg>

                  <c:arg name="end">
                    <!-- bisect the length of the matrix -->
                    <t:dec>
                      <c:length-of>
                        <c:value-of name="matrix" />
                      </c:length-of>
                    </t:dec>
                  </c:arg>
                </c:apply>
              </c:case>

              <!-- we have no good guess; linear search :x -->
              <c:otherwise>
                <c:const value="0" desc="Start at the first element" />
              </c:otherwise>
            </c:cases>
          </c:arg>

          <c:arg name="end">
            <t:dec>
              <c:length-of>
                <c:value-of name="matrix" />
              </c:length-of>
            </t:dec>
          </c:arg>
        </c:apply>
      </t:merge-until-empty>
    </function>


    <function name="mrange"
              desc="Filter matrix rows by column value within a certain
                    range of indexes (inclusive)">
      <param name="matrix" type="float" set="matrix" desc="Matrix to filter" />
      <param name="col" type="integer" desc="Column index to filter on" />
      <param name="val" type="float" desc="Column value to filter on" />
      <param name="start" type="integer" desc="Starting index (inclusive)" />
      <param name="end" type="integer" desc="Ending index (inclusive)" />
      <param name="seq" type="boolean" desc="Is data sequential?" />
      <param name="op" type="integer" desc="Comparison operator" />

      <c:let>
        <c:values>
          <c:value name="matches" type="integer" set="vector"
                   desc="Matching row indexes of matrix">
            <c:apply name="mrange_accum"
                     matrix="matrix" col="col" val="val"
                     start="start" end="end" seq="seq" op="op">
              <!-- Matchines indexes will be accumulated into a vector (in
                   reverse) to permit TCO -->
              <c:arg name="accum">
                <c:vector />
              </c:arg>
            </c:apply>
          </c:value>
        </c:values>

        <c:apply name="_mextract_rows" matrix="matrix" indexes="matches" i="#0">
          <!-- Pre-compute so _mextract_rows doesn't have to -->
          <c:arg name="length">
            <c:length-of>
              <c:value-of name="matches" />
            </c:length-of>
          </c:arg>

          <!-- The final matrix will be accumulated to permit TCO (note that
               this reverse the original reversal mentioned above, so the
               final matrix is in the right order) -->
          <c:arg name="accum">
            <c:vector />
          </c:arg>
        </c:apply>
      </c:let>
    </function>


    <function name="_mextract_rows"
              desc="Pull rows from a matrix by index">
      <param name="matrix" type="float" set="matrix" desc="Source matrix" />
      <param name="indexes" type="integer" set="vector" desc="Indexes to extract" />
      <param name="length" type="integer" desc="Length of indexes vector" />
      <param name="i" type="integer" desc="Current index offset" />
      <param name="accum"  type="float" set="matrix" desc="Accumulator (matrix)" />

      <param name="__experimental_guided_tco" type="float" desc="Experimental guided TCO" />

      <c:cases>
        <!-- When we're done, yield the accumulated value, representign our
             final matrix -->
        <c:case>
          <t:when-eq name="i" value="length" />
          <c:value-of name="accum" />
        </c:case>

        <c:otherwise>
          <c:recurse __experimental_guided_tco="TRUE">
            <c:arg name="i">
              <c:sum>
                <c:value-of name="i" />
                <c:const value="1" desc="Proceed to next index" />
              </c:sum>
            </c:arg>

            <c:arg name="accum">
              <c:cons>
                <!-- Add the row identified by the current index to the
                     accumulator.  Note that this uses cons, so it adds it
                     to the head, but since the original mrange results are
                     reversed, this is precisely what we want—to reverse the
                     reversal -->
                <c:value-of name="matrix">
                  <c:index>
                    <c:value-of name="indexes" index="i" />
                  </c:index>
                </c:value-of>

                <c:value-of name="accum" />
              </c:cons>
            </c:arg>
          </c:recurse>
        </c:otherwise>
      </c:cases>
    </function>


    <function name="mrange_accum"
              desc="Filter matrix rows by column value within a certain
                    range of indexes (inclusive)">
      <param name="matrix" type="float" set="matrix" desc="Matrix to filter" />
      <param name="col" type="integer" desc="Column index to filter on" />
      <param name="val" type="float" desc="Column value to filter on" />
      <param name="start" type="integer" desc="Starting index (inclusive)" />
      <param name="end" type="integer" desc="Ending index (inclusive)" />
      <param name="seq" type="boolean" desc="Is data sequential?" />
      <param name="op" type="integer" desc="Comparison operator" />
      <param name="accum"  type="integer" set="vector" desc="Accumulator (row indexes)" />

      <param name="__experimental_guided_tco" type="float" desc="Experimental guided TCO" />

      <c:let>
        <c:values>
          <c:value name="curval" type="float" desc="Current value">
            <c:value-of name="matrix">
              <c:index>
                <c:value-of name="start" />
              </c:index>

              <c:index>
                <c:value-of name="col" />
              </c:index>
            </c:value-of>
          </c:value>
        </c:values>

        <!-- nested let needed so that the curval is available to over
             in the body below -->
        <c:let>
          <c:values>
            <!-- determine if the value we're looking for is over the current value
                 in a sorted list (meaning that we will not find it) -->
            <c:value name="over" type="boolean"
                     desc="Did we pass the potential value in a sorted list?">
              <c:value-of name="TRUE">
                <t:when-eq name="seq" value="TRUE" />
                <t:when-gt name="curval" value="val" />
              </c:value-of>
            </c:value>
          </c:values>

          <c:cases>
            <!-- if we're done filtering, then return an empty set -->
            <c:case>
              <t:when-gt name="start" value="end" />

              <c:value-of name="accum" />
            </c:case>

            <!-- if the data is sequential and the next element is over the
                 requested value, then we're done (can only be used for
                 equality and LT{,E}; need a GT{,E} version  -->
            <c:case>
              <t:when-lte name="op" value="CMP_OP_LTE" />
              <t:when-eq name="over" value="TRUE" />

              <c:value-of name="accum" />
            </c:case>


            <c:otherwise>
              <c:let>
                <c:values>
                  <c:value name="cur" type="float"
                           desc="Current value">
                    <c:value-of name="matrix">
                      <!-- current row -->
                      <c:index>
                        <c:value-of name="start" />
                      </c:index>

                      <!-- requested column -->
                      <c:index>
                        <c:value-of name="col" />
                      </c:index>
                    </c:value-of>
                  </c:value>
                </c:values>

                <c:let>
                  <c:values>
                    <c:value name="found" type="boolean"
                             desc="Whether comparison matches">
                      <c:cases>
                        <c:case label="Equal">
                          <t:when-eq name="op" value="CMP_OP_EQ" />

                          <c:value-of name="TRUE">
                            <t:when-eq name="cur" value="val" />
                          </c:value-of>
                        </c:case>

                        <c:case label="Less than">
                          <t:when-eq name="op" value="CMP_OP_LT" />

                          <c:value-of name="TRUE">
                            <t:when-lt name="cur" value="val" />
                          </c:value-of>
                        </c:case>

                        <c:case label="Less than or equal to">
                          <t:when-eq name="op" value="CMP_OP_LTE" />

                          <c:value-of name="TRUE">
                            <t:when-lte name="cur" value="val" />
                          </c:value-of>
                        </c:case>

                        <c:case label="Greater than">
                          <t:when-eq name="op" value="CMP_OP_GT" />

                          <c:value-of name="TRUE">
                            <t:when-gt name="cur" value="val" />
                          </c:value-of>
                        </c:case>

                        <c:case label="Greater than or equal to">
                          <t:when-eq name="op" value="CMP_OP_GTE" />

                          <c:value-of name="TRUE">
                            <t:when-gte name="cur" value="val" />
                          </c:value-of>
                        </c:case>
                      </c:cases>
                    </c:value>
                  </c:values>

                  <!-- continue recursion using TCO so that we do not
                       exhaust the stack (this is an undocumented,
                       experimental feature that requires explicitly stating
                       that a recursive call is in tail position)  -->
                  <c:recurse __experimental_guided_tco="TRUE">
                    <c:arg name="accum">
                      <c:cases>
                        <!-- If match, add the current row index to the
                             accumulator (cons, so note that it is added
                             in reverse) -->
                        <c:case>
                          <t:when-eq name="found" value="TRUE" />

                          <c:cons>
                            <c:value-of name="start" />
                            <c:value-of name="accum" />
                          </c:cons>
                        </c:case>

                        <!-- If no match, no change to accumulator -->
                        <c:otherwise>
                          <c:value-of name="accum" />
                        </c:otherwise>
                      </c:cases>
                    </c:arg>

                    <c:arg name="start">
                      <c:sum>
                        <c:value-of name="start" />
                        <c:const value="1" desc="Check next element" />
                      </c:sum>
                    </c:arg>
                  </c:recurse>
                </c:let>
              </c:let>
            </c:otherwise>
          </c:cases>
        </c:let>
      </c:let>
    </function>


    <section title="Bisecting">
      Perform an~$O(lg n)$ bisect on a data set.

      This is intended to limit recursion on very large data sets (and
        consequently will increase performance).
      This will bisect up until a certain point (the gap),
        unless it finds the value in question.
      After finding the value,
        it will perform an~$O(n)$ linear backward search to find the first
          occurrence of the value.
      If the value is not found,
        it will halt at the gap and return the first index of the gap,
          which we will consider its "best guess",
            at which point a linear search can be performed by the caller to
              determine if the value does in fact exist at all.

      (The reason this operates so oddly is because of its caller;
        we could rid the gap and make this a general-purpose function if need be.
      Technically,
        the gap is useless and saves $lg g$ steps,
          which may be very small.)


      <const name="TABLE_WHEN_MASK_VALUE" type="float" value="-0.0000001"
             desc="Used to mask when conditions" />
      <const name="MFILTER_BISECT_GAP_MAX" type="integer" value="10"
             desc="Quit bisect if size is less than or equal to this value" />


      <function name="bisect"
                desc="Bisect a matrix toward the requested column value">
        <param name="matrix" type="float" set="matrix" desc="Matrix to bisect" />
        <param name="col" type="integer" desc="Column index to filter on" />
        <param name="val" type="float" desc="Column value to filter on" />
        <param name="start" type="integer" desc="Start index" />
        <param name="end" type="integer" desc="Start end" />

        <c:let>
          <c:values>
            <!-- the gap represents the number of indexes between the current start
                 and end indexes -->
            <c:value name="gap" type="integer" desc="Gap between current start and end">
              <c:sum>
                <c:value-of name="end" />

                <t:negate>
                  <c:value-of name="start" />
                </t:negate>
              </c:sum>
            </c:value>
          </c:values>

          <!--
            At this point, we need to determine if we should continue the bisect or
            halt. The purpose of the gap is based on the understanding that (with
            our use cases) we will arrive, almost always, at one of two scenarios:
            we found the value, but it's part of a larger set of the same values, or
            the value we are looking for may not even exist at all.

            The gap just limits recursion (but just barely) at smaller levels, since
            bisect will take lg(n) steps). Increase the gap limit to decrease the
            number of steps, or decrease it to 1 if you want a complete bisection.
          -->
          <c:cases>
            <!-- give up if we've reached our gap limit -->
            <c:case>
              <t:when-lte name="gap" value="MFILTER_BISECT_GAP_MAX" />

              <!-- we tried our best; return our current position -->
              <c:value-of name="start" />
            </c:case>


            <!-- we have not yet reached our gap limit; keep going -->
            <c:otherwise>
              <c:let>
                <c:values>
                  <c:value name="mid_index" type="integer" desc="Middle index">
                    <!-- to determine the new mid index, add half of the gap to the
                         current index -->
                    <c:sum>
                      <c:value-of name="start" />
                      <c:ceil>
                        <c:quotient>
                          <c:value-of name="gap" />
                          <c:const value="2" desc="Bisect" />
                        </c:quotient>
                      </c:ceil>
                    </c:sum>
                  </c:value>
                </c:values>

                <c:let>
                  <c:values>
                    <c:value name="mid" type="float" desc="Middle value">
                      <c:value-of name="matrix">
                        <!-- row -->
                        <c:index>
                          <c:value-of name="mid_index" />
                        </c:index>

                        <!-- column -->
                        <c:index>
                          <c:value-of name="col" />
                        </c:index>
                      </c:value-of>
                    </c:value>
                  </c:values>

                  <c:cases>
                    <!-- if the middle value is lower than our value, then take the upper half -->
                    <c:case>
                      <t:when-lt name="mid" value="val" />

                      <c:recurse start="mid_index" />
                    </c:case>

                    <!-- similarily, take the lower half if we over-shot -->
                    <c:case>
                      <t:when-gt name="mid" value="val" />

                      <c:recurse end="mid_index" />
                    </c:case>

                    <!-- if we have an exact match, that doesn't necessarily mean that we
                         have every value; we may have intersected a set of them -->
                    <c:otherwise>
                      <!-- this will return an exact index: the first index
                           containing the element we've been looking for (it is a
                           linear backwards search) -->
                      <c:apply name="foremost" matrix="matrix" col="col" i="mid_index" />
                    </c:otherwise>
                  </c:cases>
                </c:let>
              </c:let>
            </c:otherwise>
          </c:cases>
        </c:let>
      </function>


      <function name="foremost"
                desc="Search backwards for the first occurrance in a sorted list">
        <param name="matrix" type="float" set="matrix" desc="Matrix to bisect" />
        <param name="col" type="integer"               desc="Column index to search on" />
        <param name="i" type="integer"                 desc="Current index" />

        <c:let>
          <c:values>
            <!-- we calculate this rather than accept it via an argument so that
                 this function may be called directly in a more convenient manner
                 -->
            <c:value name="val" type="float" desc="Current value">
              <c:value-of name="matrix">
                <!-- row -->
                <c:index>
                  <c:value-of name="i" />
                </c:index>

                <!-- column -->
                <c:index>
                  <c:value-of name="col" />
                </c:index>
              </c:value-of>
            </c:value>

            <c:value name="prev" type="float" desc="Previous value">
              <c:value-of name="matrix">
                <!-- row -->
                <c:index>
                  <t:dec>
                    <c:value-of name="i" />
                  </t:dec>
                </c:index>

                <!-- column -->
                <c:index>
                  <c:value-of name="col" />
                </c:index>
              </c:value-of>
            </c:value>
          </c:values>

          <c:cases>
            <!-- if we have no more indexes to check, then we're done -->
            <c:case>
              <t:when-eq name="i" value="#0" />

              <c:value-of name="i" />
            </c:case>

            <!-- if the previous column value is the same value, then continue checking -->
            <c:case>
              <t:when-eq name="prev" value="val" />

              <c:recurse>
                <c:arg name="i">
                  <t:dec>
                    <c:value-of name="i" />
                  </t:dec>
                </c:arg>
              </c:recurse>
            </c:case>

            <!-- otherwise, we've found the foremost index -->
            <c:otherwise>
              <c:value-of name="i" />
            </c:otherwise>
          </c:cases>
        </c:let>
      </function>


      <template name="_mask-unless_"
                desc="Mask a value unless the condition is truthful">
        <param name="@values@" desc="Body" />
        <param name="@name@"   desc="Scalar to check" />
        <param name="@index@"  desc="Optional index" />
        <param name="@desc@"   desc="Optional description" />

        <c:cases>
          <!-- if masked -->
          <c:case>
            <t:when-eq name="@name@" index="@index@" value="FALSE" />

            <!-- TODO: configurable mask via meta and/or param -->
            <c:value-of name="TABLE_WHEN_MASK_VALUE" />
          </c:case>

          <!-- if not masked -->
          <c:otherwise>
            <param-copy name="@values@" />
          </c:otherwise>
        </c:cases>
      </template>
    </section>
  </section>
</package>