tame/core/vector/filter.xml

<?xml version="1.0"?>
<!--
  Copyright (C) 2014-2019 Ryan Specialty Group, LLC.

  This file is part of tame-core.

  tame-core is free software: you can redistribute it and/or modify it
  under the terms of the GNU General Public License as
  published by the Free Software Foundation, either version 3 of the
  License, or (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-->
<package xmlns="http://www.lovullo.com/rater"
         xmlns:c="http://www.lovullo.com/calc"
         xmlns:t="http://www.lovullo.com/rater/apply-template"
         core="true"
         desc="Filtering Vectors and Matrices">

  <import package="../base" />
  <import package="../when" />
  <import package="list" />


  <typedef name="CmpOp"
           desc="Comparison operator">
    <enum type="integer">
      <!-- DO NOT REORDER; see mrange 'over' check -->
      <item name="CMP_OP_EQ"  value="1" desc="Equal (=)" />
      <item name="CMP_OP_LT"  value="2" desc="Less than (&lt;)" />
      <item name="CMP_OP_LTE" value="3" desc="Less than or equal to (&lt;=)" />
      <item name="CMP_OP_GT"  value="4" desc="Greater than (&gt;)" />
      <item name="CMP_OP_GTE" value="5" desc="Greater than or equal to (&gt;=)" />
    </enum>
  </typedef>


  <section title="Vector Filtering">
    <function name="vfilter_lookup"
              desc="Filter predicate by value and use corresponding index in
                    source vector as a value">
      <param name="vector_pred" type="float" set="vector" desc="Vector to filter" />
      <param name="vector_src" type="float" set="vector" desc="Vector to filter" />
      <param name="value"  type="float"              desc="Predicate value" />
      <param name="start_index" type="integer" desc="Starting vector_pred index" />

      <t:cons-until-empty set="vector_pred" only="value" index="start_index">
        <c:value-of name="vector_src" index="start_index" />
      </t:cons-until-empty>
    </function>
  </section>


  <section title="Matrix Filtering">
    \ref{mfilter} handles complex filtering of matrices.
    If the requested column~\tt{@col@} is marked as sequential with~\tt{@seq@}
      \emph{and} the comparison operator is~\ref{CMP_OP_EQ},
        then an~$O(lg n)$ bisect algorithm will be used;
          otherwise,
            it will undergo a~$O(n)$ linear scan.

    <function name="mfilter"
              desc="Filter matrix rows by column value">
      <param name="matrix" type="float" set="matrix" desc="Matrix to filter" />
      <param name="col"    type="integer"            desc="Column index to filter on" />
      <param name="vals"   type="float"              desc="Column value to filter on" />
      <param name="seq"    type="boolean"            desc="Is data sequential?" />
      <param name="op"     type="integer"            desc="Comparison operator" />

      <!-- merge the result of each condition in vals into a single set, which
           has the effect of supporting multiple conditions on a single column of
           data (or just one, if they don't want to feel sweet). By performing
           the lookups separately for each, we preserve the bisect-ability of the
           condition. -->
      <t:merge-until-empty set="vals" car="val" glance="TABLE_WHEN_MASK_VALUE">
        <c:apply name="mrange" matrix="matrix" col="col" val="val" seq="seq" op="op">
          <c:arg name="start">
            <c:cases>
              <!-- if we know that the data is sequential, then we may not need to
                   perform a linear search (if the dataset is large enough and the
                   column value is relatively distinct) -->
              <!-- TODO: bisect is currently only performed for CMP_OP_EQ -->
              <c:case>
                <t:when-eq name="op" value="CMP_OP_EQ" />
                <t:when-eq name="seq" value="TRUE" />

                <c:apply name="bisect" matrix="matrix" col="col" val="val">
                  <c:arg name="start">
                    <c:const value="0" desc="Start bisect at beginning" />
                  </c:arg>

                  <c:arg name="end">
                    <!-- bisect the length of the matrix -->
                    <t:dec>
                      <c:length-of>
                        <c:value-of name="matrix" />
                      </c:length-of>
                    </t:dec>
                  </c:arg>
                </c:apply>
              </c:case>

              <!-- we have no good guess; linear search :x -->
              <c:otherwise>
                <c:const value="0" desc="Start at the first element" />
              </c:otherwise>
            </c:cases>
          </c:arg>

          <c:arg name="end">
            <t:dec>
              <c:length-of>
                <c:value-of name="matrix" />
              </c:length-of>
            </t:dec>
          </c:arg>
        </c:apply>
      </t:merge-until-empty>
    </function>


    <function name="mrange"
              desc="Filter matrix rows by column value within a certain
                    range of indexes (inclusive)">
      <param name="matrix" type="float" set="matrix" desc="Matrix to filter" />
      <param name="col" type="integer" desc="Column index to filter on" />
      <param name="val" type="float" desc="Column value to filter on" />
      <param name="start" type="integer" desc="Starting index (inclusive)" />
      <param name="end" type="integer" desc="Ending index (inclusive)" />
      <param name="seq" type="boolean" desc="Is data sequential?" />
      <param name="op" type="integer" desc="Comparison operator" />

      <c:let>
        <c:values>
          <c:value name="curval" type="float" desc="Current value">
            <c:value-of name="matrix">
              <c:index>
                <c:value-of name="start" />
              </c:index>

              <c:index>
                <c:value-of name="col" />
              </c:index>
            </c:value-of>
          </c:value>
        </c:values>

        <!-- nested let needed so that the curval is available to over
             in the body below -->
        <c:let>
          <c:values>
            <!-- determine if the value we're looking for is over the current value
                 in a sorted list (meaning that we will not find it) -->
            <c:value name="over" type="boolean"
                     desc="Did we pass the potential value in a sorted list?">
              <c:value-of name="TRUE">
                <t:when-eq name="seq" value="TRUE" />
                <t:when-gt name="curval" value="val" />
              </c:value-of>
            </c:value>
          </c:values>

          <c:cases>
            <!-- if we're done filtering, then return an empty set -->
            <c:case>
              <t:when-gt name="start" value="end" />

              <c:vector />
            </c:case>

            <!-- if the data is sequential and the next element is over the
                 requested value, then we're done (can only be used for
                 equality and LT{,E}; need a GT{,E} version  -->
            <c:case>
              <t:when-lte name="op" value="CMP_OP_LTE" />
              <t:when-eq name="over" value="TRUE" />

              <c:vector />
            </c:case>


            <c:otherwise>
              <c:apply name="_mrange_cmp" matrix="matrix" col="col" val="val"
                       start="start" end="end" seq="seq" op="op">
                <c:arg name="cur">
                  <c:value-of name="matrix">
                    <!-- current row -->
                    <c:index>
                      <c:value-of name="start" />
                    </c:index>

                    <!-- requested column -->
                    <c:index>
                      <c:value-of name="col" />
                    </c:index>
                  </c:value-of>
                </c:arg>
              </c:apply>
            </c:otherwise>
          </c:cases>
        </c:let>
      </c:let>
    </function>


    <!-- mutually recursive with _mrange -->
    <function name="_mrange_cmp" desc="mrange helper for value comparison">
      <param name="matrix" type="float" set="matrix" desc="Matrix to filter" />
      <param name="col" type="integer"               desc="Column index to filter on" />
      <param name="val" type="float"                 desc="Column value to filter on" />
      <param name="start" type="integer"             desc="Starting index (aka current index)" />
      <param name="end" type="integer"               desc="Ending index" />
      <param name="seq" type="integer"               desc="Is data sequential?" />
      <param name="op" type="integer"                desc="Comparison operator" />
      <param name="cur" type="float"                 desc="Current value" />


      <c:let>
        <c:values>
          <c:value name="found" type="boolean"
                   desc="Whether comparison matches">
            <c:cases>
              <c:case label="Equal">
                <t:when-eq name="op" value="CMP_OP_EQ" />

                <c:value-of name="TRUE">
                  <t:when-eq name="cur" value="val" />
                </c:value-of>
              </c:case>

              <c:case label="Less than">
                <t:when-eq name="op" value="CMP_OP_LT" />

                <c:value-of name="TRUE">
                  <t:when-lt name="cur" value="val" />
                </c:value-of>
              </c:case>

              <c:case label="Less than or equal to">
                <t:when-eq name="op" value="CMP_OP_LTE" />

                <c:value-of name="TRUE">
                  <t:when-lte name="cur" value="val" />
                </c:value-of>
              </c:case>

              <c:case label="Greater than">
                <t:when-eq name="op" value="CMP_OP_GT" />

                <c:value-of name="TRUE">
                  <t:when-gt name="cur" value="val" />
                </c:value-of>
              </c:case>

              <c:case label="Greater than or equal to">
                <t:when-eq name="op" value="CMP_OP_GTE" />

                <c:value-of name="TRUE">
                  <t:when-gte name="cur" value="val" />
                </c:value-of>
              </c:case>
            </c:cases>
          </c:value>
        </c:values>

        <c:cases>
          <!-- if values matches, cons it -->
          <c:case>
            <t:when-eq name="found" value="TRUE" />

            <c:cons>
              <c:value-of name="matrix">
                <c:index>
                  <c:value-of name="start" />
                </c:index>
              </c:value-of>

              <c:apply name="mrange" matrix="matrix" col="col" val="val"
                       end="end" seq="seq" op="op">
                <c:arg name="start">
                  <c:sum>
                    <c:value-of name="start" />
                    <c:const value="1" desc="Check next element" />
                  </c:sum>
                </c:arg>
              </c:apply>
            </c:cons>
          </c:case>


          <!-- no match, continue (mutual) recursion -->
          <c:otherwise>
            <c:apply name="mrange" matrix="matrix" col="col" val="val"
                     end="end" seq="seq" op="op">
              <c:arg name="start">
                <c:sum>
                  <c:value-of name="start" />
                  <c:const value="1" desc="Check next element" />
                </c:sum>
              </c:arg>
            </c:apply>
          </c:otherwise>
        </c:cases>
      </c:let>
    </function>


    <section title="Bisecting">
      Perform an~$O(lg n)$ bisect on a data set.

      This is intended to limit recursion on very large data sets (and
        consequently will increase performance).
      This will bisect up until a certain point (the gap),
        unless it finds the value in question.
      After finding the value,
        it will perform an~$O(n)$ linear backward search to find the first
          occurrence of the value.
      If the value is not found,
        it will halt at the gap and return the first index of the gap,
          which we will consider its "best guess",
            at which point a linear search can be performed by the caller to
              determine if the value does in fact exist at all.

      (The reason this operates so oddly is because of its caller;
        we could rid the gap and make this a general-purpose function if need be.
      Technically,
        the gap is useless and saves $lg g$ steps,
          which may be very small.)


      <const name="TABLE_WHEN_MASK_VALUE" type="float" value="-0.0000001"
             desc="Used to mask when conditions" />
      <const name="MFILTER_BISECT_GAP_MAX" type="integer" value="10"
             desc="Quit bisect if size is less than or equal to this value" />


      <function name="bisect"
                desc="Bisect a matrix toward the requested column value">
        <param name="matrix" type="float" set="matrix" desc="Matrix to bisect" />
        <param name="col" type="integer" desc="Column index to filter on" />
        <param name="val" type="float" desc="Column value to filter on" />
        <param name="start" type="integer" desc="Start index" />
        <param name="end" type="integer" desc="Start end" />

        <c:let>
          <c:values>
            <!-- the gap represents the number of indexes between the current start
                 and end indexes -->
            <c:value name="gap" type="integer" desc="Gap between current start and end">
              <c:sum>
                <c:value-of name="end" />

                <t:negate>
                  <c:value-of name="start" />
                </t:negate>
              </c:sum>
            </c:value>
          </c:values>

          <!--
            At this point, we need to determine if we should continue the bisect or
            halt. The purpose of the gap is based on the understanding that (with
            our use cases) we will arrive, almost always, at one of two scenarios:
            we found the value, but it's part of a larger set of the same values, or
            the value we are looking for may not even exist at all.

            The gap just limits recursion (but just barely) at smaller levels, since
            bisect will take lg(n) steps). Increase the gap limit to decrease the
            number of steps, or decrease it to 1 if you want a complete bisection.
          -->
          <c:cases>
            <!-- give up if we've reached our gap limit -->
            <c:case>
              <t:when-lte name="gap" value="MFILTER_BISECT_GAP_MAX" />

              <!-- we tried our best; return our current position -->
              <c:value-of name="start" />
            </c:case>


            <!-- we have not yet reached our gap limit; keep going -->
            <c:otherwise>
              <c:let>
                <c:values>
                  <c:value name="mid_index" type="integer" desc="Middle index">
                    <!-- to determine the new mid index, add half of the gap to the
                         current index -->
                    <c:sum>
                      <c:value-of name="start" />
                      <c:ceil>
                        <c:quotient>
                          <c:value-of name="gap" />
                          <c:const value="2" desc="Bisect" />
                        </c:quotient>
                      </c:ceil>
                    </c:sum>
                  </c:value>
                </c:values>

                <c:let>
                  <c:values>
                    <c:value name="mid" type="float" desc="Middle value">
                      <c:value-of name="matrix">
                        <!-- row -->
                        <c:index>
                          <c:value-of name="mid_index" />
                        </c:index>

                        <!-- column -->
                        <c:index>
                          <c:value-of name="col" />
                        </c:index>
                      </c:value-of>
                    </c:value>
                  </c:values>

                  <c:cases>
                    <!-- if the middle value is lower than our value, then take the upper half -->
                    <c:case>
                      <t:when-lt name="mid" value="val" />

                      <c:recurse start="mid_index" />
                    </c:case>

                    <!-- similarily, take the lower half if we over-shot -->
                    <c:case>
                      <t:when-gt name="mid" value="val" />

                      <c:recurse end="mid_index" />
                    </c:case>

                    <!-- if we have an exact match, that doesn't necessarily mean that we
                         have every value; we may have intersected a set of them -->
                    <c:otherwise>
                      <!-- this will return an exact index: the first index
                           containing the element we've been looking for (it is a
                           linear backwards search) -->
                      <c:apply name="foremost" matrix="matrix" col="col" i="mid_index" />
                    </c:otherwise>
                  </c:cases>
                </c:let>
              </c:let>
            </c:otherwise>
          </c:cases>
        </c:let>
      </function>


      <function name="foremost"
                desc="Search backwards for the first occurrance in a sorted list">
        <param name="matrix" type="float" set="matrix" desc="Matrix to bisect" />
        <param name="col" type="integer"               desc="Column index to search on" />
        <param name="i" type="integer"                 desc="Current index" />

        <c:let>
          <c:values>
            <!-- we calculate this rather than accept it via an argument so that
                 this function may be called directly in a more convenient manner
                 -->
            <c:value name="val" type="float" desc="Current value">
              <c:value-of name="matrix">
                <!-- row -->
                <c:index>
                  <c:value-of name="i" />
                </c:index>

                <!-- column -->
                <c:index>
                  <c:value-of name="col" />
                </c:index>
              </c:value-of>
            </c:value>

            <c:value name="prev" type="float" desc="Previous value">
              <c:value-of name="matrix">
                <!-- row -->
                <c:index>
                  <t:dec>
                    <c:value-of name="i" />
                  </t:dec>
                </c:index>

                <!-- column -->
                <c:index>
                  <c:value-of name="col" />
                </c:index>
              </c:value-of>
            </c:value>
          </c:values>

          <c:cases>
            <!-- if we have no more indexes to check, then we're done -->
            <c:case>
              <t:when-eq name="i" value="#0" />

              <c:value-of name="i" />
            </c:case>

            <!-- if the previous column value is the same value, then continue checking -->
            <c:case>
              <t:when-eq name="prev" value="val" />

              <c:recurse>
                <c:arg name="i">
                  <t:dec>
                    <c:value-of name="i" />
                  </t:dec>
                </c:arg>
              </c:recurse>
            </c:case>

            <!-- otherwise, we've found the foremost index -->
            <c:otherwise>
              <c:value-of name="i" />
            </c:otherwise>
          </c:cases>
        </c:let>
      </function>


      <template name="_mask-unless_"
                desc="Mask a value unless the condition is truthful">
        <param name="@values@" desc="Body" />
        <param name="@name@"   desc="Scalar to check" />
        <param name="@index@"  desc="Optional index" />
        <param name="@desc@"   desc="Optional description" />

        <c:cases>
          <!-- if masked -->
          <c:case>
            <t:when-eq name="@name@" index="@index@" value="FALSE" />

            <!-- TODO: configurable mask via meta and/or param -->
            <c:value-of name="TABLE_WHEN_MASK_VALUE" />
          </c:case>

          <!-- if not masked -->
          <c:otherwise>
            <param-copy name="@values@" />
          </c:otherwise>
        </c:cases>
      </template>
    </section>
  </section>
</package>