tame/core/vector/table.xml

591 lines
20 KiB
XML
Raw Normal View History

<?xml version="1.0" encoding="utf-8"?>
<!--
Copyright (C) 2014-2023 Ryan Specialty, LLC.
This file is part of tame-core.
tame-core is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
-->
2015-03-18 11:31:47 -04:00
<package xmlns="http://www.lovullo.com/rater"
xmlns:c="http://www.lovullo.com/calc"
xmlns:t="http://www.lovullo.com/rater/apply-template"
core="true"
desc="Functions for performing table lookups">
2015-03-18 11:31:47 -04:00
<import package="../base" />
<!-- since templates are inlined, we need to make these symbols available to
avoid terrible confusion -->
<import package="../numeric/common" export="true"/>
<import package="../when" export="true"/>
2015-03-18 11:31:47 -04:00
<import package="common" export="true" />
<import package="filter" export="true" />
2015-03-18 11:31:47 -04:00
<import package="matrix" export="true" />
<!--
Create a constant table
This definition must appear within a `constants' block.
Permitted children:
- _table_column_+ - Column definitions
- _table_rows_ - Begin table data definition
-->
2015-03-18 11:31:47 -04:00
<template name="_create-table_"
desc="Create an arbitrary table for querying">
2015-03-18 11:31:47 -04:00
<param name="@name@" desc="Table name" />
<param name="@values@" desc="Table definition" />
2015-03-18 11:31:47 -04:00
<param name="@desc@" desc="Table description">
<text></text>
</param>
2015-03-18 11:31:47 -04:00
<param name="@__tid@"
desc="Internal table identifier">
2015-03-18 11:31:47 -04:00
<param-value name="@name@" upper="true" snake="true" />
</param>
2015-03-18 11:31:47 -04:00
<param-copy name="@values@">
<param-meta name="create-table-id" value="@__tid@" />
<param-meta name="create-table-name" value="@name@" />
<param-meta name="create-table-desc" value="@desc@" />
</param-copy>
</template>
<!--
Declare a table column name
A column definition assigns a field name to a column index. If the
column always contains ordered (sequenced) data, then @seq@ should
be set to "true"; this allows a more efficient query strategy to
be used.
If a table is especially large, the first column should be treated
as the index and always be sequenced.
-->
2015-03-18 11:31:47 -04:00
<template name="_table-column_"
desc="Declare name for table column">
2015-03-18 11:31:47 -04:00
<param name="@name@" desc="Column name" />
<param name="@index@" desc="Column index (0-indexed)" />
2015-03-18 11:31:47 -04:00
<param name="@desc@" desc="Column description">
<param-value name="@name@" />
</param>
<!-- use carefully; leave alone unless data is definately sorted,
or query results may be incorrect -->
2015-03-18 11:31:47 -04:00
<param name="@seq@" desc="Column is sorted (sequential)">
<text></text>
</param>
2015-03-18 11:31:47 -04:00
<param name="@__tid@"
desc="Internal table identifier">
2015-03-18 11:31:47 -04:00
<param-inherit meta="create-table-id" />
</param>
<!-- FIXME: this doesn't contain @__tid@ because of a bug in the
preprocessor; this is fixed in the new DSL -->
2015-03-18 11:31:47 -04:00
<param name="@__constname@"
desc="Name of internal constant used for column lookup">
2015-03-18 11:31:47 -04:00
<text>RATE_TABLE_</text>
<param-value name="@name@" upper="true" snake="true" />
</param>
<!-- column index identifier -->
2015-03-18 11:31:47 -04:00
<const name="{@__tid@}_{@__constname@}"
value="@index@" type="integer"
desc="@desc@" />
<!-- column sequential flag to permit query optimizations -->
2015-03-18 11:31:47 -04:00
<if name="@seq@" eq="true">
<const name="{@__tid@}_{@__constname@}_IS_SEQ"
value="1" type="integer"
desc="{@name@} is sequenced" />
2015-03-18 11:31:47 -04:00
</if>
<unless name="@seq@" eq="true">
<const name="{@__tid@}_{@__constname@}_IS_SEQ"
value="0" type="integer"
desc="{@name@} is unordered" />
2015-03-18 11:31:47 -04:00
</unless>
</template>
<!--
Begin table data definition (rows)
Each _table-row_ child inserts a constant row into the table. Note
that all data must be constant.
Use only one of @data@ or children to define rows.
Permitted children:
- _table-row_* - Table row definitions
-->
2015-03-18 11:31:47 -04:00
<template name="_table-rows_"
desc="Begin table data definition">
2015-03-18 11:31:47 -04:00
<param name="@values@" desc="Row definitions" />
<param name="@data@" desc="GNU Octave/MATLAB-style data definition" />
2015-03-18 11:31:47 -04:00
<param name="@__tid@"
desc="Table identifier">
2015-03-18 11:31:47 -04:00
<param-inherit meta="create-table-id" />
</param>
2015-03-18 11:31:47 -04:00
<param name="@__tname@"
desc="Table name as provided by caller">
2015-03-18 11:31:47 -04:00
<param-inherit meta="create-table-name" />
</param>
2015-03-18 11:31:47 -04:00
<param name="@__desc@"
desc="Table description">
2015-03-18 11:31:47 -04:00
<param-inherit meta="create-table-desc" />
</param>
<if name="@data@">
<const name="{@__tid@}_RATE_TABLE"
type="float"
desc="{@__tname@} table; {@__desc@}"
_table-row_: Performance fix: place table in const/text() instead of const/@values This is an interesting one. For some context: TAME uses `csvm` files to provide syntactic sugar for large tables of values ("rate tables", as they're often called, since they contain insurance rates and other data). This gets desugared into a `csv` which in turn is compiled via `csv2xml` into a package. That package uses the `_table-*_` templates to define a table, which is represented as a matrix using `const/@values`. Here's an example of a generated table in a package: ``` <t:create-table name="foo"> <t:table-rows data=" 1,2,3; 4,5,6;" /> </t:create-table> ``` Some of the tables are quite large, generating tens of MiB of data in `@data`. This in itself isn't a problem. But when Saxon parses the `@data` attribute, it normalizes the whitespace, as mandated by the XML spec, and removes the newlines. Therefore, when the template is expanded and the `xmlo` file is produced, the template produced a `const/@values` with a huge amount of data on one line. Then, when another package imports that `xmlo` file via `<import package="..." />`, which is done via `document()` in XSLT, Saxon takes a long time to parse it. 60s on my machine for a ~20MiB line. This problem does not exist for JS fragments; Saxon doesn't mind large text nodes. So that is the approach that is taken here. The template system doesn't have a way to output text yet, so this takes an approach that minimizes changes as much as possible: - `param-copy` will expand `with-param/@value` as a text node. - `const/@values="-"` will cause TAME to use the child text node as the value of `@values`. - `_table-rows_` is modified to use the above two features. The reason for using `@values="-"` is so that other parts of the compiler do not have to be modified to recognize the new text convention, which is otherwise awkward because newlines are text nodes. The `-` convention comes from command line programs, which means "read from stdin", generally; this is okay since `-` is never a valid matrix specification. This must have been a problem for a very long time, but wasn't all that noticeable until recent performance optimizations, since so many other things around it were also slow. DEV-15131
2023-10-18 10:20:49 -04:00
values="-">
<!-- `@values="-"` above tells TAME to read the value from the
child text node -->
<param-copy name="@data@" />
</const>
</if>
<unless name="@data@">
<const name="{@__tid@}_RATE_TABLE"
type="float"
desc="{@__tname@} table; {@__desc@}">
<param-copy name="@values@" />
</const>
</unless>
2015-03-18 11:31:47 -04:00
</template>
<!--
Define a constant table row
Rows will be inserted into the table in the order in which they
appear. Note that, if the column is marked as sequenced, it is
important that the order is itself sequenced.
All values must be constant.
Permitted children:
_table-value_+ - Row column value
-->
2015-03-18 11:31:47 -04:00
<template name="_table-row_"
desc="Define a constant table row (ordered)">
2015-03-18 11:31:47 -04:00
<param name="@values@" desc="Column values" />
2015-03-18 11:31:47 -04:00
<set desc="Row">
<param-copy name="@values@" />
</set>
</template>
<!--
Set a column value for the parent row
Column value order should match the defined column order. All
values must be constants.
-->
2015-03-18 11:31:47 -04:00
<template name="_table-value_"
desc="Table column value for row (ordered)">
2015-03-18 11:31:47 -04:00
<param name="@const@" desc="Constant column value" />
2015-03-18 11:31:47 -04:00
<item value="@const@" desc="Column value" />
</template>
2015-03-18 11:31:47 -04:00
<template name="_query-first-field_" desc="Return the requested field from the first row returned by a query">
<param name="@table@" desc="Table (matrix) to query" />
<param name="@values@" desc="Query parameters" />
<!-- use either field or name[+index] -->
2015-03-18 11:31:47 -04:00
<param name="@field@" desc="Column to select (variable/constant); do not use with @name@" />
<param name="@name@" desc="Name of field to query (variable/constant); overrides @field@" />
<param name="@index@" desc="Optional index for field name lookup (using @name@)" />
<c:car label="First row of query result">
<t:query-field table="@table@" field="@field@" name="@name@" index="@index@">
2015-03-18 11:31:47 -04:00
<param-copy name="@values@" />
</t:query-field>
</c:car>
2015-03-18 11:31:47 -04:00
</template>
<!--
Query-style syntax for matrix searches
Overhead of this algorithm is minimal. Let us consider that a direct access
to an element in a vector is O(1). Therefore, given a set of p predicates
in a direct matrix access, the time would be O(p).
In the case of this query method, we perform first a bisect and then a
linear search backward to the first element matching the given predicate.
Therefore, per field, the best case average is O(lg n), where n is the
number of rows; this assumes that no backward linear search is required.
Should such a search be required, the worst case per field is O(lg n + m),
where m is the length of the largest subset.
Once again considering p predicates, each predicate P_k where k > 0
(0-indexed) will operate on a subset of the first predicate. As already
discussed, the worst-case scenerio for the length of the subset is m. Since
further queries on this subset are rarely likely to use the bisect
algorithm, we're looking at O(m) time per field. Therefore, the total query
time for the best-case scenerio is still O(lg n) if m is sufficiently small
and O(lg n + pm) if m is sufficiently large.
An important case to note is when m approaches (or is equal to) n; in such
a case, the algorithm degrades to a worst-case linear search of O(pn) and
best case of O(n) if early predicates are sufficiently effective at
reducing the subsets to further predicates: That is, the bisect algorithm
either cannot be used or is ineffective. For large m, this may cause a
stack overflow due to the recursive nature of the algorithm. Is is
therefore important to order the first column of the table such that it is
both sorted and produces small m. Additionally, it is ideal for the first
predicate to query the first field to quickly reduce the size of the set
for the next predicate.
-->
2015-03-18 11:31:47 -04:00
<template name="_query-field_" desc="Return the requested field from rows returned by a query">
<param name="@table@" desc="Table (matrix) to query" />
<param name="@values@" desc="Query parameters" />
<!-- use one or the other -->
2015-03-18 11:31:47 -04:00
<param name="@field@" desc="Column to select (variable/constant); do not use with @name@" />
2015-03-18 11:31:47 -04:00
<param name="@index@" desc="Optional index for field name lookup (using @name@)" />
<!-- by default, if @field@ is provided instead of @name@, the field
constant will be generated (same concept as the 'when' template) -->
2015-03-18 11:31:47 -04:00
<param name="@name@" desc="Name of field to query (variable/constant); overrides @field@">
<!-- convert @table@ to uppercase and snake case (replace - with _) -->
2015-03-18 11:31:47 -04:00
<param-value name="@table@" upper="true" snake="true" />
<text>_RATE_TABLE_</text>
<param-value name="@field@" upper="true" snake="true" />
</param>
<c:apply name="mcol" label="Query result">
<!-- the matrix (vector of rows) returned by the query -->
<c:arg name="matrix">
<t:query-row table="@table@">
2015-03-18 11:31:47 -04:00
<param-copy name="@values@" />
</t:query-row>
</c:arg>
<!-- the field (column) to retrieve; 0-based index -->
<c:arg name="col">
<!-- no index lookup needed -->
2015-03-18 11:31:47 -04:00
<unless name="@index@">
<c:value-of name="@name@" />
2015-03-18 11:31:47 -04:00
</unless>
<!-- index lookup required -->
2015-03-18 11:31:47 -04:00
<if name="@index@">
<c:value-of name="@name@" index="@index@" />
2015-03-18 11:31:47 -04:00
</if>
</c:arg>
</c:apply>
2015-03-18 11:31:47 -04:00
</template>
2015-03-18 11:31:47 -04:00
<template name="_query-row_" desc="Query a table (matrix) for a row (vector) of values">
<param name="@table@" desc="Table (matrix)" />
<param name="@values@" desc="Query parameters" />
<!-- this defaults to a table name constant as generated from the csv2xml
script; either this or @table@ should be used -->
2015-03-18 11:31:47 -04:00
<param name="@matrix@" desc="Matrix to look up from">
<!-- convert @table@ to uppercase and snake case (replace - with _) -->
2015-03-18 11:31:47 -04:00
<param-value name="@table@" upper="true" snake="true" />
<text>_RATE_TABLE</text>
</param>
<c:let>
<c:values>
<c:value name="_qparams" type="integer" set="matrix"
desc="Query parameters">
<c:vector>
<param-copy name="@values@">
<param-meta name="table_basename" value="@matrix@" />
</param-copy>
</c:vector>
</c:value>
</c:values>
<c:apply name="_mquery" matrix="@matrix@">
<c:arg name="criteria">
<c:value-of name="_qparams" />
</c:arg>
<c:arg name="i">
<!-- begin with the last predicate (due to the way we'll recurse, it
will be applied *last* -->
<t:dec>
<c:length-of>
<c:value-of name="_qparams" />
</c:length-of>
</t:dec>
</c:arg>
</c:apply>
</c:let>
2015-03-18 11:31:47 -04:00
</template>
There are a series of \tt{_where-*_} templates for query predicates that
are analogous to the \tt{_match-*_} and \tt{_when-*_} templates used in
other contexts.
<inline-template>
<for-each>
<set tplname="_where-eq_" op="CMP_OP_EQ" desc="equal" />
<set tplname="_where-lt_" op="CMP_OP_LT" desc="less than" />
<set tplname="_where-lte_" op="CMP_OP_LTE" desc="less than or equal to" />
<set tplname="_where-gt_" op="CMP_OP_GT" desc="greater than" />
<set tplname="_where-gte_" op="CMP_OP_GTE" desc="greater than or equal to" />
</for-each>
<template name="@tplname@" desc="Field predicate for table query ({@desc@})">
<param name="@values@" desc="Field value (provide only one node)" />
<param name="@id@" desc="Field index" />
<param name="@field@" desc="Field name (to be used with base)" />
<param name="@name@" desc="Field name (as a variable/constant)">
<text></text>
</param>
<param name="@seqvar@" desc="Var/constant containing whether field is sequential">
<text></text>
</param>
<t:where id="@id@" seqvar="@seqvar@"
field="@field@" name="@name@" op="@op@">
<expand-barrier>
<param-copy name="@values@" />
</expand-barrier>
</t:where>
</template>
</inline-template>
<template name="_where_" desc="Create field predicate for query definition">
2015-03-18 11:31:47 -04:00
<param name="@id@" desc="Field index" />
<param name="@values@" desc="Field value (provide only one node)" />
<param name="@sequential@" desc="Is data sequential?" />
<!-- @name@ may be provided directly, or @field@ may be used when the
basename is available (set by a query template), giving the illusion of
querying the table columns by name directly (magic!); pure syntatic
sugary goodness -->
2015-03-18 11:31:47 -04:00
<param name="@field@" desc="Field name (to be used with base)" />
<param name="@name@" desc="Field name (as a variable/constant)">
<param-inherit meta="table_basename" />
<text>_</text>
<param-value name="@field@" upper="true" snake="true" />
</param>
<param name="@seqvar@" desc="Var/constant containing whether field is sequential">
<param-inherit meta="table_basename" />
<text>_</text>
<param-value name="@field@" upper="true" snake="true" />
<text>_IS_SEQ</text>
</param>
<param name="@op@"
desc="Comparison operator (default CMP_OP_EQ; see CmpOp typedef)">
<text>CMP_OP_EQ</text>
</param>
<c:vector label="Conditional for {@field@}">
<!-- the first element will represent the column (field) index -->
2015-03-18 11:31:47 -04:00
<unless name="@name@">
<c:const value="@id@" desc="Field index" />
2015-03-18 11:31:47 -04:00
</unless>
<if name="@name@">
<c:value-of name="@name@" />
2015-03-18 11:31:47 -04:00
</if>
<!-- the second element will represent the expected value(s) -->
<c:vector>
2015-03-18 11:31:47 -04:00
<param-copy name="@values@" />
</c:vector>
<!-- the third element will represent whether or not this field is sequential -->
2015-03-18 11:31:47 -04:00
<if name="@sequential@">
<c:const value="@sequential@" desc="Whether data is sequential" />
2015-03-18 11:31:47 -04:00
</if>
<unless name="@sequential@">
<!-- if a field name was given, we can get the sequential information
that was already generated for us -->
2015-03-18 11:31:47 -04:00
<if name="@field@">
<c:value-of name="@seqvar@" />
2015-03-18 11:31:47 -04:00
</if>
<!-- otherwise, default to non-sequential -->
2015-03-18 11:31:47 -04:00
<unless name="@field@">
<c:value-of name="FALSE" />
2015-03-18 11:31:47 -04:00
</unless>
</unless>
<!-- the fourth and final element is the comparison operator -->
<c:value-of name="@op@" />
</c:vector>
2015-03-18 11:31:47 -04:00
</template>
<!--
_when_ is deprecated in favor of _where-eq_.
This old template aimed to be consistent with the use of `when'
elsewhere (for cases and value predicates), but it was awkward in a
query abstraction.
-->
<template name="_when_"
desc="Create field predicate for query definition (deprecated;
use _where-*_)">
<param name="@values@" desc="Field value (provide only one node)" />
<param name="@id@" desc="Field index" />
<param name="@sequential@" desc="Is data sequential?" />
<param name="@field@" desc="Field name (to be used with base)" />
<param name="@name@" desc="Field name (as a variable/constant)">
<text></text>
</param>
<param name="@seqvar@" desc="Var/constant containing whether field is sequential">
<text></text>
</param>
<param name="@op@"
desc="Comparison operator (default CMP_OP_EQ; see CmpOp typedef)">
<text></text>
</param>
<warning>
_when_ is deprecated; use _where-eq_ instead
</warning>
<t:where id="@id@" sequential="@sequential@" seqvar="@seqvar@"
field="@field@" name="@name@" op="CMP_OP_EQ">
<param-copy name="@values@" />
</t:where>
</template>
<!--
These functions make the magic happen
They are hideous. Look away.
-->
<!-- this function is intended to be called by the _query_ template, not directly -->
2015-03-18 11:31:47 -04:00
<function name="_mquery" desc="Query for vectors using a set of column criteria">
<param name="matrix" type="float" set="matrix" desc="Matrix to query" />
<param name="criteria" type="float" set="matrix" desc="Query criteria" />
2015-03-18 11:31:47 -04:00
<param name="i" type="integer" desc="Current criteria index" />
<c:cases>
<c:case>
<!-- it's important that we allow index 0, since that is a valid
predicate -->
<t:when-eq name="i" value="#-1" />
<!-- we're done; stick with the result -->
<c:value-of name="matrix" />
</c:case>
<c:otherwise>
<c:apply name="mfilter">
<!-- matrix to search -->
<c:arg name="matrix">
<!-- >> recursion happens here << -->
<c:recurse>
<c:arg name="i">
<t:dec>
<c:value-of name="i" />
</t:dec>
</c:arg>
</c:recurse>
</c:arg>
<!-- field (column) -->
<c:arg name="col">
<c:value-of name="criteria">
<c:index>
<c:value-of name="i" />
</c:index>
<c:index>
<c:const value="0" desc="Field id" />
</c:index>
</c:value-of>
</c:arg>
<!-- value(s) to search for -->
<c:arg name="vals">
<c:value-of name="criteria">
<c:index>
<c:value-of name="i" />
</c:index>
<c:index>
<c:const value="1" desc="Field value" />
</c:index>
</c:value-of>
</c:arg>
<!-- if it's sequential, we can cut down on the search substantially -->
<c:arg name="seq">
<c:value-of name="criteria">
<c:index>
<c:value-of name="i" />
</c:index>
<c:index>
<c:const value="2" desc="Sequential flag" />
</c:index>
</c:value-of>
</c:arg>
<!-- comparison operator -->
<c:arg name="op">
<c:value-of name="criteria">
<c:index>
<c:value-of name="i" />
</c:index>
<c:index>
<c:const value="3" desc="Comparison operator" />
</c:index>
</c:value-of>
</c:arg>
</c:apply>
</c:otherwise>
</c:cases>
2015-03-18 11:31:47 -04:00
</function>
</package>