tame: Introduce __pkguniq and preproc:pkg-generate-id to replace generate-id

This modifies the XSLT-based compiler to generate ids that are expected to
be unique across packages.  No such guarantee exists today; `generate-id()`
relies on the position of the node within a tree, which could easily be the
same across multiple compiler invocations for separate packages.

This situation seldom occurs, but has happened with increased frequency
lately in a system with >1000 packages.  It is more likely to occur in
packages that are very similar to one-another or where the beginning of the
package is similar (such as packages used as configuration for taxes for
each individual state).

This derives a SHA-256 hash from the canonical package name (well, not
canonical acccording to TAMER, but close: without the leading slash),
truncating it to 32 bits.  I used a birthday attack to estimate what the
size of this value ought to be: sqrt(2^32) = 65536, which is way more
packages than the poor XSLT-based compiler is going to handle.

If ever it needs to be increased due to conflicts, that is simple enough.

DEV-14965
main
Mike Gerwitz 2023-09-19 16:18:04 -04:00
parent bdd98a5d92
commit 418bd34005
5 changed files with 85 additions and 9 deletions

View File

@ -26,7 +26,9 @@
template that is intended for use with dslc should include this.
-->
<stylesheet version="2.0"
xmlns="http://www.w3.org/1999/XSL/Transform">
xmlns="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:preproc="http://www.lovullo.com/rater/preproc">
<!--
Absolute path to root of TAME
@ -80,6 +82,26 @@
-->
<param name="__rseed" />
<!--
A package-unique string
You should use `preproc:pkg-generate-id` instead of this value directly.
This value is deterministic, derived from `__srcpkg`, and so will not
change between runs; it can be used to generate identifier names that are
unique across packages, which is not something that we can rely on
`generate-id()` for on its own.
In practice, this can be concatenated with other generated strings,
including `generate-id()`-derived strings.
_There is no guarantee that this string will begin with a letter_, so you
should generate your identifiers accordingly.
See `DslCompiler.java` for implementation.
-->
<param name="__pkguniq" as="xs:string" />
<!--
Root node of template on which stylesheet was invoked
@ -114,4 +136,12 @@
</choose>
</template>
<function name="preproc:pkg-generate-id" as="xs:string">
<param name="refnode" as="node()" />
<sequence select="concat(
'_pu', $__pkguniq, '_',
generate-id( $refnode ) )" />
</function>
</stylesheet>

View File

@ -166,7 +166,7 @@
<template match="c:let[ not( @name ) ]" mode="preproc:expand" priority="5">
<copy>
<sequence select="@*" />
<attribute name="name" select="generate-id(.)" />
<attribute name="name" select="preproc:pkg-generate-id(.)" />
<apply-templates select="*" mode="preproc:expand" />
</copy>

View File

@ -415,11 +415,7 @@
<template match="lv:any|lv:all" mode="preproc:class-groupgen" priority="5">
<param name="legacy-classify" as="xs:boolean" tunnel="yes" />
<!-- this needs to be unique enough that there is unlikely to be a conflict
between generated ids in various packages; generate-id is not enough for
cross-package guarantees (indeed, I did witness conflicts), so there is
a random seed passed into the stylesheet externally -->
<variable name="id" select="concat( $__rseed, generate-id(.) )" />
<variable name="id" select="preproc:pkg-generate-id(.)" />
<variable name="parent-name" select="ancestor::lv:classify/@as" />
<variable name="yields" select="concat( 'is', $id )" />

View File

@ -411,7 +411,7 @@
with an optional looping construct
-->
<template match="lv:inline-template" mode="preproc:macros" priority="5">
<variable name="name" select="concat( '___i', generate-id(.), '___' )" />
<variable name="name" select="concat( '___i', preproc:pkg-generate-id(.), '___' )" />
<variable name="inline" select="." />
<!-- generate template -->
@ -1155,7 +1155,7 @@
tunnel="yes" />
<value-of select="." />
<value-of select="generate-id( $apply )" />
<value-of select="preproc:pkg-generate-id( $apply )" />
</template>

View File

@ -31,6 +31,7 @@ package com.lovullo.dslc;
import java.io.*;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.security.MessageDigest;
import java.util.HashMap;
import java.util.Map;
import javax.xml.transform.Source;
@ -163,6 +164,10 @@ public class DslCompiler
new QName( "__rseed" ),
XdmValue.makeValue( (int)( Math.random() * 10e6 ) )
);
t.setParameter(
new QName( "__pkguniq" ),
XdmValue.makeValue( _createPkgUniq( srcpkg ) )
);
_setTemplateParams( t, params );
@ -173,6 +178,51 @@ public class DslCompiler
}
// Generate an identifier that is expected to be unique given a
// canonical package name.
//
// This produces a string that is expected to
// - Be usable as a non-leading component of a C-style identifier;
// - Provide enough entropy so as to be unlikely to cause
// conflicts between thousands of packages; and
// - Is reasonably short so as not to bloat generated identifier
// sizes too greatly.
private String _createPkgUniq(String srcpkg) throws Exception {
// This is used only for uniqueness, not security. The choice
// of hash function is not particularly important so long as it
// provides a good distribution. If collisions are encountered
// between packages with a good algorithm, increase the
// truncation length.
MessageDigest md = MessageDigest.getInstance( "SHA-256" );
byte[] digest = md.digest( srcpkg.getBytes( "UTF-8" ) );
// Ensure we received a string of the expected length, otherwise
// we risk sending a non-unique id off to the XSLT-based
// compiler.
int len_expected = 32; // 256 bits
int len = digest.length;
if ( len != len_expected ) {
throw new Exception(
String.format(
"Unexpected pkguniq length (expected %d): %d",
len_expected,
len
)
);
}
// 32 bits = 4 bytes = 8 hex chars.
// Birthday attack: sqrt(2^32) = 65536 packages
StringBuilder hex = new StringBuilder();
int use_len = 4;
for ( int i = 0; i < use_len; i++ ) {
hex.append( String.format( "%02x", digest[i] ) );
}
return hex.toString();
}
private void _setTemplateParams(
XsltTransformer t,
HashMap<String,String> params