csvm: Auto-sort expanded output

master
Mike Gerwitz 2018-10-03 14:44:55 -04:00
commit b716e8c2cd
3 changed files with 339 additions and 148 deletions

View File

@ -0,0 +1,195 @@
#!/usr/bin/awk -f
#
# Expands a "magic" CSV file into a normal CSV
#
# Copyright (C) 2016, 2018 R-T Specialty, LLC.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# "Magic" CSVs simply exist to make life easier: they permit comments, blank
# lines, variables, sub-delimiter expansion, and any number of ranges per line.
# Ranges will be expanded in every combination, making rate tables highly
# maintainable.
#
# Variables are also supported when defined using :var=val. Variables may
# expand into ranges, 'cause they're awesome. Multiple variables may be
# delimited by semi-colons, as may multiple values.
#
# For example:
# :foo=1--3
# $foo;7;9--10:$foo, 5--10,1/1/2017
#
# Would generate:
# 1, 5, 1483246800
# 1, 6, 1483246800
# ...
# 5, 10, 1483246800
# 2, 5, 1483246800
# ...
# 9, 5, 14832468005
# ...
# 1, 5, 1483246800
# 1, 6, 1483246800
# ...
##
# Expand variable with its value, if any
function expand_vars( s, value )
{
# attempt to parse variable (may expand into a range)
if ( match( s, /^\$([a-zA-Z_-]+)$/, m ) )
{
value = vars[ m[1] ];
if ( value == "" )
{
print "error: unknown variable reference: `$" m[1] "'" > "/dev/stderr"
exit 1
}
return value
}
return s
}
# Expand line
function parseline( i, m, j, me, orig )
{
if ( i > NF )
{
print
return
}
orig = $i
# expand variables before any processing so that expansions
# can include any type of formatting
$i = expand_vars( $i )
if ( match( $i, /^([0-9]+\/){2}[0-9]+$/, m ) )
{
cmd = "date --date=" $i " +%s"
cmd |& getline $i
close(cmd)
}
# check first for delimiters
if ( match( $i, /^([^;]+);(.*)$/, m ) )
{
# give it a shot with the first value
$i = m[1]
parseline( i )
# strip off the first value and process with following value(s)
$i = m[2]
parseline( i )
# we've delegated; we're done
$i = orig
return
}
# parse range
if ( match( $i, /^([^-]+)--([^-]+)$/, m ) )
{
j = expand_vars( m[1] )
me = expand_vars( m[2] )
if ( !match( j, /^[0-9]+$/ ) || !match( me, /^[0-9]+$/ ) )
{
print "error: invalid range: `" $i "'" > "/dev/stderr"
exit 1
}
do
{
$i = j
parseline( i + 1 )
} while ( j++ < me )
}
else
{
parseline( i + 1 );
}
# restore to original value
$i = orig
}
BEGIN {
# we're parsing CSVs
FS = " *, *"
OFS = ","
has_directives = 0
directives = "!(NODIRECTIVES)"
}
# skip all lines that begin with `#', which denotes a comment, or are empty
/^#|^$/ { next; }
# directives are echoed back and are intended for processing by
# the parent csvm2csv script
/^!/ && output_started {
print "error: directive must appear before header: `" $0 "'" > "/dev/stderr"
exit 1
}
/^!/ && has_directives {
print "error: all directives must be on one line: `" $0 "'" > "/dev/stderr"
exit 1
}
/^!/ {
has_directives = 1
directives = $0
next
}
# lines that begin with a colon are variable definitions
/^:/ {
if ( !match( $0, /^:([a-zA-Z_-]+)=(.*?)$/, m ) )
{
print "error: invalid variable definition: `" $0 "'" > "/dev/stderr"
exit 1
}
vars[ m[1] ] = m[2]
next
}
# Always begin output with a line for directives, even if there are
# none. This makes subsequent processing much easier, since we won't have
# to conditionally ignore the top line.
!output_started {
print directives
output_started = 1
}
# lines that need any sort of processing (ranges, dates, etc)
/--|;|\$[a-zA-Z_-]|\// { parseline( 1 ); next; }
# all other lines are normal; simply output them verbatim
{
# this assignment will ensure that awk processes the output, ensuring that
# extra spaces between commas are stripped
$1=$1
print
}

View File

@ -1,8 +1,7 @@
#!/usr/bin/awk -f
#
#!/bin/bash
# Compiles a "magic" CSV file into a normal CSV
#
# Copyright (C) 2016, 2018 R-T Specialty, LLC.
# Copyright (C) 2018 R-T Specialty, LLC.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -17,150 +16,95 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# "Magic" CSVs simply exist to make life easier: they permit comments, blank
# lines, variables, sub-delimiter expansion, and any number of ranges per line.
# Ranges will be expanded in every combination, making rate tables highly
# maintainable.
# For format of CSVMs, see `csvm-expand'.
#
# Variables are also supported when defined using :var=val. Variables may
# expand into ranges, 'cause they're awesome. Multiple variables may be
# delimited by semi-colons, as may multiple values.
#
# For example:
# :foo=1--3
# $foo;7;9--10:$foo, 5--10,1/1/2017
#
# Would generate:
# 1, 5, 1483246800
# 1, 6, 1483246800
# ...
# 5, 10, 1483246800
# 2, 5, 1483246800
# ...
# 9, 5, 14832468005
# ...
# 1, 5, 1483246800
# 1, 6, 1483246800
# ...
# To disable sorting of CSVM output, use the `!NOSORT' directive before the
# header line.
##
# account for symlinks, since historically this script lives in a different
# directory and has been symlinked for compatibility
declare -r mypath=$( dirname "$( readlink -f "$0" )" )
# Expand variable with its value, if any
function expand_vars( s, value )
# Generate -k arguments for GNU sort given a CSV header
#
# The generated arguments will be of the form -k1,1n ... -kl,ln, where `l'
# is the total number of header entries.
#
# For example, given this header:
# foo, bar, baz
# the output would be:
# -k1,1n -k2,2n -k3,3n
sort-key-args()
{
# attempt to parse variable (may expand into a range)
if ( match( s, /^\$([a-zA-Z_-]+)$/, m ) )
{
value = vars[ m[1] ];
local -r header="${1?Missing CSV header}"
if ( value == "" )
{
print "error: unknown variable reference: `$" m[1] "'" > "/dev/stderr"
exit 1
local -i i=0
# generate -ki,in for each column (notice that a trailing
# comma is added to the header because of the read delimiter)
while read -d,; do
echo -n "-k$((++i)),${i}n "
done <<< "$header,"
}
# Sort every column of CSV
#
# The columns will all be sorted left-to-right. The header is left in place
# as the first row.
csv-sort()
{
# the first line of the expanded CSVM is the CSV header
local header; read -r header
local -r keys=$( sort-key-args "$header" )
# all remaining input (which is now sans header) is sorted
echo "$header"
sort -t, $keys -
}
# Output usage information
#
# Kudos to you if you understand the little Easter egg.
usage()
{
cat <<EOU
Usage: $0 [FILE]
Expand CSVM represented by FILE or stdin into a CSV
The columns of the expanded CSV will be automatically sorted
left-to-right. To inhibit this behavior, use the \`!NOSORT'
directive anywhere before the header line in the source CSVM.
Options:
--help Output usage information.
This program has magic CSV powers.
EOU
exit 64 # EX_USAGE
}
# Sort CSV rows left-to-right unless the `!NOSORT' directive is provided
main()
{
test ! "$1" == --help || usage
"$mypath/csvm-expand" "$@" \
| {
local directives; read -r directives
# ignore sorting if given NOSORT directive
if [[ "$directives" =~ NOSORT ]]; then
cat
else
csv-sort "$sort"
fi
}
return value
}
return s
}
# Expand line
function parseline( i, m, j, me, orig )
{
if ( i > NF )
{
print
return
}
orig = $i
# expand variables before any processing so that expansions
# can include any type of formatting
$i = expand_vars( $i )
if ( match( $i, /^([0-9]+\/){2}[0-9]+$/, m ) )
{
cmd = "date --date=" $i " +%s"
cmd |& getline $i
close(cmd)
}
# check first for delimiters
if ( match( $i, /^([^;]+);(.*)$/, m ) )
{
# give it a shot with the first value
$i = m[1]
parseline( i )
# strip off the first value and process with following value(s)
$i = m[2]
parseline( i )
# we've delegated; we're done
$i = orig
return
}
# parse range
if ( match( $i, /^([^-]+)--([^-]+)$/, m ) )
{
j = expand_vars( m[1] )
me = expand_vars( m[2] )
if ( !match( j, /^[0-9]+$/ ) || !match( me, /^[0-9]+$/ ) )
{
print "error: invalid range: `" $i "'" > "/dev/stderr"
exit 1
}
do
{
$i = j
parseline( i + 1 )
} while ( j++ < me )
}
else
{
parseline( i + 1 );
}
# restore to original value
$i = orig
}
BEGIN {
# we're parsing CSVs
FS = " *, *"
OFS = ","
}
# skip all lines that begin with `#', which denotes a comment, or are empty
/^#|^$/ { next; }
# lines that begin with a colon are variable definitions
/^:/ {
if ( !match( $0, /^:([a-zA-Z_-]+)=(.*?)$/, m ) )
{
print "error: invalid variable definition: `" $0 "'" > "/dev/stderr"
exit 1
}
vars[ m[1] ] = m[2]
next
}
# lines that need any sort of processing (ranges, dates, etc)
/--|;|\$[a-zA-Z_-]|\// { parseline( 1 ); next; }
# all other lines are normal; simply output them verbatim
{
# this assignment will ensure that awk processes the output, ensuring that
# extra spaces between commas are stripped
$1=$1
print
}
main "$@"

View File

@ -38,7 +38,10 @@ run-test()
test $? -eq 0 || return 1
# expected output
diff <( cat <<< "$expected" ) <( cat <<< "$given" )
diff <( cat <<< "$expected" ) <( cat <<< "$given" ) || {
echo "test $testsum failure" >&2
return 1
}
}
@ -92,11 +95,11 @@ test-delim()
declare -r expected='header,line
1,2
3,6
3,9
4,2
4,6
4,9
3,6
3,9'
4,9'
run-test "$input" "$expected"
}
@ -179,11 +182,12 @@ test-var-with-var()
:baz=$range;$foo
$baz, 5'
# note that the output is sorted
declare -r expected='header,line
2,5
2,5
3,5
4,5
2,5'
4,5'
run-test "$input" "$expected"
}
@ -203,6 +207,51 @@ $foo'
}
test-directive-stripped()
{
declare -r input='!DIRECTIVE
header, line'
declare -r expected='header,line'
run-test "$input" "$expected"
}
test-no-sort()
{
declare -r input='!NOSORT
header, line
1,1
0,0'
declare -r expected='header,line
1,1
0,0'
run-test "$input" "$expected"
}
# all directives should be put on a single line
test-fail-multi-directive()
{
declare -r input='!DIRECTIVE1
!DIRECTIVE2
header, line'
((testsum++))
local -r result=$(
../csvm2csv 2>&1 <<< "$input" \
&& echo '(test failure: expected failure)'
)
grep -q '!DIRECTIVE2' <<< "$result" \
|| return 1
}
test-fail-unknown-var-ref()
{
((testsum++))
@ -254,6 +303,9 @@ test-comment \
&& test-var-with-range-delim \
&& test-var-with-var \
&& test-var-zero-ref \
&& test-directive-stripped \
&& test-no-sort \
&& test-fail-multi-directive \
&& test-fail-unknown-var-ref \
&& test-fail-non-numeric-range \
&& test-fail-invalid-var-dfn \
@ -263,7 +315,7 @@ test-comment \
}
# safety check
test "$testsum" -eq 12 || {
test "$testsum" -eq 15 || {
echo 'error: did not run all csvm2csv tests!' >&2
exit 1
}