2018-10-03 14:21:35 -04:00
|
|
|
#!/usr/bin/awk -f
|
|
|
|
#
|
|
|
|
# Expands a "magic" CSV file into a normal CSV
|
|
|
|
#
|
2019-02-07 13:23:09 -05:00
|
|
|
# Copyright (C) 2014-2019 Ryan Specialty Group, LLC.
|
2018-10-03 14:21:35 -04:00
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
# "Magic" CSVs simply exist to make life easier: they permit comments, blank
|
|
|
|
# lines, variables, sub-delimiter expansion, and any number of ranges per line.
|
|
|
|
# Ranges will be expanded in every combination, making rate tables highly
|
|
|
|
# maintainable.
|
|
|
|
#
|
|
|
|
# Variables are also supported when defined using :var=val. Variables may
|
|
|
|
# expand into ranges, 'cause they're awesome. Multiple variables may be
|
|
|
|
# delimited by semi-colons, as may multiple values.
|
|
|
|
#
|
|
|
|
# For example:
|
|
|
|
# :foo=1--3
|
|
|
|
# $foo;7;9--10:$foo, 5--10,1/1/2017
|
|
|
|
#
|
|
|
|
# Would generate:
|
|
|
|
# 1, 5, 1483246800
|
|
|
|
# 1, 6, 1483246800
|
|
|
|
# ...
|
|
|
|
# 5, 10, 1483246800
|
|
|
|
# 2, 5, 1483246800
|
|
|
|
# ...
|
|
|
|
# 9, 5, 14832468005
|
|
|
|
# ...
|
|
|
|
# 1, 5, 1483246800
|
|
|
|
# 1, 6, 1483246800
|
|
|
|
# ...
|
|
|
|
##
|
|
|
|
|
2019-04-02 10:58:12 -04:00
|
|
|
BEGIN {
|
|
|
|
date_cmd = "stdbuf -o0 date -f- +%s"
|
|
|
|
}
|
|
|
|
|
|
|
|
END {
|
|
|
|
close( date_cmd )
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# Parse a date string into a Unix timestamp (memoized)
|
|
|
|
#
|
|
|
|
# This spawns a single process for date and reads from standard in. Even
|
|
|
|
# then, though, date parsing is very slow for many thousands of rows, so the
|
|
|
|
# output is also cached in `date_cache'.
|
2019-04-02 11:05:02 -04:00
|
|
|
function parse_date( i, src )
|
2019-04-02 10:58:12 -04:00
|
|
|
{
|
|
|
|
src = $i
|
|
|
|
|
|
|
|
if ( date_cache[ src ] )
|
|
|
|
{
|
|
|
|
$i = date_cache[ src ]
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
print $i |& date_cmd
|
|
|
|
date_cmd |& getline $i
|
|
|
|
|
|
|
|
date_cache[ src ] = $i;
|
|
|
|
}
|
|
|
|
|
2018-10-03 14:21:35 -04:00
|
|
|
|
|
|
|
# Expand variable with its value, if any
|
|
|
|
function expand_vars( s, value )
|
|
|
|
{
|
|
|
|
# attempt to parse variable (may expand into a range)
|
|
|
|
if ( match( s, /^\$([a-zA-Z_-]+)$/, m ) )
|
|
|
|
{
|
|
|
|
value = vars[ m[1] ];
|
|
|
|
|
|
|
|
if ( value == "" )
|
|
|
|
{
|
|
|
|
print "error: unknown variable reference: `$" m[1] "'" > "/dev/stderr"
|
|
|
|
exit 1
|
|
|
|
}
|
|
|
|
|
|
|
|
return value
|
|
|
|
}
|
|
|
|
|
|
|
|
return s
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# Expand line
|
|
|
|
function parseline( i, m, j, me, orig )
|
|
|
|
{
|
|
|
|
if ( i > NF )
|
|
|
|
{
|
|
|
|
print
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
orig = $i
|
|
|
|
|
|
|
|
# expand variables before any processing so that expansions
|
|
|
|
# can include any type of formatting
|
|
|
|
$i = expand_vars( $i )
|
|
|
|
|
|
|
|
if ( match( $i, /^([0-9]+\/){2}[0-9]+$/, m ) )
|
|
|
|
{
|
2019-04-02 10:58:12 -04:00
|
|
|
parse_date( i );
|
2018-10-03 14:21:35 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
# check first for delimiters
|
|
|
|
if ( match( $i, /^([^;]+);(.*)$/, m ) )
|
|
|
|
{
|
|
|
|
# give it a shot with the first value
|
|
|
|
$i = m[1]
|
|
|
|
parseline( i )
|
|
|
|
|
|
|
|
# strip off the first value and process with following value(s)
|
|
|
|
$i = m[2]
|
|
|
|
parseline( i )
|
|
|
|
|
|
|
|
# we've delegated; we're done
|
|
|
|
$i = orig
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
# parse range
|
|
|
|
if ( match( $i, /^([^-]+)--([^-]+)$/, m ) )
|
|
|
|
{
|
|
|
|
j = expand_vars( m[1] )
|
|
|
|
me = expand_vars( m[2] )
|
|
|
|
|
|
|
|
if ( !match( j, /^[0-9]+$/ ) || !match( me, /^[0-9]+$/ ) )
|
|
|
|
{
|
|
|
|
print "error: invalid range: `" $i "'" > "/dev/stderr"
|
|
|
|
exit 1
|
|
|
|
}
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
$i = j
|
|
|
|
parseline( i + 1 )
|
|
|
|
} while ( j++ < me )
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
parseline( i + 1 );
|
|
|
|
}
|
|
|
|
|
|
|
|
# restore to original value
|
|
|
|
$i = orig
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
BEGIN {
|
|
|
|
# we're parsing CSVs
|
2019-04-08 15:16:58 -04:00
|
|
|
FS = "[[:space:]]*,[[:space:]]*"
|
2018-10-03 14:21:35 -04:00
|
|
|
OFS = ","
|
|
|
|
|
|
|
|
has_directives = 0
|
|
|
|
directives = "!(NODIRECTIVES)"
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# skip all lines that begin with `#', which denotes a comment, or are empty
|
|
|
|
/^#|^$/ { next; }
|
|
|
|
|
|
|
|
# directives are echoed back and are intended for processing by
|
|
|
|
# the parent csvm2csv script
|
|
|
|
/^!/ && output_started {
|
|
|
|
print "error: directive must appear before header: `" $0 "'" > "/dev/stderr"
|
|
|
|
exit 1
|
|
|
|
}
|
|
|
|
/^!/ && has_directives {
|
|
|
|
print "error: all directives must be on one line: `" $0 "'" > "/dev/stderr"
|
|
|
|
exit 1
|
|
|
|
}
|
|
|
|
/^!/ {
|
|
|
|
has_directives = 1
|
|
|
|
directives = $0
|
|
|
|
|
|
|
|
next
|
|
|
|
}
|
|
|
|
|
|
|
|
# lines that begin with a colon are variable definitions
|
|
|
|
/^:/ {
|
|
|
|
if ( !match( $0, /^:([a-zA-Z_-]+)=(.*?)$/, m ) )
|
|
|
|
{
|
|
|
|
print "error: invalid variable definition: `" $0 "'" > "/dev/stderr"
|
|
|
|
exit 1
|
|
|
|
}
|
|
|
|
|
|
|
|
vars[ m[1] ] = m[2]
|
|
|
|
next
|
|
|
|
}
|
|
|
|
|
|
|
|
# Always begin output with a line for directives, even if there are
|
|
|
|
# none. This makes subsequent processing much easier, since we won't have
|
|
|
|
# to conditionally ignore the top line.
|
|
|
|
!output_started {
|
|
|
|
print directives
|
|
|
|
|
|
|
|
output_started = 1
|
|
|
|
}
|
|
|
|
|
|
|
|
# lines that need any sort of processing (ranges, dates, etc)
|
|
|
|
/--|;|\$[a-zA-Z_-]|\// { parseline( 1 ); next; }
|
|
|
|
|
|
|
|
# all other lines are normal; simply output them verbatim
|
|
|
|
{
|
|
|
|
# this assignment will ensure that awk processes the output, ensuring that
|
|
|
|
# extra spaces between commas are stripped
|
|
|
|
$1=$1
|
|
|
|
print
|
|
|
|
}
|