tame/build-aux/csvm-expand

#!/usr/bin/awk -f
#
# Expands a "magic" CSV file into a normal CSV
#
#   Copyright (C) 2014-2019 Ryan Specialty Group, LLC.
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# "Magic" CSVs simply exist to make life easier: they permit comments, blank
# lines, variables, sub-delimiter expansion, and any number of ranges per line.
# Ranges will be expanded in every combination, making rate tables highly
# maintainable.
#
# Variables are also supported when defined using :var=val. Variables may
# expand into ranges, 'cause they're awesome. Multiple variables may be
# delimited by semi-colons, as may multiple values.
#
# For example:
#   :foo=1--3
#   $foo;7;9--10:$foo, 5--10,1/1/2017
#
# Would generate:
#   1, 5, 1483246800
#   1, 6, 1483246800
#   ...
#   5, 10, 1483246800
#   2, 5, 1483246800
#   ...
#   9, 5, 14832468005
#   ...
#   1, 5, 1483246800
#   1, 6, 1483246800
#   ...
##

BEGIN {
  date_cmd = "stdbuf -o0 date -f- +%s"
}

END {
  close( date_cmd )
}


# Parse a date string into a Unix timestamp (memoized)
#
# This spawns a single process for date and reads from standard in.  Even
# then, though, date parsing is very slow for many thousands of rows, so the
# output is also cached in `date_cache'.
function parse_date( i,   src )
{
  src = $i

  if ( date_cache[ src ] )
  {
    $i = date_cache[ src ]
    return
  }

  print $i |& date_cmd
  date_cmd |& getline $i

  date_cache[ src ] = $i;
}


# Expand variable with its value, if any
function expand_vars( s,   value )
{
  # attempt to parse variable (may expand into a range)
  if ( match( s, /^\$([a-zA-Z_-]+)$/, m ) )
  {
    value = vars[ m[1] ];

    if ( value == "" )
    {
      print "error: unknown variable reference: `$" m[1] "'" > "/dev/stderr"
      exit 1
    }

    return value
  }

  return s
}


# Expand line
function parseline( i, m, j, me, orig )
{
  if ( i > NF )
  {
    print
    return
  }

  orig = $i

  # expand variables before any processing so that expansions
  # can include any type of formatting
  $i = expand_vars( $i )

  if ( match( $i, /^([0-9]+\/){2}[0-9]+$/, m ) )
  {
    parse_date( i );
  }

  # check first for delimiters
  if ( match( $i, /^([^;]+);(.*)$/, m ) )
  {
    # give it a shot with the first value
    $i = m[1]
    parseline( i )

    # strip off the first value and process with following value(s)
    $i = m[2]
    parseline( i )

    # we've delegated; we're done
    $i = orig
    return
  }

  # parse range
  if ( match( $i, /^([^-]+)--([^-]+)$/, m ) )
  {
    j  = expand_vars( m[1] )
    me = expand_vars( m[2] )

    if ( !match( j, /^[0-9]+$/ ) || !match( me, /^[0-9]+$/ ) )
    {
      print "error: invalid range: `" $i "'" > "/dev/stderr"
      exit 1
    }

    do
    {
      $i = j
      parseline( i + 1 )
    } while ( j++ < me )
  }
  else
  {
    parseline( i + 1 );
  }

  # restore to original value
  $i = orig
}


BEGIN {
  # we're parsing CSVs
  FS = "[[:space:]]*,[[:space:]]*"
  OFS = ","

  has_directives = 0
  directives     = "!(NODIRECTIVES)"
}


# skip all lines that begin with `#', which denotes a comment, or are empty
/^#|^$/ { next; }

# directives are echoed back and are intended for processing by
# the parent csvm2csv script
/^!/ && output_started {
  print "error: directive must appear before header: `" $0 "'" > "/dev/stderr"
  exit 1
}
/^!/ && has_directives {
  print "error: all directives must be on one line: `" $0 "'" > "/dev/stderr"
  exit 1
}
/^!/ {
  has_directives = 1
  directives     = $0

  next
}

# lines that begin with a colon are variable definitions
/^:/ {
  if ( !match( $0, /^:([a-zA-Z_-]+)=(.*?)$/, m ) )
  {
      print "error: invalid variable definition: `" $0 "'" > "/dev/stderr"
      exit 1
  }

  vars[ m[1] ] = m[2]
  next
}

# Always begin output with a line for directives, even if there are
# none.  This makes subsequent processing much easier, since we won't have
# to conditionally ignore the top line.
!output_started {
  print directives

  output_started = 1
}

# lines that need any sort of processing (ranges, dates, etc)
/--|;|\$[a-zA-Z_-]|\// { parseline( 1 ); next; }

# all other lines are normal; simply output them verbatim
{
  # this assignment will ensure that awk processes the output, ensuring that
  # extra spaces between commas are stripped
  $1=$1
  print
}
csvm: Auto-sort expanded output This will allow the variable abstractions to fully encapsulate values while still permitting binary searches on sorted rows. * csvm-expand: Renamed from `csvm2csv'. Add directive support. * csvm2csv: New script to perform sorting. Invokes aforementioned. * test/test-csvm2csv: Update for sorting. 2018-10-03 14:21:35 -04:00			`#!/usr/bin/awk -f`
			`#`
			`# Expands a "magic" CSV file into a normal CSV`
			`#`
Copyright year simplification and update to Ryan Specialty Group This now uses year ranges, which I'll update annually. This also renames "R-T Specialty" to "Ryan Specialty Group". The latter is the parent company of the former. I was originally employed under the former when LoVullo Associates was purchased, by I now work for the parent company. 2019-02-07 13:23:09 -05:00			`# Copyright (C) 2014-2019 Ryan Specialty Group, LLC.`
csvm: Auto-sort expanded output This will allow the variable abstractions to fully encapsulate values while still permitting binary searches on sorted rows. * csvm-expand: Renamed from `csvm2csv'. Add directive support. * csvm2csv: New script to perform sorting. Invokes aforementioned. * test/test-csvm2csv: Update for sorting. 2018-10-03 14:21:35 -04:00			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`#`
			`# "Magic" CSVs simply exist to make life easier: they permit comments, blank`
			`# lines, variables, sub-delimiter expansion, and any number of ranges per line.`
			`# Ranges will be expanded in every combination, making rate tables highly`
			`# maintainable.`
			`#`
			`# Variables are also supported when defined using :var=val. Variables may`
			`# expand into ranges, 'cause they're awesome. Multiple variables may be`
			`# delimited by semi-colons, as may multiple values.`
			`#`
			`# For example:`
			`# :foo=1--3`
			`# $foo;7;9--10:$foo, 5--10,1/1/2017`
			`#`
			`# Would generate:`
			`# 1, 5, 1483246800`
			`# 1, 6, 1483246800`
			`# ...`
			`# 5, 10, 1483246800`
			`# 2, 5, 1483246800`
			`# ...`
			`# 9, 5, 14832468005`
			`# ...`
			`# 1, 5, 1483246800`
			`# 1, 6, 1483246800`
			`# ...`
			`##`

build-aux/csvm-expand: Spawn only one date and memoize A table with a couple hundred thousand rows was taking minutes to generate. This gets it down to a few seconds. * build-aux/csvm-expand (parse_date): New function. (parseline): use it. 2019-04-02 10:58:12 -04:00			`BEGIN {`
			`date_cmd = "stdbuf -o0 date -f- +%s"`
			`}`

			`END {`
			`close( date_cmd )`
			`}`


			`# Parse a date string into a Unix timestamp (memoized)`
			`#`
			`# This spawns a single process for date and reads from standard in. Even`
			`# then, though, date parsing is very slow for many thousands of rows, so the`
			# output is also cached in `date_cache'.
build-aux/csvm-expand: {orig=>src} local arg typo fix This does not affect its functionality. 2019-04-02 11:05:02 -04:00			`function parse_date( i, src )`
build-aux/csvm-expand: Spawn only one date and memoize A table with a couple hundred thousand rows was taking minutes to generate. This gets it down to a few seconds. * build-aux/csvm-expand (parse_date): New function. (parseline): use it. 2019-04-02 10:58:12 -04:00			`{`
			`src = $i`

			`if ( date_cache[ src ] )`
			`{`
			`$i = date_cache[ src ]`
			`return`
			`}`

			`print $i \|& date_cmd`
			`date_cmd \|& getline $i`

			`date_cache[ src ] = $i;`
			`}`

csvm: Auto-sort expanded output This will allow the variable abstractions to fully encapsulate values while still permitting binary searches on sorted rows. * csvm-expand: Renamed from `csvm2csv'. Add directive support. * csvm2csv: New script to perform sorting. Invokes aforementioned. * test/test-csvm2csv: Update for sorting. 2018-10-03 14:21:35 -04:00
			`# Expand variable with its value, if any`
			`function expand_vars( s, value )`
			`{`
			`# attempt to parse variable (may expand into a range)`
			`if ( match( s, /^\$([a-zA-Z_-]+)$/, m ) )`
			`{`
			`value = vars[ m[1] ];`

			`if ( value == "" )`
			`{`
			print "error: unknown variable reference: `$" m[1] "'" > "/dev/stderr"
			`exit 1`
			`}`

			`return value`
			`}`

			`return s`
			`}`


			`# Expand line`
			`function parseline( i, m, j, me, orig )`
			`{`
			`if ( i > NF )`
			`{`
			`print`
			`return`
			`}`

			`orig = $i`

			`# expand variables before any processing so that expansions`
			`# can include any type of formatting`
			`$i = expand_vars( $i )`

			`if ( match( $i, /^([0-9]+\/){2}[0-9]+$/, m ) )`
			`{`
build-aux/csvm-expand: Spawn only one date and memoize A table with a couple hundred thousand rows was taking minutes to generate. This gets it down to a few seconds. * build-aux/csvm-expand (parse_date): New function. (parseline): use it. 2019-04-02 10:58:12 -04:00			`parse_date( i );`
csvm: Auto-sort expanded output This will allow the variable abstractions to fully encapsulate values while still permitting binary searches on sorted rows. * csvm-expand: Renamed from `csvm2csv'. Add directive support. * csvm2csv: New script to perform sorting. Invokes aforementioned. * test/test-csvm2csv: Update for sorting. 2018-10-03 14:21:35 -04:00			`}`

			`# check first for delimiters`
			`if ( match( $i, /^([^;]+);(.*)$/, m ) )`
			`{`
			`# give it a shot with the first value`
			`$i = m[1]`
			`parseline( i )`

			`# strip off the first value and process with following value(s)`
			`$i = m[2]`
			`parseline( i )`

			`# we've delegated; we're done`
			`$i = orig`
			`return`
			`}`

			`# parse range`
			`if ( match( $i, /^([^-]+)--([^-]+)$/, m ) )`
			`{`
			`j = expand_vars( m[1] )`
			`me = expand_vars( m[2] )`

			`if ( !match( j, /^[0-9]+$/ ) \|\| !match( me, /^[0-9]+$/ ) )`
			`{`
			print "error: invalid range: `" $i "'" > "/dev/stderr"
			`exit 1`
			`}`

			`do`
			`{`
			`$i = j`
			`parseline( i + 1 )`
			`} while ( j++ < me )`
			`}`
			`else`
			`{`
			`parseline( i + 1 );`
			`}`

			`# restore to original value`
			`$i = orig`
			`}`


			`BEGIN {`
			`# we're parsing CSVs`
csvm: Permit all whitespace (including tabs) While tabs aren't desirable, users that are not developers will be modifying these files, and so we need to be permissive in what we want to accept. That doesn't mean that we need to forego occasional formatting, though. 2019-04-08 15:16:58 -04:00			`FS = "[[:space:]],[[:space:]]"`
csvm: Auto-sort expanded output This will allow the variable abstractions to fully encapsulate values while still permitting binary searches on sorted rows. * csvm-expand: Renamed from `csvm2csv'. Add directive support. * csvm2csv: New script to perform sorting. Invokes aforementioned. * test/test-csvm2csv: Update for sorting. 2018-10-03 14:21:35 -04:00			`OFS = ","`

			`has_directives = 0`
			`directives = "!(NODIRECTIVES)"`
			`}`


			# skip all lines that begin with `#', which denotes a comment, or are empty
			`/^#\|^$/ { next; }`

			`# directives are echoed back and are intended for processing by`
			`# the parent csvm2csv script`
			`/^!/ && output_started {`
			print "error: directive must appear before header: `" $0 "'" > "/dev/stderr"
			`exit 1`
			`}`
			`/^!/ && has_directives {`
			print "error: all directives must be on one line: `" $0 "'" > "/dev/stderr"
			`exit 1`
			`}`
			`/^!/ {`
			`has_directives = 1`
			`directives = $0`

			`next`
			`}`

			`# lines that begin with a colon are variable definitions`
			`/^:/ {`
			`if ( !match( $0, /^:([a-zA-Z_-]+)=(.*?)$/, m ) )`
			`{`
			print "error: invalid variable definition: `" $0 "'" > "/dev/stderr"
			`exit 1`
			`}`

			`vars[ m[1] ] = m[2]`
			`next`
			`}`

			`# Always begin output with a line for directives, even if there are`
			`# none. This makes subsequent processing much easier, since we won't have`
			`# to conditionally ignore the top line.`
			`!output_started {`
			`print directives`

			`output_started = 1`
			`}`

			`# lines that need any sort of processing (ranges, dates, etc)`
			`/--\|;\|\$[a-zA-Z_-]\|\// { parseline( 1 ); next; }`

			`# all other lines are normal; simply output them verbatim`
			`{`
			`# this assignment will ensure that awk processes the output, ensuring that`
			`# extra spaces between commas are stripped`
			`$1=$1`
			`print`
			`}`