repo2html/msgfmt

#!/bin/bash
#
# Formats a Git commit message
#
#  Copyright (C) 2012  Mike Gerwitz
#
#  This file is part of repo2html.
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
# #

# optional id (for cref errors)
id="$1"

# HTML replacements (default)
lquo='\&ldquo;'
rquo='\&rdquo;'
mdash='\&mdash;'
opar='<p>'
epar='</p>'

# redefines replacements to yield plain text (instead of HTML entities)
nohtml()
{
  lquo=\"
  rquo=\"
  mdash=---
}

# no paragraph tags should be output
nopar()
{
  opar=
  epar=
}


while getopts nP opt; do
  case "$opt" in
    n) nohtml;;
    P) nopar;;
  esac
done

# calculate this after options have been parsed
refopar="${opar:+${opar%>} id="ref-\\2" class="ref">}"

# format the commit message, stopping at the diff (if any)
awk -vid="$id" -vurl_root="${url_root%/}" -vcref_errlog="$cref_errlog" '
    # replace commit refs with generated URL (allows linking to prior commits
    # without hard-coding the configurable links that could change or be
    # relative to where the content is hosted); this will then be processed as a
    # normal URL by the remainder of the script
    match($0, /\[cref:(.*?)\]/, g) {
      # retrieve the URL from the hashcache and perform the line replacement
      # (which will be reflected once we print the line)
      c = "./hashcache " g[1]
      c | getline result

      # if a cref error logfile path was provided, log unknown refs so that they
      # can be re-processed (if commits are processed in reverse order and the
      # hashcache is cleared before the run, then this is likely to occur for
      # every cref)
      if ( result == "" && cref_errlog && id ) {
        printf id"\n" >>cref_errlog
      }

      gsub(/\[cref:.*?\]/, (url_root "/" result))
    }

    # stop printing at diff
    /^diff --git/ { exit }

    # otherwise, print everything
    { print }
  ' \
  | sed ':a;N;$!ba;
    # handle <>-delimited links (strip delimiters)
    s#<\([fh]ttps\?://[^ ]\+\)>#\1#g;

    # escaping
    s/\&/\&amp;/g;
    s/</\&lt;/g;
    s/>/\&gt;/g;

    # quoting (initiated by an indented paragraph and terminated by a new
    # paragraph, unless that paragraph is also indented)
    s#\n\n  \+\(\([^\n]\+\n\(\n  \+\)\?\)\+\)#<blockquote>\1</blockquote>#g

    # pre-formatted block. markdown-style
    s#\n\n  \+\(\([^\n]\+\n\(\n  \+\)\?\)\+\)#<blockquote>\1</blockquote>#g

    # unfortunately, non-greedy matches make it difficult to exclude punctuation
    # at the end of a link, so we will handle it in a separate expression
    s#[fh]ttps\?://[^]\n )]\+#<a href="&">&</a>#g;
    s#<a href="\([^"]\+\)\([.;,!]\)">\([^<]\+\).</a>#<a href="\1">\3</a>\2#g;

    # reference definitions (footnotes)
    s#\(\n\[\([0-9]\+\)\]\):\?#'"$epar$refopar"'\1#g;

    # references in text (note that references that enclose text as a hyperlink
    # must not start with a number, otherwise they will be considered to be a
    # reference number)
    s|\[\([^0-9][^]]\+\)\]\[\([0-9]\+\)\]|<a href="#ref-\2">\1</a>\[\2\]|g
    s|\[\([0-9]\+\)\]|<sup><a href="#ref-\1">&</a></sup>|g

    # paragraphs
    s#\n\n#'"$epar"'&'"$opar"'#g;
    /^/i'"$opar"'
    /$/a'"$epar"'

    # basic formatting
    s/---/'"$mdash"'/g;
    s#``#'"$lquo"'#g;
    s#'\'\''#'"$rquo"'#g;
    s#\(\W\|^\)\*\*\([^\*]\+\)\*\*\(\W\)#\1<strong>\2</strong>\3#g;
    s#\(\W\)\*\([^\*]\+\)\*\(\W\)#\1<em>\2</em>\3#g;
  '