thoughts/src/post2meta

83 lines
2.8 KiB
Awk
Executable File

#!/usr/bin/gawk -f
# Cache post data in metadata recutils file
#
# Copyright (C) 2019 Mike Gerwitz
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Generates database of metadata for a given post in recutils format for use
# by other scripts. The post must have already been converted to HTML using
# `post2html' or some equivalent means.
#
# This script is also responsible for determining what constitutes the
# abstract, which we consider to be everything after the subject line but
# before the end-of-abstract marker "<!-- more -->". If no such marker
# exists then the script exits in error.
##
# Output author and post date derived from the file name.
BEGINFILE {
match( FILENAME, /[^/]+$/, name )
# TODO: configurable
print "author: Mike Gerwitz <mtg@gnu.org>"
printf "date: %s\n",
gensub( /^(.{10}).*$/, "\\1", 1, name[0] )
}
# Wait until after <main>; everything before it is the HTML header.
/^ *<main>/ { main=1 }
!main { next }
# The first header represents the subject/title and also contains the
# unique id for this post (as generated by `post2html').
main && /^<h1 / {
# Strip header tags from subject.
print "subject: " gensub( /<\/?h[^>]+>/, "", "g" )
# Grab the generated id from the header and use it to
# generate a complete slug.
printf "slug: %s\n", \
gensub( /^([0-9]+)-([0-9]+)-[0-9]+-(.*)\.[a-z]+$/,
"\\1/\\2/\\3",
1,
name[0] )
# Skip the date line immediately following the header and grab the first
# line of the abstract.
getline
getline
printf "abstract: %s\n", $0
a = 1
next
}
# The end-of-abstract marker is "<!-- more -->". Until we reach that point,
# output each line of the abstract prefixed by a `+', which is the recutils
# line continuation marker.
/^<!-- more -->/ { exit }
a { printf "+ %s\n", $0 }
# If we get to this point, that means that there is no end-of-abstract
# marker, which we will consider to be an error just to make sure that the
# author didn't forget to add one. If the entire post is to be considered
# part of the abstract, then the marker can be added at the end of the post.
ENDFILE {
print "error: missing '<!-- more -->'" > "/dev/stderr"
exit 1
}