#!/usr/bin/perl
# Given a directory containing a set of downloaded wiki HTML topics files,
# clean up the HTML code in each file and save the page into a new directory.
# By default, the "Html" directory is searched for HTML files, with new files
# stored in "NewHtml".
#
# (c) 2008 Warren Toomey wkt@tuhs.org, under the GPL3 license.
#
use strict;
use warnings;

my $zipdir  = "Zips";
my $htmldir = "NewHtml";

my %prefix;     # Hash of arrays, e.g. key ab -> ("abalone.html", "abacus.html")
my %imglist;    # List of images per html file which is used as the key

## MAIN PROGRAM ##

# Get any directory details
$htmldir = $ARGV[0] if ( @ARGV > 0 );
$zipdir  = $ARGV[1] if ( @ARGV > 1 );

mkdir($zipdir) if ( !-d $zipdir );

# Get the list of html files in the $htmldir.
# Build a list of html filenames.
opendir( my $DIR, $htmldir ) || die "can't opendir $htmldir: $!";
my @htmllist = sort( grep { /\.html$/ } readdir($DIR) );
closedir($DIR);

# For all html files, get the first 2 letters and build a
# hash of arrays containing these filenames.
foreach my $f (@htmllist) {
    my $key = substr( $f, 0, 2 );
    push( @{ $prefix{$key} }, $f );

    # Now open the file, find all the image filenames, and determine which
    # directory contains the image file
    open( my $IN, "<", "$htmldir/$f" ) || die("Can't open $htmldir/$f: $!");
    while (<$IN>) {
        if (m{<img.*?src="(.*?)"}) {
            if ( -f "NewImages/$1" ) {
                push( @{ $imglist{$f} }, "NewImages/$1" );
            }
            else {
                push( @{ $imglist{$f} }, "Images/$1" );
            }
        }
    }
    close($IN);
}

# Create an index.html file which has the topics in alphabetical order
# with hyperlinks into the Zip files.
my $oldfirstletter="";
open(my $OUT, ">", "$zipdir/index.html") ||
	die("Can't create $zipdir/index.html: $!");
print($OUT "<html><head>\n");
print($OUT " <meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\" />\n");
print($OUT "</head><body>\n");

foreach my $f (@htmllist) {
    my $topic= substr($f,0,-5);	# Lose the .html
    $topic=~ s{%([0-9a-fA-f][0-9a-fA-f])}{&#x$1;}g;

    # Spit out a new heading if needed
    my $firstletter= substr($f,0,1);
    if ($firstletter ne $oldfirstletter) {
	print($OUT "\n<p><h1>$firstletter</h1><p>\n");
	$oldfirstletter= $firstletter;
    } else {
	print($OUT ",\n");
    }
    print($OUT "<a href=\"" . substr($f,0,2) . ".zip/$f\">$topic</a>");
}
print($OUT "</body></html>\n");
close($OUT);

# We should have html files grouped by 2-char prefix and the
# associated images. Build up lists, and then zip up these files.
for my $key ( keys(%prefix) ) {
    #print("Doing key $key\n");
    push( my @list, map( "$htmldir/$_", @{ $prefix{$key} } ) );
    foreach my $f ( @{ $prefix{$key} } ) {
        push( @list, @{ $imglist{$f} } ) if ( defined( $imglist{$f} ) );
    }

    #print map "\t$_\n", @list;

    system("zip -9 -q -j $zipdir/$key.zip " . join( ' ', map "'$_'", @list ) );
}
exit(0);
