#!/usr/bin/perl # Given a directory containing a set of downloaded wiki HTML topics files, # clean up the HTML code in each file and save the page into a new directory. # By default, the "Html" directory is searched for HTML files, with new files # stored in "NewHtml". # # (c) 2008 Warren Toomey wkt@tuhs.org, under the GPL3 license. # use strict; use warnings; my $zipdir = "Zips"; my $htmldir = "NewHtml"; my %prefix; # Hash of arrays, e.g. key ab -> ("abalone.html", "abacus.html") my %imglist; # List of images per html file which is used as the key ## MAIN PROGRAM ## # Get any directory details $htmldir = $ARGV[0] if ( @ARGV > 0 ); $zipdir = $ARGV[1] if ( @ARGV > 1 ); mkdir($zipdir) if ( !-d $zipdir ); # Get the list of html files in the $htmldir. # Build a list of html filenames. opendir( my $DIR, $htmldir ) || die "can't opendir $htmldir: $!"; my @htmllist = sort( grep { /\.html$/ } readdir($DIR) ); closedir($DIR); # For all html files, get the first 2 letters and build a # hash of arrays containing these filenames. foreach my $f (@htmllist) { my $key = substr( $f, 0, 2 ); push( @{ $prefix{$key} }, $f ); # Now open the file, find all the image filenames, and determine which # directory contains the image file open( my $IN, "<", "$htmldir/$f" ) || die("Can't open $htmldir/$f: $!"); while (<$IN>) { if (m{", "$zipdir/index.html") || die("Can't create $zipdir/index.html: $!"); print($OUT "\n"); print($OUT " \n"); print($OUT "\n"); foreach my $f (@htmllist) { my $topic= substr($f,0,-5); # Lose the .html $topic=~ s{%([0-9a-fA-f][0-9a-fA-f])}{&#x$1;}g; # Spit out a new heading if needed my $firstletter= substr($f,0,1); if ($firstletter ne $oldfirstletter) { print($OUT "\n

$firstletter

\n"); $oldfirstletter= $firstletter; } else { print($OUT ",\n"); } print($OUT "$topic"); } print($OUT "\n"); close($OUT); # We should have html files grouped by 2-char prefix and the # associated images. Build up lists, and then zip up these files. for my $key ( keys(%prefix) ) { #print("Doing key $key\n"); push( my @list, map( "$htmldir/$_", @{ $prefix{$key} } ) ); foreach my $f ( @{ $prefix{$key} } ) { push( @list, @{ $imglist{$f} } ) if ( defined( $imglist{$f} ) ); } #print map "\t$_\n", @list; system("zip -9 -q -j $zipdir/$key.zip " . join( ' ', map "'$_'", @list ) ); } exit(0);