#!/usr/bin/perl
# Given a directory containing a set of downloaded wiki HTML topics files,
# clean up the HTML code in each file and save the page into a new directory.
# By default, the "Html" directory is searched for HTML files, with new files
# stored in "NewHtml".
#
# (c) 2008 Warren Toomey wkt@tuhs.org, under the GPL3 license.
#
use strict;
use warnings;

my $olddir = "Html";
my $newdir = "NewHtml";
my $newimg = "NewImages";

my $noimages = 0;
my $nohrefs  = 0;

# Given a web page as a list of lines, clean it up and fix images
sub CleanHtml {
    my ( $IN, $OUT ) = @_;

    # Loop for lines before the initial <h1>
    while (<$IN>) {

        # Misc cleanups
        s/^[ \t]+//;

        if (m{^<!DOCTYPE}) { print( $OUT $_ ); next; }
        if (m{^<html}) {
            print( $OUT $_ ); print( $OUT "<head>\n" ); next;
        }
        if (m{<meta http-equiv}) { print( $OUT $_ ); next; }
        if (m{<title})           { print( $OUT $_ ); next; }
        last if (m{<h1});
    }

    # We found the <h1>, so finish off the head
    s{.*<h1}{<h1};
    print( $OUT "</head><body>\n$_" );

    # Loop for the main body
    while (<$IN>) {

        # Misc cleanups
        s/^[ \t]+//;

        # Remove everything after "See also" and "External links"
        last if (m{<span class="mw-headline">See also</span>});
        last if (m{<span class="mw-headline">External links</span>});
        last if (m{Saved in parser cache with key enwiki});

        # Remove the "Jump to"
        next if (m{<div id="jump-to-nav"});

        # Remove citations
        s{<sup id="cite.*?</sup>}{}g;

        # Remove [edit] sections
        s{<span class="editsection.*?</span>}{}g;

        # Deal with hyperlinks: with $nohrefs set, just
        # keep the internal ones
        if ($nohrefs) {
            s{<a href="[^#].*?>(.*?)</a>}{$1}g;
        }
        else {
            # Remove external hyperlinks
            s{<a href="[a-z].*?>(.*?)</a>}{$1}g;

            # Remove /skins/ and /w/ hyperlinks
            s{<a href="(/skins/|/w/).*?>(.*?)</a>}{$2}g;

            # Rewrite /wiki links to point into Zip files
            while (m{<a href="/wiki/(..)(.*?)".*?>.*?</a>}) {
                if ( -f "$olddir/$1$2.html" ) {
		    s{<a href="/wiki/(..)(.*?)".*?>(.*?)</a>}{<a href="../$1.zip/$1$2.html">$3</a>};
                }
                else {
                    s{<a href=".*?>(.*?)</a>}{$1};
                }
            }
        }

        # Remove class=""
        s{class=".*?"}{}g;

        # Remove <div>s and <span>s
        s{<div.*?>}{}g;
        s{</div.*?>}{}g;
        s{<span.*?>}{}g;
        s{</span.*?>}{}g;

        # Remove magnify-clip.png
        s{<img.*?src=".*?magnify-clip.png".*?>}{}g;

        # Rewrite the reference to an embedded image
        # or skip the images if $noimages is on
        if (m{<img (.*?)src="(.*?)"(.*?)>}) {
            next if ($noimages);
            my ( $first, $href, $rest ) = ( $1, $2, $3 );
            my $file = substr( $href, rindex( $href, "/" ) + 1 );

	    # Replace the suffix with .jpg and see if the file
	    # exists in the $newimg directory. If so, use that filename.
	    # If not, keep the old filename, as we will get it from the
	    # untouched images directory
	    my $newfile= substr($file, 0, -3) . "jpg";
	    if (-f "$newimg/$newfile") {
              print( $OUT "<img $first src=\"$newfile\" $rest>" );
	    } else {
              print( $OUT "<img $first src=\"$file\" $rest>" );
	    }
            next;
        }
        print( $OUT $_ );
    }
}

## MAIN PROGRAM ##
while ( @ARGV > 0 && $ARGV[0] =~ /^-/ ) {
    if ( $ARGV[0] eq "-ni" ) { $noimages = 1; shift; }
    if ( $ARGV[0] eq "-nh" ) { $nohrefs  = 1; shift; }
}

# Get any directory details
$olddir = $ARGV[0] if ( @ARGV > 0 );
$newdir = $ARGV[1] if ( @ARGV > 1 );

mkdir($newdir) if ( !-d $newdir );

# Read and open in/out files from the $olddir and $newdir
opendir( my $DIR, $olddir ) || die "can't opendir $olddir: $!";
foreach my $file ( grep { /\.html$/ } readdir($DIR) ) {
    print("Doing $file\n");
    open( my $IN, "<", "$olddir/$file" ) || die("Can't open $olddir/$file: $!");
    open( my $OUT, ">", "$newdir/$file" ) || die("Can't open $newdir/$file: $!");
    CleanHtml( $IN, $OUT );
    close($IN);
    close($OUT);
}
closedir($DIR);
