#!/usr/bin/perl # Given a directory containing a set of downloaded wiki HTML topics files, # clean up the HTML code in each file and save the page into a new directory. # By default, the "Html" directory is searched for HTML files, with new files # stored in "NewHtml". # # (c) 2008 Warren Toomey wkt@tuhs.org, under the GPL3 license. # use strict; use warnings; my $olddir = "Html"; my $newdir = "NewHtml"; my $newimg = "NewImages"; my $noimages = 0; my $nohrefs = 0; # Given a web page as a list of lines, clean it up and fix images sub CleanHtml { my ( $IN, $OUT ) = @_; # Loop for lines before the initial

while (<$IN>) { # Misc cleanups s/^[ \t]+//; if (m{^\n" ); next; } if (m{, so finish off the head s{.*\n$_" ); # Loop for the main body while (<$IN>) { # Misc cleanups s/^[ \t]+//; # Remove everything after "See also" and "External links" last if (m{See also}); last if (m{External links}); last if (m{Saved in parser cache with key enwiki}); # Remove the "Jump to" next if (m{
}{}g; # Deal with hyperlinks: with $nohrefs set, just # keep the internal ones if ($nohrefs) { s{(.*?)}{$1}g; # Remove /skins/ and /w/ hyperlinks s{.*?}) { if ( -f "$olddir/$1$2.html" ) { s{(.*?)}{$3}; } else { s{s and s s{}{}g; s{}{}g; s{}{}g; s{}{}g; # Remove magnify-clip.png s{}{}g; # Rewrite the reference to an embedded image # or skip the images if $noimages is on if (m{}) { next if ($noimages); my ( $first, $href, $rest ) = ( $1, $2, $3 ); my $file = substr( $href, rindex( $href, "/" ) + 1 ); # Replace the suffix with .jpg and see if the file # exists in the $newimg directory. If so, use that filename. # If not, keep the old filename, as we will get it from the # untouched images directory my $newfile= substr($file, 0, -3) . "jpg"; if (-f "$newimg/$newfile") { print( $OUT "" ); } else { print( $OUT "" ); } next; } print( $OUT $_ ); } } ## MAIN PROGRAM ## while ( @ARGV > 0 && $ARGV[0] =~ /^-/ ) { if ( $ARGV[0] eq "-ni" ) { $noimages = 1; shift; } if ( $ARGV[0] eq "-nh" ) { $nohrefs = 1; shift; } } # Get any directory details $olddir = $ARGV[0] if ( @ARGV > 0 ); $newdir = $ARGV[1] if ( @ARGV > 1 ); mkdir($newdir) if ( !-d $newdir ); # Read and open in/out files from the $olddir and $newdir opendir( my $DIR, $olddir ) || die "can't opendir $olddir: $!"; foreach my $file ( grep { /\.html$/ } readdir($DIR) ) { print("Doing $file\n"); open( my $IN, "<", "$olddir/$file" ) || die("Can't open $olddir/$file: $!"); open( my $OUT, ">", "$newdir/$file" ) || die("Can't open $newdir/$file: $!"); CleanHtml( $IN, $OUT ); close($IN); close($OUT); } closedir($DIR);