#!/usr/bin/perl # Given a directory containing a set of downloaded wiki HTML topics files, # search the documents for other topic files which have not yet been # downloaded. The list of other topics is sent to standard output. # By default, the "Html" directory is searched for HTML files. # # (c) 2008 Warren Toomey wkt@tuhs.org, under the GPL3 license. # use strict; use warnings; my $htmldir = "Html"; my %KnownTopic; my %NewTopic; ## MAIN PROGRAM ## if ( @ARGV > 1 ) { print( STDOUT "Usage: $0 [htmldir]\n" ); exit(1); } # Get any directory details $htmldir = $ARGV[0] if ( @ARGV > 0 ); # Get a hashed list of all the existing topics opendir( my $DIR, $htmldir ) || die "can't opendir $htmldir: $!"; %KnownTopic = map( { $_ => 1 } grep {/\.html$/} readdir($DIR) ); closedir($DIR); foreach my $topic ( keys(%KnownTopic) ) { #print("Topic $topic\n"); open( my $IN, "<", "$htmldir/$topic" ) || next; while (<$IN>) { # Search each of the tags on the line foreach my $tagline ( split( /