#!/usr/bin/perl
# Given a directory containing a set of downloaded wiki HTML topics files,
# search the documents for other topic files which have not yet been
# downloaded. The list of other topics is sent to standard output.
# By default, the "Html" directory is searched for HTML files.
#
# (c) 2008 Warren Toomey wkt@tuhs.org, under the GPL3 license.
#
use strict;
use warnings;

my $htmldir = "Html";
my %KnownTopic;
my %NewTopic;

## MAIN PROGRAM ##
if ( @ARGV > 1 ) { print( STDOUT "Usage: $0 [htmldir]\n" ); exit(1); }

# Get any directory details
$htmldir = $ARGV[0] if ( @ARGV > 0 );

# Get a hashed list of all the existing topics
opendir( my $DIR, $htmldir ) || die "can't opendir $htmldir: $!";
%KnownTopic = map( { $_ => 1 } grep {/\.html$/} readdir($DIR) );
closedir($DIR);
foreach my $topic ( keys(%KnownTopic) ) {
    #print("Topic $topic\n");
    open( my $IN, "<", "$htmldir/$topic" ) || next;
    while (<$IN>) {
        # Search each of the tags on the line
        foreach my $tagline ( split( /</, $_ ) ) {

            # If it's an internal topic, grab it
            if ( $tagline =~ m{href="/wiki/(.*?)"} ) {

                # Trim off any # stuff
                my $t = $1; $t =~ s{#.*}{};

                # Skip certain topics
                next if ( $t =~ m{Image:} || $t =~ m{Wikipedia} || $t =~ m{/} );

                # Save if the topic doesn't already exist
                $NewTopic{$t} = 1
                    if ( !defined(
                        $NewTopic{$t} && !defined( $KnownTopic{"$t.html"} ) ) );
            }
        }
    }
    close($IN);
}

# Now print out the new topics
print map "$_\n", sort keys %NewTopic;
exit(0);