#!/usr/bin/perl
# Given a topic name, fetch it from Wikipedia, along with all
# of the images in the article. Do not modify the downloaded files.
# By default, the HTML file is stored in the "Html" directory, and
# the images in the "Images" directory
#
# Alternatively the first argument can be a text file containing a
# list of topics, each on its own line
#
# (c) 2008 Warren Toomey wkt@tuhs.org, under the GPL3 license.
#
use strict;
use warnings;
use LWP::UserAgent;

my $wikiurl  = "http://en.wikipedia.org/w/index.php?title=";
my $baseurl  = "http://en.wikipedia.org";

my $htmldir  = "Html";
my $imgdir   = "Images";
my $doimages = 1;

# Fetch a URL and save it as a file.
sub FetchSave {
    my ( $url, $file ) = @_;

    # Don't re-fetch the file
    return (undef) if ( defined($file) && -f $file );
    # print( STDERR "Fetching $file\n" ) if ( defined($file) );

    # Create the agent, fetch the URL
    my $ua = LWP::UserAgent->new;
    $ua->agent('Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; Q312461)');
    $ua->env_proxy;
    my $response = $ua->get($url);
    # print( STDERR "Fetching $url\n" );
    return (undef) if ( !$response->is_success );

    # Save the file if we were asked to.
    my $result = $response->content;
    open( my $OUT, ">$file" ) || return ($result);
    print( $OUT $result );
    close($OUT);
    return ($result);
}

# Given a href, return a full URL and a filename
sub Href2URL {
    my $href = shift;
    my $file = substr( $href, rindex( $href, "/" ) + 1 );

    # Deal with %xx hex-encoded characters in the filename
    while ( $file =~ m{%([0-9a-fA-f][0-9a-fA-f])} ) {
        my $hex  = $1;
        my $char = chr( hex($hex) );
        $file =~ s{%$hex}{$char};
    }

    # If $href starts with http://, leave it. If it starts with
    # /, then prepend $baseurl. Otherwise, prepend $wikiurl
    my $url;
    if    ( $href =~ m{^http:} ) { $url = $href; }
    elsif ( $href =~ m{/} )      { $url = $baseurl . $href; }
    else                         { $url = $wikiurl . "/" . $href; }

    return ( $url, $file );
}

# Given a web page as a scalar, find and fetch any images
sub FindImages {
    my @lines = split( /\n/, $_[0] );

    # Loop for the main body
    while ( $_ = shift(@lines) ) {

        # Fetch an embedded image, and rewrite the reference
        if (m{<img (.*?)src="(.*?)"(.*?)>}) {
            my ( $first, $href, $rest ) = ( $1, $2, $3 );
            my ( $url, $file ) = Href2URL($href);

            # Fetch the image
            FetchSave( $url, "$imgdir/$file" );
            next;
        }
    }
}

sub FetchTopic
{
  my $topic= shift;

  # Stop if the HTML file exists
  if ( -f "$htmldir/$topic.html" ) {
    print( STDERR "Err: $htmldir/$topic.html exists\n" ); return;
  }

  # Fetch the content
  my $result
    = FetchSave( "$wikiurl$topic&printable=yes", "$htmldir/$topic.html" );
  if ( !defined($result) ) {
    print( STDERR "Err fetching $topic\n" ); exit(1);
  }

  # Fetch the associated images
  FindImages($result) if ($doimages);
}

## MAIN PROGRAM ##
if ( @ARGV < 1 || @ARGV > 4 ) {
    print( STDERR
            "Usage: $0 topic [-ni] [htmldir] [imgdir], e.g. $0 M._C._Escher\n" );
    exit(1);
}

if ( $ARGV[0] eq "-ni" ) { $doimages = 0; shift; }

# Get any directory details
$htmldir = $ARGV[1] if ( @ARGV > 1 );
$imgdir  = $ARGV[2] if ( @ARGV > 2 );

# If there exists a file $ARGV[0], open it and fetch each of the named
# topics therein. If not, simply fetch the named topic
if (! -f $ARGV[0]) { FetchTopic($ARGV[0]); exit(0); }

open(my $IN, "<", $ARGV[0]) || die("Err: Cannot open $ARGV[0]: $!");
while (<$IN>) { chomp; print("Fetching $_\n"); FetchTopic($_); }
close($IN);

exit(0);