#!/usr/bin/perl # Given a topic name, fetch it from Wikipedia, along with all # of the images in the article. Do not modify the downloaded files. # By default, the HTML file is stored in the "Html" directory, and # the images in the "Images" directory # # Alternatively the first argument can be a text file containing a # list of topics, each on its own line # # (c) 2008 Warren Toomey wkt@tuhs.org, under the GPL3 license. # use strict; use warnings; use LWP::UserAgent; my $wikiurl = "http://en.wikipedia.org/w/index.php?title="; my $baseurl = "http://en.wikipedia.org"; my $htmldir = "Html"; my $imgdir = "Images"; my $doimages = 1; # Fetch a URL and save it as a file. sub FetchSave { my ( $url, $file ) = @_; # Don't re-fetch the file return (undef) if ( defined($file) && -f $file ); # print( STDERR "Fetching $file\n" ) if ( defined($file) ); # Create the agent, fetch the URL my $ua = LWP::UserAgent->new; $ua->agent('Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; Q312461)'); $ua->env_proxy; my $response = $ua->get($url); # print( STDERR "Fetching $url\n" ); return (undef) if ( !$response->is_success ); # Save the file if we were asked to. my $result = $response->content; open( my $OUT, ">$file" ) || return ($result); print( $OUT $result ); close($OUT); return ($result); } # Given a href, return a full URL and a filename sub Href2URL { my $href = shift; my $file = substr( $href, rindex( $href, "/" ) + 1 ); # Deal with %xx hex-encoded characters in the filename while ( $file =~ m{%([0-9a-fA-f][0-9a-fA-f])} ) { my $hex = $1; my $char = chr( hex($hex) ); $file =~ s{%$hex}{$char}; } # If $href starts with http://, leave it. If it starts with # /, then prepend $baseurl. Otherwise, prepend $wikiurl my $url; if ( $href =~ m{^http:} ) { $url = $href; } elsif ( $href =~ m{/} ) { $url = $baseurl . $href; } else { $url = $wikiurl . "/" . $href; } return ( $url, $file ); } # Given a web page as a scalar, find and fetch any images sub FindImages { my @lines = split( /\n/, $_[0] ); # Loop for the main body while ( $_ = shift(@lines) ) { # Fetch an embedded image, and rewrite the reference if (m{}) { my ( $first, $href, $rest ) = ( $1, $2, $3 ); my ( $url, $file ) = Href2URL($href); # Fetch the image FetchSave( $url, "$imgdir/$file" ); next; } } } sub FetchTopic { my $topic= shift; # Stop if the HTML file exists if ( -f "$htmldir/$topic.html" ) { print( STDERR "Err: $htmldir/$topic.html exists\n" ); return; } # Fetch the content my $result = FetchSave( "$wikiurl$topic&printable=yes", "$htmldir/$topic.html" ); if ( !defined($result) ) { print( STDERR "Err fetching $topic\n" ); exit(1); } # Fetch the associated images FindImages($result) if ($doimages); } ## MAIN PROGRAM ## if ( @ARGV < 1 || @ARGV > 4 ) { print( STDERR "Usage: $0 topic [-ni] [htmldir] [imgdir], e.g. $0 M._C._Escher\n" ); exit(1); } if ( $ARGV[0] eq "-ni" ) { $doimages = 0; shift; } # Get any directory details $htmldir = $ARGV[1] if ( @ARGV > 1 ); $imgdir = $ARGV[2] if ( @ARGV > 2 ); # If there exists a file $ARGV[0], open it and fetch each of the named # topics therein. If not, simply fetch the named topic if (! -f $ARGV[0]) { FetchTopic($ARGV[0]); exit(0); } open(my $IN, "<", $ARGV[0]) || die("Err: Cannot open $ARGV[0]: $!"); while (<$IN>) { chomp; print("Fetching $_\n"); FetchTopic($_); } close($IN); exit(0);