#!/usr/bin/perl
##################################################################
# grabba.pl, A script to download multiple files from a page
# Copyright (C) 2001 Lev Epshteyn (www.levik.com)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# (Or visit http://www.gnu.org/licenses/gpl.html)
###############################################################
# Usage: grabba.pl url [ext]
# Example: grabba.pl http://my.book.com txt
# will download all the .txt files linked from the page
# The files will be saved in the current directory, with the names
# preserved (the part of the url after the last "/"). If no extension
# is provided, all the linked files will be grabbed.
#
# grabba.pl has two notable features:
# * it pretends to be IE 5.5
# * when it grabs the files, it sends the original URL in the referer
# header, so it looks like another web-surfer clicking links on a page
# These features make it a very useful script in pages that try to block
# you from somehow getting their files wholesale.
###############################################################
use strict;
use URI::URL;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Request::Common;
my $ua = LWP::UserAgent->new;
# pretend to be IE 5.5 on win98
$ua->agent('Mozilla/4.0 (compatible; MSIE 5.5; Windows 98)');
my $url = url $ARGV[0];
#determine the url base (for links)
$url =~ /^(.*\/)[^\/]*$/;
my $urlbase = $1;
#determine the url host (for links)
$url =~ /^(\w*:\/\/[^\/]+)/;
my $urlhost = $1;
my $ext = $ARGV[1];
my $res = $ua->request(GET $url);
my @lines = split(/(]*>)/ig, $res->content);
my $line;
my @links;
my $link;
# get all links, and put them into a list
foreach $line (@lines) {
if ($line =~ /]*href=\"([^"]*)\"[^>]*>/i) {
$link = $1;
print (".");
if (!$ext || $link =~ /$ext$/i) {
push(@links, $link);
}
}
}
#download each link
my $link;
foreach $link (@links) {
&download($link);
}
sub download {
my ($addr) = @_;
if ($addr =~ /^\//) {
# if url starts with a "/" prepend $urlhost
$addr = $urlhost . $addr;
} elsif (! ($addr =~ /^\w+:\/\//)) {
# else, if url doesn't start with "http://" prepend $urlbase
$addr = $urlbase . $addr;
}
$addr =~ /([^\/]+)$/;
my $fname = $1;
print ($addr . " => " . $fname . "\n");
my $req = HTTP::Request->new(GET => $addr);
$req->referer($ARGV[0]);
$ua->request($req, $fname);
}