#!/usr/bin/perl -w #### MODIFY LINE ABOVE TO HAVE PROPER PERL PATH FOR YOUR COMPUTER # This program is provided by Hot Neuron LLC free of charge without # any warranty of any kind whatsoever. It is entirely your responsibility # to asses the suitability of this program for your use. By using this # program you accept all responsibility for any damage that it may cause. # NOTE: This program checks the robots.txt file on the server and it will # not send requests too rapidly (don't strain the server) so it # normally takes a little time to run: 1 minute for each page request # You will need the LWP module which is in "libwww-perl" which you can get at # http://search.cpan.org/search?module=LWP use LWP::RobotUA; $robot_from='your@email.com'; ######## MODIFY - PROVIDE YOUR EMAIL ADDRESS $robot_name='mp_fetch_feed'; # don't change - name of the spider $ua = new LWP::RobotUA($robot_name, $robot_from); ####################################################### # MODIFY - Put in one call to the 'fetch' function for each # page you want to fetch below. # 1st argument = URL of page to fetch # 2nd argument = full path of file to store page in (FILE WILL BE OVERWRITTEN) ####################################################### # Example: # fetch('http://MagPortal.com/nr/feed.php?c=92&t=1&i=33', '/tmp/mp_cache/uspolitics.js'); sub print_error($) { my $msg = shift; print STDERR "mp_fetch_feed.pl: ERROR - $msg\n"; } sub fetch($$) { my ($url, $filename) = @_; my $req = new HTTP::Request(GET => $url); my $page = $ua->request($req); # if we got some sort of server error, try the request one more time if ($page->code() >= 500 && $page->code() < 600) { $page = $ua->request($req); } if ($page->is_success) { if (open(OUTFILE, ">$filename")) { print OUTFILE $page->content; close OUTFILE; } else { print_error("Unable to open output file: $filename"); } } else { print_error("Unable to fetch data from URL: $url"); } }