Apr 24 2009 NukeZone Programming

Perl WWW:Mechanize script to fetch webpages, and a companion script to parse them.
A backend to my IRC bOt.  

#!/usr/bin/perl

# e. dziewa april 2008

use warnings;
use strict;
use WWW::Mechanize;

my $timestamp = int(time()/3600); $timestamp .= "--"; ### hours since epoch
my $mechcookies = "/home/armorbot/nukezonetop10/mech_cookies";

my $uri = 'http://portal.nukezone.nu/currentPtsClans.aspx';
my $nw_uri = 'http://portal.nukezone.nu/currentNetClans.aspx';
my $combo_uri = 'http://portal.nukezone.nu/currentCboClans.aspx';
my $province_uri = 'http://portal.nukezone.nu/currentProvinces.aspx';

my $mech = WWW::Mechanize->new(
        agent => 'ArmOrbOt',
        cookie_jar => { file => $mechcookies, autosave => 1 },
        autocheck => 1
    );

$mech->get( "$uri", ":content_file" => "/home/armorbot/nukezonetop10/files/${timestamp}pts.aspx" );
$mech->get( "$nw_uri", ":content_file" => "/home/armorbot/nukezonetop10/files/${timestamp}nw.aspx" );
$mech->get( "$combo_uri", ":content_file" => "/home/armorbot/nukezonetop10/files/${timestamp}cb.aspx" );
$mech->get( "$province_uri", ":content_file" => "/home/armorbot/nukezonetop10/files/${timestamp}pr.aspx" );


Download
Simple enough. On to the parsing..


#!/usr/bin/perl -w

# breakdown a portal.nukezone.nu top 100 list
# e. dziewa april 2008

use warnings;
use strict;

my $time = time();
my $timestamp = int($time/3600); $timestamp .= "--";

my $nw_from_file = "/home/armorbot/nukezonetop10/files/${timestamp}nw.aspx";
my $nwoutfile = "/home/armorbot/nukezonetop10/tmpfiles/nw.outfile";
my $nwpasstwo = "/home/armorbot/nukezonetop10/tmpfiles/nw.passtwo";
my $nwfinal = "/home/armorbot/nukezonetop10/final/${timestamp}nw.final";

my $pts_from_file = "/home/armorbot/nukezonetop10/files/${timestamp}pts.aspx";
my $ptoutfile = "/home/armorbot/nukezonetop10/tmpfiles/pt.outfile";
my $ptpasstwo = "/home/armorbot/nukezonetop10/tmpfiles/pt.passtwo";
my $ptfinal = "/home/armorbot/nukezonetop10/final/${timestamp}pt.final";

my $combo_from_file = "/home/armorbot/nukezonetop10/files/${timestamp}cb.aspx";
my $combooutfile = "/home/armorbot/nukezonetop10/tmpfiles/cb.outfile";
my $cbpasstwo = "/home/armorbot/nukezonetop10/tmpfiles/cb.passtwo";
my $cbfinal = "/home/armorbot/nukezonetop10/final/${timestamp}cb.final";

my $province_from_file = "/home/armorbot/nukezonetop10/files/${timestamp}pr.aspx";
my $provinceoutfile = "/home/armorbot/nukezonetop10/tmpfiles/pr.outfile";
my $prpasstwo = "/home/armorbot/nukezonetop10/tmpfiles/pr.passtwo";
my $prfinal = "/home/armorbot/nukezonetop10/final/${timestamp}pr.final";

##############
# strip html #
##############

use HTML::Parser 3.00 ();

my %inside;

sub tag { my($tag, $num) = @_; $inside{$tag} += $num; print " "; }
sub text { return if $inside{script} || $inside{style}; print $_[0]; }

sub work { # give it an infile and outfile
    open( IFH, "<:utf8", $_[0] ) or die "couldn't open $_[0] -> $!\n";
    open( OFH, ">", $_[1] ) or die "couldn't open $_[1] -> $!\n";
    my $i = 0;
    select OFH;
        while ( <IFH> ) {
        $i++;
        next if ( $i < 339 ); # garbage
            HTML::Parser->new( # ripped right from the documentation
                    api_version    => 3,
                    handlers   => [start => [\&tag, "tagname, '+1'"],
                    end            => [\&tag, "tagname, '-1'"],
                    text       => [\&text, "dtext"],
                ],
                marked_sections => 1,
            )->parse($_) || die "Couldn't parse\(\)\n"; print "\n";
        }
    select STDOUT;

close IFH;
close OFH;

}

work( $nw_from_file, $nwoutfile );
work( $pts_from_file, $ptoutfile );
work( $combo_from_file, $combooutfile );
work( $province_from_file, $provinceoutfile );

print "Finished culling HTML tags\n";

######################
# cull space and hex #
######################

sub passtwo { # give it an infile and outfile
    open( OFH, "<", $_[0] ) or die "couldn't open $_[0] -> $!\n";
    open( AFH, ">", $_[1] ) or die "couldn't open $_[1] -> $!\n";

    select AFH;
    while ( <OFH> ) {
        next unless s/\xa0|^\s+|\s+$//ig;
        print;
    }
close AFH;
close OFH;
select STDOUT;
}

passtwo( $nwoutfile, $nwpasstwo );
passtwo( $ptoutfile, $ptpasstwo );
passtwo( $combooutfile, $cbpasstwo );
passtwo( $provinceoutfile, $prpasstwo );

print "Finished culling space and hex\n";

######################
# format what's left #
######################

sub final { #give it an infile and an outfile
    open ( IFH, "<", $_[0] ) or die "couldn't open $_[0] -> $!\n";
    open ( FFH, ">", $_[1] ) or die "couldn't open $_[1] -> $!\n";

        if ( $_[0] =~ /pt\.passtwo/ ) {
            while ( <IFH> ) {
            s/^Top\s+Points\s+Clans\s+from\s+current\s+round\s+Clan\s+Name\s+Clan\s+Points//g;
            s/(\d+)p/$1\n/g;
            s/Copyright \xa9 2003-2006  Primetic . All Rights Reserved.//g;
            print FFH $_;
            }
            close FFH;
            close IFH;
        }
        elsif ( $_[0] =~ /nw\.passtwo/ ) {
            my $i;
            while ( <IFH> ) {
            s/^Top\s+Networth\s+Clans\s+from\s+current\s+round\s+Clan\s+Name\s+Networth//g;
                for $i ( 1..100 ) {
                    s/($i\.)/\n$i./;
                }
            s/Copyright \xa9 2003-2006  Primetic . All Rights Reserved.//g;
            print FFH $_;
            }
            close FFH;
            close IFH;
        }
        elsif ( $_[0] =~ /cb\.passtwo/ ) {
            my $i;
            while ( <IFH> ) {
            s/^Top\s+Combo\s+Clans\s+from\s+current\s+round\s+Clan\s+Name\s+Combo\s+Points//g;
                for $i ( 1..100 ) {
                    s/($i\.)/\n$i./;
                }
            s/Copyright \xa9 2003-2006  Primetic . All Rights Reserved.//g;
            print FFH $_;
            }
            close FFH;
            close IFH;
        }

        elsif ( $_[0] =~ /pr\.passtwo/ ) {
            my $i;
            while ( <IFH> ) {
            s/^Top\s+Provinces\s+from\s+current\s+round\s+Province\s+Name\s+Networth//g;
                for $i ( 1..100 ) {
                    s/($i\.)/\n$i./;
                }
            s/Copyright \xa9 2003-2006  Primetic . All Rights Reserved.//g;
            print FFH $_;
            }
            close FFH;
            close IFH;
        }
        else { '' }
}

final( $nwpasstwo, $nwfinal );
final( $ptpasstwo, $ptfinal );
final( $cbpasstwo, $cbfinal );
final( $prpasstwo, $prfinal );

print "construction complete\n";


Download

   
Comments
May 16 2009
Comment #1 posted by emerlu at 6:13 pm

I wrote a similar script in php not too long ago that parsed a page and output it as XML (RSS). In this way I could monitor changes on a webpage that did not utilize RSS. I find this one more useful though.

Comments for this entry available via RSS.
Comment Area
Your Name
Your Email (will not be published)
Your Website
Your Comment
Profanity is Prohibited
eric.dziewa.com is running WordPress.
WhiteSpace theme designed by E. Dziewa.
All content © E. Dziewa.
Thanks for stopping by.