Jump to content

New problem with Spidering Hack Tutorial ~ Bareword

dmgmn's Photo
Posted Jun 25 2010 03:28 PM
3442 Views

Thanks to everyone that helped with my earlier problems.

I'm making progress but I've run into a new problem.

This is the new code:

#!/usr/bin/perl -w
use strict;
use LWP::Simple;
use HTML::TreeBuilder;

my $url = 'http://oreilly.com/store/complete.html';
my $page = get( $url ) or die $!;
my $p = HTML::TreeBuilder->new_from_content( $page );
my($book);
my($edition);

my @links = $p->look_down(
        _tag => 'a',
        href => qr{^ /Qhttp://www.oreilly.com/catalog/\E \w+ $}x
);

my @rows = map { $_->parent->parent } @links;

my @books;
for my $row (@rows) {
        my %book;
        my @cells = $row->look_down( _tag => 'td' );
        $book{title}    =$cells[0]->as_trimmed-text;
        $book{price}    =$cells[2]->as_trimmed-text;
        $book{price} =~ s/^\$//;
        
        $book{url}              = get_url( $cells[0] );
        $book{ebook}    = get_url( $cells[3] );
        $book{safari}   = get_url( $cells[4] );
        $book{examples} = get_url( $cells[5] );
        push @books, \%book;
}

sub get_url {
        my $node = shift;
        my @hrefs = $node->look_down( _tag => 'a');
        return unless @hrefs;
        my $url = $hrefs[0]->atr('href');
        $url =~ s/\s+$//;
        return $url;
}

$p = $p->delete; #we don't need this anymore.

{
        my $count = 1;
        my @perlbooks = sort { $a->{price} <=> $b->{price} }
                                        grep { $_->{title} =~/perl/i } @books;
        print $count++, "\t", $_->{price}, "\t", $_->{title} for @perlbooks;
}

{
        my @perlbooks = grep { $_->{title} =~ /perl/i } @books;
        my @javabooks = grep { $_->{title} =~ /java/i } @books;
        my $diff =  @javabooks - @perlbooks;
        print "There are ".@perlbooks." Perl books and ".@javabooks.
                " Java books. $diff more Java than Perl.";
}

for my $book ( $books[34] ) {
        my $url = $book->{url};
        my $page = get( $url );
        my $tree = HTML::TreeBuilder->new_from_content( $page );
        my ($pubinfo) = $tree->look_down(
                                                                        _tag => 'span',
                                                                        class => 'secondary2'
        );
        my $html = $pubinfo->as_HTML; print $html;
        my ($pages) = $html =~ /(\d+) pages/,
        my ($edition) = $html =~ /(\d)(?:st|nd|rd|th) Edition/;
        my ($date) = $html =~ /(\w+ (19|20)\d\d)/;
        
        print "\n$pages $edition $date\n";
        
        my ($img_node) = $tree->look_down(
                                                                        _tag => 'img',
                                                                        src  => qr{^/catalog/covers/},
        );
        my $img_url = 'http://www.oreilly.com'.$img_node->attr('src');
        my $cover = get( $img_url );
        # now save $cover to disk
}                                                                                       




Now I'm getting these errors:


Bareword "text" not allowed while "strict subs" in use at ./SpiderTutorial_19_06.pl line 23.
Bareword "text" not allowed while "strict subs" in use at ./SpiderTutorial_19_06.pl line 24.
Execution of ./SpiderTutorial_19_06.pl aborted due to compilation errors.

Again, any help would be greatly appreciated.

Tags:
1 Subscribe


1 Reply

0
  dmgmn's Photo
Posted Jun 25 2010 04:16 PM

I found the problem,

as_trimmed-text

should have been:

as_trimmed_text