Skip to content

Commit

Permalink
Move status handling to JobFunnel
Browse files Browse the repository at this point in the history
This isn't as clear as I'd like, but the previous handling wasn't all that clear either
  • Loading branch information
Max Maischein committed Jun 1, 2024
1 parent 5fa6711 commit 6b87f39
Showing 1 changed file with 60 additions and 45 deletions.
105 changes: 60 additions & 45 deletions script/crawler.pl
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
use DateTime;
use DateTime::Format::ISO8601;
use File::Path 'mkpath';
use JobFunnel;
use JobFunnel::ProgressItem;

GetOptions(
'config|c=s' => \my $config_file,
Expand Down Expand Up @@ -61,27 +63,48 @@

# The progress output
my $printer = Term::Output::List->new();
my @scoreboard;

sub status($res, $r) {
my $size = $res->content->progress;
my $url = $r->{req}->req->url;
my $len = $res->headers->content_length;

my $viz = $url;
# XXX Get terminal size
if( length $viz > 80 ) {
substr( $viz, 77 ) = '...';
my $funnel = JobFunnel->new(
new_job => \&add_request
);
$funnel->on( update => sub { output_scoreboard() });

my $crawler;
sub add_request( $job ) {
my $url = $job->{info}->{url};
my $res;
if( $job->{action} eq 'request' ) {
if( $res = $crawler->submit_request($job)) {
if( $verbose ) {
msg("Queueing $url");
}
} else {
if( $verbose ) {
msg("Skipping $url");
};
}
} elsif( $job->{action} eq 'download' ) {
$res = $crawler->submit_download(
$job => $job->{target},
);
}
return $res
}

return sprintf "% 3d %s %s", $size == $len ? 100 : int($size / ($len / 100)), $url;
sub status($item) {
my $perc = $item->percent;
$perc = defined $perc ? sprintf "% 3d%%", $perc : ' -- ';
my $vis = $item->visual // '?';
return sprintf "%s %s %s", $perc, $item->action, $vis;
}

sub output_scoreboard() {
#my $debug = sprintf "%d requests, %d pending", scalar(keys %scoreboard), scalar $crawler->queue->@*;
sub output_scoreboard(@) {
my @scoreboard;
if( $funnel ) {
@scoreboard = $funnel->jobs->@*;
};
$printer->output_list(
#$debug,
map { status( @$_ ) } @scoreboard
map { status( $_ ) } @scoreboard
);
}

Expand All @@ -108,34 +131,28 @@ ( $config_file )
}

sub create_crawler( $config, $cache ) {
my $crawler = COWS::Crawler->new(
$crawler = COWS::Crawler->new(
#base => $config->{base},
cache => $cache,
debug => $debug,
);

$crawler->on('progress' => sub($c, $r, $res) {
return unless my $len = $res->headers->content_length;

# Check if we already have this request in our list
if( ! grep { $_->[0] == $res and $_->[1] == $r } @scoreboard) {
push @scoreboard, [$res,$r]
}

output_scoreboard();
});
#$crawler->on('progress' => sub($c, $r, $res) {
# return unless my $len = $res->headers->content_length;
#
# output_scoreboard();
#});

$crawler->on('error' => sub($c, $r) {
msg( sprintf "Couldn't fetch %s", $r->{req}->req->url );
output_scoreboard();
});

# remove things on complete
$crawler->on('finish' => sub($c, $r, $res) {
@scoreboard = grep { $_->[0] != $res or $_->[1] != $r } @scoreboard;
#msg( sprintf "Finished %s", $r->{req}->req->url );
output_scoreboard();
});
#$crawler->on('finish' => sub($c, $r, $res) {
# @scoreboard = grep { $_->[0] != $res or $_->[1] != $r } @scoreboard;
# output_scoreboard();
#});

return $crawler
}
Expand Down Expand Up @@ -178,16 +195,8 @@ ( $crawler, $page, $url )
from => $page->{info}->{url},
};

if(
$crawler->submit_request({info => $info, method => 'GET', url => "$url"})
) {
if( $verbose ) {
msg("Queueing $url");
}
#msg( sprintf "Queueing %s (%x, %x)", $r->{req}->req->url, $res, $r );
} else {
#msg("Skipping $url");
}
my $job = {info => $info, method => 'GET', url => "$url", action => 'request'};
$funnel->add($job);
}

my @known_extensions = (qw(
Expand Down Expand Up @@ -222,12 +231,15 @@ ( $crawler, $page, $url, $filename=undef )

my $target = File::Spec->catfile( $target_directory, $filename );

$crawler->submit_download({info => $info, method => 'GET', url => "$url",
my $job = {info => $info, method => 'GET', url => "$url",
headers => {
Referer => $page->{info}->{url},
# cookies?
}
} => $target);
},
target => $target,
action => 'download',
};
$funnel->add($job);
}

my %actions = (
Expand Down Expand Up @@ -287,7 +299,10 @@ ($config, @items)

my @rows;
for my $url (@items) {
$crawler->submit_request({ method => 'GET', url => $url, info => { url => $url }} ) ;
my $job = { method => 'GET', url => $url, info => { url => $url },
action => 'request',
};
$funnel->add($job);
}

my @res;
Expand Down

0 comments on commit 6b87f39

Please sign in to comment.