#!/usr/bin/perl -w
use strict;
use CGI;
use HTML::LinkExtractor;
# Get filenames from the command line
my @files = @ARGV;
die 'No files given' unless (@files);
my @matches = ();
# Process all files and extract movie links and titles
foreach my $file (@files) {
my $data = '';
open (FILE, "<$file") or die "Couldn't read from $file : $!\n";
# Join on all lines into one
while (my $line = ) {
chomp($line);
$data .= $line;
}
close(FILE);
# Remove 'Bottom Rated'
$data =~ s/Bottom\s+Rated\s+.*$//g;
# Get movie links out of data
# Sample Lawrence of Arabia
my $lx = new HTML::LinkExtractor();
$lx->parse(\$data);
# There a lot of links. We need only those that are for
# movies. In other words, have title/tt in them.
foreach my $item (@{ $lx->links }) {
if ($item->{'href'} && $item->{'href'} =~ m#title/tt#) {
my $text = $item->{'_TEXT'};
my $url = $item->{'href'};
# Get title
$text =~ s#(.*)?#$1#;
my %match = ();
# Save URL, title, and the file we found it in
$match{'url'} = $url;
$match{'title'} = $text;
$match{'file'} = $file;
push @matches, \%match;
}
}
}
my %uniqs = ();
# Remove dups and save match files
foreach my $match (@matches) {
if ($uniqs{ $match->{'url'} }) {
$uniqs{ $match->{'url'} }{'files'} .= ', ' . $match->{'file'};
}
else {
$uniqs{ $match->{'url'} }{'title'} = $match->{'title'};
$uniqs{ $match->{'url'} }{'files'} = $match->{'file'};
}
}
# Generate proper HTML
my $q = new CGI;
my $wishlist_title = 'Ultimate movie wishlist';
# Put some header in HTML file
print $q->start_html($wishlist_title);
print $q->h1($wishlist_title);
print $q->p('Total number of movies: ' . scalar(keys (%uniqs)));
print $q->p('Processed files: '. join(', ', @ARGV));
# Print the list, sorted by title alphabetically
foreach my $url (sort { $uniqs{$a}{'title'} cmp $uniqs{$b}{'title'} } keys %uniqs) {
$uniqs{$url}{'files'} =~ s/\.html//ig;
print $q->a({href=>$url},$uniqs{$url}{'title'}) . ' (' . $uniqs{$url}{'files'} . ')' . $q->br . "\n";
}
print $q->end_html();