#!/usr/bin/perl -w use strict; use CGI; use HTML::LinkExtractor; # Get filenames from the command line my @files = @ARGV; die 'No files given' unless (@files); my @matches = (); # Process all files and extract movie links and titles foreach my $file (@files) { my $data = ''; open (FILE, "<$file") or die "Couldn't read from $file : $!\n"; # Join on all lines into one while (my $line = ) { chomp($line); $data .= $line; } close(FILE); # Remove 'Bottom Rated' $data =~ s/Bottom\s+Rated\s+.*$//g; # Get movie links out of data # Sample Lawrence of Arabia my $lx = new HTML::LinkExtractor(); $lx->parse(\$data); # There a lot of links. We need only those that are for # movies. In other words, have title/tt in them. foreach my $item (@{ $lx->links }) { if ($item->{'href'} && $item->{'href'} =~ m#title/tt#) { my $text = $item->{'_TEXT'}; my $url = $item->{'href'}; # Get title $text =~ s#(.*)?#$1#; my %match = (); # Save URL, title, and the file we found it in $match{'url'} = $url; $match{'title'} = $text; $match{'file'} = $file; push @matches, \%match; } } } my %uniqs = (); # Remove dups and save match files foreach my $match (@matches) { if ($uniqs{ $match->{'url'} }) { $uniqs{ $match->{'url'} }{'files'} .= ', ' . $match->{'file'}; } else { $uniqs{ $match->{'url'} }{'title'} = $match->{'title'}; $uniqs{ $match->{'url'} }{'files'} = $match->{'file'}; } } # Generate proper HTML my $q = new CGI; my $wishlist_title = 'Ultimate movie wishlist'; # Put some header in HTML file print $q->start_html($wishlist_title); print $q->h1($wishlist_title); print $q->p('Total number of movies: ' . scalar(keys (%uniqs))); print $q->p('Processed files: '. join(', ', @ARGV)); # Print the list, sorted by title alphabetically foreach my $url (sort { $uniqs{$a}{'title'} cmp $uniqs{$b}{'title'} } keys %uniqs) { $uniqs{$url}{'files'} =~ s/\.html//ig; print $q->a({href=>$url},$uniqs{$url}{'title'}) . ' (' . $uniqs{$url}{'files'} . ')' . $q->br . "\n"; } print $q->end_html();