00001 #!/usr/bin/perl -w
00002 # Author: Andrei Rjeousski ( arjeousski at gmail com )
00003 # Version 0.6
00004 # Changes:
00005 # 0.1
00006 # Initial release
00007 # 0.2
00008 # Added more handling to linked .mov files
00009 # 0.3
00010 # Searching trailer pages that dont have "large" or "high" in them
00011 # 0.4
00012 # If nothing is found based on IMDB title, try looking for movie's short title
00013 # 0.5
00014 # If apple.com doesnt return anything, try googling it :)
00015 # 0.6
00016 # Updated and added more recognition in getting the large trailer from a page with choices of small/med/large
00017
00018 use LWP::Simple; # libwww-perl providing simple HTML get actions
00019 use HTML::Entities;
00020 use URI::Escape;
00021
00022
00023
00024 use vars qw($opt_r $opt_d $opt_T);
00025 use Getopt::Std;
00026
00027
00028 # display usage
00029 sub usage {
00030 print "usage: $0 \n";
00031 print " -T <movieid> get movie trailer\n";
00032 print " -d enable debug";
00033 exit(-1);
00034 }
00035
00036 # returns text within 'data' between 'beg' and 'end' matching strings (taken from imdb.pl)
00037 sub parseBetween {
00038 my ($data, $beg, $end)=@_; # grab parameters
00039
00040 my $ldata = lc($data);
00041 my $start = index($ldata, lc($beg)) + length($beg);
00042 my $finish = index($ldata, lc($end), $start);
00043 if ($start != (length($beg) -1) && $finish != -1) {
00044 my $result = substr($data, $start, $finish - $start);
00045 # return w/ decoded numeric character references
00046 # (see http://www.w3.org/TR/html4/charset.html#h-5.3.1)
00047 decode_entities($result);
00048 return $result;
00049 }
00050 return "";
00051 }
00052
00053 sub getTrailerURL {
00054 my ($movieid) = @_;
00055
00056 # get the name of the movie
00057 my $host = "http://www.imdb.com/title/tt" . $movieid . "/";
00058 my $response = get $host;
00059
00060 # get the title
00061 my $movie_title = parseBetween($response, "<title>", "<\/title>");
00062
00063 $response =~ /<br>([\w\s\(\)^<^>]+)\S+?\(short title\)/i;
00064 my $short_movie_title = "";
00065 $short_movie_title = $1 if ($1);
00066
00067
00068 # get rid of the year
00069 $movie_title =~ s/\ *\(.+?\)
00070 $movie_title =~ s/\ *,\ *(The|A)$
00071 $movie_title =~ s/\&
00072
00073 # get rid of the year
00074 $short_movie_title =~ s/\ *\(.+?\)
00075 $short_movie_title =~ s/\ *,\ *(The|A)$
00076 $short_movie_title =~ s/\&
00077
00078 if (defined $opt_d) { printf("# looking for movie id: $movieid title: $movie_title short title: $short_movie_title\n"); }
00079
00080 # add pluses
00081 my $movie_title_plus = $movie_title;
00082 $movie_title_plus =~ tr/ /+/;
00083 my $short_movie_title_plus = $short_movie_title;
00084 $short_movie_title_plus =~ tr/ /+/;
00085
00086 my $found = 0;
00087 my $search_string = $movie_title_plus;
00088 my $trailer_page_uri = "";
00089
00090 while (!$found) {
00091 if (defined $opt_d) { printf("# looking for: $search_string\n"); }
00092 # do the search
00093 $host = "http://searchcgi.apple.com/cgi-bin/sp/nph-searchpre1.pl?q=".$search_string."+site:www.apple.com/trailers&restrict=us_trailers_only&client=www_collection&site=www_collection&lr=lang_en&output=xml&sort=&filter=0&access=p";
00094 $response = get $host;
00095
00096 # find URL with "large"
00097 if ($trailer_page_uri eq "") {
00098 if ($response =~ m/\"(http:\/\/.*large.*)\"/i) {
00099 $trailer_page_uri = $1;
00100 if (defined $opt_d) { printf("# found large\n"); }
00101 }
00102 }
00103
00104 # check for high
00105 if ($trailer_page_uri eq "") {
00106 if ($response =~ m/\"(http:\/\/(.*high.*))\"/i) {
00107 $trailer_page_uri = $1;
00108 if (defined $opt_d) { printf("# found high\n"); }
00109 }
00110 }
00111
00112 # must not have "large" or "high"
00113 if ($trailer_page_uri eq "") {
00114 if ($response =~ m/href=\"(http:\/\/[\w.\/]+)\">(Apple - Trailers)[^<]+/i) {
00115 $trailer_page_uri = $1;
00116 if (defined $opt_d) { printf("# found other\n"); }
00117 }
00118 }
00119
00120 # check for "large" in the title
00121 if ($trailer_page_uri eq "") {
00122 if ($response =~ m/<a.*?href=\"([^\"]*)\"[^>]*?>.*?large.*?<\/a>/i) {
00123 $trailer_page_uri = $1;
00124 if (defined $opt_d) { printf("# found title - large\n"); }
00125 }
00126 }
00127
00128 # we must be at the studios page
00129 if ($trailer_page_uri =~ m/\/$/) {
00130 if (defined $opt_d) { printf("# didnt find the movie, found studio: $trailer_page_uri\n"); }
00131 # look for a movie
00132 $response =~ /href=\"([^"]+)\"[^>]*>.*?<b>.*?<\/a>/i;
00133 $trailer_page_uri = $1;
00134
00135 # search for a partial match, if the movie is on the studios page, movie must be here
00136 if (defined($trailer_page_uri)) {
00137 if (defined $opt_d) { printf("# new trailer page: $trailer_page_uri\n"); }
00138
00139 # get the actual page
00140 $response = get $trailer_page_uri;
00141 if (
00142 $response =~ m/<a.*?href=\"([^\"]*)\"[^>]*?>.*?large/i || # large in image name after A
00143 $response =~ m/<a.*?href=\"([^\"]*)\"[^>]*?large[^>]*?>/i || # large within javascript
00144 $response =~ m/\"(.*large[^\"]*)\"/i || # large in actual filename
00145 $response =~ m/\"([^<>\"]*?high[^\"<>]*?)\"/i || # high in filename
00146 $response =~ m/<a.*?href=\"([^\"]*)\"[^>]*?>.*?high/i || # high after A
00147 $response =~ m/\"([^\"]*?lrg[^\"]*?)\"/i || #lrg in filename
00148 $response =~ m/\"([^\"]*?_lg.[^\"]*?)\"/i || #_lg. in filename
00149 $response =~ m/\"([^\"]*?480[^\"]*?)\"/i # 480 in filename
00150 ) {
00151 # m/\"([^\"]*?high[^\"]*?)\"/i - gangs of new york
00152 if (defined $opt_d) { printf("# found large trailer\n"); }
00153 $trailer_page_uri .= $1;
00154 } else {
00155 $trailer_page_uri = "";
00156 }
00157 } else {
00158 $trailer_page_uri = "";
00159 }
00160 }
00161 if ($trailer_page_uri eq "" and $short_movie_title ne "" and !$found and $search_string ne $short_movie_title_plus) {
00162 $search_string = $short_movie_title_plus;
00163
00164 } else {
00165 $found = 1;
00166 }
00167 }
00168
00169
00170 # lets try googling (cant merge with above code yet)
00171 if ($trailer_page_uri eq "") {
00172 if (defined $opt_d) { printf("# googling for: $movie_title_plus\n"); }
00173
00174 # google requires the following hack, otherwise it give 403 (try googling a seach page with wget)
00175 require LWP::UserAgent;
00176
00177 my $ua = LWP::UserAgent->new;
00178 # gotta trick google
00179 $ua->agent('Mozilla/5.0');
00180
00181 $response = $ua->get("http:
00182 $response = $response->content;
00183
00184 # must not have "large" or "high"
00185 if ($trailer_page_uri eq "") {
00186 if ($response =~ m/href=(http:\/\/[\w.\/]+)[^>]*?>(Apple - Trailers)[^<]+/i) {
00187 $trailer_page_uri = $1;
00188 if (defined $opt_d) { printf("# googled other\n"); }
00189 }
00190 }
00191
00192 }
00193
00194
00195
00196 my $trailer_uri = "";
00197 #only proceed, if something was found
00198 if ($trailer_page_uri ne "") {
00199 if (defined $opt_d) { printf("# trailer page found: $trailer_page_uri\n"); }
00200
00201 # get the trailer page
00202 $response = get $trailer_page_uri;
00203 $response = lc($response);
00204
00205 if (defined $opt_r) { printf($response); }
00206
00207 # try parsing...
00208 $trailer_uri = parseBetween($response,"name=\"href\" value=\"","\"") if ($trailer_uri eq "");
00209 $trailer_uri = parseBetween($response,"controller=\"false\" href=\"","\"") if ($trailer_uri eq "");
00210 $trailer_uri = parseBetween($response,"target=\"myself\" src=\"","\"") if ($trailer_uri eq "");
00211 $trailer_uri = parseBetween($response,"controller=\"true\" src=\"","\"") if ($trailer_uri eq "");
00212 $trailer_uri = parseBetween($response,"param name=\"src\" value=\"","\"") if ($trailer_uri eq "");
00213
00214 # now we need to get the filename of the ACTUAL file
00215 if ($trailer_uri ne "") {
00216 if (defined $opt_d) { printf("# actual trailer found: $trailer_uri\n"); }
00217 if (defined $opt_d) { printf("# starting to download\n"); }
00218
00219 my $file_size = 0;
00220 while ($file_size < 100000) {
00221 my @headers = head $trailer_uri;
00222 $file_size = $headers[1];
00223
00224 if ($file_size < 100000) { # to be save
00225 my $file = get $trailer_uri;
00226
00227 $file =~ /(.*\.mov.*)*url.{5}([\w-]+.mov).*$/is;
00228 $file = $2;
00229
00230 if (defined $opt_d) { printf("# actual filename is $file\n"); }
00231 $trailer_uri =~ s/[^\/]+.mov$/$file/gi;
00232 }
00233 }
00234
00235 if (defined $opt_d) { printf("# final trailer uri is $trailer_uri\n"); }
00236 }
00237 }
00238 return $trailer_uri;
00239 }
00240
00241
00242
00243 #
00244 # Main Program
00245 #
00246
00247 # parse command line arguments
00248 getopts('drT');
00249
00250 # print out usage if needed
00251 if ($#ARGV<0) { usage(); }
00252
00253 if (defined $opt_T) {
00254 # take movieid from cmdline arg
00255 $movieid = shift || die "Usage : $0 -T <movieid>\n";
00256 my $trailer_uri = getTrailerURL($movieid);
00257 #if (defined $opt_d) {
00258 printf("$trailer_uri\n");
00259 #}
00260 # if ($trailer_uri ne "") {
00261 # system "wget -O $movieid.mov $trailer_uri";
00262 # }
00263 }
00264
00265
00266