00001 #!/usr/bin/perl -w
00002
00003 #
00004 # This perl script is intended to perform movie data lookups based on
00005 # the russian analog of www.imdb.com website
00006 #
00007 # For more information on MythVideo's external movie lookup mechanism, see
00008 # the README file in this directory.
00009 #
00010 # Author: Oleksiy Kokachev (kokachev AT gmail DOT com), based on imdb.pl script, made
00011 # by Tim Harvey.
00012
00013
00014
00015 use LWP::Simple; # libwww-perl providing simple HTML get actions
00016 use HTML::Entities;
00017 use URI::Escape;
00018
00019
00020 use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_M $opt_P);
00021 use Getopt::Std;
00022
00023 $title = "ilovecinema query";
00024 $version = "v1.0";
00025 $author = "Oleksiy Kokachev";
00026
00027 my @countries = qw(Russia);
00028
00029 binmode(STDOUT, ":utf8");
00030
00031 # display usage
00032 sub usage {
00033 print "usage: $0 -hdrviMPD [parameters]\n";
00034 print " -h help\n";
00035 print " -d debug\n";
00036 print " -r dump raw query result data only\n";
00037 print " -v display version\n";
00038 print " -i display info\n";
00039 print "\n";
00040 print " -M [options] <query> get movie list\n";
00041 print " some known options are:\n";
00042 print " mode=films Show only films\n";
00043 print " Note: multiple options must be separated by ';'\n";
00044 print " -P <movieid> get movie poster\n";
00045 print " -D <movieid> get movie data\n";
00046 exit(-1);
00047 }
00048
00049 # display 1-line of info that describes the version of the program
00050 sub version {
00051 print "$title ($version) by $author\n"
00052 }
00053
00054 # display 1-line of info that can describe the type of query used
00055 sub info {
00056 print "Performs queries using the www.ilovecinema.ru website.\n";
00057 }
00058
00059 # display detailed help
00060 sub help {
00061 version();
00062 info();
00063 usage();
00064 }
00065
00066 sub trim {
00067 my ($str) = @_;
00068 $str =~ s/^\s+
00069 $str =~ s/\s+$
00070 return $str;
00071 }
00072
00073 # returns text within 'data' between 'beg' and 'end' matching strings
00074 sub parseBetween {
00075 my ($data, $beg, $end)=@_; # grab parameters
00076
00077 my $ldata = lc($data);
00078 my $start = index($ldata, lc($beg)) + length($beg);
00079 my $finish = index($ldata, lc($end), $start);
00080 if ($start != (length($beg) -1) && $finish != -1) {
00081 my $result = substr($data, $start, $finish - $start);
00082 # return w/ decoded numeric character references
00083 # (see http://www.w3.org/TR/html4/charset.html#h-5.3.1)
00084 decode_entities($result);
00085 return $result;
00086 }
00087 return "";
00088 }
00089
00090 # get Movie Data
00091 sub getMovieData {
00092 my ($movieid)=@_; # grab movieid parameter
00093 if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
00094
00095 my $name_link_pat = qr'<a href="/name/[^"]*">([^<]*)</a>'m;
00096
00097 # get the search results page
00098 my $request = "http://www.ilovecinema.ru/films/" . $movieid . "/";
00099 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
00100 my $response = get $request;
00101 utf8::decode($response);
00102 if (defined $opt_r) { printf("%s", $response); }
00103
00104 # parse title and year
00105 my $title = "";
00106 my $year = "";
00107 my $cast = "";
00108 my $director = "";
00109 my $plot = "";
00110 my $lgenres = "";
00111 $data = parseBetween($response, "<div class=\"col1\">","/div>");
00112 $title = parseBetween($data,"<h1>","</h1>");
00113
00114 #grab original english title if available
00115 if ($data =~ s/<p class=\"eng_name\">(.+)<//){
00116 $title = $title.'('.$1.')';
00117 }
00118
00119 #grab year
00120 $data = parseBetween($data,"<span id=\"filmInfoSubtitle\">","</span>");
00121 if ($data =~ /(\d+)/){
00122 $year = $1;
00123 }
00124
00125 #parse movie description
00126 my $plot_data = parseBetween($response,"<div class=\"film_descr\">","</div>");
00127 if ($plot_data !~ /.+no_descr/){
00128 if ($plot_data =~ s/<p>(.+)<\/p>
00129 $plot = $1;
00130 }
00131 }
00132
00133 if (defined $opt_d) {
00134 printf("############################ plot raw data ##########################\n");
00135 printf("%s\n",$plot_data);
00136 printf("######################## end of plot raw data #######################\n");
00137 }
00138 #parse cast data
00139 my $cast_data = parseBetween($response,"<table class=\"film_persons\">","</table>");
00140 if ($cast_data) {
00141 $cast = join(",", ($cast_data =~ m/alt=\"(.+)\"/g));
00142 }
00143
00144 if (defined $opt_d) {
00145 printf("############################ cast raw data ##########################\n");
00146 printf("%s\n",$cast_data);
00147 printf("######################## end of cast raw data #######################\n");
00148 }
00149
00150
00151 # output fields (these field names must match what MythVideo is looking for)
00152 print "Title:$title\n";
00153 print "Year:$year\n";
00154 print "Director:$director\n";
00155 print "Plot:$plot\n";
00156 print "Cast: $cast\n";
00157 print "Genres: $lgenres\n";
00158 }
00159
00160 # dump Movie Poster
00161 sub getMoviePoster {
00162 my ($movieid)=@_; # grab movieid parameter
00163 if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
00164
00165 # get the search results page
00166 my $request = "http:
00167 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
00168 my $response = get $request;
00169 if (defined $opt_r) { printf("%s", $response); }
00170
00171 if (!defined $response) {return;}
00172 my $poster_data;
00173 my $uri = "";
00174
00175 $poster_data = parseBetween($response,"<div class=\"main\">","<div class=\"tag_cloud\">");
00176
00177 if (defined $opt_d) {
00178 printf("############################ poster raw data ##########################\n");
00179 printf("%s\n",$poster_data);
00180 printf("######################## end of poster raw data #######################\n");
00181 }
00182
00183 if ($poster_data !~ s/no_film
00184 if ($poster_data =~ /src=\"(.+)_/){
00185 $uri = "http:
00186 print "$uri\n";
00187 return;
00188 }
00189 }
00190
00191 if (defined $opt_d) {
00192 print "Poster not found on ilovecinema. Trying to find it on IMDB";
00193 }
00194
00195
00196 #if no poster available on ilovecinema, let's try to find it on imdb.
00197 $poster_data = parseBetween($response,"<div class=\"details_links\">","</div>");
00198 if ($poster_data =~ s/<a href=\"http:\/\/.{0,3}imdb.com\/title\/tt(\d+)\///){
00199 if (defined $opt_d) {
00200 print "Found IMDB number:".$1;
00201 }
00202 getIMDBMoviePoster($1);
00203
00204 }
00205
00206 }
00207
00208 sub getIMDBMoviePoster {
00209 my ($movieid)=@_; # grab movieid parameter
00210 if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
00211
00212 # get the search results page
00213 my $request = "http:
00214 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
00215 my $response = get $request;
00216 if (defined $opt_r) { printf("%s", $response); }
00217
00218 if (!defined $response) {return;}
00219
00220 my $uri = "";
00221
00222 # look for references to impawards.com posters - they are high quality
00223 my $site = "http://www.impawards.com";
00224 my $impsite = parseBetween($response, "<a href=\"".$site, "\">".$site);
00225
00226 # jersey girl fix
00227 $impsite = parseBetween($response, "<a href=\"http://impawards.com","\">http://impawards.com") if ($impsite eq "");
00228
00229 if ($impsite) {
00230 $impsite = $site . $impsite;
00231
00232 if (defined $opt_d) { print "# Searching for poster at: ".$impsite."\n"; }
00233 my $impres = get $impsite;
00234 if (defined $opt_d) { printf("# got %i bytes\n", length($impres)); }
00235 if (defined $opt_r) { printf("%s", $impres); }
00236
00237 # making sure it isnt redirect
00238 $uri = parseBetween($impres, "0;URL=..", "\">");
00239 if ($uri ne "") {
00240 if (defined $opt_d) { printf("# processing redirect to %s\n",$uri); }
00241 # this was redirect
00242 $impsite = $site . $uri;
00243 $impres = get $impsite;
00244 }
00245
00246 # do stuff normally
00247 $uri = parseBetween($impres, "<img SRC=\"posters/", "\" ALT");
00248 # uri here is relative... patch it up to make a valid uri
00249 if (!($uri =~ /http:(.*)/ )) {
00250 my $path = substr($impsite, 0, rindex($impsite, '/') + 1);
00251 $uri = $path."posters/".$uri;
00252 }
00253 if (defined $opt_d) { print "# found ipmawards poster: $uri\n"; }
00254 }
00255
00256 # try looking on nexbase
00257 if ($uri eq "" && $response =~ m/<a href="([^"]*)">([^"]*?)nexbase/i) {
00258 if ($1 ne "") {
00259 if (defined $opt_d) { print "# found nexbase poster page: $1 \n"; }
00260 my $cinres = get $1;
00261 if (defined $opt_d) { printf("# got %i bytes\n", length($cinres)); }
00262 if (defined $opt_r) { printf("%s", $cinres); }
00263
00264 if ($cinres =~ m/<a id="photo_url" href="([^"]*?)" ><\/a>/i) {
00265 if (defined $opt_d) { print "# nexbase url retreived\n"; }
00266 $uri = $1;
00267 }
00268 }
00269 }
00270
00271 # try looking on cinemablend
00272 if ($uri eq "" && $response =~ m/<a href="([^"]*)">([^"]*?)cinemablend/i) {
00273 if ($1 ne "") {
00274 if (defined $opt_d) { print "# found cinemablend poster page: $1 \n"; }
00275 my $cinres = get $1;
00276 if (defined $opt_d) { printf("# got %i bytes\n", length($cinres)); }
00277 if (defined $opt_r) { printf("%s", $cinres); }
00278
00279 if ($cinres =~ m/<td align=center><img src="([^"]*?)" border=1><\/td>/i) {
00280 if (defined $opt_d) { print "# cinemablend url retreived\n"; }
00281 $uri = "http://www.cinemablend.com/".$1;
00282 }
00283 }
00284 }
00285
00286 # if the impawards site attempt didn't give a filename grab it from imdb
00287 if ($uri eq "") {
00288 if (defined $opt_d) { print "# looking for imdb posters\n"; }
00289 my $host = "http://posters.imdb.com/posters/";
00290
00291 $uri = parseBetween($response, $host, "\"><td><td><a href=\"");
00292 if ($uri ne "") {
00293 $uri = $host.$uri;
00294 } else {
00295 if (defined $opt_d) { print "# no poster found\n"; }
00296 }
00297 }
00298
00299
00300
00301 my @movie_titles;
00302 my $found_low_res = 0;
00303 my $k = 0;
00304
00305 # no poster found, take lowres image from imdb
00306 if ($uri eq "") {
00307 if (defined $opt_d) { print "# looking for lowres imdb posters\n"; }
00308 my $host = "http://www.imdb.com/title/tt" . $movieid . "/";
00309 $response = get $host;
00310
00311 # Better handling for low resolution posters
00312 #
00313 if ($response =~ m/<a name="poster".*<img.*src="([^"]*).*<\/a>/ig) {
00314 if (defined $opt_d) { print "# found low res poster at: $1\n"; }
00315 $uri = $1;
00316 $found_low_res = 1;
00317 } else {
00318 if (defined $opt_d) { print "# no low res poster found\n"; }
00319 $uri = "";
00320 }
00321
00322 if (defined $opt_d) { print "# starting to look for movie title\n"; }
00323
00324 # get main title
00325 if (defined $opt_d) { print "# Getting possible movie titles:\n"; }
00326 $movie_titles[$k++] = parseBetween($response, "<title>", "<\/title>");
00327 if (defined $opt_d) { print "# Title: ".$movie_titles[$k-1]."\n"; }
00328
00329 # now we get all other possible movie titles and store them in the titles array
00330 while($response =~ m/>([^>^\(]*)([ ]{0,1}\([^\)]*\)[^\(^\)]*[ ]{0,1}){0,1}\(informal title\)/g) {
00331 $movie_titles[$k++] = trim($1);
00332 if (defined $opt_d) { print "# Title: ".$movie_titles[$k-1]."\n"; }
00333 }
00334
00335 }
00336
00337 print "$uri\n";
00338 }
00339
00340
00341
00342
00343 sub getMovieList {
00344 my ($filename, $options)=@_; # grab parameters
00345
00346 # If we wanted to inspect the file for any reason we can do that now
00347 #
00348 # Convert filename into a query string
00349 # (use same rules that Metadata::guesTitle does)
00350 my $query = $filename;
00351 $query = uri_unescape($query); # in case it was escaped
00352 # Strip off the file extension
00353 if (rindex($query, '.') != -1) {
00354 $query = substr($query, 0, rindex($query, '.'));
00355 }
00356 # Strip off anything following '(' - people use this for general comments
00357 if (rindex($query, '(') != -1) {
00358 $query = substr($query, 0, rindex($query, '('));
00359 }
00360 # Strip off anything following '[' - people use this for general comments
00361 if (rindex($query, '[') != -1) {
00362 $query = substr($query, 0, rindex($query, '['));
00363 }
00364
00365 # IMDB searches do better if any trailing ,The is left off
00366 $query =~ /(.*), The$/i;
00367 if ($1) { $query = $1; }
00368
00369 # prepare the url
00370 $query = uri_escape($query);
00371 if (!$options) { $options = "" ;}
00372 if (defined $opt_d) {
00373 printf("# query: '%s', options: '%s'\n", $query, $options);
00374 }
00375
00376 # get the search results page
00377 # some known ilovecinema options are:
00378 # mode=films Show only films
00379 my $request = "http://ilovecinema.ru/search/?q=$query&$options";
00380 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
00381 my $response = get $request;
00382 if (defined $opt_r) {
00383 print $response;
00384 exit(0);
00385 }
00386
00387 # extract possible matches
00388 # possible matches are grouped in several catagories:
00389 # exact, partial, and approximate
00390 my $exact_matches = parseBetween($response, "<div class=\"search_result\">","</div>");
00391
00392 # parse movie list from matches
00393 if (defined $opt_d) { printf("# exact_matches: '%s'\n", $exact_matches); }
00394 my $beg = "<tr>";
00395 my $end = "</tr>";
00396 my $count = 0;
00397 my @movies;
00398
00399 my $data = $exact_matches;
00400 # resort to approximate matches if no exact or partial
00401 if ($data eq "") {
00402 if (defined $opt_d) { printf("# no results\n"); }
00403 return;
00404 }
00405 my $start = index($data, $beg);
00406 my $finish = index($data, $end, $start);
00407 my $year;
00408 my $type;
00409 my $title;
00410 while ($start != -1 && $start < length($data)) {
00411 $start += length($beg);
00412 my $entry = substr($data, $start, $finish - $start);
00413 $start = index($data, $beg, $finish + 1);
00414 $finish = index($data, $end, $start);
00415
00416 my $title = "";
00417 my $eng_title = "";
00418 my $year = "";
00419 my $type = "";
00420 my $movienum = "";
00421
00422 if ($entry =~ /<a href="\/films\/(.+)\/".+alt="(.+)"/) {
00423 $movienum = $1;
00424 $title = $2;
00425 utf8::decode($title);
00426 # $year = $3;
00427 # $type = $4 if ($4);
00428 } else {
00429 if (defined $opt_d) {
00430 print("Unrecognized entry format ($entry)\n");
00431 }
00432 next;
00433 }
00434 my $skip = 0;
00435
00436
00437 # add to array
00438 if (!$skip) {
00439 my $moviename = $title;
00440 if ($year ne "") {
00441 $moviename .= " ($year)";
00442 }
00443
00444 # $movies[$count++] = $movienum . ":" . $title;
00445 $movies[$count++] = $movienum . ":" . $moviename;
00446 }
00447 }
00448
00449 # display array of values
00450 for $movie (@movies) { print "$movie\n"; }
00451 }
00452
00453 #
00454 # Main Program
00455 #
00456
00457 # parse command line arguments
00458 getopts('ohrdivDMP');
00459
00460 # print out info
00461 if (defined $opt_v) { version(); exit 1; }
00462 if (defined $opt_i) { info(); exit 1; }
00463
00464 # print out usage if needed
00465 if (defined $opt_h || $#ARGV<0) { help(); }
00466
00467 if (defined $opt_D) {
00468 # take movieid from cmdline arg
00469 $movieid = shift || die "Usage : $0 -D <movieid>\n";
00470 getMovieData($movieid);
00471 }
00472 elsif (defined $opt_P) {
00473 # take movieid from cmdline arg
00474 $movieid = shift || die "Usage : $0 -P <movieid>\n";
00475 getMoviePoster($movieid);
00476 }
00477 elsif (defined $opt_M) {
00478 # take query from cmdline arg
00479 $options = shift || die "Usage : $0 -M [options] <query>\n";
00480 $query = shift;
00481 if (!$query) {
00482 $query = $options;
00483 $options = "";
00484 }
00485 getMovieList($query, $options);
00486 }
00487 # vim: set expandtab ts=3 sw=3 :