00001 #!/usr/bin/perl -w
00002
00003 #
00004 # This perl script is intended to perform movie data lookups in french based on
00005 # the www.allocine.fr website
00006 #
00007 # For more information on MythVideo's external movie lookup mechanism, see
00008 # the README file in this directory.
00009 #
00010 # Author: Xavier Hervy (maxpower44 AT tiscali DOT fr)
00011 #
00012
00013 # changes:
00014 # 9-10-2006: Anduin Withers
00015 # Changed output to utf8
00016 # Made -u option a dummy for this release, it is deprecated and will be
00017 # removed
00018
00019 use LWP::Simple; # libwww-perl providing simple HTML get actions
00020 use HTML::Entities;
00021 use URI::Escape;
00022
00023
00024 use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_M $opt_P $opt_originaltitle $opt_casting $opt_u_dummy);
00025 use Getopt::Long;
00026
00027 $title = "Allocine Query";
00028 $version = "v2.02";
00029 $author = "Xavier Hervy";
00030
00031 binmode(STDOUT, ":utf8");
00032
00033 # display usage
00034 sub usage {
00035 print "usage: $0 -hviocMPD [parameters]\n";
00036 print " -h, --help help\n";
00037 print " -v, --version display version\n";
00038 print " -i, --info display info\n";
00039 print " -o, --originaltitle concatenate title and original title\n";
00040 print " -c, --casting with -D option, grap the complete actor list (much slower)\n";
00041 print "\n";
00042 print " -M <query>, --movie query> get movie list\n";
00043 print " -D <movieid>, --data <movieid> get movie data\n";
00044 print " -P <movieid>, --poster <movieid> get movie poster\n";
00045 exit(-1);
00046 }
00047
00048 sub trim {
00049 my ($str) = @_;
00050 $str =~ s/^\s+
00051 $str =~ s/\s+$
00052 return $str;
00053 }
00054
00055 # display 1-line of info that describes the version of the program
00056 sub version {
00057 print "$title ($version) by $author\n"
00058 }
00059
00060 # display 1-line of info that can describe the type of query used
00061 sub info {
00062 print "Performs queries using the www.allocine.fr website.\n";
00063 }
00064
00065 # display detailed help
00066 sub help {
00067 version();
00068 info();
00069 usage();
00070 }
00071
00072 # returns text within 'data' between 'beg' and 'end' matching strings
00073 sub parseBetween {
00074 my ($data, $beg, $end)=@_; # grab parameters
00075
00076 my $ldata = lc($data);
00077 my $start = index($ldata, lc($beg)) + length($beg);
00078 my $finish = index($ldata, lc($end), $start);
00079
00080
00081 if ($start != (length($beg) -1) && $finish != -1) {
00082 my $result = substr($data, $start, $finish - $start);
00083 # dont use decode entities &npsp; => spécial characters bug in html::entities ?
00084 #decode_entities($result);
00085 return removenbsp($result);
00086 }
00087 return "";
00088 }
00089
00090 # use to replace by " " (instead of decode_entities)
00091 sub removenbsp {
00092 my ($data)=@_; # grab parameters
00093
00094 my $ldata = lc($data);
00095 my $start = index($ldata, " ");
00096 while ($start != -1){
00097 $data = substr($data, 0, $start). " " .substr($data, $start+6, length($data));
00098 $ldata = lc($data);
00099 $start = index($ldata, " ");
00100 }
00101 return $data;
00102 }
00103
00104
00105 # returns text within 'data' without tag
00106 sub removeTag {
00107 my ($data)=@_; # grab parameters
00108
00109 my $ldata = lc($data);
00110 my $start = index($ldata, "<");
00111 my $finish = index($ldata, ">", $start)+1;
00112 while ($start != -1 && $finish != -1){
00113 $data = substr($data, 0, $start).substr($data, $finish, length($data));
00114 $ldata = lc($data);
00115 $start = index($ldata, "<");
00116 $finish = index($ldata, ">", $start)+1;
00117 }
00118 return $data;
00119 }
00120
00121 # get Movie Data
00122 sub getMovieData {
00123 my ($movieid)=@_; # grab movieid parameter
00124 if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
00125
00126 # get the search results page
00127 my $request = "http://www.allocine.fr/film/fichefilm_gen_cfilm=" . $movieid . ".html";
00128 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
00129 my $response = get $request;
00130
00131 # parse title and year
00132 my $title = parseBetween($response, "<title>", "</title>");
00133 my $original_title = parseBetween($response, "<h4>Titre original : <i>","</i></h4></div>");
00134 $original_title = removeTag($original_title);
00135 if (defined $opt_originaltitle){
00136 if ($original_title ne ""){
00137 $title = $title . " (" . $original_title . ")";
00138 }
00139 }
00140
00141 #print "titre = $title\n";
00142 $title = removeTag($title);
00143 my $year = parseBetween($response,"<h4>Année de production : ","</h4>");
00144
00145 # parse director
00146 my $director = parseBetween($response,"<h4>Réalisé par ","</h4>");
00147 $director = removeTag($director);
00148
00149 # parse plot
00150 my $plot = parseBetween($response,"<td valign=\"top\" style=\"padding:10 0 0 0\"><div align=\"justify\"><h4>","</h4></div></td>");
00151 $plot =~ s/\n
00152 $plot = removeTag($plot);
00153
00154 # parse user rating
00155 my $userrating=0;
00156 my $tmpratings = parseBetween($response,"Critiques :</b></h5>", "</table>");
00157 my $rating_pat = qr'class="etoile_(\d+)"';
00158
00159 # ratings are from one to four stars
00160 my @ratings = ($tmpratings =~ m/$rating_pat/g);
00161 $userrating += $_ foreach @ratings;
00162 if (@ratings) { $userrating /= @ratings; }
00163
00164 if ($userrating) { $userrating = int($userrating * 2.5); }
00165
00166 # parse rating
00167 my $movierating = parseBetween($response,"Interdit aux moins de ","ans");
00168 if (!($movierating eq ""))
00169 { $movierating = "Interdit aux moins de " . $movierating . "ans";}
00170 else
00171 {
00172 $movierating = parseBetween($response,"Visible ","enfants");
00173 if (!($movierating eq "")){ $movierating = "Visible par des enfants";};
00174 }
00175
00176
00177 # parse movie length
00178 my $runtime = parseBetween($response,"Durée : ",". </h4>");
00179 my $heure;
00180 my $minutes;
00181 ($heure,$minutes)=($runtime=~/[^\d]*(\d+)[^\d]*(\d*)/);
00182 if (!$heure){ $heure = 0; }
00183 if (!$minutes){
00184 $runtime = $heure * 60;
00185 }else{
00186 $runtime = $heure * 60 + $minutes;
00187 }
00188
00189
00190
00191
00192 # parse cast
00193
00194 my $castchunk;
00195
00196 my $name_link_pat = qr'<a .*?href="/personne/.*?".*?>(.*?)</a>';
00197 if (defined $opt_casting){
00198 my $responsecasting = get "http://www.allocine.fr/film/casting_gen_cfilm=" . $movieid . ".html";
00199 $castchunk = parseBetween($responsecasting, "Acteurs", "</table");
00200 }
00201
00202 if (!$castchunk) {
00203 $castchunk = parseBetween($response, "<h4>Avec ","</h4>");
00204 }
00205
00206 my $cast = "";
00207 if (defined $castchunk) {
00208 $cast = trim(join(',', ($castchunk =~ m/$name_link_pat/g)));
00209 }
00210
00211 #genres
00212 my $genres = parseBetween($response,"<h4>Genre : ","</h4>");
00213 $genres = removeTag($genres);
00214
00215 #countries
00216 my $countries = parseBetween($response,"<h4>Film ",". </h4>");
00217 $countries = removeTag($countries);
00218
00219 # output fields (these field names must match what MythVideo is looking for)
00220 print "Title:$title\n";
00221 if (!(defined $opt_originaltitle)){
00222 print "OriginalTitle:$original_title\n";
00223 }
00224 print "Year:$year\n";
00225 print "Director:$director\n";
00226 print "Plot:$plot\n";
00227 print "UserRating:$userrating\n";
00228 print "MovieRating:$movierating\n";
00229 print "Runtime:$runtime\n";
00230 print "Cast:$cast\n";
00231 print "Genres:$genres\n";
00232 print "Countries:$countries\n";
00233 }
00234
00235 # dump Movie Poster
00236 sub getMoviePoster {
00237 my ($movieid)=@_; # grab movieid parameter
00238 if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
00239
00240 # get the search results page
00241
00242 my $request = "http://www.allocine.fr/film/galerie_gen_cfilm=" . $movieid . ".html";
00243 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
00244 my $response = get $request;
00245 my $page=parseBetween($response,"&page=",".html\" class=\"link1\"><span class=\"text2\">>>");
00246 my @pages = split ("page=",$page);
00247 $request = "";
00248
00249 my $uri = "";
00250 my $furi = "";
00251 my $first= 1;
00252 for $page (@pages ) {
00253 $request = $page;
00254
00255 #
00256 # get only the page number
00257 #
00258 $request = substr($request, 0, index($request, '.'));
00259
00260 if (!($request eq "")) {
00261 $request = "http://www.allocine.fr/film/galerie_gen_cfilm=" . $movieid . "&page=" . $request . ".html";
00262 $response = get $request;
00263
00264 $uri = parseBetween($response,"<table style=\"padding:0 0 0 0\" border=\"0\" >","Ko\" />");
00265 $uri = parseBetween($uri ,"<img src=\"","\" border=\"0\" class=\"galerie\" ");
00266 if ($first && ! ($uri eq ""))
00267 {
00268 $furi = $uri;
00269 $first = 0;
00270 }
00271
00272
00273 }
00274 #
00275 # stop when we have an poster...
00276 #
00277 last if (($uri =~ /affiche/) or ($uri =~ /_af/))
00278 }
00279
00280 # if $uri =~ affiche or _af then get the first poster if exist
00281
00282 if (($uri !~ /affiche/) or ($uri !~ /_af/))
00283 {
00284 if ($first == 0)
00285 {
00286 $uri = $furi;
00287 }
00288 }
00289
00290 #
00291 # in case nothing was found fall back to the little poster...
00292 #
00293 if ($uri eq "")
00294 {
00295 $request = "http://www.allocine.fr/film/fichefilm_gen_cfilm=" . $movieid .".html";
00296 $response = get $request;
00297 $response = parseBetween($response, "sousnav_separe_droite2.gif","sortie");
00298 $uri = parseBetween($response, "<img src=\"","\"");
00299
00300 #
00301 # in case no little poster was found get the small DVD poster
00302 # if exists !
00303 #
00304 if ($uri =~ /AffichetteAllocine/)
00305 {
00306 $request = "http://www.allocine.fr/film/fichefilm_gen_cfilm=" . $movieid .".html";
00307 $response = get $request;
00308 $response = parseBetween($response, "Disponible en","Zone");
00309 $uri = parseBetween($response, "<img src=\"","\"");
00310 return if ($uri eq "");
00311 }
00312 }
00313
00314 print "$uri\n";
00315 }
00316
00317 # dump Movie list: 1 entry per line, each line as 'movieid:Movie Title'
00318 sub getMovieList {
00319 my ($filename, $options)=@_; # grab parameters
00320
00321 # If we wanted to inspect the file for any reason we can do that now
00322
00323 #
00324 # Convert filename into a query string
00325 # (use same rules that Metadata::guesTitle does)
00326 my $query = $filename;
00327 $query = uri_unescape($query); # in case it was escaped
00328 # Strip off the file extension
00329 if (rindex($query, '.') != -1) {
00330 $query = substr($query, 0, rindex($query, '.'));
00331 }
00332 # Strip off anything following '(' - people use this for general comments
00333 if (rindex($query, '(') != -1) {
00334 $query = substr($query, 0, rindex($query, '('));
00335 }
00336 # Strip off anything following '[' - people use this for general comments
00337 if (rindex($query, '[') != -1) {
00338 $query = substr($query, 0, rindex($query, '['));
00339 }
00340 # Strip off anything following '-' - people use this for general comments
00341 if (index($query, '-') != -1) {
00342 $query = substr($query, 0, index($query, '-'));
00343 }
00344
00345 # IMDB searches do better if any trailing ,The is left off
00346 $query =~ /(.*), The$/i;
00347 if ($1) { $query = $1; }
00348
00349 # prepare the url
00350 $query = uri_escape($query);
00351 if (!$options) { $options = "" ;}
00352 if (defined $opt_d) {
00353 printf("# query: '%s', options: '%s'\n", $query, $options);
00354 }
00355 my $count = 0;
00356 my $typerecherche = 3;
00357
00358 while (($typerecherche <=5) && ($count ==0)){
00359 # get the search results page
00360 my $request = "http://www.allocine.fr/recherche/?rub=1&motcle=$query";
00361 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
00362 my $response = get $request;
00363
00364 #
00365 # don't try to invent if it doesn't exist
00366 #
00367 return if $response =~ /Pas de résultats/;
00368
00369 # extract possible matches
00370 # possible matches are grouped in several catagories:
00371 # exact, partial, and approximate
00372 my $exact_matches = $response;
00373 # parse movie list from matches
00374 my $beg = "<h4><a href=\"/film/fichefilm_gen_cfilm=";
00375 my $end = "</a></h4>";
00376
00377 my @movies;
00378
00379 my $data = $exact_matches;
00380 if ($data eq "") {
00381 if (defined $opt_d) { printf("# no results\n"); }
00382 $typerecherche = $typerecherche +2 ;
00383 }else{
00384 my $start = index($data, $beg);
00385 my $finish = index($data, $end, $start);
00386
00387 my $title;
00388 while ($start != -1) {
00389 $start += length($beg);
00390 my $sub = substr($data, $start, $finish - $start);
00391 my ($movienum, $moviename) = split(".html\" class=\"link1\">", $sub);
00392 $title = removeTag($moviename);
00393 $moviename = removeTag($moviename);
00394 my ($movieyear)= $moviename =~/\((\d+)\)/;
00395 if ($movieyear){$title = $title." (".$movieyear.")"; }
00396 $moviename=$title ;
00397
00398 # advance data to next movie
00399 $data = substr($data, - (length($data) - $finish));
00400 $start = index($data, $beg);
00401 $finish = index($data, $end, $start + 1);
00402
00403 # add to array
00404 $movies[$count++] = $movienum . ":" . $moviename;
00405 }
00406
00407 # display array of values
00408 for $movie (@movies) {
00409 print "$movie\n";
00410 }
00411 }
00412 }
00413 }
00414
00415 #
00416 # Main Program
00417 #
00418
00419 # parse command line arguments
00420
00421 GetOptions( "utf8" => \$opt_u_dummy,
00422 "version" => \$opt_v,
00423 "info" => \$opt_i,
00424 "originaltitle" => \$opt_originaltitle,
00425 "casting" => \$opt_casting,
00426 "Data" => \$opt_D,
00427 "Movie" => \$opt_M,
00428 "Poster" => \$opt_P
00429 );
00430
00431
00432 # print out info
00433 if (defined $opt_v) { version(); exit 1; }
00434 if (defined $opt_i) { info(); exit 1; }
00435
00436 # print out usage if needed
00437 if (defined $opt_h || $#ARGV<0) { help(); }
00438
00439 if (defined $opt_D) {
00440 # take movieid from cmdline arg
00441 $movieid = shift || die "Usage : $0 -D <movieid>\n";
00442 getMovieData($movieid);
00443 }
00444
00445 elsif (defined $opt_P) {
00446 # take movieid from cmdline arg
00447 $movieid = shift || die "Usage : $0 -P <movieid>\n";
00448 getMoviePoster($movieid);
00449 }
00450
00451 elsif (defined $opt_M) {
00452 # take query from cmdline arg
00453 #$options = shift || die "Usage : $0 -M <query>\n";
00454 my $query;
00455 my $options = '';
00456 foreach $key (0 .. $#ARGV) {
00457 $query .= $ARGV[$key]. ' ';
00458 }
00459 getMovieList($query, $options);
00460 }
00461 # vim: set expandtab ts=3 sw=3 :