00001 #!/usr/bin/perl -w
00002
00003 #
00004 # This perl script is intended to perform movie data lookups in russian
00005 # based on the www.kinox.ru website
00006 #
00007 # For more information on MythVideo's external movie lookup mechanism, see
00008 # the README file in this directory.
00009 #
00010 # Author: Denys Dmytriyenko (denis AT denix DOT org)
00011 # Based on the allocine script by Xavier Hervy
00012 #
00013
00014 # Note:
00015 # Encoding on the Web page is cp1251
00016 # Internal encoding of this script is koi8-r
00017 # The output of this script is in utf8 (set by "outcp" below)
00018
00019 use LWP::Simple; # libwww-perl providing simple HTML get actions
00020 use HTML::Entities;
00021 use URI::Escape qw(uri_unescape uri_escape uri_escape_utf8);
00022
00023 no encoding;
00024
00025 use Encode;
00026 use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_M $opt_P $opt_originaltitle $opt_casting $opt_u_dummy);
00027 use Getopt::Long;
00028
00029 $title = "KinoX Query";
00030 $version = "v0.03";
00031 $author = "Denys Dmytriyenko";
00032
00033 # This is the output encoding
00034 $outcp = "utf8";
00035
00036 # binmode() does not work for some reason
00037 # The output ends up being in the wrong encoding
00038 #binmode(STDOUT, ":utf8");
00039
00040 # display usage
00041 sub usage {
00042 print "usage: $0 -hviocMPD [parameters]\n";
00043 print " -h, --help help\n";
00044 print " -v, --version display version\n";
00045 print " -i, --info display info\n";
00046 print " -o, --originaltitle concatenate title and original title\n";
00047 print " -c, --casting with -D option, grap the complete actor list (much slower)\n";
00048 print "\n";
00049 print " -M <query>, --movie query> get movie list\n";
00050 print " -D <movieid>, --data <movieid> get movie data\n";
00051 # print " -P <movieid>, --poster <movieid> get movie poster\n";
00052 exit(-1);
00053 }
00054
00055 # display 1-line of info that describes the version of the program
00056 sub version {
00057 print "$title ($version) by $author\n"
00058 }
00059
00060 # display 1-line of info that can describe the type of query used
00061 sub info {
00062 print "Performs queries using the www.kinox.ru website.\n";
00063 }
00064
00065 # display detailed help
00066 sub help {
00067 version();
00068 info();
00069 usage();
00070 }
00071
00072 # returns text within 'data' between 'beg' and 'end' matching strings
00073 sub parseBetween {
00074 my ($data, $beg, $end)=@_; # grab parameters
00075
00076 my $ldata = lc($data);
00077 my $start = index($ldata, lc($beg)) + length($beg);
00078 my $finish = index($ldata, lc($end), $start);
00079
00080
00081 if ($start != (length($beg) -1) && $finish != -1) {
00082 my $result = substr($data, $start, $finish - $start);
00083 # dont use decode entities &npsp; => spИcial characters bug in html::entities ?
00084 #decode_entities($result);
00085 return removenbsp($result);
00086 }
00087 return "";
00088 }
00089
00090 # use to replace by " " (instead of decode_entities)
00091 sub removenbsp {
00092 my ($data)=@_; # grab parameters
00093
00094 my $ldata = lc($data);
00095 my $start = index($ldata, " ");
00096 while ($start != -1){
00097 $data = substr($data, 0, $start). " " .substr($data, $start+6, length($data));
00098 $ldata = lc($data);
00099 $start = index($ldata, " ");
00100 }
00101 return $data;
00102 }
00103
00104
00105 # returns text within 'data' without tag
00106 sub removeTag {
00107 my ($data)=@_; # grab parameters
00108
00109 my $ldata = lc($data);
00110 my $start = index($ldata, "<");
00111 my $finish = index($ldata, ">", $start)+1;
00112 while ($start != -1 && $finish != -1){
00113 $data = substr($data, 0, $start).substr($data, $finish, length($data));
00114 $ldata = lc($data);
00115 $start = index($ldata, "<");
00116 $finish = index($ldata, ">", $start)+1;
00117 }
00118 return $data;
00119 }
00120
00121 # get Movie Data
00122 sub getMovieData {
00123 my ($movieid)=@_; # grab movieid parameter
00124 if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
00125
00126 # get the search results page
00127 my $request = "http://www.kinox.ru/index.asp?comm=4&num=" . $movieid;
00128 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
00129 my $response = get $request;
00130
00131 # parse title and year
00132 my $sub = parseBetween($response, "<h1>", "</h1>");
00133 my ($sub1, $countries) = split("<br>", $sub);
00134
00135 $countries = removeTag($countries);
00136 $countries =~ s/[\n\r]/ /g;
00137 Encode::from_to($countries, "windows-1251", $outcp);
00138
00139 my ($title, $original_title) = split("<font size=4 color=#000000> / </font>", $sub1);
00140 $title = removeTag($title);
00141 if (!$original_title) { $original_title = "" ;}
00142 $original_title = removeTag($original_title);
00143
00144 Encode::from_to($title, "windows-1251", $outcp);
00145 Encode::from_to($original_title, "windows-1251", $outcp);
00146
00147 if (defined $opt_originaltitle){
00148 if ($original_title ne ""){
00149 $title = $title . " (" . $original_title . ")";
00150 }
00151 }
00152
00153 # parse director
00154 my $dirq = "<b>Режиссер:</b>";
00155 Encode::from_to($dirq, "koi8-r", "windows-1251");
00156 my $director = parseBetween($response, $dirq, "</a>");
00157 $director = removeTag($director);
00158 $director =~ s/\s{2,}
00159 Encode::from_to($director, "windows-1251", $outcp);
00160
00161 # parse plot
00162 my $plotq = "<b>Краткое содержание:</b>";
00163 Encode::from_to($plotq, "koi8-r", "windows-1251");
00164 my $plot = parseBetween($response, $plotq, "</p>");
00165 $plot = removeTag($plot);
00166 $plot =~ s/\s{2,}
00167 Encode::from_to($plot, "windows-1251", $outcp);
00168
00169 # parse cast
00170 my $castq = "<b>В ролях:</b>";
00171 Encode::from_to($castq, "koi8-r", "windows-1251");
00172 my $cast = parseBetween($response, $castq, "<b>");
00173 $cast = removeTag($cast);
00174 $cast =~ s/\s{2,}
00175 $cast =~ s/\s\(.*?\)
00176 $cast =~ s/\s*,\s*/,/g;
00177 $cast =~ s/\.$
00178 Encode::from_to($cast, "windows-1251", $outcp);
00179
00180 # studio, year, genres, runtime
00181 $sub = parseBetween($response, "<td colspan=2 bgcolor=f8f8f8 align=\"center\" valign=\"top\">", "</td></tr></table>");
00182 $sub =~ s/
00183 $sub =~ s/ 
00184 Encode::from_to($sub, "windows-1251", $outcp);
00185
00186 my $beg = "<font color=\"#008000\">";
00187 my $end = "</font>";
00188
00189 my $start = index($sub, $beg);
00190 my $finish = index($sub, $end, $start);
00191
00192 $start += length($beg);
00193 my $studio = substr($sub, $start, $finish - $start);
00194 $studio = removeTag($studio);
00195
00196 $sub = substr($sub, - (length($sub) - $finish));
00197
00198 $start = index($sub, $beg);
00199 $finish = index($sub, $end, $start);
00200
00201 $start += length($beg);
00202 my $year = substr($sub, $start, $finish - $start);
00203 $year = removeTag($year);
00204
00205 $sub = substr($sub, - (length($sub) - $finish));
00206
00207 $start = index($sub, $beg);
00208 $finish = index($sub, $end, $start);
00209
00210 $start += length($beg);
00211 my $genres = substr($sub, $start, $finish - $start);
00212 $genres = removeTag($genres);
00213 $genres =~ s|\s*/\s*|,|g;
00214
00215 $sub = substr($sub, - (length($sub) - $finish));
00216
00217 $start = index($sub, $beg);
00218 $finish = index($sub, $end, $start);
00219
00220 $start += length($beg);
00221 my $runtime = substr($sub, $start, $finish - $start);
00222 $runtime = removeTag($runtime);
00223
00224 # output fields (these field names must match what MythVideo is looking for)
00225 print "Title:$title\n";
00226 if (!(defined $opt_originaltitle)){
00227 print "OriginalTitle:$original_title\n";
00228 }
00229 print "Year:$year\n";
00230 print "Director:$director\n";
00231 print "Plot:$plot\n";
00232 print "Runtime:$runtime\n";
00233 print "Cast:$cast\n";
00234 print "Genres:$genres\n";
00235 print "Countries:$countries\n";
00236 }
00237
00238 # dump Movie list: 1 entry per line, each line as 'movieid:Movie Title'
00239 sub getMovieList {
00240 my ($filename, $options)=@_; # grab parameters
00241
00242 # If we wanted to inspect the file for any reason we can do that now
00243
00244 #
00245 # Convert filename into a query string
00246 # (use same rules that Metadata::guesTitle does)
00247 my $query = $filename;
00248
00249 $query = uri_unescape($query); # in case it was escaped
00250 # Strip off the file extension
00251 if (rindex($query, '.') != -1) {
00252 $query = substr($query, 0, rindex($query, '.'));
00253 }
00254 # Strip off anything following '(' - people use this for general comments
00255 if (rindex($query, '(') != -1) {
00256 $query = substr($query, 0, rindex($query, '('));
00257 }
00258 # Strip off anything following '[' - people use this for general comments
00259 if (rindex($query, '[') != -1) {
00260 $query = substr($query, 0, rindex($query, '['));
00261 }
00262 # Strip off anything following '-' - people use this for general comments
00263 if (index($query, '-') != -1) {
00264 $query = substr($query, 0, index($query, '-'));
00265 }
00266
00267 # IMDB searches do better if any trailing ,The is left off
00268 $query =~ /(.*), The$/i;
00269 if ($1) { $query = $1; }
00270 Encode::from_to($query, "koi8-r", "windows-1251");
00271
00272 # prepare the url
00273 $query = uri_escape($query);
00274 if (!$options) { $options = "" ;}
00275 if (defined $opt_d) {
00276 printf("# query: '%s', options: '%s'\n", $query, $options);
00277 }
00278
00279 my $count = 0;
00280 my $typerecherche = 3;
00281
00282 while (($typerecherche <=5) && ($count ==0)){
00283 # get the search results page
00284 my $request = "http://www.kinox.ru/index.asp?comm=1&fop=false&pack=0&kw=$query";
00285 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
00286 my $response = get $request;
00287 if (defined $opt_d) { printf("# response: '%s'\n", $response); }
00288
00289 #
00290 # don't try to invent if it doesn't exist
00291 #
00292 my $notfnd = "ничего небыло найдено";
00293 Encode::from_to($notfnd, "koi8-r", "windows-1251");
00294 return if $response =~ /$notfnd/;
00295
00296 # extract possible matches
00297 # possible matches are grouped in several catagories:
00298 # exact, partial, and approximate
00299 my $exact_matches = $response;
00300 # parse movie list from matches
00301 my $beg = "<a class=l2 href=\"index.asp?comm=4&num=";
00302 my $end = "</a>";
00303 my $begy = "colspan=2 align=center>";
00304 my $endy = "</td>";
00305
00306 my @movies;
00307
00308 my $data = $exact_matches;
00309 if ($data eq "") {
00310 if (defined $opt_d) { printf("# no results\n"); }
00311 $typerecherche = $typerecherche +2 ;
00312 }else{
00313 my $start = index($data, $beg);
00314 my $finish = index($data, $end, $start);
00315
00316 my $title;
00317 while ($start != -1) {
00318 $start += length($beg);
00319 my $sub = substr($data, $start, $finish - $start);
00320 my ($movienum, $moviename) = split("\">", $sub);
00321 $title = removeTag($moviename);
00322 $moviename = removeTag($moviename);
00323
00324 $title =~ s/\s{2,}
00325 Encode::from_to($title, "windows-1251", $outcp);
00326
00327 # advance data to next field
00328 $data = substr($data, - (length($data) - $finish));
00329
00330 $start = index($data, $begy);
00331 $finish = index($data, $endy, $start);
00332 $start += length($begy);
00333 $sub = substr($data, $start, $finish - $start);
00334 my $movieyear = removeTag($sub);
00335
00336 if ($movieyear){$title = $title." (".$movieyear.")"; }
00337 $moviename=$title ;
00338
00339 # advance data to next movie
00340 $data = substr($data, - (length($data) - $finish));
00341 $start = index($data, $beg);
00342 $finish = index($data, $end, $start);
00343
00344 # add to array
00345 $movies[$count++] = $movienum . ":" . $moviename;
00346 }
00347
00348 # display array of values
00349 for $movie (@movies) {
00350 print "$movie\n";
00351 }
00352 }
00353 }
00354 }
00355
00356 #
00357 # Main Program
00358 #
00359
00360 # parse command line arguments
00361
00362 GetOptions( "utf8" => \$opt_u_dummy,
00363 "version" => \$opt_v,
00364 "info" => \$opt_i,
00365 "originaltitle" => \$opt_originaltitle,
00366 "casting" => \$opt_casting,
00367 "Data" => \$opt_D,
00368 "Movie" => \$opt_M,
00369 "Poster" => \$opt_P
00370 );
00371
00372 #$opt_d = 1;
00373
00374 # print out info
00375 if (defined $opt_v) { version(); exit 1; }
00376 if (defined $opt_i) { info(); exit 1; }
00377
00378 # print out usage if needed
00379 if (defined $opt_h || $#ARGV<0) { help(); }
00380
00381 if (defined $opt_D) {
00382 # take movieid from cmdline arg
00383 $movieid = shift || die "Usage : $0 -D <movieid>\n";
00384 getMovieData($movieid);
00385 }
00386 elsif (defined $opt_M) {
00387 # take query from cmdline arg
00388 #$options = shift || die "Usage : $0 -M <query>\n";
00389 my $query;
00390 my $options = '';
00391 foreach $key (0 .. $#ARGV) {
00392 $query .= $ARGV[$key]. ' ';
00393 }
00394 getMovieList($query, $options);
00395 }
00396 # vim: ts=4 sw=4: