00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 import os
00017 import sys
00018 import re
00019 import urlparse
00020 import urllib
00021 import urllib2
00022 import cgi
00023 import traceback
00024 import codecs
00025 from optparse import OptionParser
00026
00027
00028 alt_path = '/usr/lib/python%s/site-packages/oldxml' % sys.version[:3]
00029 if os.path.exists(alt_path):
00030 sys.path.append(alt_path)
00031
00032 from xml.dom.ext.reader import HtmlLib
00033 from xml.dom import EMPTY_NAMESPACE
00034 from xml import xpath
00035
00036
00037
00038
00039 VERBOSE=False
00040 URL_BASE="http://www.ofdb.de"
00041 DUMP_RESPONSE=False
00042
00043 ofdb_version = "0.3"
00044 mythtv_version = "0.21"
00045
00046 def comment_out(str):
00047 s = str
00048 try:
00049 s = unicode(str, "utf8")
00050 except:
00051 pass
00052
00053 print("# %s" % (s,))
00054
00055 def debug_out(str):
00056 if VERBOSE:
00057 comment_out(str)
00058
00059 def response_out(str):
00060 if DUMP_RESPONSE:
00061 s = str
00062 try:
00063 s = unicode(str, "utf8")
00064 except:
00065 pass
00066
00067 print(s)
00068
00069 def print_exception(str):
00070 for line in str.splitlines():
00071 comment_out(line)
00072
00073 def _xmlprep(content):
00074 """Removes any HTML tags that just confuse the parser."""
00075
00076 pat = re.compile(r'<\s*meta.*?>', re.M)
00077 ret = pat.sub('', content)
00078 pat = re.compile(r'<\s*script.*?<\s*/script\s*>', re.M | re.S)
00079 return pat.sub('', ret)
00080
00081
00082 def _myth_url_get(url, data = None, as_post = False):
00083 extras = ['ofdb', ofdb_version]
00084
00085 debug_out("_myth_url_get(%s, %s, %s)" % (url, data, as_post))
00086 send_data = {}
00087 if data:
00088 send_data.update(data)
00089 dest_url = url
00090
00091 if not as_post:
00092
00093
00094 (scheme, netloc, path, query, frag) = urlparse.urlsplit(dest_url)
00095 send_data = {}
00096
00097 old_qa = cgi.parse_qs(query)
00098 if old_qa:
00099 send_data.update(old_qa)
00100
00101 if data:
00102 send_data.update(data)
00103
00104 query = urllib.urlencode(send_data)
00105 send_data = None
00106 dest_url = urlparse.urlunsplit((scheme, netloc, path, query, frag))
00107
00108 req = urllib2.Request(url = dest_url, headers =
00109 { 'User-Agent' : "MythTV/%s (%s)" %
00110 (mythtv_version, "; ".join(extras))})
00111
00112 if send_data:
00113 req.add_data(urllib.urlencode(send_data))
00114
00115 try:
00116 debug_out("Get URL '%s:%s'" % (req.get_full_url(), req.get_data()))
00117 res = urllib2.urlopen(req)
00118 content = res.read()
00119 res.close()
00120 return (res, content)
00121 except:
00122 print_exception(traceback.format_exc())
00123 return (None, None)
00124
00125 def ofdb_url_get(url, data = None, as_post = False):
00126 (rc, content) = _myth_url_get(url, data, as_post)
00127
00128 m = re.search(r'<\s*meta[^>]*charset\s*=\s*([^" ]+)', content, re.I)
00129 if m:
00130 charset = m.group(1)
00131 debug_out("Page charset reported as %s" % (charset))
00132
00133
00134 content = _xmlprep(unicode(content, charset, 'replace')).encode("utf8")
00135 else:
00136
00137 content = _xmlprep(unicode(content, errors='replace')).encode("utf8")
00138
00139 response_out(content)
00140 return (rc, content)
00141
00142 def search_title(title):
00143 def clean_title(t):
00144 t = urllib.unquote(t)
00145 (t, ext) = os.path.splitext(t)
00146 m = re.match("(.*)(?:[(|\[]|, The$)",t, re.I)
00147 ret = t
00148 if m:
00149 ret = m.group(1)
00150 return ret.strip().encode("utf8")
00151
00152 try:
00153 data = {
00154 "page" : "suchergebnis",
00155 "Kat" : "DTitel",
00156 "SText" : clean_title(title)
00157 }
00158
00159 debug_out("Starting search for title '%s'" % (title,))
00160
00161 (rc, content) = ofdb_url_get(urlparse.urljoin(URL_BASE, "view.php"),
00162 data, True)
00163
00164 reader = HtmlLib.Reader()
00165 doc = reader.fromString(content, charset='utf8')
00166
00167 nodes = xpath.Evaluate("//A[starts-with(@href, 'film/')]",
00168 doc.documentElement)
00169
00170 title_matches = []
00171 uid_match = re.compile('/(\d+,.*)', re.I)
00172 for title in nodes:
00173 rm = uid_match.search(title.getAttributeNS(EMPTY_NAMESPACE, 'href'))
00174 if rm:
00175 title_matches.append((rm.group(1), title.firstChild.nodeValue))
00176
00177 for id, title in title_matches:
00178 print("%s:%s" % (id, title.strip()))
00179 except:
00180 print_exception(traceback.format_exc())
00181
00182 def get_ofdb_doc(uid, context):
00183 """Returns the OFDb film page as an XML document."""
00184 debug_out("Starting search for %s '%s'" % (context, uid))
00185
00186 (rc, content) = ofdb_url_get(urlparse.urljoin(URL_BASE,
00187 "film/%s" % (uid.encode("utf8"),)))
00188
00189 reader = HtmlLib.Reader()
00190 return reader.fromString(content, charset='utf8')
00191
00192 class NoIMDBURL(Exception):
00193 pass
00194
00195 def search_data(uid, rating_country):
00196 def possible_error(path):
00197 comment_out("Warning: expected to find content at '%s', site format " \
00198 "may have changed, look for a new version of this script." %
00199 (path,))
00200
00201 def single_value(doc, path):
00202 nodes = xpath.Evaluate(path, doc)
00203 if len(nodes):
00204 return nodes[0].firstChild.nodeValue.strip()
00205 possible_error(path)
00206 return ""
00207
00208 def attr_value(doc, path, attrname):
00209 nodes = xpath.Evaluate(path, doc)
00210 if len(nodes):
00211 return nodes[0].getAttributeNS(EMPTY_NAMESPACE, attrname).strip()
00212 possible_error(path)
00213 return ""
00214
00215 def multi_value(doc, path):
00216 ret = []
00217 nodes = xpath.Evaluate(path, doc)
00218 if len(nodes):
00219 for i in nodes:
00220 ret.append(i.firstChild.nodeValue.strip())
00221 return ret
00222 possible_error(path)
00223 return ""
00224
00225 def direct_value(doc, path):
00226 nodes = xpath.Evaluate(path, doc)
00227 if len(nodes):
00228 return nodes[0].nodeValue.strip()
00229 possible_error(path)
00230 return ""
00231
00232 def all_text_children(doc, path):
00233 nodes = xpath.Evaluate(path, doc)
00234 if len(nodes):
00235 ret = []
00236 for n in nodes:
00237 for c in n.childNodes:
00238 if c.nodeType == c.TEXT_NODE:
00239 ret.append(c.nodeValue.strip())
00240 return " ".join(ret)
00241 possible_error(path)
00242 return ""
00243
00244 try:
00245 doc = get_ofdb_doc(uid, "data")
00246
00247
00248 data = {'title' : '',
00249 'countries' : '',
00250 'year' : '',
00251 'directors' : '',
00252 'cast' : '',
00253 'genre' : '',
00254 'user_rating' : '',
00255
00256 'plot' : '',
00257
00258
00259
00260 }
00261
00262 data['title'] = single_value(doc.documentElement,
00263 "//TD[@width='99%']/H2/FONT[@size='3']/B")
00264 data['countries'] = ",".join(multi_value(doc.documentElement,
00265 "//A[starts-with(@href, 'view.php?page=blaettern&Kat=Land&')]"))
00266 data['year'] = single_value(doc.documentElement,
00267 "//A[starts-with(@href, 'view.php?page=blaettern&Kat=Jahr&')]")
00268 data['directors'] = ",".join(multi_value(doc.documentElement,
00269 "//TD[@width='99%']/TABLE/TR[4]/TD[3]//A[starts-with(@href, " \
00270 "'view.php?page=liste')]"))
00271 data['cast'] = ",".join(multi_value(doc.documentElement,
00272 "//TD[@width='99%']/TABLE/TR[5]/TD[3]//A[starts-with(@href, " \
00273 "'view.php?page=liste')]"))
00274 data['genre'] = ",".join(multi_value(doc.documentElement,
00275 "//A[starts-with(@href, 'view.php?page=genre&Genre=')]"))
00276 data['user_rating'] = attr_value(doc.documentElement,
00277 "//IMG[@src='images/design3/notenspalte.png']", "alt")
00278
00279 tmp_sid = attr_value(doc.documentElement,
00280 "//A[starts-with(@href, 'plot/')]", "href")
00281
00282 sid_match = re.search("/(\d+,\d+,.*)", tmp_sid, re.I)
00283 sid = None
00284 if sid_match:
00285 sid = sid_match.group(1)
00286
00287 debug_out("Looking for plot...")
00288 (rc, content) = ofdb_url_get(urlparse.urljoin(URL_BASE,
00289 "plot/%s" % sid.encode("utf8")))
00290
00291 reader = HtmlLib.Reader()
00292 doc = reader.fromString(content, charset='utf8')
00293
00294 data['plot'] = unicode(all_text_children(doc.documentElement,
00295 "//FONT[@class='Blocksatz']"))
00296
00297
00298
00299
00300
00301 print("""\
00302 Title:%(title)s
00303 Year:%(year)s
00304 Director:%(directors)s
00305 Plot:%(plot)s
00306 UserRating:%(user_rating)s
00307 Cast:%(cast)s
00308 Genres:%(genre)s
00309 Countries:%(countries)s
00310 """ % data)
00311
00312 except:
00313 print_exception(traceback.format_exc())
00314
00315 def search_poster(uid):
00316 try:
00317 debug_out("Looking for posters...")
00318 poster_urls = []
00319 ofdoc = get_ofdb_doc(uid, "poster")
00320
00321 nodes = xpath.Evaluate("//IMG[starts-with(@src, 'http://img.ofdb.de/film/')]",
00322 ofdoc.documentElement)
00323 for node in nodes:
00324 poster_urls.append(node.getAttributeNS(EMPTY_NAMESPACE, 'src'))
00325
00326 for p in poster_urls:
00327 print(p)
00328 except:
00329 print_exception(traceback.format_exc())
00330
00331 def main():
00332 parser = OptionParser(usage="""\
00333 Usage: %prog [-M TITLE | -D UID [-R COUNTRY[,COUNTRY]] | -P UID]
00334 """, version="%%prog %s" % (ofdb_version))
00335 parser.add_option("-M", "--title", type="string", dest="title_search",
00336 metavar="TITLE", help="Search for TITLE")
00337 parser.add_option("-D", "--data", type="string", dest="data_search",
00338 metavar="UID", help="Search for video data for UID")
00339 parser.add_option("-R", "--rating-country", type="string",
00340 dest="ratings_from", metavar="COUNTRY",
00341 help="When retrieving data, use ratings from COUNTRY")
00342 parser.add_option("-P", "--poster", type="string", dest="poster_search",
00343 metavar="UID", help="Search for images associated with UID")
00344 parser.add_option("-d", "--debug", action="store_true", dest="verbose",
00345 default=False, help="Display debug information")
00346 parser.add_option("-r", "--dump-response", action="store_true",
00347 dest="dump_response", default=False,
00348 help="Output the raw response")
00349
00350 (options, args) = parser.parse_args()
00351
00352 global VERBOSE, DUMP_RESPONSE
00353 VERBOSE = options.verbose
00354 DUMP_RESPONSE = options.dump_response
00355
00356 if options.title_search:
00357 search_title(unicode(options.title_search, "utf8"))
00358 elif options.data_search:
00359 rf = options.ratings_from
00360 if rf:
00361 rf = unicode(rf, "utf8")
00362 search_data(unicode(options.data_search, "utf8"), rf)
00363 elif options.poster_search:
00364 search_poster(unicode(options.poster_search, "utf8"))
00365 else:
00366 parser.print_usage()
00367 sys.exit(1)
00368
00369 if __name__ == '__main__':
00370 try:
00371 codecinfo = codecs.lookup('utf8')
00372
00373 u2utf8 = codecinfo.streamwriter(sys.stdout)
00374 sys.stdout = u2utf8
00375
00376 main()
00377 except SystemExit:
00378 pass
00379 except:
00380 print_exception(traceback.format_exc())
00381
00382