#!/usr/bin/python # -*- coding: utf-8 -*- # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # import os import re import time from BeautifulSoup import BeautifulStoneSoup as BS from urllib import unquote, urlencode from urllib2 import urlopen, Request, HTTPError, URLError from exceptions import NetworkError class AppleTrailers(object): SOURCE_ID = 'apple' MAIN_URL = 'http://trailers.apple.com/trailers/home/xml/current.xml' MOVIE_URL = 'http://trailers.apple.com/moviesxml/s/%s/index.xml' BACKUP_MOVIE_URL='http://trailers.apple.com/trailers/%s/includes/playlists/web.inc' BACKUP_MOVIE_BASE='http://trailers.apple.com/trailers/%s/' ''' TRAILER_QUALITIES = [{'title': 'iPod', 'id': 'i320.m4v'}, {'title': 'Small', 'id': 'h320.mov'}, {'title': 'Medium', 'id': 'h480.mov'}, {'title': 'Large', 'id': 'h640w.mov'}, {'title': 'HD480p', 'id': 'h480p.mov'}, {'title': 'HD720p', 'id': 'h720p.mov'}, {'title': 'HD1080p', 'id': 'h1080p.mov'}, ] ''' TRAILER_QUALITIES = [{'title': 'iPod', 'id': 'i320.m4v'}, {'title': 'HD480p', 'id': 'h480p.mov'}, {'title': 'HD720p', 'id': 'h720p.mov'}, {'title': 'HD1080p', 'id': 'h1080p.mov'}, ] FILTER_CRITERIA = [ # {'title': 'year', # 'id': 'year'}, {'title': 'genre', 'id': 'genre'}, ] UA = 'QuickTime/7.6.5 (qtver=7.6.5;os=Windows NT 5.1Service Pack 3)' def __init__(self, cache_path): self.cache_path = cache_path if not os.path.isdir(self.cache_path): os.makedirs(self.cache_path) self.movies = self.__get_movies() def get_movies(self, filters={}): if filters: filtered_movies = [] for m in self.movies: match = True for field, content in filters.items(): match = match and content in m.get(field) if match: filtered_movies.append(m) return filtered_movies else: return self.movies def get_single_movie(self, movie_title): movies = [m for m in self.movies if m['title'] == movie_title] if len(movies) == 1: return movies[0] else: raise Exception('Multiple or 0 matches in get_single_movie!') def get_filter_criteria(self): self.__log('get_filter_criteria') return self.FILTER_CRITERIA def get_filter_content(self, criteria): self.__log('get_filter_content started with criteria: %s' % criteria) assert criteria in self.FILTER_CRITERIA items = [{'title': content, 'id': content} for content in self.__filter(self.movies, criteria)] return items def get_trailer_types(self, movie_title): self.__log('get_trailer_types started with movie_title: %s' % movie_title) movie = self.get_single_movie(movie_title) url = self.MOVIE_URL % movie['movie_string'] trailer_types = [] try: cache_filename = '%s.xml' % movie['movie_string'].split('/')[1] tree = self.__get_tree(url, cache_filename=cache_filename) r_type = re.compile('/moviesxml/s/.+?/.+?/(.+?).xml') for t in tree.findAll('gotourl', {'target': 'main'}): if t.find('b'): type_string = re.search(r_type, t['url']).group(1) trailer_types.append({'title': t['draggingname'], 'id': type_string}) except: t_url=self.BACKUP_MOVIE_URL % movie['movie_string'] cache_filename='%swebinc.xml' % movie['movie_string'].split('/')[1] tree=self.__get_tree(t_url,cache_filename=cache_filename) for t in tree.findAll('div',{'class':'column first'}): if t.find('h3'): trailer_types.append({'title': t.find('h3').getText(),'id':t.find('h3').getText().lower().replace(" ","")}) return trailer_types def get_trailer_qualities(self, movie_title): self.__log('get_trailer_qualities started with movie_title: %s' % movie_title) return self.TRAILER_QUALITIES def get_trailer(self, movie_title, quality_id, trailer_type='trailer'): self.__log(('get_trailer started with movie_title: %s ' 'trailer_type: %s quality_id: %s') % (movie_title, trailer_type, quality_id)) movie = self.get_single_movie(movie_title) url = self.MOVIE_URL % movie['movie_string'] try: if trailer_type != 'trailer': url = url.replace('index', trailer_type) cache_filename = '%s-%s.xml' % (movie['movie_string'].split('/')[1], trailer_type) html = self.__get_url(url, cache_filename=cache_filename) r_section = re.compile('(.*?)', re.DOTALL) section = re.search(r_section, html).group(1) tree = BS(section, convertEntities=BS.XML_ENTITIES) trailers = [] for s in tree.findAll('dict'): for k in s.findAll('key'): if k.string == 'previewURL': url = k.nextSibling.string if quality_id in url: return ('%s?|User-Agent=%s' % (url, self.UA)) except: url=self.BACKUP_MOVIE_BASE % movie['movie_string'] tree = None if quality_id=='h480p.mov': cache=(movie['movie_string'].split('/')[1])+trailer_type+quality_id+'.xml' tree=self.__get_tree(url + 'itsxml/25-'+trailer_type+'.xml',cache_filename=cache) if quality_id=='h720p.mov': cache=(movie['movie_string'].split('/')[1])+trailer_type+quality_id+'.xml' tree=self.__get_tree(url + 'itsxml/26-'+trailer_type+'.xml',cache_filename=cache) if quality_id=='h1080p.mov': cache=(movie['movie_string'].split('/')[1])+trailer_type+quality_id+'.xml' tree=self.__get_tree(url + 'itsxml/27-'+trailer_type+'.xml',cache_filename=cache) for s in tree.findAll('dict'): for k in s.findAll('key'): if k.string == 'URL': url = k.nextSibling.string if quality_id in url: return ('%s?|User-Agent=%s' % (url, self.UA)) def __get_movies(self): self.__log('__get_movies started') url = self.MAIN_URL r_movie_string = re.compile('/trailers/(.+?)/images/') tree = self.__get_tree(url) movies = [] for m in tree.findAll('movieinfo'): movie = {'movie_id': m.get('id'), 'title': m.title.string.replace(u'\u2019', '\''), 'duration': m.runtime.string, 'mpaa': m.rating.string, 'studio': m.studio.string, 'post_date': self.__format_date(m.postdate.string), 'release_date': self.__format_date(m.releasedate.string), 'year': self.__format_year(m.releasedate.string), 'copyright': m.copyright.string, 'director': m.director.string, 'plot': m.description.string, 'thumb': m.poster.xlarge.string, } if m.genre: movie['genre'] = [g.string.strip() for g in m.genre.contents] if m.cast: movie['cast'] = [c.string.strip() for c in m.cast.contents] movie_string = re.search(r_movie_string, m.poster.location.string).group(1) movie['movie_string'] = movie_string movies.append(movie) self.__log('get_movies finished with %d elements' % len(movies)) return movies def __format_date(self, date_str): if date_str: y, m, d = date_str.split('-') return '.'.join((d, m, y, )) else: return '' def __format_year(self, date_str): if date_str: return date_str.split('-', 1)[0] else: return 0 def __filter(self, ld, f): ll = [d[f] for d in ld if d.get(f)] if isinstance(ll[0], list): s = set([i for ll in ll for i in ll]) else: s = set(ll) return sorted(s) def __get_tree(self, url, referer=None, cache_filename=None): html = self.__get_url(url, referer, cache_filename) tree = BS(html, convertEntities=BS.XML_ENTITIES) return tree def __get_url(self, url, referer=None, cache_filename=None): self.__log('__get_url started with url=%s, cache_filename=%s' % (url, cache_filename)) filename = cache_filename or url.rsplit('/')[-1] cache_file = os.path.join(self.cache_path, filename) try: cache_file_date = os.path.getmtime(cache_file) except: cache_file_date = 0 if time.time() - 3600 > cache_file_date: self.__log('__get_url opening url: %s' % url) req = Request(url) if referer: req.add_header('Referer', referer) req.add_header('Accept', ('text/html,application/xhtml+xml,' 'application/xml;q=0.9,*/*;q=0.8')) req.add_header('User-Agent', self.UA) try: html = urlopen(req).read() except HTTPError: raise NetworkError(HTTPError) except URLError: raise NetworkError(URLError) open(cache_file, 'w').write(html) else: self.__log('__get_url using cachefile: %s' % cache_file) html = open(cache_file, 'r').read() self.__log('__get_url got %d bytes' % len(html)) return html def __log(self, msg): print('Apple scraper: %s' % msg)