Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import web, os, sys, json, time, pickle, re
- import urllib
- from bs4 import BeautifulSoup as bs
- pages_to_scan = 13
- urls = (
- '/', 'index',
- '/show/(.*)', 'show',
- '/movies/', 'movies',
- '/generate/','generate'
- )
- path_to_docs = '/static/docs'
- def get_script_path():
- return os.path.dirname(os.path.realpath(sys.argv[0]))
- def list_docs():
- docs = []
- for root, dirs, files in os.walk(get_script_path() + path_to_docs):
- for file in files:
- if file.endswith(".txt"):
- # print(os.path.join(root, file))
- docs.append(os.path.join(root, file))
- return docs
- #get the latest movies from arenabg.com and return their urls
- def get_movies():
- pageurls = []
- for i in range(1, pages_to_scan , 1):
- pageurls.append("http://arenabg.com/torrents/subtitles:1/page:" + str(i))
- print "http://arenabg.com/torrents/subtitles:1/page:" + str(i)
- urls = set()
- for url in pageurls:
- soup = bs(urllib.urlopen(url), "html5lib")
- print url
- for link in soup.findAll('a',attrs={'class':'torrent-link'}): # get torrent details page urls
- urls.add('http://arenabg.com'+link['href'])
- time.sleep(0.5)
- return urls
- #return the imdbid of arenabg url
- def get_movie_imdb_id(url):
- file_name = re.sub('[^A-Za-z0-9]+', '', url)
- print file_name
- try:
- data = pickle.load( open( get_script_path()+"/arenabg/"+file_name, "rb" ) )
- print file_name + " - found"
- return data
- except:
- soup = bs(urllib.urlopen(url), "html5lib")
- for link in soup.findAll('a',attrs={'class':'download-button'}):
- if "imdb" in link['href']:
- pickle.dump( link['href'].split('/')[4], open(get_script_path()+ "/arenabg/"+file_name, "wb" ) )
- return link['href'].split('/')[4]
- return "not found"
- #read movie data from omdbapi based on imdb_id
- def get_movie_data(imdb_id):
- try:
- data = pickle.load( open( get_script_path()+"/imdb/"+imdb_id, "rb" ) )
- print imdb_id + " - found"
- except:
- print imdb_id
- url = "http://www.omdbapi.com/?i="+imdb_id+"&plot=full&r=json"
- response = urllib.urlopen(url)
- data = json.loads(response.read())
- pickle.dump( data, open(get_script_path()+ "/imdb/"+imdb_id, "wb" ) )
- return data
- #used for sorting a list of lists by first item
- def getKey(item):
- return item[0]
- def build_movie_data():
- text = []
- serials = []
- movie_imdb_ids = set()
- for movie in get_movies():
- movie_imdb_id = get_movie_imdb_id(movie)
- if movie_imdb_id in movie_imdb_ids:
- pass
- else:
- data = get_movie_data(movie_imdb_id)
- if data['Type'] == 'movie': # check if its a movie
- if 'Horror' in data['Genre']:
- pass
- else:
- text.append([data['imdbRating'] ,data['Title'].ljust(45),data['Genre'],data['Year'], movie,data['Poster'],movie_imdb_id])
- else:
- if 'Horror' in data['Genre']:
- pass
- else:
- serials.append([data['imdbRating'] ,data['Title'].ljust(45),data['Genre'],data['Year'], movie,data['Poster'],movie_imdb_id])
- movie_imdb_ids.add(movie_imdb_id)
- texts = sorted(text, key=getKey, reverse=True)
- serial = sorted(serials, key=getKey, reverse=True)
- pickle.dump( serial, open(get_script_path()+ "/save_serial.p", "wb" ) )
- pickle.dump( texts, open( get_script_path() +"/save.p", "wb" ) )
- return True
- class movies:
- def __init__(self):
- self.render = web.template.render(get_script_path()+"/templates")
- def GET(self):
- docs = list_docs()
- content = 'index.txt'
- file = open(get_script_path()+'/static/'+content, 'r')
- content = file.readlines()
- return self.render.index("My first site",docs,content,1)
- class generate:
- def __init__(self):
- self.render = web.template.render(get_script_path()+"/templates")
- def GET(self):
- docs = list_docs()
- start_time = time.time()
- build_movie_data()
- time_sec = time.time() - start_time
- return self.render.index("Arenabg movies list generation",docs,"Generation Done",time_sec)
- class index:
- def __init__(self):
- self.render = web.template.render(get_script_path()+"/templates")
- def GET(self):
- docs = list_docs()
- text = pickle.load( open( get_script_path()+"/save.p", "rb" ) ) #reads movies data from file
- serials = pickle.load( open( get_script_path()+"/save_serial.p", "rb" ) )
- date = time.strftime("%m/%d/%Y %I:%M:%S %p",time.localtime(os.path.getmtime(get_script_path()+"/save_serial.p")))
- return self.render.movies1("Movies with subs from arenabg",docs,text,date,serials)
- class show:
- def __init__(self):
- self.render = web.template.render(get_script_path()+"/templates")
- def GET(self,content):
- docs = list_docs()
- file = open(get_script_path()+'/static/docs/'+content, 'r')
- text = file.readlines()
- file.close()
- return self.render.index(content.split('.')[0],docs,text,1)
- if __name__ == "__main__":
- os.chdir(get_script_path())
- web.config.debug = False
- app = web.application(urls, globals())
- app.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement