Advertisement
Masoko

Arenabg.com py parser

Nov 23rd, 2016
137
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.83 KB | None | 0 0
  1. import web, os, sys, json, time, pickle, re
  2. import urllib
  3. from bs4 import BeautifulSoup  as bs
  4.  
  5. pages_to_scan = 13
  6.  
  7. urls = (
  8.     '/', 'index',
  9.     '/show/(.*)', 'show',
  10.     '/movies/', 'movies',
  11.     '/generate/','generate'
  12. )
  13.  
  14. path_to_docs = '/static/docs'
  15.  
  16. def get_script_path():
  17.     return os.path.dirname(os.path.realpath(sys.argv[0]))
  18.    
  19. def list_docs():
  20.     docs = []
  21.     for root, dirs, files in os.walk(get_script_path() + path_to_docs):
  22.         for file in files:
  23.             if file.endswith(".txt"):
  24. #               print(os.path.join(root, file))
  25.                 docs.append(os.path.join(root, file))
  26.     return docs
  27.  
  28. #get the latest movies from arenabg.com and return their urls
  29. def get_movies():
  30.     pageurls = []
  31.     for i in range(1, pages_to_scan , 1):
  32.         pageurls.append("http://arenabg.com/torrents/subtitles:1/page:" + str(i))
  33.         print "http://arenabg.com/torrents/subtitles:1/page:" + str(i)
  34.    
  35.     urls = set()
  36.     for url in pageurls:
  37.         soup = bs(urllib.urlopen(url), "html5lib")
  38.         print url
  39.         for link in soup.findAll('a',attrs={'class':'torrent-link'}): # get torrent details page urls
  40.             urls.add('http://arenabg.com'+link['href'])
  41.         time.sleep(0.5)
  42.     return urls
  43.  
  44. #return the imdbid of arenabg url
  45. def get_movie_imdb_id(url):
  46.     file_name = re.sub('[^A-Za-z0-9]+', '', url)
  47.     print file_name
  48.     try:
  49.         data = pickle.load( open( get_script_path()+"/arenabg/"+file_name, "rb" ) )
  50.         print file_name + " - found"
  51.         return data
  52.     except:
  53.         soup = bs(urllib.urlopen(url), "html5lib")
  54.         for link in soup.findAll('a',attrs={'class':'download-button'}):
  55.             if "imdb" in link['href']:
  56.                 pickle.dump( link['href'].split('/')[4], open(get_script_path()+ "/arenabg/"+file_name, "wb" ) )
  57.                 return link['href'].split('/')[4]
  58.     return "not found"
  59.  
  60. #read movie data from omdbapi based on imdb_id         
  61. def get_movie_data(imdb_id):
  62.     try:
  63.         data = pickle.load( open( get_script_path()+"/imdb/"+imdb_id, "rb" ) )
  64.         print imdb_id + " - found"
  65.     except:
  66.         print imdb_id
  67.         url = "http://www.omdbapi.com/?i="+imdb_id+"&plot=full&r=json"
  68.         response = urllib.urlopen(url)
  69.         data = json.loads(response.read())
  70.         pickle.dump( data, open(get_script_path()+ "/imdb/"+imdb_id, "wb" ) )
  71.    
  72.     return data
  73.    
  74. #used for sorting a list of lists by first item
  75. def getKey(item):
  76.     return item[0]
  77.  
  78.  
  79. def build_movie_data():
  80.     text = []
  81.     serials = []
  82.     movie_imdb_ids = set()
  83.     for movie in get_movies():
  84.         movie_imdb_id = get_movie_imdb_id(movie)
  85.         if movie_imdb_id in movie_imdb_ids:
  86.             pass
  87.         else:
  88.             data = get_movie_data(movie_imdb_id)
  89.             if data['Type'] == 'movie': # check if its a movie
  90.                 if 'Horror' in data['Genre']:
  91.                     pass
  92.                 else:
  93.                     text.append([data['imdbRating'] ,data['Title'].ljust(45),data['Genre'],data['Year'], movie,data['Poster'],movie_imdb_id])
  94.             else:
  95.                 if 'Horror' in data['Genre']:
  96.                     pass
  97.                 else:
  98.                     serials.append([data['imdbRating'] ,data['Title'].ljust(45),data['Genre'],data['Year'], movie,data['Poster'],movie_imdb_id])
  99.                
  100.             movie_imdb_ids.add(movie_imdb_id)
  101.     texts = sorted(text, key=getKey, reverse=True)
  102.     serial = sorted(serials, key=getKey, reverse=True)
  103.     pickle.dump( serial, open(get_script_path()+ "/save_serial.p", "wb" ) )
  104.     pickle.dump( texts, open( get_script_path() +"/save.p", "wb" ) )
  105.     return True
  106.    
  107. class movies:
  108.     def __init__(self):
  109.         self.render = web.template.render(get_script_path()+"/templates")
  110.        
  111.     def GET(self):
  112.         docs = list_docs()
  113.         content = 'index.txt'
  114.         file = open(get_script_path()+'/static/'+content, 'r')
  115.         content = file.readlines()
  116.         return self.render.index("My first site",docs,content,1)
  117.  
  118. class generate:
  119.     def __init__(self):
  120.         self.render = web.template.render(get_script_path()+"/templates")
  121.        
  122.     def GET(self):
  123.         docs = list_docs()
  124.         start_time = time.time()
  125.         build_movie_data()
  126.         time_sec = time.time() - start_time
  127.         return self.render.index("Arenabg movies list generation",docs,"Generation Done",time_sec)
  128.        
  129. class index:
  130.     def __init__(self):
  131.         self.render = web.template.render(get_script_path()+"/templates")
  132.        
  133.     def GET(self):
  134.         docs = list_docs()
  135.         text = pickle.load( open( get_script_path()+"/save.p", "rb" ) ) #reads movies data from file
  136.         serials = pickle.load( open( get_script_path()+"/save_serial.p", "rb" ) )
  137.        
  138.         date = time.strftime("%m/%d/%Y %I:%M:%S %p",time.localtime(os.path.getmtime(get_script_path()+"/save_serial.p")))
  139.  
  140.        
  141.         return self.render.movies1("Movies with subs from arenabg",docs,text,date,serials)     
  142.        
  143. class show:
  144.     def __init__(self):
  145.         self.render = web.template.render(get_script_path()+"/templates")
  146.        
  147.     def GET(self,content):
  148.         docs = list_docs()
  149.         file = open(get_script_path()+'/static/docs/'+content, 'r')
  150.  
  151.         text = file.readlines()
  152.         file.close()
  153.         return self.render.index(content.split('.')[0],docs,text,1)
  154.        
  155.        
  156. if __name__ == "__main__":
  157.     os.chdir(get_script_path())
  158.     web.config.debug = False
  159.     app = web.application(urls, globals())
  160.     app.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement