Advertisement
zkid18

Untitled

Apr 19th, 2018
177
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.61 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import urllib, requests
  3. import pandas as pd
  4.  
  5. user = "user"
  6. password = "pass"
  7. albums = pd.DataFrame(columns=["Link", "Album", "Artist", "Country", "Label", "Genre"], index=range(10000))
  8.  
  9. start_year = 2009
  10. start_month = 9
  11. end_year = 2018
  12. end_month = 1
  13. albums_count = 0
  14.  
  15. for year in range(start_year, end_year+1):
  16.     start_month_default = 1
  17.     end_month_default = 12
  18.    
  19.     if year == start_year:
  20.         start_month_default = start_month
  21.     elif year == end_year:
  22.         end_month_default = end_month
  23.  
  24.     for month in xrange(start_month_default,end_month_default):
  25.         page_exist = True
  26.         page_index = 1
  27.         while page_exist == True:
  28.             count = 1
  29.            
  30.             url = "http://www.whitenoiserecords.org/archives/date/{0}/{1:0>2}/page/{2}".format(str(year), str(month), str(page_index))
  31.             print url
  32.             response = requests.get(url,verify=False, auth=(user, password))
  33.             soup = BeautifulSoup(response.content, "html.parser")
  34.             stuff = soup.find_all("div", class_="post-alt blog")
  35.            
  36.             if len(stuff) == 0:
  37.                 page_exist = False
  38.                 break
  39.  
  40.             for experiment in stuff:
  41.                 if count < 11:
  42.                     data = []
  43.                     data.append(str(experiment.find('a')['href']))
  44.                     isTicket = False
  45.  
  46.                     for div in experiment.findAll('a'):
  47.                         string = str(div.contents[0].encode("utf-8"))
  48.                         data.append(string.strip())
  49.  
  50.                     if len(data) == 2:
  51.                         isTicket = True
  52.  
  53.                     print(count)
  54.                     if isTicket == False:
  55.                         print("Link is ", data[0])
  56.                         albums.iloc[albums_count]["Link"] = data[0]
  57.                         print("Album ", data[1])
  58.                         albums.iloc[albums_count]["Album"] = data[1]
  59.                         print("Artist ", data[2])
  60.                         albums.iloc[albums_count]["Artist"] = data[2]
  61.                         print("Country ", data[3])
  62.                         albums.iloc[albums_count]["Country"] = data[3]
  63.                         print("Label ", data[4])
  64.                         albums.iloc[albums_count]["Label"] = data[4]
  65.                         genre = ",".join(data[5:])
  66.                         print("Genre ", genre)
  67.                         albums.iloc[albums_count]["Genre"] = genre
  68.                         albums_count += 1
  69.                     count += 1
  70.                    
  71.             page_index += 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement