Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import urllib, requests
- import pandas as pd
- user = "user"
- password = "pass"
- albums = pd.DataFrame(columns=["Link", "Album", "Artist", "Country", "Label", "Genre"], index=range(10000))
- start_year = 2009
- start_month = 9
- end_year = 2018
- end_month = 1
- albums_count = 0
- for year in range(start_year, end_year+1):
- start_month_default = 1
- end_month_default = 12
- if year == start_year:
- start_month_default = start_month
- elif year == end_year:
- end_month_default = end_month
- for month in xrange(start_month_default,end_month_default):
- page_exist = True
- page_index = 1
- while page_exist == True:
- count = 1
- url = "http://www.whitenoiserecords.org/archives/date/{0}/{1:0>2}/page/{2}".format(str(year), str(month), str(page_index))
- print url
- response = requests.get(url,verify=False, auth=(user, password))
- soup = BeautifulSoup(response.content, "html.parser")
- stuff = soup.find_all("div", class_="post-alt blog")
- if len(stuff) == 0:
- page_exist = False
- break
- for experiment in stuff:
- if count < 11:
- data = []
- data.append(str(experiment.find('a')['href']))
- isTicket = False
- for div in experiment.findAll('a'):
- string = str(div.contents[0].encode("utf-8"))
- data.append(string.strip())
- if len(data) == 2:
- isTicket = True
- print(count)
- if isTicket == False:
- print("Link is ", data[0])
- albums.iloc[albums_count]["Link"] = data[0]
- print("Album ", data[1])
- albums.iloc[albums_count]["Album"] = data[1]
- print("Artist ", data[2])
- albums.iloc[albums_count]["Artist"] = data[2]
- print("Country ", data[3])
- albums.iloc[albums_count]["Country"] = data[3]
- print("Label ", data[4])
- albums.iloc[albums_count]["Label"] = data[4]
- genre = ",".join(data[5:])
- print("Genre ", genre)
- albums.iloc[albums_count]["Genre"] = genre
- albums_count += 1
- count += 1
- page_index += 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement