Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import time
- import requests, bs4, re
- from urllib.parse import urljoin
- import json
- import os
- target_dir = r"D:\00Coding\Js\WebScraper" #Yes, I do know that storing this in my Javascript folder is filthy
- fullname = os.path.join(target_dir,'TsumData.txt')
- StartURL = 'http://disneytsumtsum.wikia.com/wiki/Skill_Upgrade_Chart'
- URLPrefix = 'http://disneytsumtsum.wikia.com'
- def make_soup(url):
- r = requests.get(url)
- soup = bs4.BeautifulSoup(r.text, 'lxml')
- return soup
- def get_links(url):
- soup = make_soup(url)
- a_tags = soup.find_all('a', href=re.compile(r"^/wiki/"))
- links = [urljoin(URLPrefix, a['href'])for a in a_tags] # convert relative url to absolute url
- return links
- def get_tds(link):
- soup = make_soup(link)
- #tds = soup.find_all('li', class_="category normal") #This will give me the attributes / tags of each character
- tds = soup.find_all('table', class_="wikia-infobox")
- RowArray = []
- HeaderArray = []
- if tds:
- for td in tds:
- #print(td.text.strip()) #This is everything
- rows = td.findChildren('tr')#[0]
- headers = td.findChildren('th')#[0]
- for row in rows:
- cells = row.findChildren('td')
- for cell in cells:
- cell_content = cell.getText()
- clean_content = re.sub( '\s+', ' ', cell_content).strip()
- if clean_content:
- RowArray.append(clean_content)
- for row in rows:
- cells = row.findChildren('th')
- for cell in cells:
- cell_content = cell.getText()
- clean_content = re.sub( '\s+', ' ', cell_content).strip()
- if clean_content:
- HeaderArray.append(clean_content)
- print(HeaderArray)
- print(RowArray)
- return(RowArray, HeaderArray)
- #Output = json.dumps([dict(zip(RowArray, row_2)) for row_2 in HeaderArray], indent=1)
- #print(json.dumps([dict(zip(RowArray, row_2)) for row_2 in HeaderArray], indent=1))
- #TempFile = open(fullname, 'w') #Read only, Write Only, Append
- #TempFile.write("EHLLO")
- #TempFile.close()
- #print(td.tbody.Series)
- #print(td.tbody[Series])
- #print(td.tbody["Series"])
- #print(td.data-name)
- #time.sleep(1)
- if __name__ == '__main__':
- links = get_links(StartURL)
- MainHeaderArray = []
- MainRowArray = []
- MaxIterations = 60
- Iterations = 0
- for link in links: #Specifically I'll need to return and append the arrays here because they're being cleared repeatedly.
- #print("Getting tds calling")
- if Iterations > 38: #There are this many webpages it'll first look at that don't have the data I need
- TempRA, TempHA = get_tds(link)
- MainHeaderArray.append(TempHA)
- MainRowArray.append(TempRA)
- MaxIterations -= 1
- Iterations += 1
- #print(MaxIterations)
- if MaxIterations <= 0: #I don't want to scrape the entire website for a prototype
- break
- #print("This is the end ??")
- #time.sleep(3)
- #jsonized = map(lambda item: {'Name':item[0], 'Series':item[1]}, zip())
- print(MainHeaderArray)
- #time.sleep(2.5)
- #print(MainRowArray)
- #time.sleep(2.5)
- #print(zip())
- TsumName = []
- TsumSeries = []
- TsumBoxType = []
- TsumSkillDescription = []
- TsumFullCharge = []
- TsumMinScore = []
- TsumScoreIncreasePerLevel = []
- TsumMaxScore = []
- TsumFullUpgrade = []
- Iterations = 0
- MaxIterations = len(MainRowArray)
- while Iterations <= MaxIterations: #This will fire 1 time per Tsum
- print(Iterations)
- print(MainHeaderArray[Iterations][0]) #Holy this gives us Mickey ;
- print(MainHeaderArray[Iterations+1][0])
- print(MainHeaderArray[Iterations+2][0])
- print(MainHeaderArray[Iterations+3][0])
- TsumName.append(MainHeaderArray[Iterations][0])
- print(MainRowArray[Iterations][1])
- #At this point it will, of course, crash - that's because I only just realized I needed to append AND I just realized that everything
- #Isn't stored in a list as I thought, but rather a multi-dimensional array (as you can see below I didn't know this)
- TsumSeries[Iterations] = MainRowArray[Iterations+1]
- TsumBoxType[Iterations] = MainRowArray[Iterations+2]
- TsumSkillDescription[Iterations] = MainRowArray[Iterations+3]
- TsumFullCharge[Iterations] = MainRowArray[Iterations+4]
- TsumMinScore[Iterations] = MainRowArray[Iterations+5]
- TsumScoreIncreasePerLevel[Iterations] = MainRowArray[Iterations+6]
- TsumMaxScore[Iterations] = MainRowArray[Iterations+7]
- TsumFullUpgrade[Iterations] = MainRowArray[Iterations+8]
- Iterations += 9
- print(Iterations)
- print("It's Over")
- time.sleep(3)
- print(TsumName)
- print(TsumSkillDescription)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement