Advertisement
ATravisFoster

Web Scraper

Jan 12th, 2018
134
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.98 KB | None | 0 0
  1. import time
  2. import requests, bs4, re
  3. from urllib.parse import urljoin
  4. import json
  5. import os
  6.  
  7. target_dir = r"D:\00Coding\Js\WebScraper" #Yes, I do know that storing this in my Javascript folder is filthy
  8. fullname = os.path.join(target_dir,'TsumData.txt')
  9.  
  10. StartURL = 'http://disneytsumtsum.wikia.com/wiki/Skill_Upgrade_Chart'
  11. URLPrefix = 'http://disneytsumtsum.wikia.com'
  12.  
  13. def make_soup(url):
  14.     r = requests.get(url)
  15.     soup = bs4.BeautifulSoup(r.text, 'lxml')
  16.     return soup
  17.  
  18. def get_links(url):
  19.     soup = make_soup(url)
  20.     a_tags = soup.find_all('a', href=re.compile(r"^/wiki/"))
  21.     links = [urljoin(URLPrefix, a['href'])for a in a_tags]  # convert relative url to absolute url
  22.     return links
  23.  
  24. def get_tds(link):
  25.    
  26.  
  27.     soup = make_soup(link)
  28.     #tds = soup.find_all('li',  class_="category normal") #This will give me the attributes / tags of each character
  29.     tds = soup.find_all('table',  class_="wikia-infobox")
  30.     RowArray = []
  31.     HeaderArray = []
  32.  
  33.     if tds:
  34.         for td in tds:
  35.             #print(td.text.strip()) #This is everything
  36.            
  37.             rows = td.findChildren('tr')#[0]
  38.             headers = td.findChildren('th')#[0]
  39.            
  40.  
  41.         for row in rows:
  42.             cells = row.findChildren('td')
  43.             for cell in cells:
  44.                 cell_content = cell.getText()
  45.                 clean_content = re.sub( '\s+', ' ', cell_content).strip()
  46.                 if clean_content:
  47.                     RowArray.append(clean_content)
  48.                
  49.  
  50.         for row in rows:
  51.             cells = row.findChildren('th')
  52.             for cell in cells:
  53.                 cell_content = cell.getText()
  54.                 clean_content = re.sub( '\s+', ' ', cell_content).strip()
  55.                 if clean_content:
  56.                     HeaderArray.append(clean_content)
  57.  
  58.         print(HeaderArray)
  59.         print(RowArray)
  60.     return(RowArray, HeaderArray)
  61.  
  62.     #Output = json.dumps([dict(zip(RowArray, row_2)) for row_2 in HeaderArray], indent=1)
  63.     #print(json.dumps([dict(zip(RowArray, row_2)) for row_2 in HeaderArray], indent=1))
  64.     #TempFile = open(fullname, 'w') #Read only, Write Only, Append
  65.     #TempFile.write("EHLLO")
  66.     #TempFile.close()
  67.  
  68.  
  69.             #print(td.tbody.Series)
  70.             #print(td.tbody[Series])
  71.             #print(td.tbody["Series"])
  72.             #print(td.data-name)
  73.             #time.sleep(1)
  74.  
  75. if __name__ == '__main__':
  76.     links = get_links(StartURL)
  77.     MainHeaderArray = []
  78.     MainRowArray = []
  79.     MaxIterations = 60
  80.     Iterations = 0
  81.     for link in links: #Specifically I'll need to return and append the arrays here because they're being cleared repeatedly.
  82.         #print("Getting tds calling")
  83.         if Iterations > 38: #There are this many webpages it'll first look at that don't have the data I need
  84.             TempRA, TempHA = get_tds(link)
  85.             MainHeaderArray.append(TempHA)
  86.             MainRowArray.append(TempRA)
  87.         MaxIterations -= 1
  88.         Iterations += 1
  89.         #print(MaxIterations)
  90.         if MaxIterations <= 0: #I don't want to scrape the entire website for a prototype
  91.             break
  92.  
  93.     #print("This is the end ??")
  94.     #time.sleep(3)
  95.     #jsonized = map(lambda item: {'Name':item[0], 'Series':item[1]}, zip())
  96.     print(MainHeaderArray)
  97.     #time.sleep(2.5)
  98.     #print(MainRowArray)
  99.     #time.sleep(2.5)
  100.     #print(zip())
  101.     TsumName = []
  102.     TsumSeries = []
  103.     TsumBoxType = []
  104.     TsumSkillDescription = []
  105.     TsumFullCharge = []
  106.     TsumMinScore = []
  107.     TsumScoreIncreasePerLevel = []
  108.     TsumMaxScore = []
  109.     TsumFullUpgrade = []
  110.     Iterations = 0
  111.     MaxIterations = len(MainRowArray)
  112.     while Iterations <= MaxIterations: #This will fire 1 time per Tsum
  113.         print(Iterations)
  114.         print(MainHeaderArray[Iterations][0]) #Holy this gives us Mickey ;
  115.         print(MainHeaderArray[Iterations+1][0])
  116.         print(MainHeaderArray[Iterations+2][0])
  117.         print(MainHeaderArray[Iterations+3][0])
  118.         TsumName.append(MainHeaderArray[Iterations][0])
  119.         print(MainRowArray[Iterations][1])
  120.         #At this point it will, of course, crash - that's because I only just realized I needed to append AND I just realized that everything
  121.         #Isn't stored in a list as I thought, but rather a multi-dimensional array (as you can see below I didn't know this)
  122.         TsumSeries[Iterations] = MainRowArray[Iterations+1]
  123.         TsumBoxType[Iterations] = MainRowArray[Iterations+2]
  124.         TsumSkillDescription[Iterations] = MainRowArray[Iterations+3]
  125.         TsumFullCharge[Iterations] = MainRowArray[Iterations+4]
  126.         TsumMinScore[Iterations] = MainRowArray[Iterations+5]
  127.         TsumScoreIncreasePerLevel[Iterations] = MainRowArray[Iterations+6]
  128.         TsumMaxScore[Iterations] = MainRowArray[Iterations+7]
  129.         TsumFullUpgrade[Iterations] = MainRowArray[Iterations+8]
  130.         Iterations += 9
  131.         print(Iterations)
  132.     print("It's Over")
  133.     time.sleep(3)
  134.     print(TsumName)
  135.     print(TsumSkillDescription)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement