daily pastebin goal
32%
SHARE
TWEET

Web Scraper

ATravisFoster Jan 12th, 2018 2 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import time
  2. import requests, bs4, re
  3. from urllib.parse import urljoin
  4. import json
  5. import os
  6.  
  7. target_dir = r"D:\00Coding\Js\WebScraper" #Yes, I do know that storing this in my Javascript folder is filthy
  8. fullname = os.path.join(target_dir,'TsumData.txt')
  9.  
  10. StartURL = 'http://disneytsumtsum.wikia.com/wiki/Skill_Upgrade_Chart'
  11. URLPrefix = 'http://disneytsumtsum.wikia.com'
  12.  
  13. def make_soup(url):
  14.     r = requests.get(url)
  15.     soup = bs4.BeautifulSoup(r.text, 'lxml')
  16.     return soup
  17.  
  18. def get_links(url):
  19.     soup = make_soup(url)
  20.     a_tags = soup.find_all('a', href=re.compile(r"^/wiki/"))
  21.     links = [urljoin(URLPrefix, a['href'])for a in a_tags]  # convert relative url to absolute url
  22.     return links
  23.  
  24. def get_tds(link):
  25.    
  26.  
  27.     soup = make_soup(link)
  28.     #tds = soup.find_all('li',  class_="category normal") #This will give me the attributes / tags of each character
  29.     tds = soup.find_all('table',  class_="wikia-infobox")
  30.     RowArray = []
  31.     HeaderArray = []
  32.  
  33.     if tds:
  34.         for td in tds:
  35.             #print(td.text.strip()) #This is everything
  36.            
  37.             rows = td.findChildren('tr')#[0]
  38.             headers = td.findChildren('th')#[0]
  39.            
  40.  
  41.         for row in rows:
  42.             cells = row.findChildren('td')
  43.             for cell in cells:
  44.                 cell_content = cell.getText()
  45.                 clean_content = re.sub( '\s+', ' ', cell_content).strip()
  46.                 if clean_content:
  47.                     RowArray.append(clean_content)
  48.                
  49.  
  50.         for row in rows:
  51.             cells = row.findChildren('th')
  52.             for cell in cells:
  53.                 cell_content = cell.getText()
  54.                 clean_content = re.sub( '\s+', ' ', cell_content).strip()
  55.                 if clean_content:
  56.                     HeaderArray.append(clean_content)
  57.  
  58.         print(HeaderArray)
  59.         print(RowArray)
  60.     return(RowArray, HeaderArray)
  61.  
  62.     #Output = json.dumps([dict(zip(RowArray, row_2)) for row_2 in HeaderArray], indent=1)
  63.     #print(json.dumps([dict(zip(RowArray, row_2)) for row_2 in HeaderArray], indent=1))
  64.     #TempFile = open(fullname, 'w') #Read only, Write Only, Append
  65.     #TempFile.write("EHLLO")
  66.     #TempFile.close()
  67.  
  68.  
  69.             #print(td.tbody.Series)
  70.             #print(td.tbody[Series])
  71.             #print(td.tbody["Series"])
  72.             #print(td.data-name)
  73.             #time.sleep(1)
  74.  
  75. if __name__ == '__main__':
  76.     links = get_links(StartURL)
  77.     MainHeaderArray = []
  78.     MainRowArray = []
  79.     MaxIterations = 60
  80.     Iterations = 0
  81.     for link in links: #Specifically I'll need to return and append the arrays here because they're being cleared repeatedly.
  82.         #print("Getting tds calling")
  83.         if Iterations > 38: #There are this many webpages it'll first look at that don't have the data I need
  84.             TempRA, TempHA = get_tds(link)
  85.             MainHeaderArray.append(TempHA)
  86.             MainRowArray.append(TempRA)
  87.         MaxIterations -= 1
  88.         Iterations += 1
  89.         #print(MaxIterations)
  90.         if MaxIterations <= 0: #I don't want to scrape the entire website for a prototype
  91.             break
  92.  
  93.     #print("This is the end ??")
  94.     #time.sleep(3)
  95.     #jsonized = map(lambda item: {'Name':item[0], 'Series':item[1]}, zip())
  96.     print(MainHeaderArray)
  97.     #time.sleep(2.5)
  98.     #print(MainRowArray)
  99.     #time.sleep(2.5)
  100.     #print(zip())
  101.     TsumName = []
  102.     TsumSeries = []
  103.     TsumBoxType = []
  104.     TsumSkillDescription = []
  105.     TsumFullCharge = []
  106.     TsumMinScore = []
  107.     TsumScoreIncreasePerLevel = []
  108.     TsumMaxScore = []
  109.     TsumFullUpgrade = []
  110.     Iterations = 0
  111.     MaxIterations = len(MainRowArray)
  112.     while Iterations <= MaxIterations: #This will fire 1 time per Tsum
  113.         print(Iterations)
  114.         print(MainHeaderArray[Iterations][0]) #Holy this gives us Mickey ;
  115.         print(MainHeaderArray[Iterations+1][0])
  116.         print(MainHeaderArray[Iterations+2][0])
  117.         print(MainHeaderArray[Iterations+3][0])
  118.         TsumName.append(MainHeaderArray[Iterations][0])
  119.         print(MainRowArray[Iterations][1])
  120.         #At this point it will, of course, crash - that's because I only just realized I needed to append AND I just realized that everything
  121.         #Isn't stored in a list as I thought, but rather a multi-dimensional array (as you can see below I didn't know this)
  122.         TsumSeries[Iterations] = MainRowArray[Iterations+1]
  123.         TsumBoxType[Iterations] = MainRowArray[Iterations+2]
  124.         TsumSkillDescription[Iterations] = MainRowArray[Iterations+3]
  125.         TsumFullCharge[Iterations] = MainRowArray[Iterations+4]
  126.         TsumMinScore[Iterations] = MainRowArray[Iterations+5]
  127.         TsumScoreIncreasePerLevel[Iterations] = MainRowArray[Iterations+6]
  128.         TsumMaxScore[Iterations] = MainRowArray[Iterations+7]
  129.         TsumFullUpgrade[Iterations] = MainRowArray[Iterations+8]
  130.         Iterations += 9
  131.         print(Iterations)
  132.     print("It's Over")
  133.     time.sleep(3)
  134.     print(TsumName)
  135.     print(TsumSkillDescription)
RAW Paste Data
Top