Advertisement
skip420

LinkScraperV2

Aug 29th, 2022
729
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.05 KB | None | 0 0
  1. #!/usr/bin/python
  2. # http://docs.python-requests.org/en/latest/user/quickstart/
  3. # http://www.crummy.com/software/BeautifulSoup/bs4/doc/
  4.  
  5. import csv
  6. import re
  7. import requests
  8. import time
  9. from bs4 import BeautifulSoup
  10.  
  11. # scrapes the title
  12. def getTitle():
  13.     d = soup.find_all("h1", "branded-page-header-title")
  14.     for i in d:
  15.         name = i.text.strip().replace('\n',' ').replace(',','').encode("utf-8")
  16.         f.write(str(name) + ',')
  17.         print(f'\t\t{name}')
  18.  
  19. # scrapes the subscriber and view count
  20. def getStats():
  21.     b = soup.find_all("li", "about-stat ") # trailing space is required.
  22.     for i in b:
  23.         value = i.b.text.strip().replace(',','')
  24.         name = i.b.next_sibling.strip().replace(',','')
  25.         f.write(value+',')
  26.         print('\t\t%s = %s') % (name, value)
  27.  
  28. # scrapes the description
  29. def getDescription():
  30.     c = soup.find_all("div", "about-description")
  31.     for i in c:
  32.         description = i.text.strip().replace('\n',' ').replace(',','').encode("utf-8")
  33.         f.write(str(description) + ',')
  34.         #print('\t\t%s') % (description)
  35.  
  36. # scrapes all the external links
  37. def getLinks():
  38.     a = soup.find_all("a", "about-channel-link ") # trailing space is required.
  39.     for i in a:
  40.         url = i.get('href')
  41.         f.write(url+',')
  42.         print(f'\t\t{url}')
  43.  
  44. # scrapes the related channels
  45. def getRelated():
  46.     s = soup.find_all("h3", "yt-lockup-title")
  47.     for i in s:
  48.         t = i.find_all(href=re.compile("user"))
  49.         for i in t:
  50.             url = 'https://www.youtube.com'+i.get('href')
  51.             rCSV.write(url+'\n')
  52.             print(f'\t\t{i.text}, {url}')  
  53.  
  54. f = open("youtube-scrape-data.csv", "w+")
  55. rCSV = open("related-channels.csv", "w+")
  56. visited = []
  57. base = "https://www.youtube.com/results?search_query="
  58. q = ['search+query+here']
  59. page = "&page="
  60. features="html.parser"
  61. count = 1
  62. pagesToScrape = 20
  63.  
  64. for query in q:
  65.     while count <= pagesToScrape:
  66.         scrapeURL = base + str(query) + page + str(count)
  67.         print(f'Scraping {scrapeURL} \n')
  68.         r = requests.get(scrapeURL)
  69.         soup = BeautifulSoup(r.text)
  70.         users = soup.find_all("div", "yt-lockup-byline")
  71.         for each in users:
  72.             a = each.find_all(href=re.compile("user"))
  73.             for i in a:
  74.                 url = 'https://www.youtube.com'+i.get('href')+'/about'
  75.                 if url in visited:
  76.                     print(f'\t{url} has already been scraped\n\n')
  77.                 else:
  78.                     r = requests.get(url)
  79.                     soup = BeautifulSoup(r.text)
  80.                     f.write(url+',')
  81.                     print(f'\t{url}')
  82.                     getTitle()
  83.                     getStats()
  84.                     getDescription()
  85.                     getLinks()
  86.                     getRelated()
  87.                     f.write('\n')  
  88.                     print('\n')
  89.                     visited.append(url)
  90.                     time.sleep(3)
  91.         count += 1  
  92.         time.sleep(3)
  93.         print('\n')
  94.     count = 1
  95.     print('\n')
  96. f.close()
  97.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement