Advertisement
Guest User

spider

a guest
Nov 23rd, 2016
310
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.91 KB | None | 0 0
  1. #!/usr/bin/python3.5
  2.  
  3. import os
  4. import urllib
  5. import requests
  6. from bs4 import BeautifulSoup
  7. from urllib.request import urlretrieve
  8.  
  9.  
  10. class Spider:
  11.     def __init__(self):
  12.         self.board = None
  13.         self.page = 0
  14.         self.index = []
  15.         self.max_pages = None
  16.         self.result = []
  17.         self.output = []
  18.         self.threads = []
  19.  
  20.     def chan_spider(self):
  21.         self.index = []
  22.         self.page = 0
  23.         while self.page <= self.max_pages:
  24.             url = r'http://boards.4chan.org/' \
  25.                   + str(self.board) + '/' + str(self.page)
  26.             source_code = requests.get(url)
  27.             plain_text = source_code.text
  28.             soup = BeautifulSoup(plain_text, "html.parser")
  29.             for img in soup.findAll('a', {'class': "fileThumb"}):
  30.                 href = r'http:' + img.get('href')
  31.                 self.index.append(href)
  32.             self.page += 1
  33.             if self.page is 1:
  34.                 continue
  35.         return self.index
  36.  
  37.     def search_board(self):
  38.         self.chan_spider()
  39.         for address in self.index:
  40.             name = address.split('/')[-1]
  41.             d = ('images/' + (str(self.board)) + "/")
  42.             full_name = d + name
  43.             if not os.path.exists(d):
  44.                 os.makedirs(d)
  45.             urllib.request.urlretrieve(address, full_name)
  46.             print(full_name)
  47.  
  48.     def nchan_spider(self):
  49.         self.index = []
  50.         self.page = 0
  51.         while self.page <= self.max_pages:
  52.             url = r'http://boards.4chan.org/' + str(self.board) + '/' + str(self.page)
  53.             source_code = requests.get(url)
  54.             plain_text = source_code.text
  55.             soup = BeautifulSoup(plain_text, "html.parser")
  56.             for img in soup.findAll('a', {'class': "replylink"}):
  57.                 href = r'http://boards.4chan.org/' + str(self.board) + "/" + img.get('href')
  58.                 self.index.append(href)
  59.             self.page += 1
  60.             if self.page is 1:
  61.                 continue
  62.         return self.index
  63.  
  64.     def threads_get(self):
  65.         self.result = []
  66.         self.nchan_spider()
  67.         for i in self.index:
  68.             ilist = i.split("/")
  69.             if len(ilist) == 7:
  70.                 xlist = "/".join(ilist)
  71.             if xlist not in self.result:
  72.                 self.result.append(xlist)
  73.         return self.result
  74.  
  75.     def deepthread_spider(self):
  76.         self.output = []
  77.         self.threads_get()
  78.         for url in self.result:
  79.             thread = url[-1].split("/")
  80.             thrd = thread[-1]
  81.             index = []
  82.             source_code = requests.get(url)
  83.             plain_text = source_code.text
  84.             soup = BeautifulSoup(plain_text, "html.parser")
  85.             for s in soup.findAll('a', {'class': "fileThumb"}):
  86.                 href = r'http:' + s.get('href') + "," + thrd
  87.                 index.append(href)
  88.             self.output += index
  89.         return self.output
  90.  
  91.     def deepsearch_thread(self):
  92.         print("Digging...")
  93.         self.deepthread_spider()
  94.         for address in self.output:
  95.             thr = address.split(",")
  96.             thread = thr[-1]
  97.             title = thread.split("-")
  98.             first_name = str(thr[-2])
  99.             sec_name = first_name.split("/")
  100.             name = sec_name[-1]
  101.             final_title = " ".join(title)
  102.             url = thr[0]
  103.             d = ('images/' + (str(self.board)) + "/"
  104.                  + str(final_title)) + "/"
  105.             full_name = d + (str(name))
  106.             if not os.path.exists(d):
  107.                 os.makedirs(d)
  108.             urllib.request.urlretrieve(url, full_name)
  109.             print(full_name)
  110.  
  111.     def select_thread(self):
  112.         self.threads_get()
  113.         self.result = list(enumerate(self.result))
  114.         for i in self.result:
  115.             print('ID No {}: {}'.format(str(i[0]), i[1].split('/')[-1].replace('-', ' ')))
  116.         th = input('Enter ID No of desired thread, or'
  117.                    '\n manually enter the url of the desired thread: ')
  118.         try:
  119.             thr = int(th)
  120.             thread = self.result[thr][-1]
  121.         except ValueError:
  122.             print('Manual entry detected. starting...')
  123.             thread = th
  124.         return thread
  125.  
  126.     def thread_spider(self):
  127.         thread = self.select_thread()
  128.         inx = []
  129.         source_code = requests.get(thread)
  130.         plain_text = source_code.text
  131.         soup = BeautifulSoup(plain_text, "html.parser")
  132.         for img in soup.findAll('a', {'class': "fileThumb"}):
  133.             href = r'http:' + img.get('href')
  134.             inx.append(href)
  135.         return inx, thread
  136.  
  137.     def search_thread(self):
  138.         thread, pre_folder = self.thread_spider()
  139.         folder = pre_folder.split('/')[-1].replace('-', ' ')
  140.         for address in thread:
  141.             url_list = address.split("/")
  142.             name = url_list[-1]
  143.             d = ('images/' + (str(self.board)) + "/" + str(folder)) + "/"
  144.             full_name = d + (str(name))
  145.             if not os.path.exists(d):
  146.                 os.makedirs(d)
  147.             urllib.request.urlretrieve(address, full_name)
  148.             print(full_name)
  149.  
  150.     def menu(self):
  151.         while True:
  152.             cmd = input(
  153.                 '1 for board surface, 2 for a specific thread, '
  154.                 'and 3 for every thread on a board, or q to exit: ')
  155.             if cmd == 'q':
  156.                 break
  157.             self.board = input(
  158.                 'What board would you like to scrape?: ')
  159.             self.max_pages = int(input(
  160.                 'How many pages would you like to scrape?: '))
  161.             if cmd == '1':
  162.                 self.search_board()
  163.             elif cmd == '2':
  164.                 self.search_thread()
  165.             elif cmd == '3':
  166.                 self.deepsearch_thread()
  167.             else:
  168.                 print('Invalid choice, try again.')
  169.  
  170.  
  171. if __name__ == '__main__':
  172.     spider = Spider()
  173.     spider.menu()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement