Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3.5
- import os
- import urllib
- import requests
- from bs4 import BeautifulSoup
- from urllib.request import urlretrieve
- class Spider:
- def __init__(self):
- self.board = None
- self.page = 0
- self.index = []
- self.max_pages = None
- self.result = []
- self.output = []
- self.threads = []
- def chan_spider(self):
- self.index = []
- self.page = 0
- while self.page <= self.max_pages:
- url = r'http://boards.4chan.org/' \
- + str(self.board) + '/' + str(self.page)
- source_code = requests.get(url)
- plain_text = source_code.text
- soup = BeautifulSoup(plain_text, "html.parser")
- for img in soup.findAll('a', {'class': "fileThumb"}):
- href = r'http:' + img.get('href')
- self.index.append(href)
- self.page += 1
- if self.page is 1:
- continue
- return self.index
- def search_board(self):
- self.chan_spider()
- for address in self.index:
- name = address.split('/')[-1]
- d = ('images/' + (str(self.board)) + "/")
- full_name = d + name
- if not os.path.exists(d):
- os.makedirs(d)
- urllib.request.urlretrieve(address, full_name)
- print(full_name)
- def nchan_spider(self):
- self.index = []
- self.page = 0
- while self.page <= self.max_pages:
- url = r'http://boards.4chan.org/' + str(self.board) + '/' + str(self.page)
- source_code = requests.get(url)
- plain_text = source_code.text
- soup = BeautifulSoup(plain_text, "html.parser")
- for img in soup.findAll('a', {'class': "replylink"}):
- href = r'http://boards.4chan.org/' + str(self.board) + "/" + img.get('href')
- self.index.append(href)
- self.page += 1
- if self.page is 1:
- continue
- return self.index
- def threads_get(self):
- self.result = []
- self.nchan_spider()
- for i in self.index:
- ilist = i.split("/")
- if len(ilist) == 7:
- xlist = "/".join(ilist)
- if xlist not in self.result:
- self.result.append(xlist)
- return self.result
- def deepthread_spider(self):
- self.output = []
- self.threads_get()
- for url in self.result:
- thread = url[-1].split("/")
- thrd = thread[-1]
- index = []
- source_code = requests.get(url)
- plain_text = source_code.text
- soup = BeautifulSoup(plain_text, "html.parser")
- for s in soup.findAll('a', {'class': "fileThumb"}):
- href = r'http:' + s.get('href') + "," + thrd
- index.append(href)
- self.output += index
- return self.output
- def deepsearch_thread(self):
- print("Digging...")
- self.deepthread_spider()
- for address in self.output:
- thr = address.split(",")
- thread = thr[-1]
- title = thread.split("-")
- first_name = str(thr[-2])
- sec_name = first_name.split("/")
- name = sec_name[-1]
- final_title = " ".join(title)
- url = thr[0]
- d = ('images/' + (str(self.board)) + "/"
- + str(final_title)) + "/"
- full_name = d + (str(name))
- if not os.path.exists(d):
- os.makedirs(d)
- urllib.request.urlretrieve(url, full_name)
- print(full_name)
- def select_thread(self):
- self.threads_get()
- self.result = list(enumerate(self.result))
- for i in self.result:
- print('ID No {}: {}'.format(str(i[0]), i[1].split('/')[-1].replace('-', ' ')))
- th = input('Enter ID No of desired thread, or'
- '\n manually enter the url of the desired thread: ')
- try:
- thr = int(th)
- thread = self.result[thr][-1]
- except ValueError:
- print('Manual entry detected. starting...')
- thread = th
- return thread
- def thread_spider(self):
- thread = self.select_thread()
- inx = []
- source_code = requests.get(thread)
- plain_text = source_code.text
- soup = BeautifulSoup(plain_text, "html.parser")
- for img in soup.findAll('a', {'class': "fileThumb"}):
- href = r'http:' + img.get('href')
- inx.append(href)
- return inx, thread
- def search_thread(self):
- thread, pre_folder = self.thread_spider()
- folder = pre_folder.split('/')[-1].replace('-', ' ')
- for address in thread:
- url_list = address.split("/")
- name = url_list[-1]
- d = ('images/' + (str(self.board)) + "/" + str(folder)) + "/"
- full_name = d + (str(name))
- if not os.path.exists(d):
- os.makedirs(d)
- urllib.request.urlretrieve(address, full_name)
- print(full_name)
- def menu(self):
- while True:
- cmd = input(
- '1 for board surface, 2 for a specific thread, '
- 'and 3 for every thread on a board, or q to exit: ')
- if cmd == 'q':
- break
- self.board = input(
- 'What board would you like to scrape?: ')
- self.max_pages = int(input(
- 'How many pages would you like to scrape?: '))
- if cmd == '1':
- self.search_board()
- elif cmd == '2':
- self.search_thread()
- elif cmd == '3':
- self.deepsearch_thread()
- else:
- print('Invalid choice, try again.')
- if __name__ == '__main__':
- spider = Spider()
- spider.menu()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement