Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from urllib. request import urlopen
- from bs4 import BeautifulSoup
- import requests
- import urllib
- import bs4
- import random
- import os
- os.chdir("C:\\")
- print(os.getcwd())
- board = "b"
- def chan_spider(max_pages):
- page = 0
- index = []
- while page <= max_pages:
- url = r'http://boards.4chan.org/' + str(board) + '/' + str(page)
- source_code = requests.get(url)
- plain_text = source_code.text
- soup = BeautifulSoup(plain_text, "html.parser")
- for img in soup.findAll('a', {'class': "fileThumb"}):
- href = r'http:' + img.get('href')
- index.append(href)
- page += 1
- if page is 1:
- continue
- return index
- def search_board(index):
- for address in index:
- url_list = address.split(".")
- ext = str("." + url_list[-1])
- name = random.randrange(100000, 1000000)
- d = ('images\\' + (str(board)) + "\\")
- full_name = d + (str(name) + ext)
- if not os.path.exists(d):
- os.makedirs(d)
- urllib.request.urlretrieve(address, full_name)
- print(full_name)
- def thread_name(threads):
- for t in threads:
- name_list = t.split("-")
- name = name_list[0]
- newname = name.replace("\"", "")
- listy.append(newname)
- return listy
- def thread_spider(board, thread):
- index = []
- url = r'http://boards.4chan.org/' + str(board) + '/thread/' + thread
- source_code = requests.get(url)
- plain_text = source_code.text
- soup = bs4.BeautifulSoup(plain_text, "html.parser")
- for img in soup.findAll('a', {'class': "fileThumb"}):
- href = r'http:' + img.get('href')
- index.append(href)
- return index
- def search_thread(index):
- folder = input("Please enter a name for the folder that will hold these images: ")
- for address in index:
- url_list = address.split("/")
- name = url_list[-1]
- d = ('images\\' + (str(board)) + "\\" + str(folder)) + "\\"
- full_name = d + (str(name))
- if not os.path.exists(full_name):
- os.makedirs(d)
- urllib.request.urlretrieve(address, full_name)
- print(full_name)
- def nchan_spider(max_pages):
- page = 0
- index = []
- while page <= max_pages:
- url = r'http://boards.4chan.org/' + str(board) + '/' + str(page)
- source_code = requests.get(url)
- plain_text = source_code.text
- soup = BeautifulSoup(plain_text, "html.parser")
- for img in soup.findAll('a', {'class': "replylink"}):
- href = r'http://boards.4chan.org/' + str(board) + "/" + img.get('href')
- index.append(href)
- page += 1
- if page is 1:
- continue
- return index
- result = []
- def threads_get(index):
- for i in index:
- ilist = i.split("/")
- if len(ilist) == 7:
- xlist = "/".join(ilist)
- if xlist not in result:
- result.append(xlist)
- return result
- threads = []
- def threads_list(index):
- for i in index:
- ilist = i.split("/")
- xlist = ilist[-1].split("-")
- zlist = " ".join(xlist)
- x = ("\"" + zlist + "\" " + "- (ID = " + ilist[-2] + ")")
- threads.append(x)
- return threads
- listy = []
- def deepthread_spider(urls):
- output = []
- for url in urls:
- thread = url.split("/")
- thrd = thread[-1]
- index = []
- source_code = requests.get(url)
- plain_text = source_code.text
- soup = BeautifulSoup(plain_text, "html.parser")
- for s in soup.findAll('a', {'class': "fileThumb"}):
- href = r'http:' + s.get('href') + "," + thrd
- index.append(href)
- output += index
- return output
- def deepsearch_thread(index):
- print("Digging...")
- for address in index:
- thr = address.split(",")
- thread = thr[-1]
- title = thread.split("-")
- first_name = str(thr[-2])
- sec_name = first_name.split("/")
- name = sec_name[-1]
- final_title = " ".join(title)
- url = thr[0]
- d = ('images\\' + (str(board)) + "\\" + str(final_title)) + "\\"
- full_name = d + (str(name))
- if not os.path.exists(d):
- os.makedirs(d)
- urllib.request.urlretrieve(url, full_name)
- print(full_name)
- c = (input("Would you like to scrape a board, or a specific thread?\nPlease enter 1 for thread,"
- " 2 for the surface of a board, or 3 to scrape every thread on a board:\n"))
- while c is "1":
- board = input("What board would you like to search?: \n")
- max_p = int(input("How many pages would you like to search?:\n"))
- if len(nchan_spider(1)) > 1:
- while True:
- try:
- for i in threads_list(threads_get(nchan_spider(max_p))):
- print(i + "\n")
- print("The above is a list of threads available to select from.")
- thread = input("Please enter the thread ID: \n")
- search_thread((thread_spider(board, thread)))
- break
- except ValueError:
- print("Not a valid input. Please enter a number (e.g. 1, 2, 3, 150, etc.)")
- continue
- else:
- print("Not a valid board abbreviation, try again.\n\n")
- continue
- while c is "2":
- board = input("What board do you want to scrape from?\nPlease enter just the letter for the board: ")
- if len(nchan_spider(1)) > 1:
- while True:
- try:
- search_board(chan_spider(float(input("How many pages would you like to scrape from?\n: "))))
- break
- except ValueError:
- print("Not a valid input. Please enter a number (e.g. 1, 2, 3, 150, etc.)")
- continue
- elif len(nchan_spider(1)) < 1:
- print("Not a valid board abbreviation, try again.\n")
- continue
- while c is "3":
- board = input("What board do you want to scrape from?\nPlease enter just the letter for the board: ")
- if len(nchan_spider(1)) > 1:
- while True:
- try:
- deepsearch_thread(deepthread_spider(threads_get(
- nchan_spider(int(input("How many pages would you like to scrape from?\n: "))))))
- break
- except ValueError:
- print("Not a valid input. Please enter a number (e.g. 1, 2, 3, 150, etc.)")
- continue
- elif len(nchan_spider(1)) <1:
- print("Not a valid board abbreviation, try again.\n")
- else:
- print("Please enter 1, 2, or 3\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement