Advertisement
Guest User

Untitled

a guest
Oct 23rd, 2016
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.55 KB | None | 0 0
  1. from urllib. request import urlopen
  2. from bs4 import BeautifulSoup
  3. import requests
  4. import urllib
  5. import bs4
  6. import random
  7. import os
  8.  
  9. os.chdir("C:\\")
  10. print(os.getcwd())
  11. board = "b"
  12.  
  13.  
  14. def chan_spider(max_pages):
  15.     page = 0
  16.     index = []
  17.     while page <= max_pages:
  18.         url = r'http://boards.4chan.org/' + str(board) + '/' + str(page)
  19.         source_code = requests.get(url)
  20.         plain_text = source_code.text
  21.         soup = BeautifulSoup(plain_text, "html.parser")
  22.         for img in soup.findAll('a', {'class': "fileThumb"}):
  23.             href = r'http:' + img.get('href')
  24.             index.append(href)
  25.         page += 1
  26.         if page is 1:
  27.             continue
  28.     return index
  29.  
  30.  
  31. def search_board(index):
  32.     for address in index:
  33.         url_list = address.split(".")
  34.         ext = str("." + url_list[-1])
  35.         name = random.randrange(100000, 1000000)
  36.         d = ('images\\' + (str(board)) + "\\")
  37.         full_name = d + (str(name) + ext)
  38.         if not os.path.exists(d):
  39.             os.makedirs(d)
  40.         urllib.request.urlretrieve(address, full_name)
  41.         print(full_name)
  42.  
  43.  
  44. def thread_name(threads):
  45.     for t in threads:
  46.         name_list = t.split("-")
  47.         name = name_list[0]
  48.         newname = name.replace("\"", "")
  49.         listy.append(newname)
  50.     return listy
  51.  
  52.  
  53. def thread_spider(board, thread):
  54.     index = []
  55.     url = r'http://boards.4chan.org/' + str(board) + '/thread/' + thread
  56.     source_code = requests.get(url)
  57.     plain_text = source_code.text
  58.     soup = bs4.BeautifulSoup(plain_text, "html.parser")
  59.     for img in soup.findAll('a', {'class': "fileThumb"}):
  60.         href = r'http:' + img.get('href')
  61.         index.append(href)
  62.     return index
  63.  
  64.  
  65. def search_thread(index):
  66.     folder = input("Please enter a name for the folder that will hold these images: ")
  67.     for address in index:
  68.         url_list = address.split("/")
  69.         name = url_list[-1]
  70.         d = ('images\\' + (str(board)) + "\\" + str(folder)) + "\\"
  71.         full_name = d + (str(name))
  72.         if not os.path.exists(full_name):
  73.             os.makedirs(d)
  74.         urllib.request.urlretrieve(address, full_name)
  75.         print(full_name)
  76.  
  77.  
  78. def nchan_spider(max_pages):
  79.     page = 0
  80.     index = []
  81.     while page <= max_pages:
  82.         url = r'http://boards.4chan.org/' + str(board) + '/' + str(page)
  83.         source_code = requests.get(url)
  84.         plain_text = source_code.text
  85.         soup = BeautifulSoup(plain_text, "html.parser")
  86.         for img in soup.findAll('a', {'class': "replylink"}):
  87.             href = r'http://boards.4chan.org/' + str(board) + "/" + img.get('href')
  88.             index.append(href)
  89.         page += 1
  90.         if page is 1:
  91.             continue
  92.     return index
  93. result = []
  94.  
  95.  
  96. def threads_get(index):
  97.     for i in index:
  98.         ilist = i.split("/")
  99.         if len(ilist) == 7:
  100.             xlist = "/".join(ilist)
  101.         if xlist not in result:
  102.             result.append(xlist)
  103.     return result
  104. threads = []
  105.  
  106.  
  107. def threads_list(index):
  108.     for i in index:
  109.         ilist = i.split("/")
  110.         xlist = ilist[-1].split("-")
  111.         zlist = " ".join(xlist)
  112.         x = ("\"" + zlist + "\" " + "- (ID = " + ilist[-2] + ")")
  113.         threads.append(x)
  114.     return threads
  115. listy = []
  116.  
  117.  
  118. def deepthread_spider(urls):
  119.     output = []
  120.     for url in urls:
  121.         thread = url.split("/")
  122.         thrd = thread[-1]
  123.         index = []
  124.         source_code = requests.get(url)
  125.         plain_text = source_code.text
  126.         soup = BeautifulSoup(plain_text, "html.parser")
  127.         for s in soup.findAll('a', {'class': "fileThumb"}):
  128.             href = r'http:' + s.get('href') + "," + thrd
  129.             index.append(href)
  130.         output += index
  131.     return output
  132.  
  133.  
  134. def deepsearch_thread(index):
  135.     print("Digging...")
  136.     for address in index:
  137.         thr = address.split(",")
  138.         thread = thr[-1]
  139.         title = thread.split("-")
  140.         first_name = str(thr[-2])
  141.         sec_name = first_name.split("/")
  142.         name = sec_name[-1]
  143.         final_title = " ".join(title)
  144.         url = thr[0]
  145.         d = ('images\\' + (str(board)) + "\\" + str(final_title)) + "\\"
  146.         full_name = d + (str(name))
  147.         if not os.path.exists(d):
  148.             os.makedirs(d)
  149.         urllib.request.urlretrieve(url, full_name)
  150.         print(full_name)
  151.  
  152.  
  153. c = (input("Would you like to scrape a board, or a specific thread?\nPlease enter 1 for thread,"
  154.            " 2 for the surface of a board, or 3 to scrape every thread on a board:\n"))
  155.  
  156. while c is "1":
  157.     board = input("What board would you like to search?: \n")
  158.     max_p = int(input("How many pages would you like to search?:\n"))
  159.  
  160.     if len(nchan_spider(1)) > 1:
  161.         while True:
  162.             try:
  163.                 for i in threads_list(threads_get(nchan_spider(max_p))):
  164.                     print(i + "\n")
  165.                 print("The above is a list of threads available to select from.")
  166.                 thread = input("Please enter the thread ID: \n")
  167.                 search_thread((thread_spider(board, thread)))
  168.                 break
  169.             except ValueError:
  170.                 print("Not a valid input. Please enter a number (e.g. 1, 2, 3, 150, etc.)")
  171.                 continue
  172.     else:
  173.         print("Not a valid board abbreviation, try again.\n\n")
  174.         continue
  175.  
  176. while c is "2":
  177.     board = input("What board do you want to scrape from?\nPlease enter just the letter for the board: ")
  178.     if len(nchan_spider(1)) > 1:
  179.         while True:
  180.             try:
  181.                 search_board(chan_spider(float(input("How many pages would you like to scrape from?\n: "))))
  182.                 break
  183.             except ValueError:
  184.                 print("Not a valid input. Please enter a number (e.g. 1, 2, 3, 150, etc.)")
  185.                 continue
  186.     elif len(nchan_spider(1)) < 1:
  187.         print("Not a valid board abbreviation, try again.\n")
  188.         continue
  189. while c is "3":
  190.     board = input("What board do you want to scrape from?\nPlease enter just the letter for the board: ")
  191.     if len(nchan_spider(1)) > 1:
  192.         while True:
  193.             try:
  194.                 deepsearch_thread(deepthread_spider(threads_get(
  195.                     nchan_spider(int(input("How many pages would you like to scrape from?\n: "))))))
  196.                 break
  197.             except ValueError:
  198.                 print("Not a valid input. Please enter a number (e.g. 1, 2, 3, 150, etc.)")
  199.                 continue
  200.     elif len(nchan_spider(1)) <1:
  201.         print("Not a valid board abbreviation, try again.\n")
  202. else:
  203.     print("Please enter 1, 2, or 3\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement