Untitled

from urllib. request import urlopen
from bs4 import BeautifulSoup
import requests
import urllib
import bs4
import random
import os

os.chdir("C:\\")
print(os.getcwd())
board = "b"


def chan_spider(max_pages):
    page = 0
    index = []
    while page <= max_pages:
        url = r'http://boards.4chan.org/' + str(board) + '/' + str(page)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
        for img in soup.findAll('a', {'class': "fileThumb"}):
            href = r'http:' + img.get('href')
            index.append(href)
        page += 1
        if page is 1:
            continue
    return index


def search_board(index):
    for address in index:
        url_list = address.split(".")
        ext = str("." + url_list[-1])
        name = random.randrange(100000, 1000000)
        d = ('images\\' + (str(board)) + "\\")
        full_name = d + (str(name) + ext)
        if not os.path.exists(d):
            os.makedirs(d)
        urllib.request.urlretrieve(address, full_name)
        print(full_name)


def thread_name(threads):
    for t in threads:
        name_list = t.split("-")
        name = name_list[0]
        newname = name.replace("\"", "")
        listy.append(newname)
    return listy


def thread_spider(board, thread):
    index = []
    url = r'http://boards.4chan.org/' + str(board) + '/thread/' + thread
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = bs4.BeautifulSoup(plain_text, "html.parser")
    for img in soup.findAll('a', {'class': "fileThumb"}):
        href = r'http:' + img.get('href')
        index.append(href)
    return index


def search_thread(index):
    folder = input("Please enter a name for the folder that will hold these images: ")
    for address in index:
        url_list = address.split("/")
        name = url_list[-1]
        d = ('images\\' + (str(board)) + "\\" + str(folder)) + "\\"
        full_name = d + (str(name))
        if not os.path.exists(full_name):
            os.makedirs(d)
        urllib.request.urlretrieve(address, full_name)
        print(full_name)


def nchan_spider(max_pages):
    page = 0
    index = []
    while page <= max_pages:
        url = r'http://boards.4chan.org/' + str(board) + '/' + str(page)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
        for img in soup.findAll('a', {'class': "replylink"}):
            href = r'http://boards.4chan.org/' + str(board) + "/" + img.get('href')
            index.append(href)
        page += 1
        if page is 1:
            continue
    return index
result = []


def threads_get(index):
    for i in index:
        ilist = i.split("/")
        if len(ilist) == 7:
            xlist = "/".join(ilist)
        if xlist not in result:
            result.append(xlist)
    return result
threads = []


def threads_list(index):
    for i in index:
        ilist = i.split("/")
        xlist = ilist[-1].split("-")
        zlist = " ".join(xlist)
        x = ("\"" + zlist + "\" " + "- (ID = " + ilist[-2] + ")")
        threads.append(x)
    return threads
listy = []


def deepthread_spider(urls):
    output = []
    for url in urls:
        thread = url.split("/")
        thrd = thread[-1]
        index = []
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
        for s in soup.findAll('a', {'class': "fileThumb"}):
            href = r'http:' + s.get('href') + "," + thrd
            index.append(href)
        output += index
    return output


def deepsearch_thread(index):
    print("Digging...")
    for address in index:
        thr = address.split(",")
        thread = thr[-1]
        title = thread.split("-")
        first_name = str(thr[-2])
        sec_name = first_name.split("/")
        name = sec_name[-1]
        final_title = " ".join(title)
        url = thr[0]
        d = ('images\\' + (str(board)) + "\\" + str(final_title)) + "\\"
        full_name = d + (str(name))
        if not os.path.exists(d):
            os.makedirs(d)
        urllib.request.urlretrieve(url, full_name)
        print(full_name)


c = (input("Would you like to scrape a board, or a specific thread?\nPlease enter 1 for thread,"
           " 2 for the surface of a board, or 3 to scrape every thread on a board:\n"))

while c is "1":
    board = input("What board would you like to search?: \n")
    max_p = int(input("How many pages would you like to search?:\n"))

    if len(nchan_spider(1)) > 1:
        while True:
            try:
                for i in threads_list(threads_get(nchan_spider(max_p))):
                    print(i + "\n")
                print("The above is a list of threads available to select from.")
                thread = input("Please enter the thread ID: \n")
                search_thread((thread_spider(board, thread)))
                break
            except ValueError:
                print("Not a valid input. Please enter a number (e.g. 1, 2, 3, 150, etc.)")
                continue
    else:
        print("Not a valid board abbreviation, try again.\n\n")
        continue

while c is "2":
    board = input("What board do you want to scrape from?\nPlease enter just the letter for the board: ")
    if len(nchan_spider(1)) > 1:
        while True:
            try:
                search_board(chan_spider(float(input("How many pages would you like to scrape from?\n: "))))
                break
            except ValueError:
                print("Not a valid input. Please enter a number (e.g. 1, 2, 3, 150, etc.)")
                continue
    elif len(nchan_spider(1)) < 1:
        print("Not a valid board abbreviation, try again.\n")
        continue
while c is "3":
    board = input("What board do you want to scrape from?\nPlease enter just the letter for the board: ")
    if len(nchan_spider(1)) > 1:
        while True:
            try:
                deepsearch_thread(deepthread_spider(threads_get(
                    nchan_spider(int(input("How many pages would you like to scrape from?\n: "))))))
                break
            except ValueError:
                print("Not a valid input. Please enter a number (e.g. 1, 2, 3, 150, etc.)")
                continue
    elif len(nchan_spider(1)) <1:
        print("Not a valid board abbreviation, try again.\n")
else:
    print("Please enter 1, 2, or 3\n")