spider

#!/usr/bin/python3.5

import os
import urllib
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve


class Spider:
    def __init__(self):
        self.board = None
        self.page = 0
        self.index = []
        self.max_pages = None
        self.result = []
        self.output = []
        self.threads = []

    def chan_spider(self):
        self.index = []
        self.page = 0
        while self.page <= self.max_pages:
            url = r'http://boards.4chan.org/' \
                  + str(self.board) + '/' + str(self.page)
            source_code = requests.get(url)
            plain_text = source_code.text
            soup = BeautifulSoup(plain_text, "html.parser")
            for img in soup.findAll('a', {'class': "fileThumb"}):
                href = r'http:' + img.get('href')
                self.index.append(href)
            self.page += 1
            if self.page is 1:
                continue
        return self.index

    def search_board(self):
        self.chan_spider()
        for address in self.index:
            name = address.split('/')[-1]
            d = ('images/' + (str(self.board)) + "/")
            full_name = d + name
            if not os.path.exists(d):
                os.makedirs(d)
            urllib.request.urlretrieve(address, full_name)
            print(full_name)

    def nchan_spider(self):
        self.index = []
        self.page = 0
        while self.page <= self.max_pages:
            url = r'http://boards.4chan.org/' + str(self.board) + '/' + str(self.page)
            source_code = requests.get(url)
            plain_text = source_code.text
            soup = BeautifulSoup(plain_text, "html.parser")
            for img in soup.findAll('a', {'class': "replylink"}):
                href = r'http://boards.4chan.org/' + str(self.board) + "/" + img.get('href')
                self.index.append(href)
            self.page += 1
            if self.page is 1:
                continue
        return self.index

    def threads_get(self):
        self.result = []
        self.nchan_spider()
        for i in self.index:
            ilist = i.split("/")
            if len(ilist) == 7:
                xlist = "/".join(ilist)
            if xlist not in self.result:
                self.result.append(xlist)
        return self.result

    def deepthread_spider(self):
        self.output = []
        self.threads_get()
        for url in self.result:
            thread = url[-1].split("/")
            thrd = thread[-1]
            index = []
            source_code = requests.get(url)
            plain_text = source_code.text
            soup = BeautifulSoup(plain_text, "html.parser")
            for s in soup.findAll('a', {'class': "fileThumb"}):
                href = r'http:' + s.get('href') + "," + thrd
                index.append(href)
            self.output += index
        return self.output

    def deepsearch_thread(self):
        print("Digging...")
        self.deepthread_spider()
        for address in self.output:
            thr = address.split(",")
            thread = thr[-1]
            title = thread.split("-")
            first_name = str(thr[-2])
            sec_name = first_name.split("/")
            name = sec_name[-1]
            final_title = " ".join(title)
            url = thr[0]
            d = ('images/' + (str(self.board)) + "/"
                 + str(final_title)) + "/"
            full_name = d + (str(name))
            if not os.path.exists(d):
                os.makedirs(d)
            urllib.request.urlretrieve(url, full_name)
            print(full_name)

    def select_thread(self):
        self.threads_get()
        self.result = list(enumerate(self.result))
        for i in self.result:
            print('ID No {}: {}'.format(str(i[0]), i[1].split('/')[-1].replace('-', ' ')))
        th = input('Enter ID No of desired thread, or'
                   '\n manually enter the url of the desired thread: ')
        try:
            thr = int(th)
            thread = self.result[thr][-1]
        except ValueError:
            print('Manual entry detected. starting...')
            thread = th
        return thread

    def thread_spider(self):
        thread = self.select_thread()
        inx = []
        source_code = requests.get(thread)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
        for img in soup.findAll('a', {'class': "fileThumb"}):
            href = r'http:' + img.get('href')
            inx.append(href)
        return inx, thread

    def search_thread(self):
        thread, pre_folder = self.thread_spider()
        folder = pre_folder.split('/')[-1].replace('-', ' ')
        for address in thread:
            url_list = address.split("/")
            name = url_list[-1]
            d = ('images/' + (str(self.board)) + "/" + str(folder)) + "/"
            full_name = d + (str(name))
            if not os.path.exists(d):
                os.makedirs(d)
            urllib.request.urlretrieve(address, full_name)
            print(full_name)

    def menu(self):
        while True:
            cmd = input(
                '1 for board surface, 2 for a specific thread, '
                'and 3 for every thread on a board, or q to exit: ')
            if cmd == 'q':
                break
            self.board = input(
                'What board would you like to scrape?: ')
            self.max_pages = int(input(
                'How many pages would you like to scrape?: '))
            if cmd == '1':
                self.search_board()
            elif cmd == '2':
                self.search_thread()
            elif cmd == '3':
                self.deepsearch_thread()
            else:
                print('Invalid choice, try again.')


if __name__ == '__main__':
    spider = Spider()
    spider.menu()