Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- import urllib
- import sys
- import re
- import time
- from yattag import Doc, indent
- import xml.etree.ElementTree
- import os
- import urllib.request as urllib2
- import random
- while(True):
- #read from text file the html input
- #f = open('input.txt','r', encoding='utf-8')
- #z = f.readline()
- url='https://www.8ch.net/v'
- print(url)
- headers='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36'
- req = urllib2.Request(url, headers={ 'User-Agent': headers })
- html = urllib2.urlopen(req).read()
- soup = BeautifulSoup(html, "html.parser")
- title = soup.html.head.title.text
- #removes non-valid characters
- def clean(text):
- return re.sub(r'[^\x00-\x7F]',' ', text)
- #used in isolating the images for the OP
- def OPimgs(index):
- tmparr=[]
- for images in soup.findAll(('div'),{"class":"thread"})[index].findAll('div',{"class":"files"})[0].findAll('div',{"class":"file"}):
- for answers in images.findAll('a'):
- if (str)(answers.text):
- tmparr.append(clean((str)(answers.text)))
- return tmparr
- #filling OP posts into array for indexing
- OPnums=[]
- for threads in soup.findAll(('div'),{"class":"thread"}):
- op=True
- for posts in threads.findAll(('div'),{"class":"post"}):
- if op==True:
- op=False
- OPnums.append(posts.findAll('a',{"class":"post_no"})[1].text)
- #setting up xml
- doc, tag, text = Doc().tagtext()
- i=True
- j=0
- with tag('board', name=title):
- for num in range(0,len(OPnums)):
- i=True
- with tag('thread', name=OPnums[num]):
- for threads in soup.findAll(('div'),{"class":"thread"})[num].findAll(('div'),{"class":"post"}):
- if i==True:
- i=False
- with tag('Post',name=OPnums[num]):
- for image in OPimgs(j):
- with tag('Image'):
- text(image)
- j+=1
- else:
- for number in threads.findAll('a',{"class":"post_no"})[1]:
- with tag('Post', name=(str)(number)):
- for images in threads.findAll('div',{"class":"file"}):
- for pclass in images.findAll('p',{"class":"fileinfo"}):
- with tag('Image'):
- text(pclass.find('a').text)
- result = indent(
- doc.getvalue(),
- indentation = ' '*4,
- newline = '\r\n'
- )
- print(result)
- #write xml to file
- file = open('myfile.xml', 'w+')
- file.write(result)
- file.close()
- time.sleep(5)
- #reading myfile(the xml tree just built)
- e = xml.etree.ElementTree.parse('myfile.xml').getroot()
- postsActive=[]
- postsSaved=[]
- #returns active posts in an array
- for atype in e.findall('thread'):
- for find in atype.findall('Post'):
- postsActive.append((int)(find.get('name')))
- #return posts we've already saved
- file = open('saved.txt', 'r')
- postsSaved = file.readlines()
- postsSaved = list(map(int, postsSaved))
- #converting to a set for filtering
- s1=set(postsSaved)
- s2=set(postsActive)
- #remove posts found in saved.csv array
- for x in s1:
- for y in s2:
- if x == y:
- postsActive.remove(x)
- print(postsActive, ' is what\'s left')
- print('Hello?')
- #downloading function
- def download(link, newDir):
- url='https://media.8ch.net/' + 'v' + '/src/' + link
- print(url)
- headers='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36'
- req = urllib2.Request(url, headers={ 'User-Agent': headers })
- html = urllib2.urlopen(req).read()
- with open(os.path.join(newDir, link),'wb') as output:
- output.write(html)
- time.sleep(2)
- for threads in e.findall('thread'):
- for posts in threads.findall('Post'):
- for result in postsActive:
- if posts.get('name')==(str)(result):
- for images in posts.findall('Image'):
- print(images.text, '; Post Number: ', result)
- print('OP Post Number: ', threads.get('name'))
- if not os.path.exists('./' + threads.get('name')):
- os.makedirs('./' + threads.get('name'))
- download(images.text, './' + threads.get('name'))
- #saving csv
- outputa=postsActive + postsSaved
- outputa = list(map(str, outputa))
- newfile = open('saved.txt', 'w+')
- print(outputa, ' is our output\n\n\n')
- newfile.write("\n".join(outputa))
- newfile.close
- print(outputa)
- print('Finished, waiting now')
- time.sleep(random.randint(60, 160))
- print('Finished, going to next')
Advertisement
Add Comment
Please, Sign In to add comment