Guest User

8chan image scraper

a guest
Apr 30th, 2016
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.29 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import urllib
  4. import sys
  5. import re
  6. import time
  7. from yattag import Doc, indent
  8. import xml.etree.ElementTree
  9. import os
  10. import urllib.request as urllib2
  11. import random
  12.  
  13.  
  14. while(True):
  15.     #read from text file the html input
  16.     #f = open('input.txt','r', encoding='utf-8')
  17.     #z = f.readline()
  18.     url='https://www.8ch.net/v'
  19.     print(url)
  20.     headers='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36'
  21.  
  22.     req = urllib2.Request(url, headers={ 'User-Agent': headers })
  23.     html = urllib2.urlopen(req).read()
  24.  
  25.     soup = BeautifulSoup(html, "html.parser")
  26.  
  27.     title = soup.html.head.title.text
  28.  
  29.     #removes non-valid characters
  30.     def clean(text):
  31.         return re.sub(r'[^\x00-\x7F]',' ', text)
  32.  
  33.     #used in isolating the images for the OP
  34.     def OPimgs(index):
  35.         tmparr=[]
  36.         for images in soup.findAll(('div'),{"class":"thread"})[index].findAll('div',{"class":"files"})[0].findAll('div',{"class":"file"}):
  37.             for answers in images.findAll('a'):
  38.                 if (str)(answers.text):
  39.                     tmparr.append(clean((str)(answers.text)))
  40.         return tmparr
  41.  
  42.     #filling OP posts into array for indexing
  43.     OPnums=[]
  44.     for threads in soup.findAll(('div'),{"class":"thread"}):
  45.         op=True
  46.         for posts in threads.findAll(('div'),{"class":"post"}):
  47.             if op==True:
  48.                 op=False
  49.                 OPnums.append(posts.findAll('a',{"class":"post_no"})[1].text)
  50.  
  51.  
  52.     #setting up xml
  53.     doc, tag, text = Doc().tagtext()
  54.     i=True
  55.     j=0
  56.     with tag('board', name=title):
  57.         for num in range(0,len(OPnums)):
  58.             i=True
  59.             with tag('thread', name=OPnums[num]):
  60.                 for threads in soup.findAll(('div'),{"class":"thread"})[num].findAll(('div'),{"class":"post"}):
  61.                     if i==True:
  62.                         i=False
  63.                         with tag('Post',name=OPnums[num]):
  64.                             for image in OPimgs(j):
  65.                                 with tag('Image'):
  66.                                     text(image)
  67.                         j+=1
  68.                     else:
  69.                         for number in threads.findAll('a',{"class":"post_no"})[1]:
  70.                             with tag('Post', name=(str)(number)):
  71.                                 for images in threads.findAll('div',{"class":"file"}):
  72.                                     for pclass in images.findAll('p',{"class":"fileinfo"}):
  73.                                         with tag('Image'):
  74.                                             text(pclass.find('a').text)
  75.  
  76.     result = indent(
  77.         doc.getvalue(),
  78.         indentation = ' '*4,
  79.         newline = '\r\n'
  80.     )
  81.  
  82.     print(result)
  83.  
  84.     #write xml to file
  85.     file = open('myfile.xml', 'w+')
  86.     file.write(result)
  87.     file.close()
  88.  
  89.     time.sleep(5)
  90.  
  91.     #reading myfile(the xml tree just built)
  92.     e = xml.etree.ElementTree.parse('myfile.xml').getroot()
  93.     postsActive=[]
  94.     postsSaved=[]
  95.  
  96.     #returns active posts in an array
  97.     for atype in e.findall('thread'):
  98.         for find in atype.findall('Post'):
  99.             postsActive.append((int)(find.get('name')))
  100.  
  101.     #return posts we've already saved
  102.     file = open('saved.txt', 'r')
  103.     postsSaved = file.readlines()
  104.     postsSaved = list(map(int, postsSaved))
  105.  
  106.     #converting to a set for filtering
  107.     s1=set(postsSaved)
  108.     s2=set(postsActive)
  109.  
  110.     #remove posts found in saved.csv array
  111.     for x in s1:
  112.         for y in s2:
  113.             if x == y:
  114.                 postsActive.remove(x)
  115.                
  116.     print(postsActive,  ' is what\'s left')
  117.     print('Hello?')
  118.    
  119.     #downloading function
  120.     def download(link, newDir):
  121.         url='https://media.8ch.net/' + 'v' + '/src/' + link
  122.         print(url)
  123.         headers='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36'
  124.         req = urllib2.Request(url, headers={ 'User-Agent': headers })
  125.         html = urllib2.urlopen(req).read()
  126.         with open(os.path.join(newDir, link),'wb') as output:
  127.             output.write(html)
  128.         time.sleep(2)
  129.  
  130.     for threads in e.findall('thread'):
  131.         for posts in threads.findall('Post'):
  132.             for result in postsActive:
  133.                 if posts.get('name')==(str)(result):
  134.                     for images in posts.findall('Image'):
  135.                         print(images.text, '; Post Number: ', result)
  136.                         print('OP Post Number: ', threads.get('name'))
  137.                         if not os.path.exists('./' + threads.get('name')):
  138.                             os.makedirs('./' + threads.get('name'))
  139.                         download(images.text, './' + threads.get('name'))
  140.  
  141.     #saving csv
  142.     outputa=postsActive + postsSaved
  143.     outputa = list(map(str, outputa))
  144.     newfile = open('saved.txt', 'w+')
  145.     print(outputa, ' is our output\n\n\n')
  146.     newfile.write("\n".join(outputa))
  147.     newfile.close
  148.  
  149.  
  150.     print(outputa)
  151.     print('Finished, waiting now')
  152.     time.sleep(random.randint(60, 160))
  153.     print('Finished, going to next')
Advertisement
Add Comment
Please, Sign In to add comment