Advertisement
Guest User

Untitled

a guest
Sep 27th, 2016
51
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.69 KB | None | 0 0
  1. #imports
  2. import codecs
  3. from bs4 import BeautifulSoup
  4. import re
  5. import os
  6. import requests
  7.  
  8. #boards
  9. f=open('boardnames.html','r').read()
  10.  
  11. boards=BeautifulSoup(f,'html.parser')
  12.  
  13. boardlist=[]
  14.  
  15. for elem in boards.findAll('a'):
  16.     boardlist.append(
  17.         [
  18.             str(re.sub('( ){2,20}',' ',elem.text.replace('\n',''))),
  19.             elem['href']
  20.         ]
  21.     )
  22.  
  23. print(boardlist)
  24.  
  25. headers='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36'
  26.  
  27. c=requests.Session()
  28.  
  29. for name,link in boardlist:
  30.     print(name)
  31.     #os.makedirs('boards\\'+name) <--already done
  32.     out = c.get(link+'c', headers={'User-Agent':headers})
  33.  
  34.     catalog=BeautifulSoup(out.content,'html.parser')
  35.  
  36.     thread_arr=[]
  37.  
  38.     for elem in catalog.findAll('div',{'class','catalogMedia'}):
  39.         for found in elem.findAll('a'):
  40.             if 'global' not in found['href']:
  41.                 thread_arr.append(found['href'])
  42.  
  43.     for elem in thread_arr:
  44.         print('Thread Link:',elem)
  45.  
  46.         out = c.get(elem, headers={'User-Agent':headers})
  47.  
  48.         data=BeautifulSoup(out.content,'html.parser')
  49.  
  50.         for data_point in data.findAll('span',{'class','mediaFileAttrb'}):
  51.             _in=data_point.find('a')['href']
  52.             if 'filegone' not in _in:
  53.                 print(_in)
  54.                 print('Board Name:',name)
  55.                 thread_name=re.sub('.+(thread/)','',elem)
  56.                 print('Thread Name:',thread_name)
  57.                 image_name=re.sub('.+(images/)','',_in)
  58.                 image_name=re.sub('.+(videos/)','',image_name)
  59.                 print('Image Name:',image_name)
  60.                 img=c.get(_in, headers={'User-Agent':headers})
  61.                 try:
  62.                     os.makedirs('boards\\'+name+'\\'+thread_name+'\\')
  63.                 except:
  64.                     pass
  65.  
  66.                 f=open('boards\\'+name+'\\'+thread_name+'\\'+image_name,'wb')
  67.                 f.write(img.content)
  68.                 f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement