Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #imports
- import codecs
- from bs4 import BeautifulSoup
- import re
- import os
- import requests
- #boards
- f=open('boardnames.html','r').read()
- boards=BeautifulSoup(f,'html.parser')
- boardlist=[]
- for elem in boards.findAll('a'):
- boardlist.append(
- [
- str(re.sub('( ){2,20}',' ',elem.text.replace('\n',''))),
- elem['href']
- ]
- )
- print(boardlist)
- headers='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36'
- c=requests.Session()
- for name,link in boardlist:
- print(name)
- #os.makedirs('boards\\'+name) <--already done
- out = c.get(link+'c', headers={'User-Agent':headers})
- catalog=BeautifulSoup(out.content,'html.parser')
- thread_arr=[]
- for elem in catalog.findAll('div',{'class','catalogMedia'}):
- for found in elem.findAll('a'):
- if 'global' not in found['href']:
- thread_arr.append(found['href'])
- for elem in thread_arr:
- print('Thread Link:',elem)
- out = c.get(elem, headers={'User-Agent':headers})
- data=BeautifulSoup(out.content,'html.parser')
- for data_point in data.findAll('span',{'class','mediaFileAttrb'}):
- _in=data_point.find('a')['href']
- if 'filegone' not in _in:
- print(_in)
- print('Board Name:',name)
- thread_name=re.sub('.+(thread/)','',elem)
- print('Thread Name:',thread_name)
- image_name=re.sub('.+(images/)','',_in)
- image_name=re.sub('.+(videos/)','',image_name)
- print('Image Name:',image_name)
- img=c.get(_in, headers={'User-Agent':headers})
- try:
- os.makedirs('boards\\'+name+'\\'+thread_name+'\\')
- except:
- pass
- f=open('boards\\'+name+'\\'+thread_name+'\\'+image_name,'wb')
- f.write(img.content)
- f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement