Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from tqdm import tqdm
- import requests
- from bs4 import BeautifulSoup
- import os
- import ray
- from datetime import date, timedelta
- from slugify import slugify
- from requests.packages.urllib3.exceptions import InsecureRequestWarning
- requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
- def download_progess(url, dir_loc, file_name=''):
- if file_name == '':
- file = url.split('/')[-1]
- else:
- if len(file_name) == 1:
- file = url.split(file_name)[-1]
- else:
- file = file_name
- r = requests.get(url, stream=True, allow_redirects=True)
- total_size = int(r.headers.get('content-length'))
- initial_pos = 0
- with open(dir_loc + os.sep + file, 'wb') as f:
- with tqdm(total=total_size, unit='B', unit_scale=True, desc=file, initial=initial_pos, ascii=True) as pbar:
- for ch in r.iter_content(chunk_size=1024):
- if ch:
- f.write(ch)
- pbar.update(len(ch))
- def infowars_com_crawl(url):
- urls = []
- page = requests.get(url, verify=False)
- soup = BeautifulSoup(page.content, "html.parser")
- pelm = soup.find_all("guid")
- for link in pelm:
- urls.append(link.string)
- return urls
- @ray.remote
- def infowars_com():
- url = infowars_com_crawl('https://www.infowars.com/rss/daily')[0]
- print(url)
- download_progess(url, '/media/ramdisk')
- def daterange(date1, date2):
- for n in range(int ((date2 - date1).days)+1):
- yield date1 + timedelta(n)
- def date_generator():
- dates = []
- current_date = date.today()
- days_before = (date.today()-timedelta(days=7))
- for dt in daterange(days_before, current_date):
- dates.append(dt.strftime('%m-%d-%Y'))
- dates.reverse()
- return dates
- @ray.remote
- def savage_nation_conservativestream_com():
- headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) "
- "Chrome/83.0.4103.97 Safari/537.36"}
- dates = date_generator()
- for date_tmp in dates:
- url = 'http://maurocorporation.com/podcasts/michael-savage/Savage_' + date_tmp + '_WCB_FULL.mp3'
- try:
- r = requests.get(url, headers=headers, verify=False)
- except Exception as exc:
- print('%r generated an exception: %s' % (url, exc))
- else:
- if r.status_code == 200:
- print(url)
- download_progess(url, '/media/ramdisk')
- break
- def savage_nation_podbean_crawl(url, class_name, class_name2=''):
- valid_file_name = ''
- valid_date = ''
- page = requests.get(url, verify=False)
- soup = BeautifulSoup(page.content, "html.parser")
- page_element = soup.find_all(class_=class_name)
- last_podast_link = page_element[0]['href']
- if class_name2 == '':
- pass
- else:
- page_element2 = soup.find_all(class_=class_name2)
- valid_file_name = slugify(page_element2[0].text)
- page_element3 = soup.find(class_=class_name2)
- valid_date = page_element3.find_next('span').text
- if valid_file_name == '':
- print(last_podast_link)
- return last_podast_link
- else:
- if valid_date == '':
- pass
- else:
- valid_file_name = str(valid_date) + '_' + str(valid_file_name)
- print(valid_file_name)
- print(last_podast_link)
- return last_podast_link, valid_file_name
- @ray.remote
- def savage_nation_podbean_com():
- link = savage_nation_podbean_crawl('https://www.podbean.com/podcast-detail/gn5mk-377ef/Savage-Nation-with-Michael-Savage'
- '-Podcast', 'download')
- link, base_file_name = savage_nation_podbean_crawl(link, 'btn btn-ios download-btn', 'pod-name')
- base_file_name = str(base_file_name) + '.mp3'
- download_progess(link, '/media/ramdisk', base_file_name)
- def string_between(search_string, first_str, second_str):
- result = search_string[search_string.find(first_str) + len(first_str):search_string.find(second_str)]
- return result
- def rss_gcnlive_com_crawl(url, class_name):
- urls = []
- date_str = ''
- page = requests.get(url, verify=False)
- soup = BeautifulSoup(page.content, "html.parser")
- page_element = soup.find_all(class_=class_name)
- for element in page_element:
- if 'http' in str(element):
- date_str = string_between(str(element), 'filename=', '_')
- break
- for element in page_element:
- if 'http' in str(element):
- if date_str in str(element):
- download_url = element['href'].strip()
- urls.append(download_url)
- return urls
- @ray.remote
- def rss_gcnlive_com():
- urls = rss_gcnlive_com_crawl('http://rss.gcnlive.com/alexJones', 'btn btn-default')
- for url in urls:
- download_progess(url, '/media/ramdisk', '_')
- ray.init()
- ray.get([infowars_com.remote(), savage_nation_conservativestream_com.remote(), savage_nation_podbean_com.remote(), rss_gcnlive_com.remote()])
- # infowars_com()
- # savage_nation_conservativestream_com()
- # savage_nation_podbean_com()
- # rss_gcnlive_com()
Add Comment
Please, Sign In to add comment