Guest User

Untitled

a guest
Aug 15th, 2020
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.15 KB | None | 0 0
  1. from tqdm import tqdm
  2. import requests
  3. from bs4 import BeautifulSoup
  4. import os
  5. import ray
  6. from datetime import date, timedelta
  7. from slugify import slugify
  8. from requests.packages.urllib3.exceptions import InsecureRequestWarning
  9. requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
  10.  
  11.  
  12. def download_progess(url, dir_loc, file_name=''):
  13.     if file_name == '':
  14.         file = url.split('/')[-1]
  15.     else:
  16.         if len(file_name) == 1:
  17.             file = url.split(file_name)[-1]
  18.         else:
  19.             file = file_name
  20.     r = requests.get(url, stream=True, allow_redirects=True)
  21.     total_size = int(r.headers.get('content-length'))
  22.     initial_pos = 0
  23.     with open(dir_loc + os.sep + file, 'wb') as f:
  24.         with tqdm(total=total_size, unit='B', unit_scale=True, desc=file, initial=initial_pos, ascii=True) as pbar:
  25.             for ch in r.iter_content(chunk_size=1024):
  26.                 if ch:
  27.                     f.write(ch)
  28.                     pbar.update(len(ch))
  29.  
  30.  
  31. def infowars_com_crawl(url):
  32.     urls = []
  33.     page = requests.get(url, verify=False)
  34.     soup = BeautifulSoup(page.content, "html.parser")
  35.     pelm = soup.find_all("guid")
  36.     for link in pelm:
  37.         urls.append(link.string)
  38.     return urls
  39.  
  40.  
  41. @ray.remote
  42. def infowars_com():
  43.     url = infowars_com_crawl('https://www.infowars.com/rss/daily')[0]
  44.     print(url)
  45.     download_progess(url, '/media/ramdisk')
  46.  
  47.  
  48. def daterange(date1, date2):
  49.     for n in range(int ((date2 - date1).days)+1):
  50.         yield date1 + timedelta(n)
  51.  
  52.  
  53. def date_generator():
  54.     dates = []
  55.     current_date = date.today()
  56.     days_before = (date.today()-timedelta(days=7))
  57.     for dt in daterange(days_before, current_date):
  58.         dates.append(dt.strftime('%m-%d-%Y'))
  59.     dates.reverse()
  60.     return dates
  61.  
  62.  
  63. @ray.remote
  64. def savage_nation_conservativestream_com():
  65.     headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) "
  66.                             "Chrome/83.0.4103.97 Safari/537.36"}
  67.     dates = date_generator()
  68.     for date_tmp in dates:
  69.         url = 'http://maurocorporation.com/podcasts/michael-savage/Savage_' + date_tmp + '_WCB_FULL.mp3'
  70.         try:
  71.             r = requests.get(url, headers=headers, verify=False)
  72.         except Exception as exc:
  73.             print('%r generated an exception: %s' % (url, exc))
  74.         else:
  75.             if r.status_code == 200:
  76.                 print(url)
  77.                 download_progess(url, '/media/ramdisk')
  78.                 break
  79.  
  80.  
  81. def savage_nation_podbean_crawl(url, class_name, class_name2=''):
  82.     valid_file_name = ''
  83.     valid_date = ''
  84.     page = requests.get(url, verify=False)
  85.     soup = BeautifulSoup(page.content, "html.parser")
  86.     page_element = soup.find_all(class_=class_name)
  87.     last_podast_link = page_element[0]['href']
  88.     if class_name2 == '':
  89.         pass
  90.     else:
  91.         page_element2 = soup.find_all(class_=class_name2)
  92.         valid_file_name = slugify(page_element2[0].text)
  93.         page_element3 = soup.find(class_=class_name2)
  94.         valid_date = page_element3.find_next('span').text
  95.     if valid_file_name == '':
  96.         print(last_podast_link)
  97.         return last_podast_link
  98.     else:
  99.         if valid_date == '':
  100.             pass
  101.         else:
  102.             valid_file_name = str(valid_date) + '_' + str(valid_file_name)
  103.         print(valid_file_name)
  104.         print(last_podast_link)
  105.         return last_podast_link, valid_file_name
  106.  
  107.  
  108. @ray.remote
  109. def savage_nation_podbean_com():
  110.     link = savage_nation_podbean_crawl('https://www.podbean.com/podcast-detail/gn5mk-377ef/Savage-Nation-with-Michael-Savage'
  111.                                '-Podcast', 'download')
  112.     link, base_file_name = savage_nation_podbean_crawl(link, 'btn btn-ios download-btn', 'pod-name')
  113.     base_file_name = str(base_file_name) + '.mp3'
  114.     download_progess(link, '/media/ramdisk', base_file_name)
  115.  
  116.  
  117. def string_between(search_string, first_str, second_str):
  118.     result = search_string[search_string.find(first_str) + len(first_str):search_string.find(second_str)]
  119.     return result
  120.  
  121.  
  122. def rss_gcnlive_com_crawl(url, class_name):
  123.     urls = []
  124.     date_str = ''
  125.     page = requests.get(url, verify=False)
  126.     soup = BeautifulSoup(page.content, "html.parser")
  127.     page_element = soup.find_all(class_=class_name)
  128.     for element in page_element:
  129.         if 'http' in str(element):
  130.             date_str = string_between(str(element), 'filename=', '_')
  131.             break
  132.     for element in page_element:
  133.         if 'http' in str(element):
  134.             if date_str in str(element):
  135.                 download_url = element['href'].strip()
  136.                 urls.append(download_url)
  137.     return urls
  138.  
  139.  
  140. @ray.remote
  141. def rss_gcnlive_com():
  142.     urls = rss_gcnlive_com_crawl('http://rss.gcnlive.com/alexJones', 'btn btn-default')
  143.     for url in urls:
  144.         download_progess(url, '/media/ramdisk', '_')
  145.  
  146.  
  147. ray.init()
  148. ray.get([infowars_com.remote(), savage_nation_conservativestream_com.remote(), savage_nation_podbean_com.remote(), rss_gcnlive_com.remote()])
  149. # infowars_com()
  150. # savage_nation_conservativestream_com()
  151. # savage_nation_podbean_com()
  152. # rss_gcnlive_com()
Add Comment
Please, Sign In to add comment