Untitled

from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import os
import ray
from datetime import date, timedelta
from slugify import slugify
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)


def download_progess(url, dir_loc, file_name=''):
    if file_name == '':
        file = url.split('/')[-1]
    else:
        if len(file_name) == 1:
            file = url.split(file_name)[-1]
        else:
            file = file_name
    r = requests.get(url, stream=True, allow_redirects=True)
    total_size = int(r.headers.get('content-length'))
    initial_pos = 0
    with open(dir_loc + os.sep + file, 'wb') as f:
        with tqdm(total=total_size, unit='B', unit_scale=True, desc=file, initial=initial_pos, ascii=True) as pbar:
            for ch in r.iter_content(chunk_size=1024):
                if ch:
                    f.write(ch)
                    pbar.update(len(ch))


def infowars_com_crawl(url):
    urls = []
    page = requests.get(url, verify=False)
    soup = BeautifulSoup(page.content, "html.parser")
    pelm = soup.find_all("guid")
    for link in pelm:
        urls.append(link.string)
    return urls


@ray.remote
def infowars_com():
    url = infowars_com_crawl('https://www.infowars.com/rss/daily')[0]
    print(url)
    download_progess(url, '/media/ramdisk')


def daterange(date1, date2):
    for n in range(int ((date2 - date1).days)+1):
        yield date1 + timedelta(n)


def date_generator():
    dates = []
    current_date = date.today()
    days_before = (date.today()-timedelta(days=7))
    for dt in daterange(days_before, current_date):
        dates.append(dt.strftime('%m-%d-%Y'))
    dates.reverse()
    return dates


@ray.remote
def savage_nation_conservativestream_com():
    headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) "
                            "Chrome/83.0.4103.97 Safari/537.36"}
    dates = date_generator()
    for date_tmp in dates:
        url = 'http://maurocorporation.com/podcasts/michael-savage/Savage_' + date_tmp + '_WCB_FULL.mp3'
        try:
            r = requests.get(url, headers=headers, verify=False)
        except Exception as exc:
            print('%r generated an exception: %s' % (url, exc))
        else:
            if r.status_code == 200:
                print(url)
                download_progess(url, '/media/ramdisk')
                break


def savage_nation_podbean_crawl(url, class_name, class_name2=''):
    valid_file_name = ''
    valid_date = ''
    page = requests.get(url, verify=False)
    soup = BeautifulSoup(page.content, "html.parser")
    page_element = soup.find_all(class_=class_name)
    last_podast_link = page_element[0]['href']
    if class_name2 == '':
        pass
    else:
        page_element2 = soup.find_all(class_=class_name2)
        valid_file_name = slugify(page_element2[0].text)
        page_element3 = soup.find(class_=class_name2)
        valid_date = page_element3.find_next('span').text
    if valid_file_name == '':
        print(last_podast_link)
        return last_podast_link
    else:
        if valid_date == '':
            pass
        else:
            valid_file_name = str(valid_date) + '_' + str(valid_file_name)
        print(valid_file_name)
        print(last_podast_link)
        return last_podast_link, valid_file_name


@ray.remote
def savage_nation_podbean_com():
    link = savage_nation_podbean_crawl('https://www.podbean.com/podcast-detail/gn5mk-377ef/Savage-Nation-with-Michael-Savage'
                               '-Podcast', 'download')
    link, base_file_name = savage_nation_podbean_crawl(link, 'btn btn-ios download-btn', 'pod-name')
    base_file_name = str(base_file_name) + '.mp3'
    download_progess(link, '/media/ramdisk', base_file_name)


def string_between(search_string, first_str, second_str):
    result = search_string[search_string.find(first_str) + len(first_str):search_string.find(second_str)]
    return result


def rss_gcnlive_com_crawl(url, class_name):
    urls = []
    date_str = ''
    page = requests.get(url, verify=False)
    soup = BeautifulSoup(page.content, "html.parser")
    page_element = soup.find_all(class_=class_name)
    for element in page_element:
        if 'http' in str(element):
            date_str = string_between(str(element), 'filename=', '_')
            break
    for element in page_element:
        if 'http' in str(element):
            if date_str in str(element):
                download_url = element['href'].strip()
                urls.append(download_url)
    return urls


@ray.remote
def rss_gcnlive_com():
    urls = rss_gcnlive_com_crawl('http://rss.gcnlive.com/alexJones', 'btn btn-default')
    for url in urls:
        download_progess(url, '/media/ramdisk', '_')


ray.init()
ray.get([infowars_com.remote(), savage_nation_conservativestream_com.remote(), savage_nation_podbean_com.remote(), rss_gcnlive_com.remote()])
# infowars_com()
# savage_nation_conservativestream_com()
# savage_nation_podbean_com()
# rss_gcnlive_com()