Untitled

import random
import time
import json
import zlib
import urllib.parse
import urllib.request
import http.client
from argparse import ArgumentParser
from bs4 import BeautifulSoup

http.client.HTTPConnection.debuglevel = 1
http.client.HTTPSConnection.debuglevel = 1

parser = ArgumentParser()
parser.add_argument('-t', '--throttle', dest='throttle', action='store_true',
                    help='enable http request throttling')
args = parser.parse_args()

def throttle():
  throttle_amount = random.randint(15, 16)
  time.sleep(throttle_amount)

def decode_response(response):
  if response.headers['Content-Encoding'] == 'gzip' or response.headers['Content-Encoding'] == 'zlib':
    return zlib.decompress(response.read(), 15+32)
  return response.read()

def add_common_headers(request):
  request.add_header('Accept', 'application/json, text/plain, */*')
  request.add_header('Accept-Encoding', 'gzip, deflate, br')
  request.add_header('Accept-Language', 'en-US,en;q=0.5')
  request.add_header('Cache-Control', 'no-cache')
  request.add_header('Connection', 'keep-alive')
  request.add_header('Pragma', 'no-cache')
  request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0')
  request.add_header('X-Requested-With', 'XMLHttpRequest')

def fetch_list(page):
  page_url = 'https://seekingalpha.com/earnings/earnings-call-transcripts/' + str(page)
  request = urllib.request.Request(page_url)
  add_common_headers(request)
  request.add_header('Referer', 'https://www.seekingalpha.com/')
  response = urllib.request.urlopen(request)
  response_body = decode_response(response)
  return response_body

def fetch_transcript(partial_url):
  page_url = 'https://seekingalpha.com/' + str(partial_url) + '?part=single'
  request = urllib.request.Request(page_url)
  add_common_headers(request)
  request.add_header('Referer', 'https://www.seekingalpha.com/')
  response = urllib.request.urlopen(request)
  response_body = decode_response(response)
  return response_body

page = ''
page_count = 4500

for x in range(1, page_count):
  if x % 2 == 0:
    if (args.throttle):
      throttle()
  page += str(fetch_list(x))

print(page)

parsed_html = BeautifulSoup(page, 'html.parser')
url_list = parsed_html.find_all('a', attrs={'class':'dashboard-article-link'})

i = 0
for url in url_list:
  if i % 15 == 0:
    if (args.throttle):
      time.sleep(25)
  elif i % 5 == 0:
    if (args.throttle):
      throttle()
  transcript = fetch_transcript(url.get('href'))
  parsed_html = BeautifulSoup(transcript, 'html.parser')
  transcript_text = parsed_html.body.find('article').text

  f = open(str(i) + ".txt", "w")
  f.write(transcript_text)
  i += 1