Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import random
- import time
- import json
- import zlib
- import urllib.parse
- import urllib.request
- import http.client
- from argparse import ArgumentParser
- from bs4 import BeautifulSoup
- http.client.HTTPConnection.debuglevel = 1
- http.client.HTTPSConnection.debuglevel = 1
- parser = ArgumentParser()
- parser.add_argument('-t', '--throttle', dest='throttle', action='store_true',
- help='enable http request throttling')
- args = parser.parse_args()
- def throttle():
- throttle_amount = random.randint(15, 16)
- time.sleep(throttle_amount)
- def decode_response(response):
- if response.headers['Content-Encoding'] == 'gzip' or response.headers['Content-Encoding'] == 'zlib':
- return zlib.decompress(response.read(), 15+32)
- return response.read()
- def add_common_headers(request):
- request.add_header('Accept', 'application/json, text/plain, */*')
- request.add_header('Accept-Encoding', 'gzip, deflate, br')
- request.add_header('Accept-Language', 'en-US,en;q=0.5')
- request.add_header('Cache-Control', 'no-cache')
- request.add_header('Connection', 'keep-alive')
- request.add_header('Pragma', 'no-cache')
- request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0')
- request.add_header('X-Requested-With', 'XMLHttpRequest')
- def fetch_list(page):
- page_url = 'https://seekingalpha.com/earnings/earnings-call-transcripts/' + str(page)
- request = urllib.request.Request(page_url)
- add_common_headers(request)
- request.add_header('Referer', 'https://www.seekingalpha.com/')
- response = urllib.request.urlopen(request)
- response_body = decode_response(response)
- return response_body
- def fetch_transcript(partial_url):
- page_url = 'https://seekingalpha.com/' + str(partial_url) + '?part=single'
- request = urllib.request.Request(page_url)
- add_common_headers(request)
- request.add_header('Referer', 'https://www.seekingalpha.com/')
- response = urllib.request.urlopen(request)
- response_body = decode_response(response)
- return response_body
- page = ''
- page_count = 4500
- for x in range(1, page_count):
- if x % 2 == 0:
- if (args.throttle):
- throttle()
- page += str(fetch_list(x))
- print(page)
- parsed_html = BeautifulSoup(page, 'html.parser')
- url_list = parsed_html.find_all('a', attrs={'class':'dashboard-article-link'})
- i = 0
- for url in url_list:
- if i % 15 == 0:
- if (args.throttle):
- time.sleep(25)
- elif i % 5 == 0:
- if (args.throttle):
- throttle()
- transcript = fetch_transcript(url.get('href'))
- parsed_html = BeautifulSoup(transcript, 'html.parser')
- transcript_text = parsed_html.body.find('article').text
- f = open(str(i) + ".txt", "w")
- f.write(transcript_text)
- i += 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement