Advertisement
Guest User

Untitled

a guest
Jul 22nd, 2018
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.71 KB | None | 0 0
  1. import random
  2. import time
  3. import json
  4. import zlib
  5. import urllib.parse
  6. import urllib.request
  7. import http.client
  8. from argparse import ArgumentParser
  9. from bs4 import BeautifulSoup
  10.  
  11. http.client.HTTPConnection.debuglevel = 1
  12. http.client.HTTPSConnection.debuglevel = 1
  13.  
  14. parser = ArgumentParser()
  15. parser.add_argument('-t', '--throttle', dest='throttle', action='store_true',
  16.                     help='enable http request throttling')
  17. args = parser.parse_args()
  18.  
  19. def throttle():
  20.   throttle_amount = random.randint(15, 16)
  21.   time.sleep(throttle_amount)
  22.  
  23. def decode_response(response):
  24.   if response.headers['Content-Encoding'] == 'gzip' or response.headers['Content-Encoding'] == 'zlib':
  25.     return zlib.decompress(response.read(), 15+32)
  26.   return response.read()
  27.  
  28. def add_common_headers(request):
  29.   request.add_header('Accept', 'application/json, text/plain, */*')
  30.   request.add_header('Accept-Encoding', 'gzip, deflate, br')
  31.   request.add_header('Accept-Language', 'en-US,en;q=0.5')
  32.   request.add_header('Cache-Control', 'no-cache')
  33.   request.add_header('Connection', 'keep-alive')
  34.   request.add_header('Pragma', 'no-cache')
  35.   request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0')
  36.   request.add_header('X-Requested-With', 'XMLHttpRequest')
  37.  
  38. def fetch_list(page):
  39.   page_url = 'https://seekingalpha.com/earnings/earnings-call-transcripts/' + str(page)
  40.   request = urllib.request.Request(page_url)
  41.   add_common_headers(request)
  42.   request.add_header('Referer', 'https://www.seekingalpha.com/')
  43.   response = urllib.request.urlopen(request)
  44.   response_body = decode_response(response)
  45.   return response_body
  46.  
  47. def fetch_transcript(partial_url):
  48.   page_url = 'https://seekingalpha.com/' + str(partial_url) + '?part=single'
  49.   request = urllib.request.Request(page_url)
  50.   add_common_headers(request)
  51.   request.add_header('Referer', 'https://www.seekingalpha.com/')
  52.   response = urllib.request.urlopen(request)
  53.   response_body = decode_response(response)
  54.   return response_body
  55.  
  56. page = ''
  57. page_count = 4500
  58.  
  59. for x in range(1, page_count):
  60.   if x % 2 == 0:
  61.     if (args.throttle):
  62.       throttle()
  63.   page += str(fetch_list(x))
  64.  
  65. print(page)
  66.  
  67. parsed_html = BeautifulSoup(page, 'html.parser')
  68. url_list = parsed_html.find_all('a', attrs={'class':'dashboard-article-link'})
  69.  
  70. i = 0
  71. for url in url_list:
  72.   if i % 15 == 0:
  73.     if (args.throttle):
  74.       time.sleep(25)
  75.   elif i % 5 == 0:
  76.     if (args.throttle):
  77.       throttle()
  78.   transcript = fetch_transcript(url.get('href'))
  79.   parsed_html = BeautifulSoup(transcript, 'html.parser')
  80.   transcript_text = parsed_html.body.find('article').text
  81.  
  82.   f = open(str(i) + ".txt", "w")
  83.   f.write(transcript_text)
  84.   i += 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement