Advertisement
Guest User

Untitled

a guest
Aug 15th, 2017
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.52 KB | None | 0 0
  1. import json
  2. import requests
  3. import pandas as pd
  4. import matplotlib.pyplot as plt
  5. from dateutil.relativedelta import relativedelta
  6. from datetime import date
  7. from flatten_json import flatten
  8. from tqdm import tnrange as trange
  9. from time import sleep
  10.  
  11. class CrimsonHexagonClient(object):
  12. """Interacts with the Crimson Hexagon API to retrieve post data (twitter ids
  13. etc.) from a configured monitor.
  14.  
  15. Docs:
  16. https://apidocs.crimsonhexagon.com/v1.0/reference
  17.  
  18. Args:
  19. username (str): Username on website.
  20. password (str): Password on website.
  21. monitor_id (str): id of crimson monitor.
  22. """
  23.  
  24. def __init__(self, username, password, monitor_id):
  25. self.username = username
  26. self.password = password
  27. self.monitor_id = monitor_id
  28. self.base = 'https://api.crimsonhexagon.com/api/monitor'
  29. self.session = requests.Session()
  30. self.ratelimit_refresh = 60
  31. self._auth()
  32.  
  33. def _auth(self):
  34. """Authenticates a user using their username and password through the
  35. authenticate endpoint.
  36. """
  37. url = 'https://forsight.crimsonhexagon.com/api/authenticate?'
  38.  
  39. payload = {
  40. 'username': self.username,
  41. 'password': self.password
  42. }
  43.  
  44. r = self.session.get(url, params=payload)
  45. j_result = r.json()
  46. self.auth_token = j_result["auth"]
  47. print('-- Authenticated --')
  48. return
  49.  
  50. def make_endpoint(self, endpoint):
  51. return '{}/{}?'.format(self.base, endpoint)
  52.  
  53. def get_data_from_endpoint(self, from_, to_, endpoint):
  54. """Hits the designated endpoint (volume/posts) for a specified time period.
  55. The ratelimit is burned through ASAP and then backed off for one minute.
  56. """
  57. endpoint = self.make_endpoint(endpoint)
  58. from_, to_ = str(from_), str(to_)
  59. payload = {
  60. 'auth': self.auth_token,
  61. 'id': self.monitor_id,
  62. 'start': from_,
  63. 'end': to_,
  64. 'extendLimit': 'true',
  65. 'fullContents': 'true'
  66. }
  67.  
  68. r = self.session.get(endpoint, params=payload)
  69. self.last_response = r
  70.  
  71. ratelimit_remaining = r.headers['X-RateLimit-Remaining']
  72.  
  73. # If the header is empty or 0 then wait for a ratelimit refresh.
  74. if (not ratelimit_remaining) or (float(ratelimit_remaining) < 1):
  75. print('Waiting for ratelimit refresh...')
  76. sleep(self.ratelimit_refresh)
  77.  
  78. return r
  79.  
  80. def get_dates_from_timespan(self, r_volume, max_documents=10000):
  81. """Divides the time period into chunks of less than 10k where possible.
  82. """
  83. # If the count is less than max, just return the original time span.
  84. if r_volume.json()['numberOfDocuments'] <= max_documents:
  85. l_dates = [[pd.to_datetime(r_volume.json()['startDate']).date(),
  86. pd.to_datetime(r_volume.json()['endDate']).date()]]
  87. return l_dates
  88.  
  89. # Convert json to df for easier subsetting & to calculate cumulative sum.
  90. df = pd.DataFrame(r_volume.json()['volume'])
  91. df['startDate'] = pd.to_datetime(df['startDate'])
  92. df['endDate'] = pd.to_datetime(df['endDate'])
  93.  
  94. l_dates = []
  95.  
  96. while True:
  97. df['cumulative_sum'] = df['numberOfDocuments'].cumsum()
  98.  
  99. # Find the span whose cumulative sum is below the threshold.
  100. df_below = df[df['cumulative_sum'] <= max_documents]
  101.  
  102. # If there are 0 rows under threshold.
  103. if (df_below.empty):
  104. # If there are still rows left, use the first row.
  105. if len(df) > 0:
  106. # This entry will have over 10k, but we can't go more
  107. # granular than one day.
  108. df_below = df.iloc[0:1]
  109. else:
  110. break
  111.  
  112. # Take the first row's start date and last row's end date.
  113. from_ = df_below['startDate'].iloc[0].date()
  114. to_ = df_below['endDate'].iloc[-1].date()
  115.  
  116. l_dates.append([from_, to_])
  117.  
  118. # Reassign df to remaining portion.
  119. df = df[df['startDate'] >= to_]
  120.  
  121. return l_dates
  122.  
  123. def plot_volume(self, r_volume):
  124. """Plots a time-series chart with two axes to show the daily and cumulative
  125. document count.
  126. """
  127. # Convert r to df, fix datetime, add cumulative sum.
  128. df_volume = pd.DataFrame(r_volume.json()['volume'])
  129. df_volume['startDate'] = pd.to_datetime(df_volume['startDate'])
  130. df_volume['endDate'] = pd.to_datetime(df_volume['endDate'])
  131. df_volume['cumulative_sum'] = df_volume['numberOfDocuments'].cumsum()
  132.  
  133. fig, ax1 = plt.subplots()
  134. ax2 = ax1.twinx()
  135.  
  136. df_volume['numberOfDocuments'].plot(ax=ax1, style='b-')
  137. df_volume['cumulative_sum'].plot(ax=ax2, style='r-')
  138.  
  139. ax1.set_ylabel('Number of Documents')
  140. ax2.set_ylabel('Cumulative Sum')
  141.  
  142. h1, l1 = ax1.get_legend_handles_labels()
  143. h2, l2 = ax2.get_legend_handles_labels()
  144. ax1.legend(h1+h2, l1+l2, loc=2)
  145.  
  146. plt.show()
  147.  
  148. return
  149.  
  150. def make_data_pipeline(self, from_, to_):
  151. """Combines the functionsin this class to make a robust pipeline, that
  152. loops through each day in a time period. Data is returned as a dataframe.
  153. """
  154.  
  155. # Get the volume over time data.
  156. r_volume = self.get_data_from_endpoint(from_, to_, 'volume')
  157. print('There are approximately {} documents.'.format(r_volume.json()['numberOfDocuments']))
  158. self.plot_volume(r_volume)
  159.  
  160. # Carve up time into buckets of volume <10k.
  161. l_dates = self.get_dates_from_timespan(r_volume)
  162.  
  163. data = []
  164.  
  165. for i in trange(len(l_dates), leave=False):
  166. from_, to_ = l_dates[i]
  167.  
  168. # Pull posts.
  169. r_posts = self.get_data_from_endpoint(from_, to_, 'posts')
  170. if r_posts.ok and (r_posts.json()['status'] != 'error'):
  171. j_result = json.loads(r_posts.content.decode('utf8'))
  172. data.extend(j_result['posts'])
  173.  
  174. l_flat= [flatten(d) for d in data]
  175. df = pd.DataFrame(l_flat)
  176.  
  177. return df
  178.  
  179. if __name__ == "__main__":
  180.  
  181.  
  182. # Credentials.
  183. username = 'xxxxx'
  184. password = 'xxxxx'
  185.  
  186. # Monitor id - taken from URL on website.
  187. monitor_id = '123'
  188.  
  189. # Instantiate client.
  190. crimson_api = CrimsonHexagonClient(username, password, monitor_id)
  191.  
  192. from_ = date(2017, 1, 1)
  193. to_ = date(2017, 6, 30)
  194.  
  195. # Combine class functions into a typical workflow.
  196. df = crimson_api.make_data_pipeline(from_, to_)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement