Advertisement
Guest User

Untitled

a guest
Apr 21st, 2017
234
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.82 KB | None | 0 0
  1. from __future__ import absolute_import, print_function, unicode_literals
  2. import sys
  3. import requests
  4. import json
  5. import pandas as pd
  6. from bs4 import BeautifulSoup
  7. if sys.version_info[0] == 2: # Python 2
  8. from urllib import quote
  9. else: # Python 3
  10. from urllib.parse import quote
  11.  
  12.  
  13. class TrendReq(object):
  14. """
  15. Google Trends API
  16. """
  17. def __init__(self, google_username, google_password, hl='en-US', tz=360, geo='', custom_useragent=None):
  18. """
  19. Initialize hard-coded URLs, HTTP headers, and login parameters
  20. needed to connect to Google Trends, then connect.
  21. """
  22. self.username = google_username
  23. self.password = google_password
  24. # google rate limit
  25. self.google_rl = 'You have reached your quota limit. Please try again later.'
  26. self.url_login = "https://accounts.google.com/ServiceLogin"
  27. self.url_auth = "https://accounts.google.com/ServiceLoginAuth"
  28. # custom user agent so users know what "new account signin for Google" is
  29. if custom_useragent is None:
  30. self.custom_useragent = {'User-Agent': 'PyTrends'}
  31. else:
  32. self.custom_useragent = {'User-Agent': custom_useragent}
  33. self._connect()
  34. self.results = None
  35.  
  36. # set user defined options used globally
  37. self.tz = tz
  38. self.hl = hl
  39. self.geo = ''
  40. self.kw_list = list()
  41.  
  42. # intialize widget payloads
  43. self.interest_overtime_widget = dict()
  44. self.interest_by_region_widget = dict()
  45. self.related_queries_widget_list = list()
  46.  
  47. def _connect(self):
  48. """
  49. Connect to Google.
  50. Go to login page GALX hidden input value and send it back to google + login and password.
  51. http://stackoverflow.com/questions/6754709/logging-in-to-google-using-python
  52. """
  53. self.ses = requests.session()
  54. login_html = self.ses.get(self.url_login, headers=self.custom_useragent)
  55. soup_login = BeautifulSoup(login_html.content, "lxml").find('form').find_all('input')
  56. form_data = dict()
  57. for u in soup_login:
  58. if u.has_attr('value') and u.has_attr('name'):
  59. form_data[u['name']] = u['value']
  60. # override the inputs with out login and pwd:
  61. form_data['Email'] = self.username
  62. form_data['Passwd'] = self.password
  63. self.ses.post(self.url_auth, data=form_data)
  64.  
  65. def build_payload(self, kw_list, cat=0, timeframe='today 5-y', geo='', gprop=''):
  66. """Create the payload for related queries, interest over time and interest by region"""
  67. token_payload = dict()
  68. self.kw_list = kw_list
  69. self.geo = geo
  70. token_payload['hl'] = self.hl
  71. token_payload['tz'] = self.tz
  72. token_payload['req'] = {'comparisonItem': [], 'category': cat}
  73. token_payload['property'] = gprop
  74. # build out json for each keyword
  75. for kw in self.kw_list:
  76. keyword_payload = {'keyword': kw, 'time': timeframe, 'geo': self.geo}
  77. token_payload['req']['comparisonItem'].append(keyword_payload)
  78. # requests will mangle this if it is not a string
  79. token_payload['req'] = json.dumps(token_payload['req'])
  80. # get tokens
  81. self._tokens(token_payload)
  82. return
  83.  
  84. def _tokens(self, token_payload):
  85. """Makes request to Google to get API tokens for interest over time, interest by region and related queries"""
  86.  
  87. # make the request
  88. req_url = "https://www.google.com/trends/api/explore"
  89. req = self.ses.get(req_url, params=token_payload)
  90.  
  91. # parse the returned json
  92. # strip off garbage characters that break json parser
  93. widget_json = req.text[4:]
  94. widget_dict = json.loads(widget_json)['widgets']
  95. # order of the json matters...
  96. first_region_token = True
  97. # assign requests
  98. for widget in widget_dict:
  99. if widget['title'] == 'Interest over time':
  100. self.interest_over_time_widget = widget
  101. if widget['title'] == 'Interest by region' and first_region_token:
  102. self.interest_by_region_widget = widget
  103. first_region_token = False
  104. if widget['title'] == 'Interest by subregion' and first_region_token:
  105. self.interest_by_region_widget = widget
  106. first_region_token = False
  107. # response for each term, put into a list
  108. if widget['title'] == 'Related queries':
  109. self.related_queries_widget_list.append(widget)
  110. return
  111.  
  112. def interest_over_time(self):
  113. """Request data from Google's Interest Over Time section and return a dataframe"""
  114.  
  115. # make the request
  116. req_url = "https://www.google.com/trends/api/widgetdata/multiline"
  117. over_time_payload = dict()
  118. # convert to string as requests will mangle
  119. over_time_payload['req'] = json.dumps(self.interest_over_time_widget['request'])
  120. over_time_payload['token'] = self.interest_over_time_widget['token']
  121. over_time_payload['tz'] = self.tz
  122. req = self.ses.get(req_url, params=over_time_payload)
  123.  
  124. # parse the returned json
  125. # strip off garbage characters that break json parser
  126. req_json = json.loads(req.text[5:])
  127. df = pd.DataFrame(req_json['default']['timelineData'])
  128. df['date'] = pd.to_datetime(df['time'], unit='s')
  129. df = df.set_index(['date']).sort_index()
  130. # split list columns into seperate ones, remove brackets and split on comma
  131. result_df = df['value'].apply(lambda x: pd.Series(str(x).replace('[', '').replace(']', '').split(',')))
  132. # rename each column with its search term, relying on order that google provides...
  133. for idx, kw in enumerate(self.kw_list):
  134. result_df[kw] = result_df[idx].astype('int')
  135. del result_df[idx]
  136. return result_df
  137.  
  138. def interest_by_region(self, resolution='COUNTRY'):
  139. """Request data from Google's Interest by Region section and return a dataframe"""
  140.  
  141. # make the request
  142. req_url = "https://www.google.com/trends/api/widgetdata/comparedgeo"
  143. region_payload = dict()
  144. if self.geo == '':
  145. self.interest_by_region_widget['request']['resolution'] = resolution
  146. # convert to string as requests will mangle
  147. region_payload['req'] = json.dumps(self.interest_by_region_widget['request'])
  148. region_payload['token'] = self.interest_by_region_widget['token']
  149. region_payload['tz'] = self.tz
  150. req = self.ses.get(req_url, params=region_payload)
  151.  
  152. # parse returned json
  153. # strip off garbage characters that break json parser
  154. req_json = json.loads(req.text[5:])
  155. df = pd.DataFrame(req_json['default']['geoMapData'])
  156. # rename the column with the search keyword
  157. df = df[['geoName', 'value']].set_index(['geoName']).sort_index()
  158. # split list columns into seperate ones, remove brackets and split on comma
  159. result_df = df['value'].apply(lambda x: pd.Series(str(x).replace('[', '').replace(']', '').split(',')))
  160. # rename each column with its search term
  161. for idx, kw in enumerate(self.kw_list):
  162. result_df[kw] = result_df[idx].astype('int')
  163. del result_df[idx]
  164. return result_df
  165.  
  166. def related_queries(self):
  167. """Request data from Google's Related Queries section and return a dictionary of dataframes"""
  168.  
  169. # make the request
  170. req_url = "https://www.google.com/trends/api/widgetdata/relatedsearches"
  171. related_payload = dict()
  172. result_dict = dict()
  173. for request_json in self.related_queries_widget_list:
  174. # ensure we know which keyword we are looking at rather than relying on order
  175. kw = request_json['request']['restriction']['complexKeywordsRestriction']['keyword'][0]['value']
  176. # convert to string as requests will mangle
  177. related_payload['req'] = json.dumps(request_json['request'])
  178. related_payload['token'] = request_json['token']
  179. related_payload['tz'] = self.tz
  180. req = self.ses.get(req_url, params=related_payload)
  181.  
  182. # parse the returned json
  183. # strip off garbage characters that break json parser
  184. req_json = json.loads(req.text[5:])
  185. # top queries
  186. top_df = pd.DataFrame(req_json['default']['rankedList'][0]['rankedKeyword'])
  187. top_df = top_df[['query', 'value']]
  188. # rising queries
  189. rising_df = pd.DataFrame(req_json['default']['rankedList'][1]['rankedKeyword'])
  190. rising_df = rising_df[['query', 'value']]
  191. result_dict[kw] = {'top': top_df, 'rising': rising_df}
  192. return result_dict
  193.  
  194. def trending_searches(self, pn_value):
  195. """Request data from Google's Trending Searches section and return a dataframe"""
  196.  
  197. # make the request
  198. req_url = "https://trends.google.com/trends/hottrends/hotItems"
  199. forms = {'ajax': 1, 'pn': pn_value, 'htd': '', 'htv': 'l'}
  200. req = self.ses.post(req_url, data=forms)
  201. try:
  202. req_json = json.loads(req.text)['trendsByDateList']
  203. except Exception:
  204. return None
  205. result_df = pd.DataFrame()
  206.  
  207. # parse the returned json
  208. for trenddate in req_json:
  209. sub_df = pd.DataFrame()
  210. sub_df['date'] = trenddate['date']
  211. for trend in trenddate['trendsList']:
  212. sub_df = sub_df.append(trend, ignore_index=True)
  213. result_df = pd.concat([result_df, sub_df])
  214. return result_df
  215.  
  216. def top_charts(self, date, cid, geo='US', cat=''):
  217. """Request data from Google's Top Charts section and return a dataframe"""
  218.  
  219. # make the request
  220. # create the payload
  221. chart_payload = {'ajax': 1, 'lp': 1}
  222. chart_payload['geo'] = geo
  223. chart_payload['date'] = date
  224. chart_payload['cat'] = cat
  225. chart_payload['cid'] = cid
  226. req_url = "https://www.google.com/trends/topcharts/chart"
  227. req = self.ses.post(req_url, params=chart_payload)
  228.  
  229. # parse the returned json
  230. req_json = json.loads(req.text)['data']['entityList']
  231. df = pd.DataFrame(req_json)
  232. return df
  233.  
  234. def suggestions(self, keyword):
  235. """Request data from Google's Keyword Suggestion dropdown and return a dictionary"""
  236.  
  237. # make the request
  238. kw_param = quote(keyword)
  239. req = self.ses.get("https://www.google.com/trends/api/autocomplete/" + kw_param)
  240.  
  241. # parse the returned json
  242. # response is invalid json but if you strip off ")]}'," from the front it is then valid
  243. req_json = json.loads(req.text[5:])['default']['topics']
  244. return req_json
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement