SHARE
TWEET

Export android app reviews

chrisbailey Nov 19th, 2012 455 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from HTMLParser import HTMLParser
  2. import json, sys, re
  3. import requests # requires pip install requests
  4.  
  5. # Following thanks to
  6. # http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
  7. class MLStripper(HTMLParser):
  8.     def __init__(self):
  9.         self.reset()
  10.         self.fed = []
  11.     def handle_data(self, d):
  12.         self.fed.append(d)
  13.     def get_data(self):
  14.         return ' '.join(self.fed)
  15.  
  16. def strip_tags(html):
  17.     s = MLStripper()
  18.     s.feed(html)
  19.     return s.get_data()
  20.  
  21. url = "https://play.google.com/store/getreviews?id=%s&reviewSortOrder=2&reviewType=1&pageNum="
  22. url = url % sys.argv[1]
  23.  
  24. headers = {'Content-Type':'application/x-www-form-urlencoded;charset=utf-8','Content-Length':'5'}
  25.  
  26. reviews_list = []
  27. max_pages = -1
  28. i = 0
  29. while (i == 0 or i < max_pages):
  30.         r = requests.post(url+str(i), headers=headers, data=json.dumps({'xhr':1}))
  31.         content = json.loads(r.text[r.text.find('{'):])
  32.         if max_pages == -1:
  33.                 max_pages = content['numPages']
  34.         reviews = content['htmlContent']
  35.         reviews = reviews.replace('\u003C','<')
  36.         reviews = reviews.replace('&quot;',"")
  37.         reviews = reviews.replace('&#39;',"'")
  38.         for review in reviews.split('<div class="doc-review">'):
  39.                 match = re.search(r'<strong>(.*)</strong>.*doc-review-date"> - (.*)<.* - ([^<]*)<.*Rating: ([0-9\.]*) ', review)
  40.                 if match:
  41.                         author = match.group(1)
  42.                         date = match.group(2)
  43.                         device = match.group(3)
  44.                         rating = match.group(4)
  45.                         text = title = ''
  46.                         match = re.search(r'review-text">(.*?)</p>', review)
  47.                         if match:
  48.                                 text = match.group(1)
  49.                         match = re.search(r'review-title">(.*?)</h4>', review)
  50.                         if match:
  51.                                 title = match.group(1)
  52.  
  53.                         reviews_list.append({'date':date,
  54.                                              'author':author,
  55.                                              'device':device,
  56.                                              'rating':rating,
  57.                                              'title':title,
  58.                                              'review':text} )
  59.         i = i + 1
  60.  
  61. print json.dumps(reviews_list);
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Top