Advertisement
chrisbailey

Export android app reviews

Nov 19th, 2012
598
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.93 KB | None | 0 0
  1. from HTMLParser import HTMLParser
  2. import json, sys, re
  3. import requests # requires pip install requests
  4.  
  5. # Following thanks to
  6. # http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
  7. class MLStripper(HTMLParser):
  8.     def __init__(self):
  9.         self.reset()
  10.         self.fed = []
  11.     def handle_data(self, d):
  12.         self.fed.append(d)
  13.     def get_data(self):
  14.         return ' '.join(self.fed)
  15.  
  16. def strip_tags(html):
  17.     s = MLStripper()
  18.     s.feed(html)
  19.     return s.get_data()
  20.  
  21. url = "https://play.google.com/store/getreviews?id=%s&reviewSortOrder=2&reviewType=1&pageNum="
  22. url = url % sys.argv[1]
  23.  
  24. headers = {'Content-Type':'application/x-www-form-urlencoded;charset=utf-8','Content-Length':'5'}
  25.  
  26. reviews_list = []
  27. max_pages = -1
  28. i = 0
  29. while (i == 0 or i < max_pages):
  30.     r = requests.post(url+str(i), headers=headers, data=json.dumps({'xhr':1}))
  31.     content = json.loads(r.text[r.text.find('{'):])
  32.     if max_pages == -1:
  33.         max_pages = content['numPages']
  34.     reviews = content['htmlContent']
  35.     reviews = reviews.replace('\u003C','<')
  36.     reviews = reviews.replace('&quot;',"")
  37.     reviews = reviews.replace('&#39;',"'")
  38.     for review in reviews.split('<div class="doc-review">'):
  39.         match = re.search(r'<strong>(.*)</strong>.*doc-review-date"> - (.*)<.* - ([^<]*)<.*Rating: ([0-9\.]*) ', review)
  40.         if match:
  41.             author = match.group(1)
  42.             date = match.group(2)
  43.             device = match.group(3)
  44.             rating = match.group(4)
  45.             text = title = ''
  46.             match = re.search(r'review-text">(.*?)</p>', review)
  47.             if match:
  48.                 text = match.group(1)
  49.             match = re.search(r'review-title">(.*?)</h4>', review)
  50.             if match:
  51.                 title = match.group(1)
  52.  
  53.             reviews_list.append({'date':date,
  54.                                  'author':author,
  55.                                  'device':device,
  56.                                  'rating':rating,
  57.                                  'title':title,
  58.                                  'review':text} )
  59.     i = i + 1
  60.  
  61. print json.dumps(reviews_list);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement