from HTMLParser import HTMLParser
import json, sys, re
import requests # requires pip install requests
# Following thanks to
# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ' '.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
url = "https://play.google.com/store/getreviews?id=%s&reviewSortOrder=2&reviewType=1&pageNum="
url = url % sys.argv[1]
headers = {'Content-Type':'application/x-www-form-urlencoded;charset=utf-8','Content-Length':'5'}
reviews_list = []
max_pages = -1
i = 0
while (i == 0 or i < max_pages):
r = requests.post(url+str(i), headers=headers, data=json.dumps({'xhr':1}))
content = json.loads(r.text[r.text.find('{'):])
if max_pages == -1:
max_pages = content['numPages']
reviews = content['htmlContent']
reviews = reviews.replace('\u003C','<')
reviews = reviews.replace('"',"")
reviews = reviews.replace(''',"'")
for review in reviews.split('<div class="doc-review">'):
match = re.search(r'<strong>(.*)</strong>.*doc-review-date"> - (.*)<.* - ([^<]*)<.*Rating: ([0-9\.]*) ', review)
if match:
author = match.group(1)
date = match.group(2)
device = match.group(3)
rating = match.group(4)
text = title = ''
match = re.search(r'review-text">(.*?)</p>', review)
if match:
text = match.group(1)
match = re.search(r'review-title">(.*?)</h4>', review)
if match:
title = match.group(1)
reviews_list.append({'date':date,
'author':author,
'device':device,
'rating':rating,
'title':title,
'review':text} )
i = i + 1
print json.dumps(reviews_list);