Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from HTMLParser import HTMLParser
- import json, sys, re
- import requests # requires pip install requests
- # Following thanks to
- # http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
- class MLStripper(HTMLParser):
- def __init__(self):
- self.reset()
- self.fed = []
- def handle_data(self, d):
- self.fed.append(d)
- def get_data(self):
- return ' '.join(self.fed)
- def strip_tags(html):
- s = MLStripper()
- s.feed(html)
- return s.get_data()
- url = "https://play.google.com/store/getreviews?id=%s&reviewSortOrder=2&reviewType=1&pageNum="
- url = url % sys.argv[1]
- headers = {'Content-Type':'application/x-www-form-urlencoded;charset=utf-8','Content-Length':'5'}
- reviews_list = []
- max_pages = -1
- i = 0
- while (i == 0 or i < max_pages):
- r = requests.post(url+str(i), headers=headers, data=json.dumps({'xhr':1}))
- content = json.loads(r.text[r.text.find('{'):])
- if max_pages == -1:
- max_pages = content['numPages']
- reviews = content['htmlContent']
- reviews = reviews.replace('\u003C','<')
- reviews = reviews.replace('"',"")
- reviews = reviews.replace(''',"'")
- for review in reviews.split('<div class="doc-review">'):
- match = re.search(r'<strong>(.*)</strong>.*doc-review-date"> - (.*)<.* - ([^<]*)<.*Rating: ([0-9\.]*) ', review)
- if match:
- author = match.group(1)
- date = match.group(2)
- device = match.group(3)
- rating = match.group(4)
- text = title = ''
- match = re.search(r'review-text">(.*?)</p>', review)
- if match:
- text = match.group(1)
- match = re.search(r'review-title">(.*?)</h4>', review)
- if match:
- title = match.group(1)
- reviews_list.append({'date':date,
- 'author':author,
- 'device':device,
- 'rating':rating,
- 'title':title,
- 'review':text} )
- i = i + 1
- print json.dumps(reviews_list);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement