Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import json
- import bson
- import urllib
- from HTMLParser import HTMLParser
- from urllib import urlopen
- import re
- import urllib2,cookielib
- hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
- 'Accept-Encoding': 'none',
- 'Accept-Language': 'en-US,en;q=0.8',
- 'Connection': 'keep-alive'}
- class MyHTMLParser(HTMLParser):
- def __init__(self):
- HTMLParser.__init__(self)
- self.source = 0
- self.name = 0
- self.likes_count = 0
- self.views_count = 0
- self.attach = 0
- self.color = 0
- self.tags = 0
- self.tag = 0
- self.p = 0
- self.json_name = ''
- self.json_date = ''
- self.json_attachments = []
- self.json_small_link = ''
- self.json_big_link = ''
- self.json_likes_count = 0
- self.json_views_count = 0
- self.json_colors = []
- self.json_tags = []
- self.json_description = ''
- def handle_starttag(self, tag, attrs):
- if tag == 'source':
- self.source += 1
- for name, value in attrs:
- if self.source == 3 and name == 'srcset':
- self.json_big_link = value
- if self.source == 4 and name == 'srcset':
- self.json_small_link = value
- if tag == 'h1':
- self.name = 1
- if tag == 'span':
- for name, value in attrs:
- if name == 'class' and value == 'stats-label likes-count':
- self.likes_count = 1
- if name == 'class' and value == 'views-count stats-num':
- self.views_count = 1
- if tag == 'a':
- for name, value in attrs:
- if name == 'href' and '/shots?date' in value:
- self.json_date = value[12:]
- if name == 'data-title':
- self.attach = 1
- if name == 'href' and self.attach == 1:
- self.attach = 0
- self.json_attachments.append('https://dribbble.com' + value)
- if name == 'style' and 'background-color' in value:
- self.color = 1
- if tag == 'div':
- for name, value in attrs:
- if name == 'class' and value == 'tags-section':
- self.tags = 1
- if tag == 'strong' and self.tags == 1:
- self.tag = 1
- if tag == 'p':
- self.p += 1
- def handle_endtag(self, tag):
- if tag == 'div' and self.tags == 1:
- self.tags = 0
- def handle_data(self, data):
- if self.name == 1:
- self.name = 0
- self.json_name = data
- if self.likes_count == 1:
- self.likes_count= 0
- lines = data.split('\n')
- likes = lines[1].replace(',', '').replace(lines[1][0], '')
- self.json_likes_count = int(likes)
- if self.views_count == 1:
- self.views_count = 0
- lines = data.split('\n')
- views = lines[1].replace(',', '').replace(lines[1][0], '')
- self.json_views_count = int(views)
- if self.color == 1:
- self.color = 0
- self.json_colors.append(data)
- if self.tag == 1:
- self.tag = 0
- self.json_tags.append(data)
- if self.p == 2:
- self.p += 1
- self.json_description = data
- for i in range(1):
- json_id = 3590389 + i
- site= "https://dribbble.com/shots/"+str(json_id)
- req = urllib2.Request(site, headers=hdr)
- try:
- page = urllib2.urlopen(req)
- except urllib2.HTTPError, e:
- print site
- content = page.read()
- parser = MyHTMLParser()
- parser.feed(content)
- print 'url: ', site
- print 'id: ', json_id
- print 'small_link: ', parser.json_small_link
- print 'big_link: ', parser.json_big_link
- print 'name: ', parser.json_name
- print 'likes: ', parser.json_likes_count
- print 'views: ', parser.json_views_count
- print 'date: ', parser.json_date
- print 'attach: ', parser.json_attachments
- print 'colors: ', parser.json_colors
- print 'tag: ', parser.json_tags
- print 'description: ', parser.json_description
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement