Advertisement
Guest User

Dribbble Parser

a guest
Jun 23rd, 2017
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.54 KB | None | 0 0
  1. import numpy as np
  2. import json
  3. import bson
  4. import urllib
  5. from HTMLParser import HTMLParser
  6. from urllib import urlopen
  7. import re
  8. import urllib2,cookielib
  9.  
  10. hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
  11.        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  12.        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
  13.        'Accept-Encoding': 'none',
  14.        'Accept-Language': 'en-US,en;q=0.8',
  15.        'Connection': 'keep-alive'}
  16.  
  17. class MyHTMLParser(HTMLParser):
  18.     def __init__(self):
  19.         HTMLParser.__init__(self)
  20.         self.source = 0
  21.         self.name = 0
  22.         self.likes_count = 0
  23.         self.views_count = 0
  24.         self.attach = 0
  25.         self.color = 0
  26.         self.tags = 0
  27.         self.tag = 0
  28.         self.p = 0
  29.        
  30.         self.json_name = ''
  31.         self.json_date = ''
  32.         self.json_attachments = []
  33.         self.json_small_link = ''
  34.         self.json_big_link = ''
  35.         self.json_likes_count = 0
  36.         self.json_views_count = 0
  37.         self.json_colors = []
  38.         self.json_tags = []
  39.         self.json_description = ''
  40.        
  41.     def handle_starttag(self, tag, attrs):
  42.         if tag == 'source':
  43.             self.source += 1
  44.             for name, value in attrs:
  45.                 if self.source == 3 and name == 'srcset':
  46.                     self.json_big_link = value
  47.                 if self.source == 4 and name == 'srcset':
  48.                     self.json_small_link = value
  49.         if tag == 'h1':
  50.             self.name = 1
  51.         if tag == 'span':
  52.             for name, value in attrs:
  53.                 if name == 'class' and value == 'stats-label likes-count':
  54.                     self.likes_count = 1
  55.                 if name == 'class' and value == 'views-count stats-num':
  56.                     self.views_count = 1
  57.                    
  58.         if tag == 'a':
  59.             for name, value in attrs:
  60.                 if name == 'href' and '/shots?date' in value:
  61.                     self.json_date = value[12:]
  62.                 if name == 'data-title':
  63.                     self.attach = 1
  64.                 if name == 'href' and self.attach == 1:
  65.                     self.attach = 0
  66.                     self.json_attachments.append('https://dribbble.com' + value)
  67.                 if name == 'style' and 'background-color' in value:
  68.                     self.color = 1
  69.         if tag == 'div':
  70.             for name, value in attrs:
  71.                 if name == 'class' and value == 'tags-section':
  72.                     self.tags = 1
  73.         if tag == 'strong' and self.tags == 1:
  74.             self.tag = 1
  75.         if tag == 'p':
  76.             self.p += 1
  77.                
  78.            
  79.            
  80.    
  81.     def handle_endtag(self, tag):
  82.         if tag == 'div' and self.tags == 1:
  83.             self.tags = 0
  84.            
  85.    
  86.     def handle_data(self, data):
  87.         if self.name == 1:
  88.             self.name = 0
  89.             self.json_name = data
  90.         if self.likes_count == 1:
  91.             self.likes_count= 0
  92.             lines = data.split('\n')
  93.             likes = lines[1].replace(',', '').replace(lines[1][0], '')
  94.             self.json_likes_count = int(likes)
  95.         if self.views_count == 1:
  96.             self.views_count = 0
  97.             lines = data.split('\n')
  98.             views = lines[1].replace(',', '').replace(lines[1][0], '')
  99.             self.json_views_count = int(views)
  100.         if self.color == 1:
  101.             self.color = 0
  102.             self.json_colors.append(data)
  103.         if self.tag == 1:
  104.             self.tag = 0
  105.             self.json_tags.append(data)
  106.         if self.p == 2:
  107.             self.p += 1
  108.             self.json_description = data
  109.  
  110. for i in range(1):
  111.     json_id = 3590389 + i
  112.     site= "https://dribbble.com/shots/"+str(json_id)
  113.     req = urllib2.Request(site, headers=hdr)
  114.     try:
  115.         page = urllib2.urlopen(req)
  116.     except urllib2.HTTPError, e:
  117.         print site
  118.     content = page.read()
  119.     parser = MyHTMLParser()
  120.     parser.feed(content)
  121.     print 'url: ', site
  122.     print 'id: ', json_id
  123.     print 'small_link: ', parser.json_small_link
  124.     print 'big_link: ', parser.json_big_link
  125.     print 'name: ', parser.json_name
  126.     print 'likes: ', parser.json_likes_count
  127.     print 'views: ', parser.json_views_count
  128.     print 'date: ', parser.json_date
  129.     print 'attach: ', parser.json_attachments
  130.     print 'colors: ', parser.json_colors
  131.     print 'tag: ', parser.json_tags
  132.     print 'description: ', parser.json_description
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement