Advertisement
Guest User

Untitled

a guest
Apr 29th, 2016
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.24 KB | None | 0 0
  1. Приложение-скрапер продуктов
  2.  
  3. #models
  4.  
  5. # -*- coding: utf-8 -*-
  6. from django.db import models
  7. from mptt.models import MPTTModel, TreeForeignKey
  8.  
  9.  
  10. class Category(MPTTModel):
  11.  
  12.     name = models.CharField(max_length=255, unique=True)
  13.     parent = TreeForeignKey('self', null=True, blank=True,
  14.                             related_name='children', db_index=True)
  15.  
  16.     def __str__(self):
  17.         return self.name
  18.  
  19.  
  20. class Product(models.Model):
  21.  
  22.     categories = models.ManyToManyField(Category, related_name='products')
  23.     site_id = models.CharField(max_length=10)
  24.     name = models.CharField(max_length=255)
  25.     cost = models.DecimalField(max_digits=7, decimal_places=2,
  26.                                help_text="Price in euros (€)")
  27.     created_at = models.DateTimeField('date published', auto_now_add=True)
  28.  
  29.     def __str__(self):
  30.         return self.name
  31.  
  32. # admin
  33.  
  34. # -*- coding: utf-8 -*-
  35. from django.contrib import admin
  36. from mptt.admin import MPTTModelAdmin
  37. from .models import Category, Product
  38.  
  39.  
  40. class CategoryMPTTModelAdmin(MPTTModelAdmin):
  41.     fieldsets = [
  42.         ('Category', {
  43.          'fields': ['name', 'parent']
  44.          }),
  45.     ]
  46.     list_display = ('name', )
  47.     search_fields = ['name', ]
  48.     list_per_page = 100
  49.     mptt_level_indent = 10
  50.  
  51.  
  52. class ProductAdmin(admin.ModelAdmin):
  53.     list_display = ('site_id', 'name', 'cost', 'created_at', )
  54.     search_fields = ['site_id', 'name', 'cost']
  55.     list_per_page = 100
  56.     filter_horizontal = ('categories', )
  57.  
  58.  
  59. admin.site.register(Category, CategoryMPTTModelAdmin)
  60. admin.site.register(Product, ProductAdmin)
  61.  
  62. # utils
  63.  
  64. # -*- coding: utf-8 -*-
  65. import requests
  66. import time
  67. import sys
  68.  
  69. from django.conf import settings
  70. from django.db import IntegrityError
  71.  
  72. from .models import Category, Product
  73.  
  74.  
  75. def get_children(category):
  76.     """Return subcategories of argument category"""
  77.  
  78.     sub_categories = []
  79.     # go to category page
  80.     try:
  81.         resp = requests.get(settings.FULL_API_URL, params={'url': category, })
  82.     except requests.ConnectionError as err:
  83.         print(err)
  84.     for _ in resp.json()['_embedded']['lanes'][3]['_embedded']['items'][0]['_embedded']['filters']:
  85.         for i in _['_embedded']['filterItems']:
  86.             sub_categories.append(i['navItem']['link']['href'])
  87.     return sub_categories
  88.  
  89.  
  90. def has_sub_categories(category):
  91.     """Return True if category contains subcategories, and False otherwise."""
  92.  
  93.     # go to category page
  94.     try:
  95.         resp = requests.get(settings.FULL_API_URL, params={'url': category, })
  96.     except requests.ConnectionError as err:
  97.         print(err)
  98.     return resp.json()['_embedded']['lanes'][3]['_embedded']['items'][0]['_embedded']['filters'] != []
  99.  
  100.  
  101. def get_products(category):
  102.     """Return products of argument category"""
  103.  
  104.     products = []
  105.     # go to category page
  106.     try:
  107.         resp = requests.get(settings.FULL_API_URL, params={'url': category})
  108.     except requests.ConnectionError as err:
  109.         print(err)
  110.     category_obj = Category.objects.get(name=category)
  111.     for lane in resp.json()['_embedded']['lanes']:
  112.         if lane['type'] == "ProductLane":
  113.             for product in lane['_embedded']['items']:
  114.                 if product['type'] == "Product":
  115.                     site_id = product['_embedded']['productCard'][
  116.                         '_embedded']['product']['id']
  117.                     name = product['_embedded']['productCard'][
  118.                         '_embedded']['product']['description']
  119.                     cost = product['_embedded']['productCard'][
  120.                         '_embedded']['product']['priceLabel']['now']
  121.                     if Product.objects.filter(site_id=site_id).exists():
  122.                         result_msg = 'Exists product: '
  123.                         # taking last saved product in DB with this site_id for
  124.                         # checking, was his cost updated or not?
  125.                         product_obj = Product.objects.filter(
  126.                             site_id=site_id).order_by('-created_at')[0]
  127.                         # if was updated (cost not still the same)
  128.                         if cost != float(product_obj.cost):
  129.                             result_msg = 'Updated product: '
  130.                             product_obj = Product.objects.create(
  131.                                 site_id=site_id, name=name, cost=cost,)
  132.                     else:
  133.                         result_msg = 'Created product: '
  134.                         product_obj = Product.objects.create(
  135.                             site_id=site_id, name=name, cost=cost,)
  136.                     print('{0} {1}'.format(result_msg, product_obj))
  137.                     products.append(product_obj)
  138.     return products
  139.  
  140.  
  141. def get_sub_categories(category, parent):
  142.     """Append category list, with all subcategories, by recursive method"""
  143.  
  144.     if time.time() > settings.TIME_END:
  145.         print('Scraping products completed')
  146.         sys.exit()
  147.     try:
  148.         result_msg = 'Added category: '
  149.         category_obj = Category.objects.create(name=category, parent=parent)
  150.     except IntegrityError:
  151.         result_msg = 'Go to category: '
  152.         category_obj = Category.objects.get(name=category)
  153.     print('{0} {1}'.format(result_msg, category))
  154.     if has_sub_categories(category):
  155.         parent = Category.objects.get(name=category)
  156.         sub_categories = get_children(category)
  157.         for cat in sub_categories:
  158.             get_sub_categories(cat, parent)
  159.     else:
  160.         products = get_products(category)
  161.         category_obj.products.add(*products)
  162.  
  163.  
  164. def scrap_content():
  165.     # go to sart page (main category that contains 18 categories into)
  166.     resp = requests.get(settings.FULL_API_URL, params={'url': '/producten'})
  167.     all_main_categories = []
  168.     for category in resp.json()['_embedded']['lanes'][0]['_embedded']['items']:
  169.         all_main_categories.append(category['navItem']['link']['href'])
  170.     try:
  171.         parent = Category.objects.create(name='/producten')
  172.     except IntegrityError:
  173.         parent = Category.objects.get(name='/producten')
  174.     # run appending category list with all subcategories
  175.     for cat in all_main_categories:
  176.         get_sub_categories(cat, parent)
  177.  
  178. if __name__ == '__main__':
  179.     scrap_content()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement