Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Приложение-скрапер продуктов
- #models
- # -*- coding: utf-8 -*-
- from django.db import models
- from mptt.models import MPTTModel, TreeForeignKey
- class Category(MPTTModel):
- name = models.CharField(max_length=255, unique=True)
- parent = TreeForeignKey('self', null=True, blank=True,
- related_name='children', db_index=True)
- def __str__(self):
- return self.name
- class Product(models.Model):
- categories = models.ManyToManyField(Category, related_name='products')
- site_id = models.CharField(max_length=10)
- name = models.CharField(max_length=255)
- cost = models.DecimalField(max_digits=7, decimal_places=2,
- help_text="Price in euros (€)")
- created_at = models.DateTimeField('date published', auto_now_add=True)
- def __str__(self):
- return self.name
- # admin
- # -*- coding: utf-8 -*-
- from django.contrib import admin
- from mptt.admin import MPTTModelAdmin
- from .models import Category, Product
- class CategoryMPTTModelAdmin(MPTTModelAdmin):
- fieldsets = [
- ('Category', {
- 'fields': ['name', 'parent']
- }),
- ]
- list_display = ('name', )
- search_fields = ['name', ]
- list_per_page = 100
- mptt_level_indent = 10
- class ProductAdmin(admin.ModelAdmin):
- list_display = ('site_id', 'name', 'cost', 'created_at', )
- search_fields = ['site_id', 'name', 'cost']
- list_per_page = 100
- filter_horizontal = ('categories', )
- admin.site.register(Category, CategoryMPTTModelAdmin)
- admin.site.register(Product, ProductAdmin)
- # utils
- # -*- coding: utf-8 -*-
- import requests
- import time
- import sys
- from django.conf import settings
- from django.db import IntegrityError
- from .models import Category, Product
- def get_children(category):
- """Return subcategories of argument category"""
- sub_categories = []
- # go to category page
- try:
- resp = requests.get(settings.FULL_API_URL, params={'url': category, })
- except requests.ConnectionError as err:
- print(err)
- for _ in resp.json()['_embedded']['lanes'][3]['_embedded']['items'][0]['_embedded']['filters']:
- for i in _['_embedded']['filterItems']:
- sub_categories.append(i['navItem']['link']['href'])
- return sub_categories
- def has_sub_categories(category):
- """Return True if category contains subcategories, and False otherwise."""
- # go to category page
- try:
- resp = requests.get(settings.FULL_API_URL, params={'url': category, })
- except requests.ConnectionError as err:
- print(err)
- return resp.json()['_embedded']['lanes'][3]['_embedded']['items'][0]['_embedded']['filters'] != []
- def get_products(category):
- """Return products of argument category"""
- products = []
- # go to category page
- try:
- resp = requests.get(settings.FULL_API_URL, params={'url': category})
- except requests.ConnectionError as err:
- print(err)
- category_obj = Category.objects.get(name=category)
- for lane in resp.json()['_embedded']['lanes']:
- if lane['type'] == "ProductLane":
- for product in lane['_embedded']['items']:
- if product['type'] == "Product":
- site_id = product['_embedded']['productCard'][
- '_embedded']['product']['id']
- name = product['_embedded']['productCard'][
- '_embedded']['product']['description']
- cost = product['_embedded']['productCard'][
- '_embedded']['product']['priceLabel']['now']
- if Product.objects.filter(site_id=site_id).exists():
- result_msg = 'Exists product: '
- # taking last saved product in DB with this site_id for
- # checking, was his cost updated or not?
- product_obj = Product.objects.filter(
- site_id=site_id).order_by('-created_at')[0]
- # if was updated (cost not still the same)
- if cost != float(product_obj.cost):
- result_msg = 'Updated product: '
- product_obj = Product.objects.create(
- site_id=site_id, name=name, cost=cost,)
- else:
- result_msg = 'Created product: '
- product_obj = Product.objects.create(
- site_id=site_id, name=name, cost=cost,)
- print('{0} {1}'.format(result_msg, product_obj))
- products.append(product_obj)
- return products
- def get_sub_categories(category, parent):
- """Append category list, with all subcategories, by recursive method"""
- if time.time() > settings.TIME_END:
- print('Scraping products completed')
- sys.exit()
- try:
- result_msg = 'Added category: '
- category_obj = Category.objects.create(name=category, parent=parent)
- except IntegrityError:
- result_msg = 'Go to category: '
- category_obj = Category.objects.get(name=category)
- print('{0} {1}'.format(result_msg, category))
- if has_sub_categories(category):
- parent = Category.objects.get(name=category)
- sub_categories = get_children(category)
- for cat in sub_categories:
- get_sub_categories(cat, parent)
- else:
- products = get_products(category)
- category_obj.products.add(*products)
- def scrap_content():
- # go to sart page (main category that contains 18 categories into)
- resp = requests.get(settings.FULL_API_URL, params={'url': '/producten'})
- all_main_categories = []
- for category in resp.json()['_embedded']['lanes'][0]['_embedded']['items']:
- all_main_categories.append(category['navItem']['link']['href'])
- try:
- parent = Category.objects.create(name='/producten')
- except IntegrityError:
- parent = Category.objects.get(name='/producten')
- # run appending category list with all subcategories
- for cat in all_main_categories:
- get_sub_categories(cat, parent)
- if __name__ == '__main__':
- scrap_content()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement