Guest User

Untitled

a guest
Apr 17th, 2020
13
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.28 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. # Define your item pipelines here
  4. #
  5. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6. # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  7. import os
  8. from w3lib import html
  9. import re
  10. from sqlalchemy.orm import sessionmaker
  11. from sqlalchemy import create_engine
  12. from .models import Base, Book, Author, Genre, Interpreter, Isbn, Image
  13.  
  14.  
  15. class ParserPipeline(object):
  16.  
  17. def process_item(self, item, spider):
  18. if item.get('info_block'):
  19. info_block_list = []
  20. for i in item.get('info_block'):
  21. i = html.remove_tags(i)
  22. i = re.sub(r' \.\.\.', ' ', i)
  23. info_block_list.append(i)
  24. item['info_block'] = info_block_list
  25. for i in item.get('info_block'):
  26. i = i.partition(':')
  27. el = {i[0]: i[2].strip() for x in i}
  28. if el.get('Язык оригинальной книги'):
  29. item['original_language'] = el['Язык оригинальной книги']
  30. if el.get('Переводчик(и)'):
  31. item['interpreter'] = el['Переводчик(и)'].split(', ')
  32. if el.get('Издатель'):
  33. item['publisher'] = el['Издатель']
  34. if el.get('Город печати'):
  35. item['city_publish'] = el['Город печати']
  36. if el.get('Год печати'):
  37. item['year_publish'] = el['Год печати']
  38. if el.get('Название печатной книги'):
  39. item['printed_name'] = el['Название печатной книги']
  40. if el.get('ISBN'):
  41. item['isbn'] = el['ISBN'].split(', ')
  42. if item.get('name'):
  43. item['name'] = item['name'].strip()
  44. return item
  45.  
  46.  
  47. class DataBasePipeline(object):
  48. Session = sessionmaker()
  49.  
  50. def __init__(self, db_name):
  51. self.db_name = db_name
  52. self.engine = create_engine('sqlite:///%s' % self.db_name, echo=False)
  53. if not os.path.exists(db_name):
  54. Base.metadata.create_all(self.engine)
  55.  
  56. @classmethod
  57. def from_crawler(cls, crawler):
  58. db_name = crawler.settings.get('DB_NAME')
  59. return cls(db_name)
  60.  
  61. def open_spider(self, spider):
  62. self.session = self.Session(bind=self.engine)
  63.  
  64. def process_item(self, item, spider):
  65. if item.get('book_id'):
  66. book = Book(
  67. id=item.get('book_id'),
  68. title=item.get('title'),
  69. number_of_pages=item.get('number_of_pages'),
  70. printed_name=item.get('printed_name'),
  71. description=item.get('description'),
  72. year_publish=item.get('year_publish'),
  73. city_publish=item.get('city_publish'),
  74. publisher=item.get('publisher'),
  75. in_language=item.get('in_language'),
  76. original_language=item.get('original_language')
  77. )
  78.  
  79. if item.get('genre'):
  80. for genre in item['genre']:
  81. book.genres.append(Genre(name=genre))
  82.  
  83. if item.get('isbn'):
  84. book.isbn = [Isbn(name=el) for el in item['isbn']]
  85.  
  86. if item.get('images'):
  87. book.image = Image(checksum=item['images'][0].get('checksum'),
  88. path=item['images'][0].get('path'))
  89.  
  90. if item.get('interpreter'):
  91. pass
  92.  
  93. self.session.add(book)
  94. self.session.commit()
  95.  
  96. if item.get('author_id'):
  97. author = Author(
  98. id=item.get('author_id'),
  99. name=item.get('name'),
  100. gender=item.get('gender'),
  101. birth_date=item.get('birth_date'),
  102. birth_place=item.get('birth_place'),
  103. death_date=item.get('death_date'),
  104. death_place=item.get('death_place'),
  105. author_bio=item.get('author_bio'),
  106.  
  107. )
  108.  
  109. self.session.add(author)
  110. self.session.commit()
  111.  
  112. def close_spider(self, spider):
  113. self.session.commit()
  114. self.session.close()
Advertisement
Add Comment
Please, Sign In to add comment