Advertisement
Guest User

Untitled

a guest
Dec 11th, 2017
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.38 KB | None | 0 0
  1. from scrapy.spiders import SitemapSpider
  2. from apkmirror_scraper.items import ApkmirrorScraperItem, ApkmirrorItemLoader
  3.  
  4. class ApkmirrorSitemapSpider(SitemapSpider):
  5. name = 'apkmirror-spider'
  6. sitemap_urls = ['http://www.apkmirror.com/sitemap_index.xml']
  7. sitemap_rules = [(r'.*-android-apk-download/$', 'parse')]
  8.  
  9. def parse(self, response):
  10. loader = ApkmirrorItemLoader(item=ApkmirrorScraperItem(), response=response)
  11.  
  12. loader.add_value('url', response.url)
  13. loader.add_xpath(field_name='title', xpath='//h1[@title]/text()')
  14. loader.add_xpath(field_name='developer', xpath='//h3[@title]/a/text()')
  15. loader.add_xpath(field_name='app', xpath='//*[contains(@data-channel-name, "App Updates")]/@data-channel-name')
  16.  
  17. apk_details_loader = loader.nested_xpath('//*[@title="APK details"]/following-sibling::*[@class="appspec-value"]')
  18.  
  19. apk_details_loader.add_xpath(field_name="version_name", xpath=".//text()")
  20. apk_details_loader.add_xpath(field_name="version_code", xpath=".//text()")
  21.  
  22. return loader.load_item()
  23.  
  24. import re
  25.  
  26. import scrapy
  27. import scrapy.loader
  28.  
  29. from scrapy.loader.processors import Compose, MapCompose, TakeFirst
  30.  
  31. class ApkmirrorScraperItem(scrapy.Item):
  32. url = scrapy.Field()
  33. title = scrapy.Field()
  34. developer = scrapy.Field()
  35. app = scrapy.Field()
  36. version_name = scrapy.Field()
  37. version_code = scrapy.Field()
  38. architectures = scrapy.Field()
  39. package = scrapy.Field()
  40. apk_file_size = scrapy.Field()
  41. android_min_version = scrapy.Field()
  42. android_target_version = scrapy.Field()
  43. supported_dpis = scrapy.Field()
  44. md5_signature = scrapy.Field()
  45. time_uploaded = scrapy.Field()
  46. time_scraped = scrapy.Field()
  47. download_link = scrapy.Field()
  48.  
  49.  
  50. def parse_app(data_channel_name):
  51. '''Parse the name of the app from the "data-channel-name" attribute of the button named "Follow [app_name] Updates".'''
  52. pattern = re.compile(r'(?P<app>.+) App Updates')
  53. return pattern.search(data_channel_name).groupdict().get("app")
  54.  
  55. def get_version_line(apk_details):
  56. '''Get the line containing the version from the 'APK details' section.'''
  57. return next(line for line in apk_details if line.startswith("Version:"))
  58.  
  59. def get_architectures_line(apk_details):
  60. '''Get the line containing the supported architectures (e.g. "arm", "x64") from the 'APK details' section, if present.'''
  61. return apk_details[1] if not apk_details[1].startswith("Package:") else None # The line does not contain any keywords and may not be present, in which case None is returned
  62.  
  63. def get_package_line(apk_details):
  64. return next(line for line in apk_details if line.startswith("Package:")) # The 'package line' is always present and starts with "Package:"
  65.  
  66. def parse_version_line(version_line):
  67. '''Parse the 'versionName' and 'versionCode' from the relevant line in 'APK details'.'''
  68. PATTERN = r"^Version: (?P<version_name>.+) ((?P<version_code>d+))s*$" # Note that the pattern includes the end-of-line character ($). This is necessary because some package names (e.g. Google Play) themselves contain brackets.
  69. return re.match(PATTERN, version_line).groupdict()
  70.  
  71.  
  72. class ApkmirrorItemLoader(scrapy.loader.ItemLoader):
  73.  
  74. url_out = TakeFirst()
  75.  
  76. title_in = MapCompose(unicode.strip)
  77. title_out = TakeFirst()
  78.  
  79. developer_in = MapCompose(unicode.strip)
  80. developer_out = TakeFirst()
  81.  
  82. app_in = MapCompose(parse_app)
  83. app_out = TakeFirst()
  84.  
  85. version_name_in = Compose(get_version_line, parse_version_line, lambda d: d.get("version_name"))
  86. version_name_out = TakeFirst()
  87.  
  88. version_code_in = Compose(get_version_line, parse_version_line, lambda d: d.get("version_code"))
  89. version_code_out = TakeFirst()
  90.  
  91. scrapy parse --spider=apkmirror-spider http://www.apkmirror.com/apk/google-inc/sheets/sheets-1-7-152-06-release/google-sheets-1-7-152-06-30-android-apk-download/
  92.  
  93. # Scraped Items ------------------------------------------------------------
  94. [{'app': u'Google Sheets',
  95. 'developer': u'Google Inc.',
  96. 'title': u'Google Sheets 1.7.152.06.30 (arm) (nodpi)',
  97. 'url': 'http://www.apkmirror.com/apk/google-inc/sheets/sheets-1-7-152-06-release/google-sheets-1-7-152-06-30-android-apk-download/',
  98. 'version_code': u'71520630',
  99. 'version_name': u'1.7.152.06.30'}]
  100.  
  101. apk_details_loader = loader.nested_xpath('//*[@title="APK details"]/following-sibling::*[@class="appspec-value"]//text()')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement