Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from scrapy.spiders import SitemapSpider
- from apkmirror_scraper.items import ApkmirrorScraperItem, ApkmirrorItemLoader
- class ApkmirrorSitemapSpider(SitemapSpider):
- name = 'apkmirror-spider'
- sitemap_urls = ['http://www.apkmirror.com/sitemap_index.xml']
- sitemap_rules = [(r'.*-android-apk-download/$', 'parse')]
- def parse(self, response):
- loader = ApkmirrorItemLoader(item=ApkmirrorScraperItem(), response=response)
- loader.add_value('url', response.url)
- loader.add_xpath(field_name='title', xpath='//h1[@title]/text()')
- loader.add_xpath(field_name='developer', xpath='//h3[@title]/a/text()')
- loader.add_xpath(field_name='app', xpath='//*[contains(@data-channel-name, "App Updates")]/@data-channel-name')
- apk_details_loader = loader.nested_xpath('//*[@title="APK details"]/following-sibling::*[@class="appspec-value"]')
- apk_details_loader.add_xpath(field_name="version_name", xpath=".//text()")
- apk_details_loader.add_xpath(field_name="version_code", xpath=".//text()")
- return loader.load_item()
- import re
- import scrapy
- import scrapy.loader
- from scrapy.loader.processors import Compose, MapCompose, TakeFirst
- class ApkmirrorScraperItem(scrapy.Item):
- url = scrapy.Field()
- title = scrapy.Field()
- developer = scrapy.Field()
- app = scrapy.Field()
- version_name = scrapy.Field()
- version_code = scrapy.Field()
- architectures = scrapy.Field()
- package = scrapy.Field()
- apk_file_size = scrapy.Field()
- android_min_version = scrapy.Field()
- android_target_version = scrapy.Field()
- supported_dpis = scrapy.Field()
- md5_signature = scrapy.Field()
- time_uploaded = scrapy.Field()
- time_scraped = scrapy.Field()
- download_link = scrapy.Field()
- def parse_app(data_channel_name):
- '''Parse the name of the app from the "data-channel-name" attribute of the button named "Follow [app_name] Updates".'''
- pattern = re.compile(r'(?P<app>.+) App Updates')
- return pattern.search(data_channel_name).groupdict().get("app")
- def get_version_line(apk_details):
- '''Get the line containing the version from the 'APK details' section.'''
- return next(line for line in apk_details if line.startswith("Version:"))
- def get_architectures_line(apk_details):
- '''Get the line containing the supported architectures (e.g. "arm", "x64") from the 'APK details' section, if present.'''
- return apk_details[1] if not apk_details[1].startswith("Package:") else None # The line does not contain any keywords and may not be present, in which case None is returned
- def get_package_line(apk_details):
- return next(line for line in apk_details if line.startswith("Package:")) # The 'package line' is always present and starts with "Package:"
- def parse_version_line(version_line):
- '''Parse the 'versionName' and 'versionCode' from the relevant line in 'APK details'.'''
- PATTERN = r"^Version: (?P<version_name>.+) ((?P<version_code>d+))s*$" # Note that the pattern includes the end-of-line character ($). This is necessary because some package names (e.g. Google Play) themselves contain brackets.
- return re.match(PATTERN, version_line).groupdict()
- class ApkmirrorItemLoader(scrapy.loader.ItemLoader):
- url_out = TakeFirst()
- title_in = MapCompose(unicode.strip)
- title_out = TakeFirst()
- developer_in = MapCompose(unicode.strip)
- developer_out = TakeFirst()
- app_in = MapCompose(parse_app)
- app_out = TakeFirst()
- version_name_in = Compose(get_version_line, parse_version_line, lambda d: d.get("version_name"))
- version_name_out = TakeFirst()
- version_code_in = Compose(get_version_line, parse_version_line, lambda d: d.get("version_code"))
- version_code_out = TakeFirst()
- scrapy parse --spider=apkmirror-spider http://www.apkmirror.com/apk/google-inc/sheets/sheets-1-7-152-06-release/google-sheets-1-7-152-06-30-android-apk-download/
- # Scraped Items ------------------------------------------------------------
- [{'app': u'Google Sheets',
- 'developer': u'Google Inc.',
- 'title': u'Google Sheets 1.7.152.06.30 (arm) (nodpi)',
- 'url': 'http://www.apkmirror.com/apk/google-inc/sheets/sheets-1-7-152-06-release/google-sheets-1-7-152-06-30-android-apk-download/',
- 'version_code': u'71520630',
- 'version_name': u'1.7.152.06.30'}]
- apk_details_loader = loader.nested_xpath('//*[@title="APK details"]/following-sibling::*[@class="appspec-value"]//text()')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement