Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import json
- from bs4 import BeautifulSoup, Comment
- import time
- import random
- import dataset
- from datetime import date
- def restruct(item):
- """
- Раскрывает нужные вложенности
- :param item: dict
- :return: dict
- """
- dow = item['downloads_and_revenue']['downloads']
- rev = item['downloads_and_revenue']['revenue']
- def get_num(item):
- if item:
- num = ''
- for word in item:
- try:
- num += str(int(word))
- except ValueError:
- pass
- num = int(num)
- if item[-1] == 'b':
- num *= 1000000000
- elif item[-1] == 'm':
- num *= 1000000
- elif item[-1] == 'k':
- num *= 1000
- return num
- return None
- def get_gen(item):
- if item:
- if item[0] in ['>', '<']:
- return item[0]
- return None
- rev_break = item['downloads_and_revenue']['revenueBreakdown']
- dow_break = item['downloads_and_revenue']['downloadBreakdown']
- release_date = date(year=int(item['release_date'][0:4]), month=int(item['release_date'][5:7]),
- day=int(item['release_date'][8:10]))
- updated_date = date(year=int(item['updated_date'][0:4]), month=int(item['updated_date'][5:7]),
- day=int(item['updated_date'][8:10]))
- downloads_revenue_date = date(year=int(item['downloads_revenue_date'][0:4]),
- month=int(item['downloads_revenue_date'][5:7]),
- day=int(item['downloads_revenue_date'][8:10]))
- if item['worldwide_release_date']:
- worldwide_release_date = date.fromtimestamp(int(str(item['worldwide_release_date'])[:10]))
- else:
- worldwide_release_date = None
- if item['recent_release_date']:
- recent_release_date = date.fromtimestamp(int(str(item['recent_release_date'])[:10]))
- else:
- recent_release_date = None
- if item['content_rating']:
- content_rating = int(item['content_rating'][:len(item['content_rating']) - 1])
- item.pop('id')
- item.pop('short_description')
- item.pop('game_intel_data')
- item.pop('publisher_address')
- item.pop('publisher_email')
- item.pop('publisher_apps')
- item.pop('unified_app')
- item.pop('contains_ads')
- item.pop('installs')
- item.pop('downloads_and_revenue')
- item.pop('humanized_worldwide_last_month_downloads')
- item.pop('humanized_worldwide_last_month_revenue')
- item.pop('top_in_app_purchases')
- item.pop('related_apps')
- item.pop('appId')
- item.update(
- {
- 'id': item['app_id'],
- 'dow_sign': get_gen(dow),
- 'downloads': get_num(dow),
- 'rev_sign': get_gen(rev),
- 'revenue': get_num(rev),
- 'revenue_breakdown': rev_break,
- 'download_breakdown': dow_break,
- 'is_parsed': 1,
- 'release_date': release_date,
- 'updated_date': updated_date,
- 'downloads_revenue_date': downloads_revenue_date,
- 'recent_release_date': recent_release_date,
- 'worldwide_release_date': worldwide_release_date,
- 'content_rating': content_rating
- })
- item.pop('app_id')
- for key, value in item.items():
- if isinstance(value, dict) or isinstance(value, list):
- item[key] = json.dumps(value)
- return item
- def get_info_from_item(url, db):
- """
- Получает информацию по ссылке
- :param url: str
- :param db: int
- :return: dict
- """
- req_html = requests.get(url)
- soup = BeautifulSoup(req_html.text, 'html.parser')
- # Получаю информацию из комментрация в html
- for comments in soup.findAll(text=lambda text: isinstance(text, Comment)):
- if "'app_profile'," in str(comments).split():
- info = str(comments).split()
- str_for_dict = ''
- for i in info[6:]:
- str_for_dict += i
- str_for_dict = str_for_dict[1:len(str_for_dict) - 2]
- str_for_dict.replace("'", '"').replace('(', '[').replace(')', ']').replace('{', '[').replace('}', ']')
- info_json = json.loads('{' + str_for_dict + '}') # парсит json выдает объект dict
- table_sensortower_apps = db['sensortower_apps']
- table_sensortower_iap = db['sensortower_iap']
- table_sensortower_related_apps = db['sensortower_related_apps']
- for app_purchases in info_json['top_in_app_purchases']:
- price_currency = None
- if app_purchases['price'] == 'Free':
- app_purchases_price = 0
- elif app_purchases['price'][0] not in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
- app_purchases_price = float(app_purchases['price'][1:])
- price_currency = app_purchases['price'][0]
- app_purchases.update({
- 'id': app_purchases['iap_id'],
- 'price': app_purchases_price,
- 'price_currency': price_currency,
- 'sensortower_app_id': info_json['app_id']
- })
- app_purchases.pop('iap_id')
- table_sensortower_iap.upsert(app_purchases, ['id'])
- for related_app in info_json['related_apps']:
- if not table_sensortower_apps.find_one(id=related_app['app_id']):
- table_sensortower_apps.upsert(
- {"id": related_app["app_id"], "app_view_url": related_app['app_view_url'], "is_parsed": 0}, ['id'])
- table_sensortower_related_apps.upsert(
- {"related_app_id": related_app["app_id"], "sensortower_app_id": info_json["app_id"]}, ['id'])
- # print(f"Парсинг приложения, id: {info_json['app_id']}")
- return info_json
- def parse(app_list, db, is_pars_all=1):
- """
- Парсит в цикле.
- Парсер, который в зависимости от параметра 'is_pars_all' парсит либо первые топ-200,
- набирая при этом дополнительную базу из списка 'похожих', либо парсит все добавленные
- в базу неспаршенные ссылки, при этом так же набирая при этом дополнительную базу из
- списка 'похожих'
- :param app_list: dict
- :param db: int
- :param is_pars_all: bool
- :return:
- """
- table_sensortower_apps = db['sensortower_apps']
- for app_item in app_list:
- if table_sensortower_apps.find_one(app_view_url=app_item['app_id'], is_parsed=1):
- continue
- item_for_update = get_info_from_item('https://sensortower.com' + app_item['app_view_url'], db)
- try:
- app_item.update(item_for_update)
- except TypeError:
- print('Ждем разбана')
- time.sleep(random.randint(15, 60))
- continue
- # print(json.dumps(app_item))
- app_item = restruct(app_item)
- table_sensortower_apps.upsert(app_item, ['id'])
- if is_pars_all:
- app_list = table_sensortower_apps.find(is_parsed=0)
- # print('парсим не топы')
- # else:
- # print('парсим топы')
- if __name__ == '__main__':
- db = dataset.connect('sqlite:///mydatabase.db')
- table_sensortower_apps = db['sensortower_apps']
- url_top = f'https://sensortower.com/api/ios/rankings/get_category_rankings?category=0&country=US&date=2021-05-11T00%3A00%3A00.000Z&device=IPHONE&limit=200&offset=0'
- top_app_list = []
- for index in range(3):
- top_app_list += [i[index] for i in requests.get(url_top).json()]
- parsed_app_list = table_sensortower_apps.find(is_parsed=1)
- if len({app['id'] for app in top_app_list} & {app['id'] for app in parsed_app_list}) != 600:
- parse(top_app_list, db, 0)
- not_parsed_app_list = table_sensortower_apps.find(is_parsed=0)
- if not not_parsed_app_list:
- not_parsed_app_list = table_sensortower_apps.find(is_parsed=1)
- parse(not_parsed_app_list, db)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement