Advertisement
KempeR1t

Scraping

Jul 27th, 2021
1,377
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 14.80 KB | None | 0 0
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3.  
  4. import requests
  5. from bs4 import BeautifulSoup
  6. from datetime import datetime
  7. import time
  8. import re
  9. import csv
  10. import random
  11. import os
  12. import telebot
  13. from discord_webhook import DiscordWebhook
  14. import threading
  15. from threading import Thread
  16. import itertools
  17.  
  18.  
  19.  
  20. webhook = DiscordWebhook(url='xx',
  21.                          username="СВЕЖАЯ ВЫГРУЗКА ТОП ИГРОКОВ В СБЧ")
  22.  
  23. TOKEN = 'xx'
  24.  
  25. '''
  26. headers = {
  27.    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
  28. }
  29. '''
  30.  
  31. user_agent_list = [
  32.    #Chrome
  33.     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
  34.     'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
  35.     'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
  36.     'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
  37.     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
  38.     'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
  39.     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
  40.     'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
  41.     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
  42.     'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
  43.     #Firefox
  44.     'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
  45.     'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
  46.     'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
  47.     'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
  48.     'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
  49.     'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
  50.     'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
  51.     'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
  52.     'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
  53.     'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
  54.     'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
  55.     'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
  56.     'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
  57. ]
  58.  
  59. players_dict = {}
  60.  
  61. player_att_dict = {}
  62.  
  63. banned_id = ['3','4','5','6','7','8','29','30']
  64.  
  65. groups = []
  66.  
  67. links = []
  68.  
  69.  
  70. def get_html(url):
  71.     user_agent = random.choice(user_agent_list)
  72.     # Set the headers
  73.     headers = {'User-Agent': user_agent}
  74.     r = requests.get(url, headers=headers, timeout=5)
  75.     if r.status_code == 200:
  76.         return r
  77.  
  78.     if r.status_code == 404:
  79.         print('Страница не существует!')
  80.  
  81. def get_all_sbc(html):
  82.     while True:
  83.         try:
  84.             soup = BeautifulSoup(html, 'lxml')
  85.             sbcs = soup.find('div', class_='row col-12 d-flex')
  86.         except Exception:
  87.             print('пытаемся повторить get_all_sbc')
  88.             continue
  89.         else:
  90.             break
  91.     return sbcs
  92.  
  93.  
  94. def collect_sbc_id(sbcs):
  95.     sbc_id = []
  96.     while True:
  97.         try:
  98.             sbcs2 = sbcs.findAll('div', class_='col-md-3 col-xs-6 set_col d-none mb-5')
  99.         except Exception:
  100.             print('пытаемся повторить collect_sbc_id')
  101.             continue
  102.         else:
  103.             break
  104.     for sbc in sbcs2:
  105.         text = sbc.find('div', class_='set_desc').text
  106.         if not 'Icon Swaps' in text:
  107.             id = convert_id(sbc.find('a').get('href'))
  108.             if not id in banned_id:
  109.                 sbc_id.append(id)
  110.     return sbc_id
  111.  
  112. def convert_id(text):
  113.     id = re.search(r'\d+',text)
  114.     return id.group(0)
  115.  
  116. def collect_challenge_list(sbc_id):
  117.     for sbc in sbc_id:
  118.         while True:
  119.             try:
  120.                 soup = BeautifulSoup(get_html(f'https://www.futbin.com/squad-building-challenges/ALL/{sbc}').text, 'lxml')
  121.                 time.sleep(1)
  122.                 challenge_groups = soup.findAll('div', class_='btn_holder')
  123.             except Exception:
  124.                 print('пытаемся повторить collect_challenge_list')
  125.                 continue
  126.             else:
  127.                 break
  128.         for group in challenge_groups:
  129.             groups.append(group.find('a').get('href'))
  130.     return True
  131.  
  132. def collect_finish_sbc_list(groups):
  133.     length = len(groups)
  134.     for group in groups:
  135.         while True:
  136.             length -= 1
  137.             print(f'{group} осталось {length}')
  138.             try:
  139.                 soup = BeautifulSoup(get_html(f'https://www.futbin.com{group}').text, 'lxml')
  140.                 tds = soup.findAll('a', class_='squad_url')
  141.             except Exception:
  142.                 print('пытаемся повторить collect_finish_sbc_list')
  143.                 continue
  144.             else:
  145.                 break
  146.         for td in tds:
  147.             links.append(td.get('href'))
  148.     return True
  149.  
  150. def collect_players(start_links_index, end_links_index):
  151.     length = len(links[start_links_index:end_links_index])
  152.     for link in links[start_links_index:end_links_index]:
  153.         length -= 1
  154.         #print(f'{link}, {threading.current_thread().name} осталось {length}')
  155.         players = get_player_list(link)
  156.         for player in players:
  157.             try:
  158.                 if not players_dict.get(player):
  159.                     players_dict[player] = 1
  160.                 else:
  161.                     players_dict[player] = players_dict[player] + 1
  162.             except TypeError:
  163.                 print(f'пропуск добавления игрока в словарь {player}')
  164.                 pass
  165.     #max_player_index = sorted(list(players_dict.values()))[-200]
  166.     #write_to_csv(max_player_index)
  167.     return True
  168.  
  169.  
  170. def get_player_list(link):
  171.     while True:
  172.         try:
  173.             soup = BeautifulSoup(get_html(f'https://www.futbin.com{link}').text, 'lxml')
  174.             divs = soup.find('div', id='area').findAll('div', class_='cardetails')
  175.         except Exception:  # Replace Exception with something more specific.
  176.             #print(f'повтор get_player_list - {link}')
  177.             continue
  178.         else:
  179.             break
  180.     cards = []
  181.     card_info = []
  182.     for div in divs:
  183.         try:
  184.             try:
  185.                 card_info_ = str(div)
  186.                 card_info.append('non-rare' if 'non-rare' in card_info_ else 'rare')
  187.                 card_info.append('gold' if 'gold' in card_info_ else 'silver' if 'silver' in card_info_ else 'bronze')
  188.             except Exception:
  189.                 card_info = ['Unknown', link]
  190.             if not player_att_dict.get(div.find('a').get('href')):
  191.                 player_att_dict[div.find('a').get('href')] = \
  192.                     {'rating': div.find('div', class_='pcdisplay-rat').text,
  193.                      'position': div.find('div', class_='pcdisplay-pos').text,
  194.                      'type': card_info[0],
  195.                      'rarity': card_info[1],
  196.                      'price_ps': int(div.find('div', class_='ps-price-hover').text.replace(',', '').replace('\n', '')),
  197.                      'price_pc': int(div.find('div', class_='pc-price-hover').text.replace(',', '').replace('\n', '')),
  198.                      'price_xbox': int(div.find('div', class_='xbox-price-hover').text.replace(',', '').replace('\n', ''))
  199.                      }
  200.                      # 'price_ps_upd': 'no_data',
  201.                      # 'price_pc_upd': 'no_data',
  202.                      # 'price_xbox_upd': 'no_data'
  203.             cards.append(div.find('a').get('href'))
  204.         except AttributeError:
  205.             print(f'пропуск парсинга игрока {link}')
  206.             pass
  207.     return cards
  208.  
  209.  
  210. def get_price_update(key):
  211.     html = 'http://futbin.com' + key
  212.     while True:
  213.         try:
  214.             soup = BeautifulSoup(get_html(html).text, 'lxml')
  215.             player_id = \
  216.                 soup.find('div', class_='site-player-page').find('div', class_='container').find('div', id='page-info').get('data-player-resource')
  217.             price_update = [
  218.                 get_html(f'https://www.futbin.com/20/playerPrices?player={player_id}').json()[player_id]['prices'][
  219.                     'ps']['updated'],
  220.                 get_html(f'https://www.futbin.com/20/playerPrices?player={player_id}').json()[player_id]['prices'][
  221.                     'pc']['updated'],
  222.                 get_html(f'https://www.futbin.com/20/playerPrices?player={player_id}').json()[player_id]['prices'][
  223.                     'xbox']['updated']]
  224.         except Exception:
  225.             print('пытаемся повторить get_price_update')
  226.             continue
  227.         else:
  228.             break
  229.     return price_update
  230.  
  231.  
  232. def update_top_players_prices(max_player_index, start_players_index, end_player_index):
  233.     i = iter(players_dict.items())
  234.     for key, value in dict(itertools.islice(i, start_players_index, end_player_index)).items():
  235.         if value >= max_player_index:
  236.             html = 'http://futbin.com' + key
  237.             #print(key)
  238.             while True:
  239.                 try:
  240.                     soup = BeautifulSoup(get_html(html).text, 'lxml')
  241.                     player_id = \
  242.                         soup.find('div', class_='site-player-page').find('div', class_='container').find('div',
  243.                                                                                                          id='page-info').get(
  244.                             'data-player-resource')
  245.                     price_update = [
  246.                         get_html(f'https://www.futbin.com/20/playerPrices?player={player_id}').json()[player_id]['prices'][
  247.                             'ps']['updated'],
  248.                         get_html(f'https://www.futbin.com/20/playerPrices?player={player_id}').json()[player_id]['prices'][
  249.                             'pc']['updated'],
  250.                         get_html(f'https://www.futbin.com/20/playerPrices?player={player_id}').json()[player_id]['prices'][
  251.                             'xbox']['updated']]
  252.                 except Exception:
  253.                     print('пытаемся повторить get_price_update')
  254.                     continue
  255.                 else:
  256.                     break
  257.             player_att_dict[key]['price_ps_upd'] = price_update[0]
  258.             player_att_dict[key]['price_pc_upd'] = price_update[1]
  259.             player_att_dict[key]['price_xbox_upd'] = price_update[2]
  260.     return True
  261.  
  262.  
  263. def write_to_csv():
  264.     dt_now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") + '.csv'
  265.     with open('/home/kemper1t/' + dt_now, 'w', newline='', encoding='utf-8') as csv_file:
  266.         writer = csv.writer(csv_file,delimiter = ';')
  267.         writer.writerow(['url', 'sbc_count', 'rating', 'position', 'type', 'rarity', 'PS price', 'PS price Update', 'PC price',
  268.              'PC price Update', 'XBOX price', 'XBOX price Update'])
  269.         for key, value in players_dict.items():
  270.             if value >= max_player_index:
  271.                 #price_update = get_price_update(key)
  272.                 writer.writerow([key,value,
  273.                                  player_att_dict[key]['rating'],
  274.                                  player_att_dict[key]['position'],
  275.                                  player_att_dict[key]['type'],
  276.                                  player_att_dict[key]['rarity'],
  277.                                  player_att_dict[key]['price_ps'],
  278.                                  player_att_dict[key]['price_ps_upd'],
  279.                                  player_att_dict[key]['price_pc'],
  280.                                  player_att_dict[key]['price_pc_upd'],
  281.                                  player_att_dict[key]['price_xbox'],
  282.                                  player_att_dict[key]['price_xbox_upd'],
  283.                                  ])
  284.     with open('/home/kemper1t/' + dt_now, "rb") as f:
  285.         webhook.add_file(file=f.read(), filename=dt_now)
  286.     webhook.execute()
  287.     bot = telebot.TeleBot(TOKEN)
  288.     bot.send_document(5020676, open('/home/kemper1t/' + dt_now, 'rb'))
  289.     bot.send_document(432775186, open('/home/kemper1t/' + dt_now, 'rb'))
  290.     os.remove('/home/kemper1t/' + dt_now)
  291.  
  292. start_time = time.time()
  293. print(f'запуск скрипта в {time.time()}')
  294.  
  295. list_sbc_id = collect_sbc_id(get_all_sbc(get_html('https://www.futbin.com/squad-building-challenges').text))
  296.  
  297. sbc_id_thread_index = (len(list_sbc_id) // 2) + (1 if len(list_sbc_id) % 2 == 1 else 0)
  298. thread1 = Thread(target=collect_challenge_list, args=(list_sbc_id[:sbc_id_thread_index],))
  299. thread2 = Thread(target=collect_challenge_list, args=(list_sbc_id[sbc_id_thread_index:],))
  300. thread1.start()
  301. thread2.start()
  302. thread1.join()
  303. thread2.join()
  304.  
  305. challenge_list_index = (len(groups) // 2) + (1 if len(groups) % 2 == 1 else 0)
  306.  
  307. thread1 = Thread(target=collect_finish_sbc_list, args=(groups[:challenge_list_index],))
  308. thread2 = Thread(target=collect_finish_sbc_list, args=(groups[challenge_list_index:],))
  309. thread1.start()
  310. thread2.start()
  311. thread1.join()
  312. thread2.join()
  313. print("--- %s seconds ---" % (time.time() - start_time))
  314. groups = []
  315.  
  316. links_index = (len(links) // 2) + (1 if len(links) % 2 == 1 else 0)
  317.  
  318. thread1 = Thread(target=collect_players, kwargs={'start_links_index':0, 'end_links_index':links_index})
  319. thread2 = Thread(target=collect_players, kwargs={'start_links_index':links_index, 'end_links_index':-1})
  320. thread1.start()
  321. thread2.start()
  322. thread1.join()
  323. thread2.join()
  324. print("--- %s seconds ---" % (time.time() - start_time))
  325.  
  326. max_player_index = sorted(list(players_dict.values()))[-200] if len(list(players_dict.values())) >= 200 else sorted(list(players_dict.values()))[0]
  327.  
  328. players_index = (len(players_dict) // 2) + (1 if len(players_dict) % 2 == 1 else 0)
  329. print("--- %s seconds ---" % (time.time() - start_time))
  330. thread1 = Thread(target=update_top_players_prices, kwargs={'max_player_index':max_player_index, 'start_players_index':0, 'end_player_index':players_index})
  331. thread2 = Thread(target=update_top_players_prices, kwargs={'max_player_index':max_player_index, 'start_players_index':players_index, 'end_player_index':len(players_dict)})
  332. thread1.start()
  333. thread2.start()
  334. thread1.join()
  335. thread2.join()
  336. print("--- %s seconds ---" % (time.time() - start_time))
  337. write_to_csv()
  338.  
  339. print("--- %s seconds ---" % (time.time() - start_time))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement