Advertisement
realtalker

Scraper Python - Cron Problem

Feb 11th, 2025 (edited)
44
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.72 KB | Source Code | 0 0
  1. from selenium import webdriver
  2. from selenium.webdriver.chrome.service import Service
  3. from selenium.webdriver.chrome.options import Options
  4. from selenium.webdriver.common.by import By
  5. from selenium.webdriver.common.action_chains import ActionChains
  6. from selenium.webdriver.support.ui import WebDriverWait
  7. from selenium.webdriver.support import expected_conditions as EC
  8. from selenium.webdriver.common.keys import Keys
  9. from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException
  10. import time
  11. from datetime import date, datetime, timedelta
  12. from bs4 import BeautifulSoup as bs4
  13. import mysql.connector
  14. import re
  15.  
  16.  
  17. options = Options()
  18. options.add_argument('--no-sandbox')
  19. options.add_argument('--disable-dev-shm-usage')
  20. options.add_argument('--headless=new')
  21. options.add_argument('--disable-gpu')
  22. options.add_argument('--disable-extensions')
  23. options.add_argument('--remote-debugging-port=9222')  # Specify a port
  24. options.add_argument('--disable-setuid-sandbox')
  25.  
  26. options.add_argument("--incognito")
  27. options.add_argument("--disable-application-cache")
  28. options.add_argument("--enable-do-not-track")
  29. options.add_argument("--disable-popup-blocking")
  30.  
  31. options.binary_location = '/snap/bin/chromium'
  32.  
  33. service = Service('/usr/bin/chromedriver')
  34.  
  35.  
  36. def get_entire_source_page(driver):
  37.     try:
  38.         driver.get("https://example.com")
  39.         pagine_source = driver.page_source
  40.  
  41.         with open("entire_source_page.html", "w", encoding="utf-8") as source:
  42.             source.write(pagine_source)
  43.     finally:
  44.         print("got source page!")
  45.  
  46.  
  47. def get_all_link():
  48.     with open("entire_source_page.html") as file:
  49.         soup = bs4(file, "html.parser")
  50.     lista_link = []
  51.     links = soup.select('div.image-box a[href]')
  52.     for link in links:
  53.         if link['href'] != 'https://www.livetantra.net/':  
  54.                 lista_link.append(link['href'])
  55.  
  56.  
  57.     return lista_link
  58.  
  59.  
  60.  
  61. def get_in(driver):
  62.     try:
  63.         driver.get("https:example.com")
  64.         driver.delete_all_cookies()
  65.         time.sleep(1)
  66.         remaining_cookies = driver.get_cookies()
  67.         print(f"Cookies remaining after the removal: {len(remaining_cookies)}")
  68.         if not remaining_cookies:
  69.             print("All cookies have been successfully deleted")
  70.         else:
  71.             print("Still cookies remaining:")
  72.             for cookie in remaining_cookies:
  73.                 print(cookie)
  74.         try:
  75.             entra = WebDriverWait(driver, 10).until(
  76.                 EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), 'abs')]"))
  77.             )
  78.             entra.click()
  79.             time.sleep(2)
  80.        
  81.         except:
  82.             pass
  83.        
  84.     finally:
  85.         print("got in!")
  86.  
  87. def get_source_page(driver, link):
  88.     try:
  89.         driver.get(link)
  90.         pagine_source = driver.page_source
  91.  
  92.         with open("source_page.html", "w", encoding="utf-8") as source:
  93.             source.write(pagine_source)
  94.     finally:
  95.         print("got source page!")
  96.  
  97.  
  98.  
  99. def get_info_and_put(link, connection):
  100.     with open("source_page.html") as file:
  101.         soup = bs4(file, "html.parser")
  102.  
  103.     city = soup.find('span', class_ = 'city').get_text().strip(" -")
  104.     genre = soup.find('span', class_ = 'genere').get_text().strip(" ")
  105.     name = soup.find('label', class_ = 'name d-block').get_text().strip(" ")
  106.     category = soup.find('span', class_ = 'category').get_text().strip(" ")
  107.     phone_nr = soup.find('a', class_ = 'bt-button mb-3')
  108.     recensioni_nr = soup.find('span', class_ = "review-counter")
  109.     visual_1= soup.find('div', class_="faq-title d-flex justify-content-between")
  110.  
  111.    
  112.     followers = soup.find('span', id = "follower-count" ).get_text().strip(" ")
  113.     vip = soup.find('div', class_ = "tag-item vip")
  114.     photo = soup.find_all('div', class_ = "workers-img")
  115.     status = "active"
  116.     pattern = r'/(\d+)-[a-zA-Z]+'
  117.     match = re.search(pattern, link)
  118.  
  119.     if match:
  120.         id_found = match.group(1)
  121.         id = str(id_found)
  122.  
  123.  
  124.  
  125.     if phone_nr:
  126.         phone = phone_nr.get_text().replace(" ", "")
  127.    
  128.     else:
  129.         link_wa = soup.find('div', class_='details-list contact-info-list')
  130.         number_wa = link_wa.find('span')
  131.         phone = numero_wa.get_text().replace(" ", "")
  132.      
  133.  
  134.     if visual_1:
  135.         vis = visual_1.get_text(strip=True).replace('Recensioni', '')
  136.      
  137.  
  138.     else:
  139.         visual_cointainer = soup.find('div', class_="d-flex justify-content-between")
  140.         visual_all = visual_cointainer.find_all('label')
  141.         visual_2 = visual_all[1]
  142.         vis = visual_2
  143.  
  144.     if recensioni_nr:
  145.         recen = recensioni_nr.get_text()
  146.         pattern = fr"\({int(recen)}\)Visto (\d+) volte"
  147.  
  148.  
  149.         match = re.search(pattern, vis)
  150.         if match:
  151.             numero = int(match.group(1))
  152.  
  153.         else:
  154.             pass
  155.  
  156.  
  157.     else:
  158.         recen = "disabilitate"
  159.  
  160.         visu = vis.get_text()
  161.  
  162.         numero = re.search(r'\d+', visu)
  163.  
  164.         if numero:
  165.             numero = numero.group()
  166.  
  167.     if followers:
  168.         pass
  169.  
  170.     else:
  171.         followers = 0
  172.  
  173.     if vip:
  174.         vip_bool = True
  175.  
  176.     else:
  177.         vip_bool = False
  178.  
  179.  
  180.     tempo_attività = datetime.now().strftime('%Y-%m-%d')
  181.  
  182.  
  183.     add_data = ("INSERT INTO example.com_analisi "
  184.         "(id_workers, nome, categoria, vip, link, città, tempo_attività, followers, nr_recensioni, visual, nr_foto, nr_telefono, status) "
  185.         "VALUES (%(id_workers)s, %(nome)s, %(categoria)s, %(vip)s, %(link)s, %(citta)s, %(tempo_attivita)s, %(followers)s, %(nr_recensioni)s, %(visual)s, %(nr_foto)s, %(nr_telefono)s, %(status)s)")
  186.  
  187.     nr_photo = len(photo)
  188.  
  189.     data_workers = {
  190.             'id_workers': id,
  191.             'nome': name,
  192.             'categoria': category,
  193.             'vip': vip_bool,
  194.             'link': link,
  195.             'citta': city,
  196.             'tempo_attivita': tempo_attività,
  197.             'followers': followers,
  198.             'nr_recensioni': recen,
  199.             'visual': numero,
  200.             'nr_foto': nr_photo,
  201.             'nr_telefono' : phone,  
  202.             'status' : status
  203.         }
  204.  
  205.     try:
  206.         cursor = connection.cursor()
  207.         cursor.execute(add_data, data_workers)
  208.         connection.commit()
  209.         return id, name, category, vip_bool, city, followers, recen, numero, nr_photo
  210.     except mysql.connector.Error:
  211.         pass
  212.         return id, name, category, vip_bool, city, followers, recen, numero, nr_photo
  213.     finally:
  214.         cursor.close()
  215.  
  216. def connetti():
  217.     try:
  218.         connection = mysql.connector.connect(
  219.             host = "localhost",
  220.             user = "root",
  221.             password = "Iolamiavitalavivoperte000",
  222.             database = "workers_analisi"
  223.         )
  224.         print("success")
  225.         return connection
  226.     except mysql.connector.Error as err:
  227.         print(f"Error: {err}")
  228.         return None
  229.    
  230. def change_value(id_list, connection):
  231.  
  232.     placeholders = ','.join(['%s'] * len(id_list))
  233.     query_disattivato = f"UPDATE example.com_analisi SET status = 'disattivato' WHERE id_workers NOT IN ({placeholders})"
  234.     query_attivato =  f"UPDATE example.com_analisi SET status = 'attivato' WHERE id_workers IN ({placeholders}) AND status = 'disattivato'"
  235.     try:
  236.         cursor = connection.cursor()
  237.         # Passiamo la lista di ID direttamente come parametro
  238.         cursor.execute(query_disattivato, tuple(id_list))
  239.         connection.commit()
  240.         cursor.execute(query_attivato, tuple(id_list))
  241.         connection.commit()
  242.         return True
  243.     except mysql.connector.Error as err:
  244.         print(f"Error: {err}")
  245.         return False
  246.     finally:
  247.         cursor.close()
  248.  
  249. def put_analisi_live(connection, id, name, category, vip_bool, city, followers, recen, numero, nr_photo):
  250.     add_data = ("INSERT INTO best_analisi_live "
  251.         "(id_workers, nome, categoria, vip, città, followers, nr_recensioni, visual, nr_foto, tempo_analisi) "
  252.         "VALUES (%(id_workers)s, %(nome)s, %(categoria)s, %(vip)s, %(citta)s, %(followers)s, %(nr_recensioni)s, %(visual)s, %(nr_foto)s, %(tempo_analisi)s)")
  253.  
  254.     tempo_analisi = datetime.now().strftime('%Y-%m-%d')
  255.     # Creazione di un dizionario con i dati
  256.     data_workers = {
  257.             'id_workers': id,
  258.             'nome': name,
  259.             'categoria': category,
  260.             'vip': vip_bool,
  261.             'citta': city,
  262.             'followers': followers,
  263.             'nr_recensioni': recen,
  264.             'visual': numero,  # Conversione a stringa per sicurezza
  265.             'nr_foto': nr_photo,
  266.             'tempo_analisi' : tempo_analisi
  267.         }
  268.  
  269.     try:
  270.         cursor = connection.cursor()
  271.         cursor.execute(add_data, data_workers)
  272.         connection.commit()
  273.     except mysql.connector.Error as err:
  274.         print(f"Error: {err}")
  275.     finally:
  276.         cursor.close()
  277.  
  278.  
  279. if __name__ == "__main__":
  280.     while True:  
  281.         try:
  282.          
  283.             driver = webdriver.Chrome(service=service, options=options)
  284.             driver.set_page_load_timeout(180)  # Aumenta il timeout a 180 secondi
  285.             get_in(driver)
  286.             time.sleep(2)
  287.      
  288.             get_entire_source_page(driver)
  289.             list_link = get_all_link()
  290.            
  291.  
  292.             try:
  293.                 with open('failed_links.txt', 'r') as f:
  294.                     failed_only = [line.strip() for line in f.readlines()]
  295.                     if failed_only:  # Se ci sono link falliti, processiamo solo quelli
  296.                         list_link = [link.replace('https://www.example.com.ch/', '') for link in failed_only]
  297.             except FileNotFoundError:
  298.                 pass  
  299.  
  300.             counter = 0
  301.             id_list = []
  302.             failed_links = []
  303.  
  304.             for name in list_link:
  305.                 try:
  306.                     conn = connetti()
  307.                     if not conn:
  308.                         print(f"Error to conn")
  309.                         continue
  310.  
  311.                     link_reale = "https://www.example.com.ch/" + name
  312.                    
  313.                     max_retries = 3
  314.                     success = False
  315.                     for attempt in range(max_retries):
  316.                         try:
  317.                             time.sleep(3)  
  318.                             get_workers_source_page(driver, link_reale)
  319.                             success = True
  320.                             break
  321.                         except Exception as e:
  322.                             print(f"Failed {attempt + 1} attempt for {link_reale}: {str(e)}")
  323.                             if attempt < max_retries - 1:
  324.                                 time.sleep(5)  
  325.                                 continue
  326.                    
  327.                     if not success:
  328.                         print(f"Skip {link_reale} after {max_retries} failed attempts")
  329.                         failed_links.append(link_reale)
  330.                         continue
  331.  
  332.                     time.sleep(3)
  333.                     counter += 1
  334.                     print(f"Successfully processed: {counter}")
  335.  
  336.                     try:
  337.                         id_attivo, name, category, vip_bool, city, followers, recen, numero, nr_photo = get_info_workers_and_put(link_reale, conn)
  338.                         id_list.append(id_attivo)
  339.                         put_analisi_live(conn, id_attivo, name, category, vip_bool, city, followers, recen, numero, nr_photo)
  340.                     except Exception as e:
  341.                         print(f"Error processing data for  {link_reale}: {str(e)}")
  342.                         failed_links.append(link_reale)
  343.                     finally:
  344.                         conn.close()
  345.  
  346.                 except Exception as e:
  347.                     print(f"General error for {name}: {str(e)}")
  348.                     if conn:
  349.                         conn.close()
  350.                     failed_links.append(link_reale)
  351.  
  352.        
  353.             if id_list:
  354.                 try:
  355.                     conn = connetti()
  356.                     if conn:
  357.                         change_value(id_list, conn)
  358.                         conn.close()
  359.                 except Exception as e:
  360.                     print(f"Error in the final change_value: {str(e)}")
  361.  
  362.        
  363.             if failed_links:
  364.                 print(f"Saved {len(failed_links)} failed links in failed_links.txt")
  365.                 with open('failed_links.txt', 'w') as f:
  366.                     for link in failed_links:
  367.                         f.write(f"{link}\n")
  368.                 print("Restart process for failed links...")
  369.                 time.sleep(5)  
  370.             else:
  371.                 print("All links have been successfully processed!")
  372.                 break
  373.         except Exception as e:
  374.             print(f"Critic error in the procession: {str(e)}")
  375.         finally:
  376.             driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement