Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from selenium import webdriver
- from selenium.webdriver.chrome.service import Service
- from selenium.webdriver.chrome.options import Options
- from selenium.webdriver.common.by import By
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.common.keys import Keys
- from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException
- import time
- from datetime import date, datetime, timedelta
- from bs4 import BeautifulSoup as bs4
- import mysql.connector
- import re
- options = Options()
- options.add_argument('--no-sandbox')
- options.add_argument('--disable-dev-shm-usage')
- options.add_argument('--headless=new')
- options.add_argument('--disable-gpu')
- options.add_argument('--disable-extensions')
- options.add_argument('--remote-debugging-port=9222') # Specify a port
- options.add_argument('--disable-setuid-sandbox')
- options.add_argument("--incognito")
- options.add_argument("--disable-application-cache")
- options.add_argument("--enable-do-not-track")
- options.add_argument("--disable-popup-blocking")
- options.binary_location = '/snap/bin/chromium'
- service = Service('/usr/bin/chromedriver')
- def get_entire_source_page(driver):
- try:
- driver.get("https://example.com")
- pagine_source = driver.page_source
- with open("entire_source_page.html", "w", encoding="utf-8") as source:
- source.write(pagine_source)
- finally:
- print("got source page!")
- def get_all_link():
- with open("entire_source_page.html") as file:
- soup = bs4(file, "html.parser")
- lista_link = []
- links = soup.select('div.image-box a[href]')
- for link in links:
- if link['href'] != 'https://www.livetantra.net/':
- lista_link.append(link['href'])
- return lista_link
- def get_in(driver):
- try:
- driver.get("https:example.com")
- driver.delete_all_cookies()
- time.sleep(1)
- remaining_cookies = driver.get_cookies()
- print(f"Cookies remaining after the removal: {len(remaining_cookies)}")
- if not remaining_cookies:
- print("All cookies have been successfully deleted")
- else:
- print("Still cookies remaining:")
- for cookie in remaining_cookies:
- print(cookie)
- try:
- entra = WebDriverWait(driver, 10).until(
- EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), 'abs')]"))
- )
- entra.click()
- time.sleep(2)
- except:
- pass
- finally:
- print("got in!")
- def get_source_page(driver, link):
- try:
- driver.get(link)
- pagine_source = driver.page_source
- with open("source_page.html", "w", encoding="utf-8") as source:
- source.write(pagine_source)
- finally:
- print("got source page!")
- def get_info_and_put(link, connection):
- with open("source_page.html") as file:
- soup = bs4(file, "html.parser")
- city = soup.find('span', class_ = 'city').get_text().strip(" -")
- genre = soup.find('span', class_ = 'genere').get_text().strip(" ")
- name = soup.find('label', class_ = 'name d-block').get_text().strip(" ")
- category = soup.find('span', class_ = 'category').get_text().strip(" ")
- phone_nr = soup.find('a', class_ = 'bt-button mb-3')
- recensioni_nr = soup.find('span', class_ = "review-counter")
- visual_1= soup.find('div', class_="faq-title d-flex justify-content-between")
- followers = soup.find('span', id = "follower-count" ).get_text().strip(" ")
- vip = soup.find('div', class_ = "tag-item vip")
- photo = soup.find_all('div', class_ = "workers-img")
- status = "active"
- pattern = r'/(\d+)-[a-zA-Z]+'
- match = re.search(pattern, link)
- if match:
- id_found = match.group(1)
- id = str(id_found)
- if phone_nr:
- phone = phone_nr.get_text().replace(" ", "")
- else:
- link_wa = soup.find('div', class_='details-list contact-info-list')
- number_wa = link_wa.find('span')
- phone = numero_wa.get_text().replace(" ", "")
- if visual_1:
- vis = visual_1.get_text(strip=True).replace('Recensioni', '')
- else:
- visual_cointainer = soup.find('div', class_="d-flex justify-content-between")
- visual_all = visual_cointainer.find_all('label')
- visual_2 = visual_all[1]
- vis = visual_2
- if recensioni_nr:
- recen = recensioni_nr.get_text()
- pattern = fr"\({int(recen)}\)Visto (\d+) volte"
- match = re.search(pattern, vis)
- if match:
- numero = int(match.group(1))
- else:
- pass
- else:
- recen = "disabilitate"
- visu = vis.get_text()
- numero = re.search(r'\d+', visu)
- if numero:
- numero = numero.group()
- if followers:
- pass
- else:
- followers = 0
- if vip:
- vip_bool = True
- else:
- vip_bool = False
- tempo_attività = datetime.now().strftime('%Y-%m-%d')
- add_data = ("INSERT INTO example.com_analisi "
- "(id_workers, nome, categoria, vip, link, città, tempo_attività, followers, nr_recensioni, visual, nr_foto, nr_telefono, status) "
- "VALUES (%(id_workers)s, %(nome)s, %(categoria)s, %(vip)s, %(link)s, %(citta)s, %(tempo_attivita)s, %(followers)s, %(nr_recensioni)s, %(visual)s, %(nr_foto)s, %(nr_telefono)s, %(status)s)")
- nr_photo = len(photo)
- data_workers = {
- 'id_workers': id,
- 'nome': name,
- 'categoria': category,
- 'vip': vip_bool,
- 'link': link,
- 'citta': city,
- 'tempo_attivita': tempo_attività,
- 'followers': followers,
- 'nr_recensioni': recen,
- 'visual': numero,
- 'nr_foto': nr_photo,
- 'nr_telefono' : phone,
- 'status' : status
- }
- try:
- cursor = connection.cursor()
- cursor.execute(add_data, data_workers)
- connection.commit()
- return id, name, category, vip_bool, city, followers, recen, numero, nr_photo
- except mysql.connector.Error:
- pass
- return id, name, category, vip_bool, city, followers, recen, numero, nr_photo
- finally:
- cursor.close()
- def connetti():
- try:
- connection = mysql.connector.connect(
- host = "localhost",
- user = "root",
- password = "Iolamiavitalavivoperte000",
- database = "workers_analisi"
- )
- print("success")
- return connection
- except mysql.connector.Error as err:
- print(f"Error: {err}")
- return None
- def change_value(id_list, connection):
- placeholders = ','.join(['%s'] * len(id_list))
- query_disattivato = f"UPDATE example.com_analisi SET status = 'disattivato' WHERE id_workers NOT IN ({placeholders})"
- query_attivato = f"UPDATE example.com_analisi SET status = 'attivato' WHERE id_workers IN ({placeholders}) AND status = 'disattivato'"
- try:
- cursor = connection.cursor()
- # Passiamo la lista di ID direttamente come parametro
- cursor.execute(query_disattivato, tuple(id_list))
- connection.commit()
- cursor.execute(query_attivato, tuple(id_list))
- connection.commit()
- return True
- except mysql.connector.Error as err:
- print(f"Error: {err}")
- return False
- finally:
- cursor.close()
- def put_analisi_live(connection, id, name, category, vip_bool, city, followers, recen, numero, nr_photo):
- add_data = ("INSERT INTO best_analisi_live "
- "(id_workers, nome, categoria, vip, città, followers, nr_recensioni, visual, nr_foto, tempo_analisi) "
- "VALUES (%(id_workers)s, %(nome)s, %(categoria)s, %(vip)s, %(citta)s, %(followers)s, %(nr_recensioni)s, %(visual)s, %(nr_foto)s, %(tempo_analisi)s)")
- tempo_analisi = datetime.now().strftime('%Y-%m-%d')
- # Creazione di un dizionario con i dati
- data_workers = {
- 'id_workers': id,
- 'nome': name,
- 'categoria': category,
- 'vip': vip_bool,
- 'citta': city,
- 'followers': followers,
- 'nr_recensioni': recen,
- 'visual': numero, # Conversione a stringa per sicurezza
- 'nr_foto': nr_photo,
- 'tempo_analisi' : tempo_analisi
- }
- try:
- cursor = connection.cursor()
- cursor.execute(add_data, data_workers)
- connection.commit()
- except mysql.connector.Error as err:
- print(f"Error: {err}")
- finally:
- cursor.close()
- if __name__ == "__main__":
- while True:
- try:
- driver = webdriver.Chrome(service=service, options=options)
- driver.set_page_load_timeout(180) # Aumenta il timeout a 180 secondi
- get_in(driver)
- time.sleep(2)
- get_entire_source_page(driver)
- list_link = get_all_link()
- try:
- with open('failed_links.txt', 'r') as f:
- failed_only = [line.strip() for line in f.readlines()]
- if failed_only: # Se ci sono link falliti, processiamo solo quelli
- list_link = [link.replace('https://www.example.com.ch/', '') for link in failed_only]
- except FileNotFoundError:
- pass
- counter = 0
- id_list = []
- failed_links = []
- for name in list_link:
- try:
- conn = connetti()
- if not conn:
- print(f"Error to conn")
- continue
- link_reale = "https://www.example.com.ch/" + name
- max_retries = 3
- success = False
- for attempt in range(max_retries):
- try:
- time.sleep(3)
- get_workers_source_page(driver, link_reale)
- success = True
- break
- except Exception as e:
- print(f"Failed {attempt + 1} attempt for {link_reale}: {str(e)}")
- if attempt < max_retries - 1:
- time.sleep(5)
- continue
- if not success:
- print(f"Skip {link_reale} after {max_retries} failed attempts")
- failed_links.append(link_reale)
- continue
- time.sleep(3)
- counter += 1
- print(f"Successfully processed: {counter}")
- try:
- id_attivo, name, category, vip_bool, city, followers, recen, numero, nr_photo = get_info_workers_and_put(link_reale, conn)
- id_list.append(id_attivo)
- put_analisi_live(conn, id_attivo, name, category, vip_bool, city, followers, recen, numero, nr_photo)
- except Exception as e:
- print(f"Error processing data for {link_reale}: {str(e)}")
- failed_links.append(link_reale)
- finally:
- conn.close()
- except Exception as e:
- print(f"General error for {name}: {str(e)}")
- if conn:
- conn.close()
- failed_links.append(link_reale)
- if id_list:
- try:
- conn = connetti()
- if conn:
- change_value(id_list, conn)
- conn.close()
- except Exception as e:
- print(f"Error in the final change_value: {str(e)}")
- if failed_links:
- print(f"Saved {len(failed_links)} failed links in failed_links.txt")
- with open('failed_links.txt', 'w') as f:
- for link in failed_links:
- f.write(f"{link}\n")
- print("Restart process for failed links...")
- time.sleep(5)
- else:
- print("All links have been successfully processed!")
- break
- except Exception as e:
- print(f"Critic error in the procession: {str(e)}")
- finally:
- driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement