collinsanele

ben_second_milestone_updated

Jun 4th, 2020
129
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.84 KB | None | 0 0
  1. from extract_emails import ExtractEmails
  2. import re
  3. import cloudscraper
  4. from bs4 import BeautifulSoup
  5. import pandas as pd
  6. import os
  7. import concurrent.futures
  8. import phonenumbers
  9. import sys
  10.  
  11.  
  12. try:
  13.     command_arg = sys.argv[1]
  14.    
  15.     #if command_arg != "AU" or command_arg != "US":
  16.         #sys.exit("\nCommand line argument should be 'AU' or 'US'")
  17.    
  18. except IndexError:
  19.     sys.exit("\nPlease provide a command line argument\nFor example 'python name_of_script.py AU' for Australian domains or 'python name_of_script.py US' for US domains")
  20.    
  21.  
  22.  
  23. RESULTS = []
  24.    
  25.        
  26.            
  27.                
  28. def locate_txt_files(path=os.getcwd):  
  29.     """ A function that locates and returns a list of txt files from a given path """
  30.    
  31.     txt_array = []
  32.     for item in os.listdir(path()):
  33.         if item.endswith(".txt"):
  34.             txt_array.append(item)
  35.     return txt_array                   
  36.                        
  37.                            
  38.                                
  39.                                    
  40. def open_and_prepare_urls2(txt_file_array):
  41.     final_domains = []
  42.    
  43.     for txt_file in txt_file_array:
  44.         with open(txt_file, encoding="utf-8") as f:
  45.             content = f.read()
  46.         content = content.split("au")[0:]
  47.         domain_urls = [item.replace("|", "").replace("\n", "").replace(",", "").strip()+"au" for item in content if "." in item]
  48.         final_domains.extend(domain_urls)
  49.         print(f"Total of {len(domain_urls)} '.au' domains found in {txt_file}")
  50.         print("\n")
  51.    
  52.     return final_domains                                       
  53.                                        
  54.                                                    
  55.                                                        
  56. ###### New Additions ####  
  57. def getEmail(url):
  58.     em = ExtractEmails(url, depth=5, ssl_verify=True)
  59.    
  60.     return em.emails
  61.        
  62.    
  63.  
  64.  
  65.  
  66. def getPhone(source):
  67.     phones_array = []
  68.     phones = re.findall(r'[\+\(]?[0-9][0-9 .\-\(\)]{8,}[0-9]', source)
  69.    
  70.     prefixes = ["1300", "1800",
  71.     "02", "03", "04", "05", "07", "08",
  72.     "612", "613", "614", "615", "617", "618"]
  73.    
  74.     for prefix in prefixes:
  75.         for phone in phones:
  76.             if phone.startswith(str(prefix)) and len(phone.strip().replace(" ", "").strip()) <= 12:
  77.                 phones_array.append(phone)
  78.    
  79.     try:
  80.         phones_array = [phone.replace(" ", "").strip() for phone in phones_array]
  81.        
  82.     except:
  83.         pass
  84.    
  85.     try:
  86.         phones_array = sorted(set(phones_array), key=phones_array.index)   
  87.        
  88.     except:
  89.         pass
  90.                            
  91.     return phones_array
  92.    
  93.    
  94. def googleScrapePhones(text):
  95.     phones_list = []
  96.     for match in phonenumbers.PhoneNumberMatcher(text, command_arg):
  97.         phones_list.append(phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164))
  98.    
  99.     phones_set = sorted(set(phones_list), key=phones_list.index)
  100.     return ", ".join(phones_set)
  101.    
  102.  
  103.  
  104.  
  105. scraper = cloudscraper.CloudScraper(browser="chrome")
  106.  
  107. def has_ads(text, keyword="ads"):
  108.     return keyword in text
  109.  
  110.  
  111. def getContent(url):
  112.  
  113.     is_ad = False
  114.     r = scraper.get(url)
  115.  
  116.     soup = BeautifulSoup(r.text, "html.parser")
  117.  
  118.     Source = str(soup)
  119.    
  120.     is_ad = has_ads(text=Source)
  121.    
  122.     try:
  123.         fones = googleScrapePhones(text=Source)
  124.    
  125.        
  126.     except Exception as e:
  127.         #print(e)
  128.        
  129.         pass
  130.    
  131.     if fones:
  132.         fones = fones
  133.         #fones = ", ".join(fones)
  134.        
  135.        
  136.     else:
  137.         fones = ""
  138.    
  139.        
  140.     try:       
  141.         e_mails = getEmail(url)
  142.        
  143.        
  144.     except Exception as e:
  145.         #print(e)
  146.         pass
  147.    
  148.     if e_mails:
  149.         e_mails = ", ".join(e_mails)
  150.        
  151.     else:
  152.         e_mails = ""
  153.        
  154.        
  155.  
  156.     RESULTS.append((url, fones, e_mails, is_ad))
  157.  
  158.  
  159.  
  160.  
  161. def getData(url):
  162.    
  163.     fones = ""
  164.     e_mails = ""
  165.     contact_link = ""
  166.    
  167.     if "http" not in url:
  168.         url  = "http://"+url
  169.        
  170.            
  171.     #perform scraping
  172.     getContent(url)
  173.    
  174.    
  175.        
  176.    
  177.  
  178.  
  179. ##### Apply #####
  180. txt_files_list = locate_txt_files()
  181.  
  182. domains = open_and_prepare_urls2(txt_files_list)
  183.  
  184. total_time = int(len(domains)) * 6.7
  185.  
  186. print("Starting scraper\nPlease wait...\nEstimated time in sec(s):\n"+ str(total_time))
  187.  
  188. with concurrent.futures.ThreadPoolExecutor() as executor:
  189.   executor.map(getData, domains)
  190.  
  191.  
  192.  
  193.  
  194. general_result = []
  195. result_04 = []
  196.  
  197. for item_tuple in RESULTS:
  198.     #Only 04 numbers
  199.     obj_04 = {}
  200.     obj_gen = {}
  201.    
  202.     phone_str = ""
  203.     phone_gen_str = ""
  204.    
  205.     for item in item_tuple[1].split(", "):
  206.         if item.startswith("+614"):
  207.             phone_str+= item + ", "
  208.            
  209.         else:
  210.             phone_gen_str+=item + ", "
  211.    
  212.     if len(phone_str) >=3:
  213.         obj_04["Domain"] = item_tuple[0]
  214.         obj_04["Phone Number"] = phone_str
  215.         obj_04["Email"] = item_tuple[2]
  216.         obj_04["ads"] = item_tuple[3]
  217.         result_04.append(obj_04)
  218.    
  219.     obj_gen["Domain"] = item_tuple[0]
  220.     obj_gen["Phone Number"] = phone_gen_str
  221.     obj_gen["Email"] = item_tuple[2]
  222.     obj_gen["ads"] = item_tuple[3] 
  223.     general_result.append(obj_gen)
  224.    
  225.    
  226.    
  227.    
  228.  
  229.  
  230. df = pd.DataFrame(general_result)
  231. df_04 = pd.DataFrame(result_04)
  232.  
  233.  
  234. #Filter conditions
  235. filt_ads = (df["ads"]== True)
  236. filt_no_ads = (df["ads"]== False)
  237. filt_04_ads = (df_04["ads"] == True)
  238. filt_04_no_ads = (df_04["ads"] == False)
  239.  
  240. #create ads dataframe from df and df_04
  241. df_ads = df[filt_ads]
  242. df_no_ads = df[filt_no_ads]
  243. df_04_ads = df_04[filt_04_ads]
  244. df_04_no_ads = df_04[filt_04_no_ads]
  245.  
  246.  
  247. df_ads.to_excel("result_landline_ads.xlsx", index=False)
  248. df_no_ads.to_excel("result_landline_no_ads.xlsx", index=False)
  249. df_04_ads.to_excel("result_04_mobile_ads.xlsx", index=False)
  250. df_04_no_ads.to_excel("result_04_mobile_no_ads.xlsx", index=False)
Advertisement
Add Comment
Please, Sign In to add comment