collinsanele

ben_second_milestone

Jun 4th, 2020
116
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.33 KB | None | 0 0
  1. from extract_emails import ExtractEmails
  2. import re
  3. import cloudscraper
  4. from bs4 import BeautifulSoup
  5. import pandas as pd
  6. import os
  7. import concurrent.futures
  8.  
  9. RESULTS = []
  10.    
  11.        
  12.            
  13.                
  14. def locate_txt_files(path=os.getcwd):  
  15.     """ A function that locates and returns a list of txt files from a given path """
  16.    
  17.     txt_array = []
  18.     for item in os.listdir(path()):
  19.         if item.endswith(".txt"):
  20.             txt_array.append(item)
  21.     return txt_array                   
  22.                        
  23.                            
  24.                                
  25.                                    
  26. def open_and_prepare_urls2(txt_file_array):
  27.     final_domains = []
  28.    
  29.     for txt_file in txt_file_array:
  30.         with open(txt_file, encoding="utf-8") as f:
  31.             content = f.read()
  32.         content = content.split("au")[0:]
  33.         domain_urls = [item.replace("|", "").replace("\n", "").replace(",", "").strip()+"au" for item in content if "." in item]
  34.         final_domains.extend(domain_urls)
  35.         print(f"Total of {len(domain_urls)} '.au' domains found in {txt_file}")
  36.         print("\n")
  37.    
  38.     return final_domains                                       
  39.                                        
  40.                                                    
  41.                                                        
  42. ###### New Additions ####  
  43. def getEmail(url):
  44.     em = ExtractEmails(url, depth=5, ssl_verify=True)
  45.    
  46.     return em.emails
  47.        
  48.    
  49.  
  50.  
  51.  
  52. def getPhone(source):
  53.     phones_array = []
  54.     phones = re.findall(r'[\+\(]?[0-9][0-9 .\-\(\)]{8,}[0-9]', source)
  55.    
  56.     prefixes = ["1300", "1800",
  57.     "02", "03", "04", "05", "07", "08",
  58.     "612", "613", "614", "615", "617", "618"]
  59.    
  60.     for prefix in prefixes:
  61.         for phone in phones:
  62.             if phone.startswith(str(prefix)) and len(phone.strip().replace(" ", "").strip()) <= 12:
  63.                 phones_array.append(phone)
  64.    
  65.     try:
  66.         phones_array = [phone.replace(" ", "").strip() for phone in phones_array]
  67.        
  68.     except:
  69.         pass
  70.    
  71.     try:
  72.         phones_array = sorted(set(phones_array), key=phones_array.index)   
  73.        
  74.     except:
  75.         pass
  76.                            
  77.     return phones_array
  78.    
  79.  
  80.  
  81.  
  82.  
  83.  
  84. scraper = cloudscraper.CloudScraper(browser="chrome")
  85.  
  86. def has_ads(text, keyword="ads"):
  87.     return keyword in text
  88.  
  89.  
  90. def getContent(url):
  91.  
  92.     is_ad = False
  93.     r = scraper.get(url)
  94.  
  95.     soup = BeautifulSoup(r.text, "html.parser")
  96.  
  97.     Source = str(soup)
  98.    
  99.     is_ad = has_ads(text=Source)
  100.    
  101.     try:
  102.         fones = getPhone(Source)
  103.    
  104.        
  105.     except Exception as e:
  106.         #print(e)
  107.        
  108.         pass
  109.    
  110.     if fones:
  111.         fones = ", ".join(fones)
  112.        
  113.        
  114.     else:
  115.         fones = ""
  116.    
  117.        
  118.     try:       
  119.         e_mails = getEmail(url)
  120.        
  121.        
  122.     except Exception as e:
  123.         #print(e)
  124.         pass
  125.    
  126.     if e_mails:
  127.         e_mails = ", ".join(e_mails)
  128.        
  129.     else:
  130.         e_mails = ""
  131.        
  132.        
  133.  
  134.     RESULTS.append((url, fones, e_mails, is_ad))
  135.  
  136.  
  137.  
  138.  
  139. def getData(url):
  140.    
  141.     fones = ""
  142.     e_mails = ""
  143.     contact_link = ""
  144.    
  145.     if "http" not in url:
  146.         url  = "http://"+url
  147.        
  148.            
  149.     #perform scraping
  150.     getContent(url)
  151.    
  152.    
  153.        
  154.    
  155.  
  156.  
  157. ##### Apply #####
  158. txt_files_list = locate_txt_files()
  159.  
  160. domains = open_and_prepare_urls2(txt_files_list)
  161.  
  162. total_time = int(len(domains)) * 6.7
  163.  
  164. print("Starting scraper\nPlease wait...\nEstimated time in sec(s):\n"+ str(total_time))
  165.  
  166. with concurrent.futures.ThreadPoolExecutor() as executor:
  167.   executor.map(getData, domains)
  168.  
  169.  
  170.  
  171.  
  172. general_result = []
  173. result_04 = []
  174.  
  175. for item_tuple in RESULTS:
  176.     #Only 04 numbers
  177.     if  item_tuple[1]:
  178.         for item in item_tuple[1].split(", "):
  179.             if item.startswith("04") or item.startswith("4"):
  180.                 obj_04 = {}
  181.                 obj_04["Domain"] = item_tuple[0]
  182.                 obj_04["Phone Number"] = item
  183.                 obj_04["Email"] = item_tuple[2]
  184.                 obj_04["ads"] = item_tuple[3]
  185.                
  186.                 result_04.append(obj_04)
  187.    
  188.     #General numbers
  189.     obj_gen = {}
  190.     obj_gen["Domain"] = item_tuple[0]
  191.     obj_gen["Phone Number"] = item_tuple[1]
  192.     obj_gen["Email"] = item_tuple[2]
  193.     obj_gen["ads"] = item_tuple[3]
  194.    
  195.     general_result.append(obj_gen)
  196.  
  197.  
  198. df = pd.DataFrame(general_result)
  199. df_04 = pd.DataFrame(result_04)
  200.  
  201. #Filter conditions
  202. filt_ads = (df["ads"]== True)
  203. filt_no_ads = (df["ads"]== False)
  204. filt_04_ads = (df_04["ads"] == True)
  205. filt_04_no_ads = (df_04["ads"] == False)
  206.  
  207. #create ads dataframe from df and df_04
  208. df_ads = df[filt_ads]
  209. df_no_ads = df[filt_no_ads]
  210. df_04_ads = df_04[filt_04_ads]
  211. df_04_no_ads = df_04[filt_04_no_ads]
  212.  
  213.  
  214. df_ads.to_excel("result_landline_ads.xlsx", index=False)
  215. df_no_ads.to_excel("result_landline_no_ads.xlsx", index=False)
  216. df_04_ads.to_excel("result_04_mobile_ads.xlsx", index=False)
  217. df_04_no_ads.to_excel("result_04_mobile_no_ads.xlsx", index=False)
Advertisement
Add Comment
Please, Sign In to add comment