Advertisement
skip420

X

Mar 17th, 2020
426
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.86 KB | None | 0 0
  1. #requirements.txt
  2. #requests
  3. #lxml
  4. ---------------------------------------------------------
  5. #python3 X.py
  6. ---------------------------------------------------------
  7. #companies.csv
  8. #add all to same directories
  9.  
  10. ---------------------------------------------------------
  11.  
  12.  
  13. import requests,re,csv
  14. from urllib.parse import urlparse
  15. from bs4 import BeautifulSoup as bs
  16.  
  17. def getList(page):
  18.     links,names = [],[]
  19.     r = requests.get('https://www.xero.com/content/xero/uk/advisors/find-advisors/jcr:content/par/advisors_search_6526/advisorsResults.html?type=advisors&orderBy=ADVISOR_RELEVANCE&sort=ASC&pageNumber={0}&view=list'.format(page))
  20.     soup = bs(r.content,'lxml')
  21.     As = soup.find_all('a',{'class':'advisors-result-card-link'})
  22.     h3s = soup.find_all('h3',{'class':'title-3'})
  23.     for a in As:
  24.         links.append(a.attrs['href'])
  25.     for h3 in h3s:
  26.         names.append(h3.get_text().strip())
  27.     return links,names
  28.  
  29. def getFbEmail(link,s=''):
  30.         if '//' == link[:2]:
  31.             url = link[2:]
  32.         elif 'http://' not in link[:10] and 'https://' not in link[:10]:
  33.             url = 'https://'+link
  34.         else:
  35.             url = link
  36.         Link = urlparse(url)
  37.         email = getEmail("https://web.facebook.com/{user}/about".format(user=Link.path.split('/')[1].split('?')[0]),search=s)
  38.         if email:
  39.             return email
  40.         else:
  41.             return ''
  42.        
  43.  
  44. def getEmail(link,fb='',search=''):
  45.     if 'http://' not in link[:10].lower() and 'https://' not in link[:10].lower():
  46.         link = 'http://'+link
  47.     a = urlparse(link)
  48.     aFb = ''
  49.     cPage = ''
  50.     ses = requests.Session()
  51.     co = {}
  52.     if search in ['fb','aFb']:
  53.         Url = link
  54.     else:
  55.         Url ="{s}://{d}".format(s=a.scheme,d=a.netloc)
  56.     try:
  57.         r = ses.get(Url,headers={'User-Agent':'Chrome'})
  58.     except:
  59.         r = ses.get(Url.replace('http:','https:'),headers={'User-Agent':'Chrome'})
  60.     soup = bs(r.content,'lxml')
  61.     body = soup.find('body')
  62.     [s.extract() for s in body.find_all('script')]
  63.     links = body.find_all('a')
  64.     contactPage = ''
  65.     for link in links:
  66.         try:
  67.             url = link.attrs['href'].lower()
  68.             if 'mailto' in url:
  69.                 return url.split(':')[1]
  70.             elif '@' in link.get_text():
  71.                 match = re.findall(r'[\w\.-]+@[\w\.-]+', link.get_text())
  72.                 if match:
  73.                     return match[0]
  74.             elif 'contact' in url and not search=='fb' and not search=='contact':
  75.                 cPage = url
  76.             elif 'facebook' in url or 'fb' in url:
  77.                 aFb = url.split(':')[1]
  78.                    
  79.         except:
  80.             pass
  81.     text = body.get_text()
  82.     match = re.findall(r'[\w\.-]+@[\w\.-]+', text)
  83.     if match:
  84.         return match[0]
  85.     try:
  86.         if '/' in cPage[0] and not search:
  87.             print("\t[=] Visiting Contact Page")
  88.             email = getEmail(link+'/'+cPage,search='contact')
  89.             if len(email) > 4:
  90.                 return email
  91.         elif ('http://' in cPage[:10] or 'https://' in cPage[:10]) and not search:
  92.             print("\t[=] Visiting Contact Page")
  93.             email = getEmail(cPage,search='contact')
  94.             if len(email)>4:
  95.                 return email
  96.     except:
  97.         pass
  98.     if fb and not search:
  99.         print("\t[=] Visiting Facebook Page")
  100.         email = getFbEmail(fb,'fb')
  101.         if len(email)>4:
  102.             return email
  103.         else:
  104.             return 'NULL'
  105.     elif aFb and not search:
  106.         print("\t[=] Visiting Facebook Page")
  107.         email = getFbEmail(aFb,'aFb')
  108.         if len(email)>4:
  109.             return email
  110.         else:
  111.             return 'NULL'
  112.     return 'NULL'
  113.  
  114. def getData(link,name):
  115.     print("[=] Scraping Company:",name)
  116.     r = requests.get(link)
  117.     fb = ''
  118.     soup = bs(r.content,'lxml')
  119.  
  120.     # WEBSITE
  121.     web = soup.find('a',class_='advisors-profile-hero-detailed-contact-website').attrs['href']
  122.     # WEBSITE
  123.    
  124.     # PHONE NUMBER
  125.     phone = soup.find('a',class_='advisors-profile-hero-detailed-contact-phone').attrs['data-phone']
  126.     # PHONE NUMBER
  127.    
  128.     # PARTNER STATUS
  129.     divs = soup.find_all('div',class_='jTrtdh')
  130.     for div in divs:
  131.         if 'partner status' in div.get_text().lower():
  132.             pStatus = div.find('h6').get_text().replace(' ','').strip()
  133.     # PARTNER STATUS
  134.  
  135.     # FACEBOOK PROFILE
  136.     try:
  137.         socials = soup.find_all('a',class_='advisor-profile-practice-social-link')
  138.         for social in socials:
  139.             if 'facebook' in social.get_text().lower():
  140.                 fb = social.attrs['href']
  141.     except:
  142.         pass
  143.     # FACEBOOK PROFILE
  144.    
  145.     # INDUSTRIES SERVED
  146.     indServed = []        
  147.     industries = soup.find_all('div',class_='TagContent')
  148.     for industry in industries:
  149.         if 'industries served' in industry.get_text().lower():
  150.             List = industry.find_all('li')
  151.             for item in List:
  152.                 indServed.append(item.get_text().strip())
  153.             break
  154.     # INDUSTRIES SERVED
  155.        
  156.     # CONNECTED APPS
  157.     conApps = []
  158.     List = soup.find('ul',class_='advisors-profile-experience-list').find_all('li')
  159.     for item in List:
  160.         conApps.append(item.a.img.attrs['alt'])
  161.     # CONNECTED APPS
  162.  
  163.     # DISPLAYING DATA
  164.     print('\t[+] Phone Number:',phone)
  165.     print('\t[+] Website:',web)
  166.     print('\t[+] Partner Status:',pStatus)
  167.     if fb:
  168.         print('\t[+] Facebook:',fb)
  169.     if indServed:
  170.         print('\t[+] Industry Served:',', '.join(indServed))
  171.     if conApps:
  172.         print("\t[+] Connected Apps:",', '.join(conApps))
  173.     # DISPLAYING DATA
  174.  
  175.     # EMAIL
  176.     print("\n\t[=] Getting Email from website")
  177.     email= getEmail(web,fb).split('?')[0]      
  178.     if not email == 'NULL':
  179.         print("\t[+] Email:",email)
  180.     else:
  181.         print("\t[-] Email Not Found")
  182.     # EMAIL
  183.    
  184.     print("[=] Scraping Completed for Company:",name)
  185.     return [name,web,pStatus,', '.join(indServed),', '.join(conApps),'','',phone,email]
  186.  
  187.  
  188. print("------------- 4th reich.py_by_Skip420@FreeNode_#Shark ------------\n")
  189. try:
  190.     with open('companies.csv','w',newline='',encoding='utf-8') as file:
  191.         cFile = csv.writer(file)
  192.         cFile.writerow(['Company Name','URL','Partner status', 'Industries served', 'Connected Apps', 'Contact person name', 'Contact person position', 'Phone number', 'Email address'])
  193.         for i in range(1,249):
  194.             print("[=] Getting Companies on Page",i)
  195.             links,names = getList(i)
  196.             print("[+] {} Companies Retrieved\n".format(len(links)))
  197.             for link,name in zip(links,names):
  198.                 data = getData(link,name)
  199.                 cFile.writerow(data)
  200.                 print('\n')
  201.                
  202. except KeyboardInterrupt:
  203.     print("\n[-] Program Stopped")
  204. except Exception as e:
  205.     print(e)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement