Advertisement
browncrown

Untitled

Jul 25th, 2019
333
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.63 KB | None | 0 0
  1. import csv, re, requests, pytesseract
  2. from bs4 import BeautifulSoup
  3. from PIL import Image
  4. from io import BytesIO
  5. import pandas as pd
  6.  
  7. pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
  8.  
  9. countries = ['Albania', 'Algeria', 'Antigua And Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria', 'Cambodia', 'Canada', 'Chile', 'China', 'Colombia', 'Congo, Democractic Republic of the', 'Costa Rica', "Cote D'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia', 'Finland', 'France', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Hong Kong S.A.R.', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Korea, South', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Lithuania', 'Luxembourg', 'Macau S.A.R.', 'Macedonia, Former Yugoslav Republic of', 'Madagascar', 'Malaysia', 'Malta', 'Mauritius', 'Mexico', 'Moldova', 'Mongolia', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Netherlands Antilles', 'Netherlands, The', 'New Zealand', 'Nicaragua', 'Nigeria', 'Niue', 'Norway', 'Pakistan', 'Panama', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Reunion', 'Romania', 'Russia', 'Rwanda', 'Saint Kitts And Nevis', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Singapore', 'Slovakia', 'Slovenia', 'South Africa', 'Spain', 'Sri Lanka', 'Swaziland', 'Sweden', 'Switzerland', 'Taiwan', 'Tanzania', 'Thailand', 'Trinidad And Tobago', 'Tunisia', 'Turkey', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United Nations', 'United States', 'United States Minor Outlying Islands', 'Uruguay', 'Uzbekistan', 'Venezuela', 'Vietnam', 'Yemen', 'Yugoslavia', 'Zimbabwe']
  10.  
  11. countries_code = [i.replace(' ', '_') for i in countries]
  12.  
  13. email_list_per_country = []
  14.  
  15. with open(('s.csv'), 'w') as myfile:
  16.     pass
  17.  
  18. for i in range(len(countries)):
  19.     try:
  20.         country = countries_code[i]
  21.         link = requests.get('https://services.creativecow.net/country/' + country)
  22.         soup = BeautifulSoup(link.text, 'html.parser')
  23.  
  24.         companyLinks = [] #a list of all URL's for the companies on the webpage
  25.  
  26.         for link in soup.findAll('a', attrs={'href': re.compile("^https://")}):
  27.             companyLink = link.get('href')
  28.             if '/s/' in companyLink and not companyLink in companyLinks:
  29.                 companyLinks.append(companyLink)
  30.         email_list = []
  31.         email_list.append(countries[i])
  32.  
  33.  
  34.         for f in range(len(companyLinks)):
  35.             link = requests.get(companyLinks[f])
  36.             soup = BeautifulSoup(link.text, 'html.parser')
  37.             soup = soup.find("div", {"class": "vcard"})
  38.             text = soup.find('img')['src']
  39.             r = requests.get('http:' + str(text))
  40.             email_list.append(pytesseract.image_to_string(Image.open(BytesIO(r.content)), lang='eng')[7:].replace(' ', ''))
  41.             print('Finished ' + str(f+1) + ' out of '+  str(len(companyLinks))+ ' for country number ' + str(i+1) +' (' + countries[i] +')' )
  42.  
  43.         print(countries[i] + ' finished successfully!\n')
  44.  
  45.         file_name = "s.csv"
  46.         numbers = [i for i in range(15)]
  47.         email_list_per_country.append(email_list)
  48.         df = pd.DataFrame(email_list_per_country)
  49.         df = df.transpose()
  50.         df.to_csv(file_name, index=False, header=None)
  51.         print(email_list_per_country)
  52.     except:
  53.         pass
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement