Advertisement
browncrown

Untitled

Jul 24th, 2019
160
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.47 KB | None | 0 0
  1. import csv
  2. import re
  3. from bs4 import BeautifulSoup
  4. import requests
  5. from PIL import Image
  6. import pytesseract
  7. from selenium import webdriver
  8. from io import BytesIO
  9. pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
  10.  
  11. countries = ['Albania', 'Algeria', 'Antigua And Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria', 'Cambodia', 'Canada', 'Chile', 'China', 'Colombia', 'Congo, Democractic Republic of the', 'Costa Rica', "Cote D'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia', 'Finland', 'France', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Hong Kong S.A.R.', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Korea, South', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Lithuania', 'Luxembourg', 'Macau S.A.R.', 'Macedonia, Former Yugoslav Republic of', 'Madagascar', 'Malaysia', 'Malta', 'Mauritius', 'Mexico', 'Moldova', 'Mongolia', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Netherlands Antilles', 'Netherlands, The', 'New Zealand', 'Nicaragua', 'Nigeria', 'Niue', 'Norway', 'Pakistan', 'Panama', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Reunion', 'Romania', 'Russia', 'Rwanda', 'Saint Kitts And Nevis', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Singapore', 'Slovakia', 'Slovenia', 'South Africa', 'Spain', 'Sri Lanka', 'Swaziland', 'Sweden', 'Switzerland', 'Taiwan', 'Tanzania', 'Thailand', 'Trinidad And Tobago', 'Tunisia', 'Turkey', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United Nations', 'United States', 'United States Minor Outlying Islands', 'Uruguay', 'Uzbekistan', 'Venezuela', 'Vietnam', 'Yemen', 'Yugoslavia', 'Zimbabwe']
  12.  
  13. countries_code = [i.replace(' ', '_') for i in countries]
  14.  
  15. for i in range(len(countries)):
  16.     print(countries[i], i)
  17.  
  18. country = input('Please choose the country: (type the number next to the country from the list above)\n')
  19. country = countries_code[int(country)]
  20.  
  21. link = requests.get('https://services.creativecow.net/country/' + country)
  22. soup = BeautifulSoup(link.text, 'html.parser')
  23.  
  24. companyLinks = [] #a list of all URL's for the companies on the webpage
  25.  
  26. for link in soup.findAll('a', attrs={'href': re.compile("^https://")}):
  27.     companyLink = link.get('href')
  28.     if '/s/' in companyLink and not companyLink in companyLinks:
  29.         companyLinks.append(companyLink)
  30.         # print(companyLink)
  31. email_list = []
  32.  
  33. for i in range(len(companyLinks)):          #for i in range(len(companyLinks)):
  34.     link = requests.get(companyLinks[i])
  35.     soup = BeautifulSoup(link.text, 'html.parser')
  36.     mydivs = soup.find("div", {"class": "vcard"})
  37.     soup = mydivs
  38.     text= soup.find('img')['src']
  39.     r = requests.get('http:' + str(text))
  40.     img = Image.open(BytesIO(r.content))
  41.     imgtext = pytesseract.image_to_string(img, lang='eng')
  42.     imgtext = imgtext[7:]
  43.     imgtext = imgtext.replace(' ', '')
  44.     email_list.append(imgtext)
  45.     print('Finished ' + str(i+1) + ' out of '+  str(len(companyLinks)))
  46. for i in email_list:
  47.     print(i)
  48.  
  49.  
  50. email_list = zip(email_list)
  51. with open((country+'.csv'), 'w') as myfile:
  52.     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
  53.     wr.writerows(email_list)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement