Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! python3
- # OG Frosh - setting the pace, as usual :)
- #Please do not remove, i'm releasing such a nice gift in exchange for nothing
- #but a request that you leave these 3 consecutive lines of text.
- import requests, bs4, re
- emailRegex = re.compile(r'''(
- [a-zA-Z0-9._-]+
- @
- [a-zA-Z0-9.-]+
- (\.[a-zA-Z]{2,4})
- )''', re.VERBOSE)
- resFile = open('Alibaba_leads2.csv', 'a', encoding="UTF-8")
- finaldata = ''
- print('starting .....')
- for i in range(806,29226):
- url = "https://chinaexporter118.mingluji.com/node/"+str(i)
- res = requests.get(url)
- try:
- res.raise_for_status()
- except Exception as exc:
- continue
- mingluji = bs4.BeautifulSoup(res.text)
- items = mingluji.select('.field-item')
- found = False
- for i in range(1, len(items)):
- if emailRegex.search(str(items[i].getText)) != None:
- found = True
- break
- if not found:
- continue
- company = items[0].getText()
- name = items[i-5].getText()
- email = items[i].getText()
- finaldata = name + u', ' + email + u', ' + company + u'\n'
- print(finaldata)
- resFile.write(finaldata)
- finaldata = ''
- if (i % 100) == 0:
- print (i)
- print('done')
- resFile.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement