Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import json
- import openpyxl
- import requests
- from lxml import html
- def search_fauna_eu(genus=None, species=None):
- query = []
- if genus is not None:
- query.append(genus)
- if species is not None:
- query.append(species)
- page = requests.get("http://www.fauna-eu.org/cdm_dataportal/search/results/taxon",
- headers={
- 'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
- },
- cookies={
- 'has_js': '0'
- },
- params={
- 'query': ' '.join(query),
- 'ws': 'portal/taxon/find',
- 'search[doTaxa]': '1',
- 'search[doSynonyms]': '',
- 'search[doTaxaByCommonNames]': ''
- }
- )
- tree = html.document_fromstring(page.text)
- tree.make_links_absolute("http://www.fauna-eu.org/cdm_dataportal/search/results/taxon")
- results = tree.xpath("//a[@class='nameAuthorPart']")
- if len(results) == 0:
- page = requests.get("http://www.fauna-eu.org/cdm_dataportal/search/results/taxon",
- headers={
- 'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
- },
- cookies={
- 'has_js': '0'
- },
- params={
- 'query': " AND ".join(query),
- 'ws': 'portal/taxon/search',
- 'search[doTaxa]': '1',
- 'search[doSynonyms]': '1',
- 'search[doMisappliedNames]': '1',
- 'search[doTaxaByCommonNames]': '1'
- }
- )
- tree = html.document_fromstring(page.text)
- tree.make_links_absolute("http://www.fauna-eu.org/cdm_dataportal/search/results/taxon")
- results.extend(tree.xpath("//a[@class='nameAuthorPart']"))
- for r in results:
- item = r.get('href')
- page = requests.get(item,
- headers={
- 'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
- },
- cookies={
- 'has_js': '0'
- }
- )
- tree = html.document_fromstring(page.text)
- order_results = tree.xpath("//ul[@id='classification-breadcrumbs']/li[7]/a")
- if len(order_results) > 0:
- return order_results[0].text_content()
- return None
- def search_col(genus=None, species=None):
- url = "http://www.catalogueoflife.org/col/search/scientific/"
- if genus is not None:
- url += "genus/{}/".format(genus)
- if species is not None:
- url += "species/{}/".format(species)
- url += "fossil/1/match/1"
- page = requests.get(url,
- headers={
- 'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
- },
- cookies={
- 'aci_language=': 'en',
- 'treeExtinct': '1',
- 'has_js': '0'
- }
- )
- tree = html.document_fromstring(page.text)
- tree.make_links_absolute(url)
- results = tree.xpath("//td[@class='field_header_black']/a")
- for r in results:
- item = r.get('href')
- page = requests.get(item,
- headers={
- 'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
- },
- cookies={
- 'aci_language=': 'en',
- 'treeExtinct': '1',
- 'has_js': '0'
- }
- )
- tree = html.document_fromstring(page.text)
- order_results = tree.xpath("//td[preceding-sibling::td[text()='Order']]/a")
- if len(order_results) > 0:
- return order_results[0].text_content()
- return None
- def search_wikipedia(query):
- search = requests.get("https://en.wikipedia.org/w/api.php",
- headers={
- 'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
- },
- params={
- 'action': 'query',
- 'list': 'search',
- 'format': 'json',
- 'srinfo': '',
- 'srsearch': query
- }
- )
- order_regex = re.compile(r'<span class="order"><a.+?>([\w ]+?)</a></span>')
- for r in json.loads(search.text)['query']['search']:
- url = "https://en.wikipedia.org/wiki/{}".format(r['title'].replace(' ', '_'))
- page = requests.get(url,
- headers={
- 'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
- }
- )
- r_match = order_regex.search(page.text)
- if r_match is None:
- continue
- return r_match.group(1)
- return None
- def main():
- temp = openpyxl.load_workbook("/Users/mac/Downloads/part2.xlsx").get_sheet_by_name('Tabelle1')
- coleoptera = [str(r[3].value) for i, r in enumerate(temp.rows) if i > 1]
- del temp
- wb = openpyxl.load_workbook("test.xlsx")
- ws = wb.get_sheet_by_name('OHNE Subspezies')
- print("Worksheets loaded.")
- for i, row in enumerate(ws.rows):
- if i == 0:
- continue
- wb.save('test.xlsx')
- order, match, status, genus, subgenus, species, subspecies, author_year = tuple(c.value for c in row)
- if order is None:
- print("{:40}".format(genus + " " + species), end="")
- if genus in coleoptera:
- ws['A' + str(i + 1)] = "Coleoptera"
- print("{:20}{}".format("Coleoptera", "Cross-Reference"))
- continue
- feu_guess = search_fauna_eu(genus, species)
- if feu_guess is not None:
- ws['A' + str(i + 1)] = feu_guess
- print("{:20}{}".format(feu_guess, "Fauna-EU"))
- continue
- col_guess = search_col(genus, species)
- if col_guess is not None:
- ws['A' + str(i + 1)] = col_guess
- print("{:20}{}".format(col_guess, "Catalogue of Life"))
- continue
- wiki_guess = search_wikipedia(genus)
- if wiki_guess is not None:
- ws['A' + str(i + 1)] = wiki_guess
- print("{:20}{}".format(wiki_guess, "Wikipedia"))
- continue
- print("Unknown - A{}".format(i + 1))
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement