Advertisement
anden3

Order Finder

Jan 25th, 2017
207
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.13 KB | None | 0 0
  1. import re
  2. import json
  3. import openpyxl
  4. import requests
  5.  
  6. from lxml import html
  7.  
  8.  
  9. def search_fauna_eu(genus=None, species=None):
  10.     query = []
  11.  
  12.     if genus is not None:
  13.         query.append(genus)
  14.  
  15.     if species is not None:
  16.         query.append(species)
  17.  
  18.     page = requests.get("http://www.fauna-eu.org/cdm_dataportal/search/results/taxon",
  19.         headers={
  20.             'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
  21.         },
  22.         cookies={
  23.             'has_js': '0'
  24.         },
  25.         params={
  26.             'query': ' '.join(query),
  27.             'ws': 'portal/taxon/find',
  28.             'search[doTaxa]': '1',
  29.             'search[doSynonyms]': '',
  30.             'search[doTaxaByCommonNames]': ''
  31.         }
  32.     )
  33.  
  34.     tree = html.document_fromstring(page.text)
  35.     tree.make_links_absolute("http://www.fauna-eu.org/cdm_dataportal/search/results/taxon")
  36.     results = tree.xpath("//a[@class='nameAuthorPart']")
  37.  
  38.     if len(results) == 0:
  39.         page = requests.get("http://www.fauna-eu.org/cdm_dataportal/search/results/taxon",
  40.             headers={
  41.                 'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
  42.             },
  43.             cookies={
  44.                 'has_js': '0'
  45.             },
  46.             params={
  47.                 'query': " AND ".join(query),
  48.                 'ws': 'portal/taxon/search',
  49.                 'search[doTaxa]': '1',
  50.                 'search[doSynonyms]': '1',
  51.                 'search[doMisappliedNames]': '1',
  52.                 'search[doTaxaByCommonNames]': '1'
  53.             }
  54.         )
  55.  
  56.         tree = html.document_fromstring(page.text)
  57.         tree.make_links_absolute("http://www.fauna-eu.org/cdm_dataportal/search/results/taxon")
  58.  
  59.         results.extend(tree.xpath("//a[@class='nameAuthorPart']"))
  60.  
  61.     for r in results:
  62.         item = r.get('href')
  63.  
  64.         page = requests.get(item,
  65.             headers={
  66.                 'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
  67.             },
  68.             cookies={
  69.                 'has_js': '0'
  70.             }
  71.         )
  72.  
  73.         tree = html.document_fromstring(page.text)
  74.         order_results = tree.xpath("//ul[@id='classification-breadcrumbs']/li[7]/a")
  75.  
  76.         if len(order_results) > 0:
  77.             return order_results[0].text_content()
  78.  
  79.     return None
  80.  
  81.  
  82. def search_col(genus=None, species=None):
  83.     url = "http://www.catalogueoflife.org/col/search/scientific/"
  84.  
  85.     if genus is not None:
  86.         url += "genus/{}/".format(genus)
  87.  
  88.     if species is not None:
  89.         url += "species/{}/".format(species)
  90.  
  91.     url += "fossil/1/match/1"
  92.  
  93.     page = requests.get(url,
  94.         headers={
  95.             'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
  96.         },
  97.         cookies={
  98.             'aci_language=': 'en',
  99.             'treeExtinct': '1',
  100.             'has_js': '0'
  101.         }
  102.     )
  103.  
  104.     tree = html.document_fromstring(page.text)
  105.     tree.make_links_absolute(url)
  106.     results = tree.xpath("//td[@class='field_header_black']/a")
  107.  
  108.     for r in results:
  109.         item = r.get('href')
  110.  
  111.         page = requests.get(item,
  112.             headers={
  113.                 'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
  114.             },
  115.             cookies={
  116.                 'aci_language=': 'en',
  117.                 'treeExtinct': '1',
  118.                 'has_js': '0'
  119.             }
  120.         )
  121.  
  122.         tree = html.document_fromstring(page.text)
  123.         order_results = tree.xpath("//td[preceding-sibling::td[text()='Order']]/a")
  124.  
  125.         if len(order_results) > 0:
  126.             return order_results[0].text_content()
  127.  
  128.     return None
  129.  
  130.  
  131. def search_wikipedia(query):
  132.     search = requests.get("https://en.wikipedia.org/w/api.php",
  133.         headers={
  134.             'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
  135.         },
  136.         params={
  137.             'action': 'query',
  138.             'list': 'search',
  139.             'format': 'json',
  140.             'srinfo': '',
  141.             'srsearch': query
  142.         }
  143.     )
  144.  
  145.     order_regex = re.compile(r'<span class="order"><a.+?>([\w ]+?)</a></span>')
  146.  
  147.     for r in json.loads(search.text)['query']['search']:
  148.         url = "https://en.wikipedia.org/wiki/{}".format(r['title'].replace(' ', '_'))
  149.         page = requests.get(url,
  150.             headers={
  151.                 'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
  152.             }
  153.         )
  154.  
  155.         r_match = order_regex.search(page.text)
  156.  
  157.         if r_match is None:
  158.             continue
  159.  
  160.         return r_match.group(1)
  161.  
  162.     return None
  163.  
  164.  
  165. def main():
  166.     temp = openpyxl.load_workbook("/Users/mac/Downloads/part2.xlsx").get_sheet_by_name('Tabelle1')
  167.     coleoptera = [str(r[3].value) for i, r in enumerate(temp.rows) if i > 1]
  168.  
  169.     del temp
  170.  
  171.     wb = openpyxl.load_workbook("test.xlsx")
  172.     ws = wb.get_sheet_by_name('OHNE Subspezies')
  173.  
  174.     print("Worksheets loaded.")
  175.  
  176.     for i, row in enumerate(ws.rows):
  177.         if i == 0:
  178.             continue
  179.  
  180.         wb.save('test.xlsx')
  181.  
  182.         order, match, status, genus, subgenus, species, subspecies, author_year = tuple(c.value for c in row)
  183.  
  184.         if order is None:
  185.             print("{:40}".format(genus + " " + species), end="")
  186.  
  187.             if genus in coleoptera:
  188.                 ws['A' + str(i + 1)] = "Coleoptera"
  189.                 print("{:20}{}".format("Coleoptera", "Cross-Reference"))
  190.                 continue
  191.  
  192.             feu_guess = search_fauna_eu(genus, species)
  193.  
  194.             if feu_guess is not None:
  195.                 ws['A' + str(i + 1)] = feu_guess
  196.                 print("{:20}{}".format(feu_guess, "Fauna-EU"))
  197.                 continue
  198.  
  199.             col_guess = search_col(genus, species)
  200.  
  201.             if col_guess is not None:
  202.                 ws['A' + str(i + 1)] = col_guess
  203.                 print("{:20}{}".format(col_guess, "Catalogue of Life"))
  204.                 continue
  205.  
  206.             wiki_guess = search_wikipedia(genus)
  207.  
  208.             if wiki_guess is not None:
  209.                 ws['A' + str(i + 1)] = wiki_guess
  210.                 print("{:20}{}".format(wiki_guess, "Wikipedia"))
  211.                 continue
  212.  
  213.             print("Unknown - A{}".format(i + 1))
  214.  
  215. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement