Order Finder

import re
import json
import openpyxl
import requests

from lxml import html


def search_fauna_eu(genus=None, species=None):
    query = []

    if genus is not None:
        query.append(genus)

    if species is not None:
        query.append(species)

    page = requests.get("http://www.fauna-eu.org/cdm_dataportal/search/results/taxon",
        headers={
            'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
        },
        cookies={
            'has_js': '0'
        },
        params={
            'query': ' '.join(query),
            'ws': 'portal/taxon/find',
            'search[doTaxa]': '1',
            'search[doSynonyms]': '',
            'search[doTaxaByCommonNames]': ''
        }
    )

    tree = html.document_fromstring(page.text)
    tree.make_links_absolute("http://www.fauna-eu.org/cdm_dataportal/search/results/taxon")
    results = tree.xpath("//a[@class='nameAuthorPart']")

    if len(results) == 0:
        page = requests.get("http://www.fauna-eu.org/cdm_dataportal/search/results/taxon",
            headers={
                'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
            },
            cookies={
                'has_js': '0'
            },
            params={
                'query': " AND ".join(query),
                'ws': 'portal/taxon/search',
                'search[doTaxa]': '1',
                'search[doSynonyms]': '1',
                'search[doMisappliedNames]': '1',
                'search[doTaxaByCommonNames]': '1'
            }
        )

        tree = html.document_fromstring(page.text)
        tree.make_links_absolute("http://www.fauna-eu.org/cdm_dataportal/search/results/taxon")

        results.extend(tree.xpath("//a[@class='nameAuthorPart']"))

    for r in results:
        item = r.get('href')

        page = requests.get(item,
            headers={
                'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
            },
            cookies={
                'has_js': '0'
            }
        )

        tree = html.document_fromstring(page.text)
        order_results = tree.xpath("//ul[@id='classification-breadcrumbs']/li[7]/a")

        if len(order_results) > 0:
            return order_results[0].text_content()

    return None


def search_col(genus=None, species=None):
    url = "http://www.catalogueoflife.org/col/search/scientific/"

    if genus is not None:
        url += "genus/{}/".format(genus)

    if species is not None:
        url += "species/{}/".format(species)

    url += "fossil/1/match/1"

    page = requests.get(url,
        headers={
            'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
        },
        cookies={
            'aci_language=': 'en',
            'treeExtinct': '1',
            'has_js': '0'
        }
    )

    tree = html.document_fromstring(page.text)
    tree.make_links_absolute(url)
    results = tree.xpath("//td[@class='field_header_black']/a")

    for r in results:
        item = r.get('href')

        page = requests.get(item,
            headers={
                'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
            },
            cookies={
                'aci_language=': 'en',
                'treeExtinct': '1',
                'has_js': '0'
            }
        )

        tree = html.document_fromstring(page.text)
        order_results = tree.xpath("//td[preceding-sibling::td[text()='Order']]/a")

        if len(order_results) > 0:
            return order_results[0].text_content()

    return None


def search_wikipedia(query):
    search = requests.get("https://en.wikipedia.org/w/api.php",
        headers={
            'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
        },
        params={
            'action': 'query',
            'list': 'search',
            'format': 'json',
            'srinfo': '',
            'srsearch': query
        }
    )

    order_regex = re.compile(r'<span class="order"><a.+?>([\w ]+?)</a></span>')

    for r in json.loads(search.text)['query']['search']:
        url = "https://en.wikipedia.org/wiki/{}".format(r['title'].replace(' ', '_'))
        page = requests.get(url,
            headers={
                'User-Agent': 'Species Finder/0.1 (andre.vennberg@gmail.com)'
            }
        )

        r_match = order_regex.search(page.text)

        if r_match is None:
            continue

        return r_match.group(1)

    return None


def main():
    temp = openpyxl.load_workbook("/Users/mac/Downloads/part2.xlsx").get_sheet_by_name('Tabelle1')
    coleoptera = [str(r[3].value) for i, r in enumerate(temp.rows) if i > 1]

    del temp

    wb = openpyxl.load_workbook("test.xlsx")
    ws = wb.get_sheet_by_name('OHNE Subspezies')

    print("Worksheets loaded.")

    for i, row in enumerate(ws.rows):
        if i == 0:
            continue

        wb.save('test.xlsx')

        order, match, status, genus, subgenus, species, subspecies, author_year = tuple(c.value for c in row)

        if order is None:
            print("{:40}".format(genus + " " + species), end="")

            if genus in coleoptera:
                ws['A' + str(i + 1)] = "Coleoptera"
                print("{:20}{}".format("Coleoptera", "Cross-Reference"))
                continue

            feu_guess = search_fauna_eu(genus, species)

            if feu_guess is not None:
                ws['A' + str(i + 1)] = feu_guess
                print("{:20}{}".format(feu_guess, "Fauna-EU"))
                continue

            col_guess = search_col(genus, species)

            if col_guess is not None:
                ws['A' + str(i + 1)] = col_guess
                print("{:20}{}".format(col_guess, "Catalogue of Life"))
                continue

            wiki_guess = search_wikipedia(genus)

            if wiki_guess is not None:
                ws['A' + str(i + 1)] = wiki_guess
                print("{:20}{}".format(wiki_guess, "Wikipedia"))
                continue

            print("Unknown - A{}".format(i + 1))

main()