Untitled

"""If you are like me, you have folders with papers, mostly from arxiv.org, and
need to fetch their bibtex entries from google scholar every time you write something.
This script will help you with that, by sending very polite requests to google scholar.
This means it will take a while to complete, but it is still recommended to run it behind a VPN.
The free ProtonVPN works very well for this purpose.

Copy fetch_bib.py in a folder with your papers.
It will generate a _generated_bibliography.bib file
with all the bibtex entries pulled from google scholar.
It uses only the filenames of pdfs within the folder.

Scenarios that work well:
    - Arxiv ID in filename. It will try the whole title first,
        but fall back to just trying the arxiv ID. Handles versioned arxiv IDs (1907.00000v3)
    - Words in filenames separated by '_', ' - ' or spaces.
    - Only works if the correct paper is the first search result on google scholar.
    - When the lookup fails, it prints info to the terminal for you to handle manually.

Usage:
    python fetch_bibtex.py

"""

import scholarly
import os

# import requests
import re
from time import sleep

# regex strings
versioned_arxiv = r"(([0-9]{4}\.[0-9]{5})v[0-9]+)"
arxiv = r"([0-9]{4}\.[0-9]{5})"

papers = os.listdir(".")
titles = []
for p in papers:
    if p[-4:] == ".pdf":
        # Capture versioned arxiv IDs and strip the version
        m = re.match(versioned_arxiv, p)
        if m:
            p = p.replace(m.group(1), m.group(2))

        p = p.replace(" - ", " ")
        p = p.replace("_", " ")
        titles.append(p[:-4])
    else:
        print(f"Skipping file: {p}")


for t in titles:
    r = scholarly.search_pubs_query(t)
    hit = None
    try:
        hit = next(r)
    except StopIteration:

        m = re.match(arxiv, t)
        if m:
            print(f"Trying Arxiv ID only for: '{t}'")
            sleep(3)
            r = scholarly.search_pubs_query(m.group(1))
            try:
                hit = next(r)
            except StopIteration:
                print(f"Still no results on: {m.group(1)}")
        else:
            print(f"No results found for: '{t}'")

    if hit:
        bibtext = scholarly.scholarly._get_page(hit.url_scholarbib)
        # bibtext = requests.get(hit.url_scholarbib).text
        with open("_generated_bibliography.bib", "a") as f:
            f.write(bibtext)
            f.write("\n")
    sleep(3)