# -*- coding: utf-8 -*- import sys import os import re import wikipedia import urllib import urllib2 import time import login import codecs import winsound ##import win32com.client from _journal_list import * ##speak = win32com.client.Dispatch('Sapi.SpVoice') ##speak.Volume = 100 ##speak.Rate = 1 ##speak.Voice = speak.GetVoices('Name=Microsoft Anna').Item(0) site = wikipedia.getSite() throttle_time = 5 errorfilepath = "C:/Users/Headbomb/Desktop/Pywikipedia/_Article timeouts/" regex = re.compile("(\{\{)|(\}\})") m_codes = {} # m_codes[u"AJ..."]=u"." username = "Bibcode Bot" print "Logging in as Bibcode Bot..." login print "Logged in!" def main(): with codecs.open("_Article list.txt", encoding="utf-8") as f: print "Starting run! \n--------------------------------------" for line in f: line = unicode(line.strip(u" \t\r\n\*\[\]")) print "Getting page: " + line global page page = wikipedia.Page(site, line) if not page.exists(): print "Page does not exist! Skipping to next article.\n--------------------------------------" continue if page.isRedirectPage(): oldpage = line text = page.get(get_redirect=True) target = re.match("\s*\#\s*redirect\s*\[\[(.*?)\]\]", text, flags=re.I).group(1) target = target.split("#")[0] page = wikipedia.Page(site, target) newpage = page print " '" + str(oldpage).strip("[]") + "' redirects to '" + str(newpage).strip("[]") + "'.\n Processing '" + str(newpage).strip("[]") + "' instead." if page.canBeEdited() is False: print "Page cannot be not editted due to protection! Skipping to next article.\n--------------------------------------" continue if not page.botMayEdit(username): print "Page cannot be editted by a bot. Skipping to next article.\n--------------------------------------" continue text = page.get() # \\n matches a linebreak for some reason #print text #bug = re.findall(r"\|\s*title\s*=.*(\\n|\\r|\\t|\\b|\\f|\\a)", text) #if bug != []: #print "Found \\n, \\t, \\r, \\f, \\b or \\a in the title. Skipping article while bug is being solved.\n--------------------------------------" #continue orig_text = text text = parse_template(text) edit_check = id_to_arxiv_count + arxiv_count + bibcode_count + doi_count if edit_check is not 0: if id_to_arxiv_count is not 0: print "\nConverting " + str(id_to_arxiv_count) + " \"id = {{arxiv|...}}\" to \"|arxiv=...\"." print "\nAdding " + str(arxiv_count) + " arxiv eprint(s), " + str(bibcode_count) + " bibcode(s) and " + str(doi_count) + " doi(s) in " + str(counter) + " possible templates." editsummary = "Converting " + str(id_to_arxiv_count) + " \"id = {{arxiv|...}}\" to \"|arxiv=...\". \nAdding " + str(arxiv_count) + " [[arXiv|arxiv eprint(s)]], " + str(bibcode_count) + " [[bibcode|bibcode(s)]] and " + str(doi_count) + " [[digital object identifier|doi(s)]]." #speak.Speak("Converting " + str(id_to_arxiv_count) + " archive I D to archive parameters. Adding " + str(arxiv_count) + " archive preprint, " + str(bibcode_count) + " bibcode and " + str(doi_count) + " d o i.") else: editsummary = "\nAdding " + str(arxiv_count) + " [[arXiv|arxiv eprint(s)]], " + str(bibcode_count) + " [[bibcode|bibcode(s)]] and " + str(doi_count) + " [[digital object identifier|doi(s)]]." #speak.Speak("Adding " + str(arxiv_count) + " archive preprint, " + str(bibcode_count) + " bibcode and " + str(doi_count) + " d o i.") try: page.put(text, editsummary + " Did it miss something? Report bugs, errors, and suggestions at [[User talk:Bibcode Bot]]", maxTries = 2) except wikipedia.MaxTriesExceededError: try: print "Couldn't send data to Wikipedia. Saving page data to " + errorfilepath + page.title() f = open(errorfilepath + page.title() + ".txt", "w") f.write(text.encode("utf-8")) f.close() except: print "Error saving data to file. Printing page:\n\n\n\n\n" print text else: print "\nNo new arxiv eprint, new bibcode or new doi found in this article. Processing next article.\n--------------------------------------" print "\nRun complete!" def get_bibcode(data): #data object that is returned from parse template # Bibcode format is YYYYJJJJJVVVVMPPPPA # YYYY = Year # JJJJJ = Journal code # VVVV = Volume # M = Section code / Headache # PPPP = Page # A = First letter of the last name of the first author global unknown_journal # Extract the year part if "year" not in data and "date" in data and re.search("[12][0-9]{3}", data["date"]) != None: data["year"] = re.search("[12][0-9]{3}", data["date"]).group(0) if "year" in data and not "date" in data and re.search("[12][0-9]{3}", data["year"]) != None: data["year"] = re.search("[12][0-9]{3}", data["year"]).group(0) if "year" not in data != None: print "*** YEAR ERROR *** - No year found in citation." unknown_journal = "None" return False else: bibcode=u"%s" % data["year"] # Let"s figure out the correct journal so we can get the JJJJJ value jkey = "" if not data.has_key("journal"): if data.has_key("work"): data["journal"] = data["work"] elif data.has_key("periodical"): data["journal"] = data["periodical"] else: print "*** JOURNAL ERROR *** - No journal found in citation." unknown_journal = "None" return False if data["journal"]: if data["journal"].lower().startswith("the "): data["journal"] = data["journal"][4:].strip() if data["journal"].endswith("."): data["journal"] = data["journal"].strip(".") for key in journals.keys(): for item in journals[key]: # second part of the tuple is a boolean for regex if item[1]: if re.search(item[0],data["journal"]): jkey = key break # if its not a regex lets escape it and search for the title else: if item[0].lower().startswith("the "): item[0] = item[0][4:].strip() if item[0].endswith("."): item[0] = item[0].strip(".") if data["journal"].lower() == item[0].lower(): jkey = key break if jkey == "": print "*** JOURNAL ERROR *** - Unknown journal detected (" + data["journal"] + ")." unknown_journal = data["journal"] return False else: unknown_journal = "None" # using the J key lets see if there is an M code defined if m_codes.has_key(jkey): m_code = m_codes[jkey] else: # default to . otherwise m_code = "." bibcode+= jkey pad_str=u"" # lets get the volume number and then define the VVVV value if not data.has_key("volume"): print "*** VOLUME ERROR *** - No volume found in citation." return False else: try: data["volume"] = re.search(r"\d+", data["volume"]).group(0) except: print "*** VOLUME ERROR *** - Volume found (" + data["volume"] +"), but not parsable." return False pad = 4-len(data["volume"]) while pad>0: pad=pad-1 pad_str+=u"." bibcode+=pad_str+data["volume"] # boolean to see if we ignore the M code later ignore_m = False # handle both page and pages parameters pg = False pg_0 = False pg_1 = False if data.has_key("page"): if re.search("L\d+",data["page"],re.I): m_code = u"L" if re.search("\d+",data["page"],re.I): pg_0 = re.search(ur"\d+",data["page"],re.I).group(0) else: pg_0 = False if data.has_key("pages"): if re.search("L\d+",data["pages"],re.I): m_code = u"L" if re.search("\d+",data["pages"],re.I): pg_1 = re.search(ur"\d+",data["pages"],re.I).group(0) else: pg_1 = False if not pg_0 and not pg_1: print "*** PAGE ERROR *** - No page detected." return False else: if pg_1: pg = pg_1 else: pg = pg_0 if not data.has_key("page") and not data.has_key("pages"): print "*** PAGE ERROR *** - No page detected." return False # lets define PPPP and wether or not M should be ignored # if its less than 4 lets pad it, if its 4 exactly lets skip ahead if len(pg)<4: pad_str=u"" pad = 4-len(pg) while pad>0: pad=pad-1 pad_str+=u"." pg = pad_str+pg elif len(pg)==5: # if its 5 M should be ignored and the 5th page number should be used instead ignore_m = True elif len(pg)==6: # if its 6 convert the last 2 to a letter and ignore M ignore_m = True alpha = "abcdefghijklmnopqrstuvwxyz" lettercode = alpha[int(pg[:1])] pg = lettercode+pg[2:] # now to combine everything if ignore_m: m_code ="" if data.has_key("last1"): a = data["last1"][0] elif data.has_key("last"): a = data["last"][0] else: a = "." return bibcode+m_code+pg+a def parse_template(text): #Kingpin's regex: \{\{cite\sjournal(((\{\{.+\}\})|[^{]([{][^{])?)+)\}\} found = findalltemplates(text) global counter counter = 0 global id_to_arxiv_count id_to_arxiv_count = 0 global arxiv_count arxiv_count = 0 global bibcode_count bibcode_count = 0 global doi_count doi_count = 0 unknown_journal_list = "None" for item in found: #Used to compare the result at the end of the processing old_item = item #Pre-cleaner (cleans |id={{arxiv|foobar}} to |arxiv=foobar) if re.search("{{\s*arxiv", item, re.IGNORECASE): if re.findall(u"\|(\s*)id(\s*)=(\s*){{\s*arxiv\s*\|\s*(archive)?\s*=?\s*([^(\||}|\s)]*)\s*\|?\s*(id)?\s*=?\s*([^(\||}|\s)]*)(\s*)}}", item, re.IGNORECASE): clean_str = re.sub(u"\|(\s*)id(\s*)=(\s*){{\s*arxiv\s*\|\s*(archive)?\s*=?\s*([^(\||}|\s)]*)\s*\|?\s*(id)?\s*=?\s*([^(\||}|\s)]*)(\s*)}}",ur"|\1arxiv\2=\3\5/\7\8", item, re.IGNORECASE) if re.findall(u"\|(\s*)arxiv(\s*)=(\s*)(.*)/(\s*)(\||})", clean_str): clean_str = re.sub(u"\|(\s*)arxiv(\s*)=(\s*)(.*)/(\s*)(\||})",ur"|\1arxiv\2=\3\4\5\6", clean_str) id_to_arxiv_count += 1 item = clean_str global unknown_journal unknown_journal = "None" counter += 1 pairs = re.finditer(u"(?P\w+)\s*=\s*(?P.*?)(\n\s*|\||\}\})",item) data = {} for pair in pairs: key = pair.group("key").strip() value = pair.group("value").strip(u" []\t\r\n") value = re.sub(r"", "", value, flags=re.DOTALL) if len(value)>0: data[key] = value # The following gets rids of the error messages if any of last1/last/year/date/etc... is missing # This is used to build a message more explicit than "Examining citation 15" # Such as "Schwartz (2000). MISSING JOURNAL, v.194, p.123" # The code might be stupid and weird, but it seems to work just fine # -Headbomb if "last1" not in data != None: if "last" not in data != None: author_message = "MISSING AUTHOR" else: author_message = data["last"] else: author_message = data["last1"] if "year" not in data and "date" in data and re.search("[12][0-9]{3}", data["date"]) != None: data["year"] = re.search("[12][0-9]{3}", data["date"]).group(0) if "year" in data and not "date" in data and re.search("[12][0-9]{3}", data["year"]) != None: data["year"] = re.search("[12][0-9]{3}", data["year"]).group(0) if "year" not in data != None: year_message = "MISSING YEAR" else: year_message = data["year"] if "journal" not in data != None: if "work" not in data != None: if "periodical" not in data != None: journal_message = "MISSING JOURNAL" else: journal_message = data["periodical"] else: journal_message = data["work"] else: journal_message = data["journal"] if "volume" not in data !=None: volume_message = "MISSING" else: volume_message = data["volume"] if "pages" not in data != None: if "page" not in data != None: page_message = "MISSING" else: page_message = data["page"] else: page_message = data["pages"] if "arxiv" not in data != None: arxiv_message = "MISSING" else: arxiv_message = data["arxiv"] if "bibcode" not in data != None: bibcode_message = "MISSING" else: bibcode_message = data["bibcode"] if "doi" not in data != None: doi_message = "MISSING" else: doi_message = data["doi"] #Message identifying what citation we"re dealing with print "\nExamining citation " + str(counter) + " [" + str(page).strip("[]") +"]" print " " + author_message + " (" + year_message + "). " + journal_message + ", vol. " + volume_message + ", p. " + page_message print " arxiv : " + arxiv_message print " bibcode: " + bibcode_message print " doi : " + doi_message #Safety net for now. Will be removed later arxiv = False arxiv_exists = False bibcode = False bibcode_exist = False doi = False doi_exists = False new_str = None #ARXIV, BIBCODE AND DOI ARE FOUND --> DO NOTHING if data.has_key("arxiv") and data.has_key("bibcode") and data.has_key("doi"): print "arxiv (OK), bibcode (OK), doi (OK). Nothing to do." arxiv = data["arxiv"] arxiv_exists = True bibcode = data["bibcode"] bibcode_exists = True doi = data["doi"] doi_exists = True #ARXIV NOT FOUND, BIBCODE FOUND, DOI FOUND --> SEARCH FOR ARXIV if not data.has_key("arxiv") and data.has_key("bibcode") and data.has_key("doi"): print "arxiv (??), bibcode (OK), doi (OK). Searching for arxiv." query = ADS_query(arxiv, data["bibcode"], data["doi"], item) arxiv = query[0] if arxiv != "NOT FOUND!": arxiv_exists = True arxiv_count += 1 else: arxiv_exists = False bibcode = data["bibcode"] bibcode_exists = True doi = data["doi"] doi_exists = True if arxiv_exists: if re.search(u"\|(\s*)arxiv(\s*)=( *)", item): new_str = re.sub(u"\|(\s*)arxiv(\s*)=( *)",ur"|\1arxiv\2=\3 %s" % arxiv, item) else: new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item) #ARXIV FOUND, BIBCODE NOT FOUND, DOI FOUND --> SEARCH FOR BIBCODE if data.has_key("arxiv") and not data.has_key("bibcode") and data.has_key("doi"): print "arxiv (OK), bibcode (??), doi (OK). Searching for bibcode." query = ADS_query(data["arxiv"], bibcode, data["doi"], item) arxiv = data["arxiv"] arxiv_exists = True bibcode = query[1] if bibcode != "NOT FOUND!": bibcode_exists = True bibcode_count += 1 else: bibcode_exists = False doi = data["doi"] doi_exists = True if bibcode_exists: if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item): new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item) else: new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item) #ARXIV FOUND, BIBCODE FOUND, DOI NOT FOUND --> SEARCH FOR DOI if data.has_key("arxiv") and data.has_key("bibcode") and not data.has_key("doi"): print "arxiv (OK), bibcode (OK), doi (??). Searching for doi." query = ADS_query(data["arxiv"], data["bibcode"], doi, item) arxiv = data["arxiv"] arxiv_exists = True bibcode = data["bibcode"] bibcode_exists = True doi = query[2] if doi != "NOT FOUND!": doi_exists = True doi_count += 1 else: doi_exists = False if doi_exists: if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item): new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item) else: new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item) #ARXIV NOT FOUND, BIBCODE NOT FOUND, DOI FOUND --> SEARCH FOR ARXIV AND BIBCODE if not data.has_key("arxiv") and not data.has_key("bibcode") and data.has_key("doi"): print "arxiv (??), bibcode (??), doi (OK). Searching for arxiv and bibcode." query = ADS_query(arxiv, bibcode, data["doi"], item) arxiv = query[0] if arxiv != "NOT FOUND!": arxiv_exists = True arxiv_count += 1 else: arxiv_exist = False bibcode = query[1] if bibcode != "NOT FOUND!": bibcode_exists = True bibcode_count += 1 else: bibcode_exists = False doi = data["doi"] doi_exists = True if arxiv_exists: if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item): new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item) else: new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item) if bibcode_exists: if new_str != None: if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", new_str): new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, new_str) else: new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, new_str) else: if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item): new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item) else: new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item) #ARXIV FOUND, BIBCODE NOT FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI if data.has_key("arxiv") and not data.has_key("bibcode") and not data.has_key("doi"): print "arxiv (OK), bibcode (??), doi (??). Searching for bibcode and doi." query = ADS_query(data["arxiv"], bibcode, doi, item) arxiv = data["arxiv"] arxiv_exists = True bibcode = query[1] if bibcode != "NOT FOUND!": bibcode_exists = True bibcode_count += 1 else: bibcode_exists = False doi = query[2] if doi != "NOT FOUND!": doi_exists = True doi_count += 1 else: doi_exists = False if bibcode_exists: if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item): new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item) else: new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item) if doi_exists: if new_str != None: if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str): new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str) else: new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str) else: if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item): new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item) else: new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item) #ARXIV NOT FOUND, BIBCODE FOUND, DOI NOT FOUND --> SEARCH FOR ARXIV AND DOI if not data.has_key("arxiv") and data.has_key("bibcode") and not data.has_key("doi"): print "arxiv (??), bibcode (OK), doi (??). Searching for arxiv and doi." query = ADS_query(arxiv, data["bibcode"], doi, item) arxiv = query[0] if arxiv != "NOT FOUND!": arxiv_exists = True arxiv_count += 1 else: arxiv_exist = False bibcode = data["bibcode"] bibcode_exists = True doi = query[2] if doi != "NOT FOUND!": doi_exists = True doi_count += 1 else: doi_exists = False if arxiv_exists: if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item): new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item) else: new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item) if doi_exists: if new_str != None: if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str): new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str) else: new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str) else: if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item): new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item) else: new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item) #ARXIV NOT FOUND, BIBCODE NOT FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI if not data.has_key("arxiv") and not data.has_key("bibcode") and not data.has_key("doi"): print "arxiv (??), bibcode (??), doi (??). Guessing bibcode..." query = ADS_query(arxiv, bibcode, doi, item) arxiv = query[0] if arxiv != "NOT FOUND!": arxiv_exists = True arxiv_count += 1 else: arxiv_exist = False bibcode = query[1] if bibcode != "NOT FOUND!": bibcode_exists = True bibcode_count += 1 else: bibcode_exists = False doi = query[2] if doi != "NOT FOUND!": doi_exists = True doi_count += 1 else: doi_exists = False if arxiv_exists: if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item): new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item) else: new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item) if bibcode_exists: if new_str != None: if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", new_str): new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, new_str) else: new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, new_str) else: if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item): new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item) else: new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item) if doi_exists: if new_str != None: if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str): new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str) else: new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str) else: if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item): new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item) else: new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item) if new_str: text = re.sub(re.escape(old_item),new_str,text) else: text = re.sub(re.escape(old_item),item,text) if unknown_journal_list is "None": if unknown_journal is not "None": unknown_journal_list = "\nUnknown journal(s) for " + page.title() + ":\n *" + unicode(unknown_journal) + "\n" else: if unknown_journal is not "None": if not re.search(unicode(unknown_journal) + "\n", unknown_journal_list): unknown_journal_list = unknown_journal_list + " *" + unknown_journal + "\n" print "\nFound:\n " + str(counter) + " {{citation}}/{{cite journal}} template(s)\n " +str(id_to_arxiv_count) + " '|id={{arxiv|...}}' to convert to '|arxiv=...'\n " +str(arxiv_count) + " new arxiv eprint(s)\n " + str(bibcode_count) + " new bibcode(s)\n " + str(doi_count) + " new doi(s)." if unknown_journal_list is "None": print "\nUnknown journals:\n *None" else: print unknown_journal_list f = open("C:/Users/Headbomb/Desktop/Pywikipedia/_Unknown_journals.txt", "a") f.write(unknown_journal_list.encode("utf-8")) f.close() print "\n--------------------------------------" return text def findalltemplates(t): f = [] lowertext = t.lower() while re.search("{{\s*(cite|cite study|citation|cite journal|c journal|cite magazine|cite magazine article|cite paper|citejournal|citepaper|vcite paper|vcite journal|vancite journal)\s*\|", lowertext) != None: firstoffset = re.search("{{\s*(cite|cite study|citation|cite journal|c journal|cite magazine|cite magazine article|cite paper|citejournal|citepaper|vcite paper|vcite journal|vancite journal)\s*\|", lowertext).start() lastoffset = firstoffset counter = 1 while counter > 0: nextbracket = regex.search(lowertext, lastoffset+1) if nextbracket.group(0) == "{{": counter += 1 lastoffset = nextbracket.end() elif nextbracket.group(0) == "}}": counter -= 1 lastoffset = nextbracket.end() f.append(t[firstoffset:lastoffset]) t = t[lastoffset:] lowertext = lowertext[lastoffset:] return f def queryADS(url): retry = True timeout = max(1, throttle_time) retrynum = 0 while retry: try: rawdata = urllib2.urlopen(url).read() retry = False except urllib2.URLError: retrynum += 1 timeout = retrynum * throttle_time if retrynum > 3: print "Cannot connect to ADS site. Aborting..." return "" print "\nError connecting to ADS site. Retrying in " + str(timeout) + " seconds." time.sleep(timeout) continue return rawdata def adv_check_bibcode(code): #Try to find a valid author / section code if code: alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ." url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX" for i in range(27): url += "&bibcode=" + urllib.quote(code[:-1]) + alpha[i] url += "&db_key=ALL" print "Probing for a new author..." raw_html = queryADS(url) bibcode_check = re.findall("@ARTICLE{(...................)", raw_html) if bibcode_check: print " Found! " + bibcode_check[0] return raw_html else: print " Not found!" alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" alphalower = "abcdefghijklmnopqrstuvwxyz" url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=PLAINTEXT" for i in range(26): url += "&bibcode=" + urllib.quote(code[:-1]) + alpha[i] print "Probing for a new section..." raw_html = urllib2.urlopen(url).read() bibcode_check = re.findall("@ARTICLE{(...................)", raw_html) if bibcode_check: print " Found! " + bibcode_check[0] return raw_html else: url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=PLAINTEXT" for i in range(26): url += "&bibcode=" + urllib.quote(code[:-1]) + alphalower[i] raw_html = queryADS(url) if bibcode_check: print " Found! " + bibcode_check[0] return raw_html else: print " Not found!" return "Dummy text" def ADS_query(arxiv, bibcode, doi, item): arxiv_match = False bibcode_match = False doi_match = False raw_html = "Dummy text" pairs = re.finditer(u"(?P\w+)\s*?=\s*?(?P.*?)(\n|\||\})", item) data = {} for pair in pairs: key = pair.group("key").strip() value = pair.group("value").strip(" []") value = re.sub(r"", "", value, flags=re.DOTALL) if len(value)>0: data[key] = value if not arxiv and not bibcode and not doi: bibcode_guess = get_bibcode(data) if bibcode_guess: print "Bibcode guess: " + bibcode_guess url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode_guess.encode("utf-8")) raw_html = queryADS(url) bibcode_check = re.findall("@ARTICLE{(...................)", raw_html) if bibcode_check: print " Valid!" else: print " Invalid!" raw_html = adv_check_bibcode(bibcode_guess) if arxiv and not bibcode and not doi: url = "http://adsabs.harvard.edu/abs/arXiv:" + urllib.quote(arxiv) raw_html = queryADS(url) bibcode_check = re.findall("", raw_html, flags=re.IGNORECASE) if bibcode_check: print " Found bibcode by arxiv query!" + bibcode_check[0] else: print " Did not find bibcode by arxiv query! Guessing bibcode..." bibcode_guess = get_bibcode(data) if bibcode_guess: print "Bibcode guess: " + bibcode_guess url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode_guess.encode("utf-8")) raw_html = queryADS(url) bibcode_check = re.findall("@ARTICLE{(...................)", raw_html) if bibcode_check: print " Valid!" else: print " Invalid!" raw_html = adv_check_bibcode(bibcode_guess) if bibcode: url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode) raw_html = queryADS(url) else: if doi: url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&doi=" + urllib.quote(doi.encode("utf-8")) raw_html = queryADS(url) arxiv_match_0 = re.findall("eprint = {arXiv:(.*)}", raw_html) arxiv_match_1 = re.findall("eprint = {(.*)}", raw_html) arxiv_match_2 = re.findall("arXiv e-print \(arXiv:(.*)\)", raw_html) arxiv_match_3 = re.findall("arXiv e-print \((.*)\)", raw_html) bibcode_match_0 = re.findall("@ARTICLE{(...................)", raw_html) bibcode_match_1 = re.findall("", raw_html, flags=re.IGNORECASE) doi_match_0 = re.findall("doi = {(.*?)}", raw_html) doi_match_1 = re.findall("", raw_html, flags=re.IGNORECASE) if not arxiv_match_0 and not arxiv_match_1 and not arxiv_match_2 and not arxiv_match_3 and not bibcode_match_0 and not bibcode_match_1 and not doi_match_0 and not doi_match_1: return ("NOT FOUND!", "NOT FOUND!", "NOT FOUND!") else: print "Query results:" if arxiv_match_0: arxiv_match = arxiv_match_0[0] print " arxiv : " + arxiv_match if not arxiv_match_0 and arxiv_match_1: arxiv_match = arxiv_match_1[0] print " arxiv : " + arxiv_match if arxiv_match_2: arxiv_match = arxiv_match_2[0] print " arxiv : " + arxiv_match if not arxiv_match_2 and arxiv_match_3: arxiv_match = arxiv_match_3[0] print " arxiv : " + arxiv_match if not arxiv_match: arxiv_match = "NOT FOUND!" print " arxiv : NOT FOUND!" if bibcode_match_0: bibcode_match = bibcode_match_0[0] print " bibcode: " + bibcode_match if bibcode_match_1: bibcode_match = bibcode_match_1[0] print " bibcode: " + bibcode_match if not bibcode_match: bibcode_match = "NOT FOUND!" print " bibcode: NOT FOUND!" if doi_match_0: doi_match = doi_match_0[0] print " doi : " + doi_match if doi_match_1: doi_match = doi_match_1[0] print " doi : " + doi_match if not doi_match: doi_match = "NOT FOUND!" print " doi : NOT FOUND!" return (arxiv_match, bibcode_match, doi_match) if __name__ == "__main__": main()