# -*- coding: utf-8 -*-
import sys
import os
import re
import wikipedia
import urllib
import urllib2
import time
import login
import codecs
import winsound
##import win32com.client
from _journal_list import *
##speak = win32com.client.Dispatch('Sapi.SpVoice')
##speak.Volume = 100
##speak.Rate = 1
##speak.Voice = speak.GetVoices('Name=Microsoft Anna').Item(0)
site = wikipedia.getSite()
throttle_time = 5
errorfilepath = "C:/Users/Headbomb/Desktop/Pywikipedia/_Article timeouts/"
regex = re.compile("(\{\{)|(\}\})")
m_codes = {}
# m_codes[u"AJ..."]=u"."
username = "Bibcode Bot"
print "Logging in as Bibcode Bot..."
login
print "Logged in!"
def main():
with codecs.open("_Article list.txt", encoding="utf-8") as f:
print "Starting run! \n--------------------------------------"
for line in f:
line = unicode(line.strip(u" \t\r\n\*\[\]"))
print "Getting page: " + line
global page
page = wikipedia.Page(site, line)
if not page.exists():
print "Page does not exist! Skipping to next article.\n--------------------------------------"
continue
if page.isRedirectPage():
oldpage = line
text = page.get(get_redirect=True)
target = re.match("\s*\#\s*redirect\s*\[\[(.*?)\]\]", text, flags=re.I).group(1)
target = target.split("#")[0]
page = wikipedia.Page(site, target)
newpage = page
print " '" + str(oldpage).strip("[]") + "' redirects to '" + str(newpage).strip("[]") + "'.\n Processing '" + str(newpage).strip("[]") + "' instead."
if page.canBeEdited() is False:
print "Page cannot be not editted due to protection! Skipping to next article.\n--------------------------------------"
continue
if not page.botMayEdit(username):
print "Page cannot be editted by a bot. Skipping to next article.\n--------------------------------------"
continue
text = page.get()
# \\n matches a linebreak for some reason
#print text
#bug = re.findall(r"\|\s*title\s*=.*(\\n|\\r|\\t|\\b|\\f|\\a)", text)
#if bug != []:
#print "Found \\n, \\t, \\r, \\f, \\b or \\a in the title. Skipping article while bug is being solved.\n--------------------------------------"
#continue
orig_text = text
text = parse_template(text)
edit_check = id_to_arxiv_count + arxiv_count + bibcode_count + doi_count
if edit_check is not 0:
if id_to_arxiv_count is not 0:
print "\nConverting " + str(id_to_arxiv_count) + " \"id = {{arxiv|...}}\" to \"|arxiv=...\"."
print "\nAdding " + str(arxiv_count) + " arxiv eprint(s), " + str(bibcode_count) + " bibcode(s) and " + str(doi_count) + " doi(s) in " + str(counter) + " possible templates."
editsummary = "Converting " + str(id_to_arxiv_count) + " \"id = {{arxiv|...}}\" to \"|arxiv=...\". \nAdding " + str(arxiv_count) + " [[arXiv|arxiv eprint(s)]], " + str(bibcode_count) + " [[bibcode|bibcode(s)]] and " + str(doi_count) + " [[digital object identifier|doi(s)]]."
#speak.Speak("Converting " + str(id_to_arxiv_count) + " archive I D to archive parameters. Adding " + str(arxiv_count) + " archive preprint, " + str(bibcode_count) + " bibcode and " + str(doi_count) + " d o i.")
else:
editsummary = "\nAdding " + str(arxiv_count) + " [[arXiv|arxiv eprint(s)]], " + str(bibcode_count) + " [[bibcode|bibcode(s)]] and " + str(doi_count) + " [[digital object identifier|doi(s)]]."
#speak.Speak("Adding " + str(arxiv_count) + " archive preprint, " + str(bibcode_count) + " bibcode and " + str(doi_count) + " d o i.")
try:
page.put(text, editsummary + " Did it miss something? Report bugs, errors, and suggestions at [[User talk:Bibcode Bot]]", maxTries = 2)
except wikipedia.MaxTriesExceededError:
try:
print "Couldn't send data to Wikipedia. Saving page data to " + errorfilepath + page.title()
f = open(errorfilepath + page.title() + ".txt", "w")
f.write(text.encode("utf-8"))
f.close()
except:
print "Error saving data to file. Printing page:\n\n\n\n\n"
print text
else:
print "\nNo new arxiv eprint, new bibcode or new doi found in this article. Processing next article.\n--------------------------------------"
print "\nRun complete!"
def get_bibcode(data): #data object that is returned from parse template
# Bibcode format is YYYYJJJJJVVVVMPPPPA
# YYYY = Year
# JJJJJ = Journal code
# VVVV = Volume
# M = Section code / Headache
# PPPP = Page
# A = First letter of the last name of the first author
global unknown_journal
# Extract the year part
if "year" not in data and "date" in data and re.search("[12][0-9]{3}", data["date"]) != None:
data["year"] = re.search("[12][0-9]{3}", data["date"]).group(0)
if "year" in data and not "date" in data and re.search("[12][0-9]{3}", data["year"]) != None:
data["year"] = re.search("[12][0-9]{3}", data["year"]).group(0)
if "year" not in data != None:
print "*** YEAR ERROR *** - No year found in citation."
unknown_journal = "None"
return False
else:
bibcode=u"%s" % data["year"]
# Let"s figure out the correct journal so we can get the JJJJJ value
jkey = ""
if not data.has_key("journal"):
if data.has_key("work"):
data["journal"] = data["work"]
elif data.has_key("periodical"):
data["journal"] = data["periodical"]
else:
print "*** JOURNAL ERROR *** - No journal found in citation."
unknown_journal = "None"
return False
if data["journal"]:
if data["journal"].lower().startswith("the "):
data["journal"] = data["journal"][4:].strip()
if data["journal"].endswith("."):
data["journal"] = data["journal"].strip(".")
for key in journals.keys():
for item in journals[key]:
# second part of the tuple is a boolean for regex
if item[1]:
if re.search(item[0],data["journal"]):
jkey = key
break
# if its not a regex lets escape it and search for the title
else:
if item[0].lower().startswith("the "):
item[0] = item[0][4:].strip()
if item[0].endswith("."):
item[0] = item[0].strip(".")
if data["journal"].lower() == item[0].lower():
jkey = key
break
if jkey == "":
print "*** JOURNAL ERROR *** - Unknown journal detected (" + data["journal"] + ")."
unknown_journal = data["journal"]
return False
else:
unknown_journal = "None"
# using the J key lets see if there is an M code defined
if m_codes.has_key(jkey):
m_code = m_codes[jkey]
else:
# default to . otherwise
m_code = "."
bibcode+= jkey
pad_str=u""
# lets get the volume number and then define the VVVV value
if not data.has_key("volume"):
print "*** VOLUME ERROR *** - No volume found in citation."
return False
else:
try:
data["volume"] = re.search(r"\d+", data["volume"]).group(0)
except:
print "*** VOLUME ERROR *** - Volume found (" + data["volume"] +"), but not parsable."
return False
pad = 4-len(data["volume"])
while pad>0:
pad=pad-1
pad_str+=u"."
bibcode+=pad_str+data["volume"]
# boolean to see if we ignore the M code later
ignore_m = False
# handle both page and pages parameters
pg = False
pg_0 = False
pg_1 = False
if data.has_key("page"):
if re.search("L\d+",data["page"],re.I):
m_code = u"L"
if re.search("\d+",data["page"],re.I):
pg_0 = re.search(ur"\d+",data["page"],re.I).group(0)
else:
pg_0 = False
if data.has_key("pages"):
if re.search("L\d+",data["pages"],re.I):
m_code = u"L"
if re.search("\d+",data["pages"],re.I):
pg_1 = re.search(ur"\d+",data["pages"],re.I).group(0)
else:
pg_1 = False
if not pg_0 and not pg_1:
print "*** PAGE ERROR *** - No page detected."
return False
else:
if pg_1:
pg = pg_1
else:
pg = pg_0
if not data.has_key("page") and not data.has_key("pages"):
print "*** PAGE ERROR *** - No page detected."
return False
# lets define PPPP and wether or not M should be ignored
# if its less than 4 lets pad it, if its 4 exactly lets skip ahead
if len(pg)<4:
pad_str=u""
pad = 4-len(pg)
while pad>0:
pad=pad-1
pad_str+=u"."
pg = pad_str+pg
elif len(pg)==5:
# if its 5 M should be ignored and the 5th page number should be used instead
ignore_m = True
elif len(pg)==6:
# if its 6 convert the last 2 to a letter and ignore M
ignore_m = True
alpha = "abcdefghijklmnopqrstuvwxyz"
lettercode = alpha[int(pg[:1])]
pg = lettercode+pg[2:]
# now to combine everything
if ignore_m:
m_code =""
if data.has_key("last1"):
a = data["last1"][0]
elif data.has_key("last"):
a = data["last"][0]
else:
a = "."
return bibcode+m_code+pg+a
def parse_template(text):
#Kingpin's regex: \{\{cite\sjournal(((\{\{.+\}\})|[^{]([{][^{])?)+)\}\}
found = findalltemplates(text)
global counter
counter = 0
global id_to_arxiv_count
id_to_arxiv_count = 0
global arxiv_count
arxiv_count = 0
global bibcode_count
bibcode_count = 0
global doi_count
doi_count = 0
unknown_journal_list = "None"
for item in found:
#Used to compare the result at the end of the processing
old_item = item
#Pre-cleaner (cleans |id={{arxiv|foobar}} to |arxiv=foobar)
if re.search("{{\s*arxiv", item, re.IGNORECASE):
if re.findall(u"\|(\s*)id(\s*)=(\s*){{\s*arxiv\s*\|\s*(archive)?\s*=?\s*([^(\||}|\s)]*)\s*\|?\s*(id)?\s*=?\s*([^(\||}|\s)]*)(\s*)}}", item, re.IGNORECASE):
clean_str = re.sub(u"\|(\s*)id(\s*)=(\s*){{\s*arxiv\s*\|\s*(archive)?\s*=?\s*([^(\||}|\s)]*)\s*\|?\s*(id)?\s*=?\s*([^(\||}|\s)]*)(\s*)}}",ur"|\1arxiv\2=\3\5/\7\8", item, re.IGNORECASE)
if re.findall(u"\|(\s*)arxiv(\s*)=(\s*)(.*)/(\s*)(\||})", clean_str):
clean_str = re.sub(u"\|(\s*)arxiv(\s*)=(\s*)(.*)/(\s*)(\||})",ur"|\1arxiv\2=\3\4\5\6", clean_str)
id_to_arxiv_count += 1
item = clean_str
global unknown_journal
unknown_journal = "None"
counter += 1
pairs = re.finditer(u"(?P<key>\w+)\s*=\s*(?P<value>.*?)(\n\s*|\||\}\})",item)
data = {}
for pair in pairs:
key = pair.group("key").strip()
value = pair.group("value").strip(u" []\t\r\n")
value = re.sub(r"<!--.*?-->", "", value, flags=re.DOTALL)
if len(value)>0:
data[key] = value
# The following gets rids of the error messages if any of last1/last/year/date/etc... is missing
# This is used to build a message more explicit than "Examining citation 15"
# Such as "Schwartz (2000). MISSING JOURNAL, v.194, p.123"
# The code might be stupid and weird, but it seems to work just fine
# -Headbomb
if "last1" not in data != None:
if "last" not in data != None:
author_message = "MISSING AUTHOR"
else:
author_message = data["last"]
else:
author_message = data["last1"]
if "year" not in data and "date" in data and re.search("[12][0-9]{3}", data["date"]) != None:
data["year"] = re.search("[12][0-9]{3}", data["date"]).group(0)
if "year" in data and not "date" in data and re.search("[12][0-9]{3}", data["year"]) != None:
data["year"] = re.search("[12][0-9]{3}", data["year"]).group(0)
if "year" not in data != None:
year_message = "MISSING YEAR"
else:
year_message = data["year"]
if "journal" not in data != None:
if "work" not in data != None:
if "periodical" not in data != None:
journal_message = "MISSING JOURNAL"
else:
journal_message = data["periodical"]
else:
journal_message = data["work"]
else:
journal_message = data["journal"]
if "volume" not in data !=None:
volume_message = "MISSING"
else:
volume_message = data["volume"]
if "pages" not in data != None:
if "page" not in data != None:
page_message = "MISSING"
else:
page_message = data["page"]
else:
page_message = data["pages"]
if "arxiv" not in data != None:
arxiv_message = "MISSING"
else:
arxiv_message = data["arxiv"]
if "bibcode" not in data != None:
bibcode_message = "MISSING"
else:
bibcode_message = data["bibcode"]
if "doi" not in data != None:
doi_message = "MISSING"
else:
doi_message = data["doi"]
#Message identifying what citation we"re dealing with
print "\nExamining citation " + str(counter) + " [" + str(page).strip("[]") +"]"
print " " + author_message + " (" + year_message + "). " + journal_message + ", vol. " + volume_message + ", p. " + page_message
print " arxiv : " + arxiv_message
print " bibcode: " + bibcode_message
print " doi : " + doi_message
#Safety net for now. Will be removed later
arxiv = False
arxiv_exists = False
bibcode = False
bibcode_exist = False
doi = False
doi_exists = False
new_str = None
#ARXIV, BIBCODE AND DOI ARE FOUND --> DO NOTHING
if data.has_key("arxiv") and data.has_key("bibcode") and data.has_key("doi"):
print "arxiv (OK), bibcode (OK), doi (OK). Nothing to do."
arxiv = data["arxiv"]
arxiv_exists = True
bibcode = data["bibcode"]
bibcode_exists = True
doi = data["doi"]
doi_exists = True
#ARXIV NOT FOUND, BIBCODE FOUND, DOI FOUND --> SEARCH FOR ARXIV
if not data.has_key("arxiv") and data.has_key("bibcode") and data.has_key("doi"):
print "arxiv (??), bibcode (OK), doi (OK). Searching for arxiv."
query = ADS_query(arxiv, data["bibcode"], data["doi"], item)
arxiv = query[0]
if arxiv != "NOT FOUND!":
arxiv_exists = True
arxiv_count += 1
else:
arxiv_exists = False
bibcode = data["bibcode"]
bibcode_exists = True
doi = data["doi"]
doi_exists = True
if arxiv_exists:
if re.search(u"\|(\s*)arxiv(\s*)=( *)", item):
new_str = re.sub(u"\|(\s*)arxiv(\s*)=( *)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
else:
new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
#ARXIV FOUND, BIBCODE NOT FOUND, DOI FOUND --> SEARCH FOR BIBCODE
if data.has_key("arxiv") and not data.has_key("bibcode") and data.has_key("doi"):
print "arxiv (OK), bibcode (??), doi (OK). Searching for bibcode."
query = ADS_query(data["arxiv"], bibcode, data["doi"], item)
arxiv = data["arxiv"]
arxiv_exists = True
bibcode = query[1]
if bibcode != "NOT FOUND!":
bibcode_exists = True
bibcode_count += 1
else:
bibcode_exists = False
doi = data["doi"]
doi_exists = True
if bibcode_exists:
if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
else:
new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
#ARXIV FOUND, BIBCODE FOUND, DOI NOT FOUND --> SEARCH FOR DOI
if data.has_key("arxiv") and data.has_key("bibcode") and not data.has_key("doi"):
print "arxiv (OK), bibcode (OK), doi (??). Searching for doi."
query = ADS_query(data["arxiv"], data["bibcode"], doi, item)
arxiv = data["arxiv"]
arxiv_exists = True
bibcode = data["bibcode"]
bibcode_exists = True
doi = query[2]
if doi != "NOT FOUND!":
doi_exists = True
doi_count += 1
else:
doi_exists = False
if doi_exists:
if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
else:
new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
#ARXIV NOT FOUND, BIBCODE NOT FOUND, DOI FOUND --> SEARCH FOR ARXIV AND BIBCODE
if not data.has_key("arxiv") and not data.has_key("bibcode") and data.has_key("doi"):
print "arxiv (??), bibcode (??), doi (OK). Searching for arxiv and bibcode."
query = ADS_query(arxiv, bibcode, data["doi"], item)
arxiv = query[0]
if arxiv != "NOT FOUND!":
arxiv_exists = True
arxiv_count += 1
else:
arxiv_exist = False
bibcode = query[1]
if bibcode != "NOT FOUND!":
bibcode_exists = True
bibcode_count += 1
else:
bibcode_exists = False
doi = data["doi"]
doi_exists = True
if arxiv_exists:
if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
else:
new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
if bibcode_exists:
if new_str != None:
if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", new_str):
new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, new_str)
else:
new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, new_str)
else:
if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
else:
new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
#ARXIV FOUND, BIBCODE NOT FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI
if data.has_key("arxiv") and not data.has_key("bibcode") and not data.has_key("doi"):
print "arxiv (OK), bibcode (??), doi (??). Searching for bibcode and doi."
query = ADS_query(data["arxiv"], bibcode, doi, item)
arxiv = data["arxiv"]
arxiv_exists = True
bibcode = query[1]
if bibcode != "NOT FOUND!":
bibcode_exists = True
bibcode_count += 1
else:
bibcode_exists = False
doi = query[2]
if doi != "NOT FOUND!":
doi_exists = True
doi_count += 1
else:
doi_exists = False
if bibcode_exists:
if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
else:
new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
if doi_exists:
if new_str != None:
if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
else:
new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
else:
if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
else:
new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
#ARXIV NOT FOUND, BIBCODE FOUND, DOI NOT FOUND --> SEARCH FOR ARXIV AND DOI
if not data.has_key("arxiv") and data.has_key("bibcode") and not data.has_key("doi"):
print "arxiv (??), bibcode (OK), doi (??). Searching for arxiv and doi."
query = ADS_query(arxiv, data["bibcode"], doi, item)
arxiv = query[0]
if arxiv != "NOT FOUND!":
arxiv_exists = True
arxiv_count += 1
else:
arxiv_exist = False
bibcode = data["bibcode"]
bibcode_exists = True
doi = query[2]
if doi != "NOT FOUND!":
doi_exists = True
doi_count += 1
else:
doi_exists = False
if arxiv_exists:
if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
else:
new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
if doi_exists:
if new_str != None:
if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
else:
new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
else:
if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
else:
new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
#ARXIV NOT FOUND, BIBCODE NOT FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI
if not data.has_key("arxiv") and not data.has_key("bibcode") and not data.has_key("doi"):
print "arxiv (??), bibcode (??), doi (??). Guessing bibcode..."
query = ADS_query(arxiv, bibcode, doi, item)
arxiv = query[0]
if arxiv != "NOT FOUND!":
arxiv_exists = True
arxiv_count += 1
else:
arxiv_exist = False
bibcode = query[1]
if bibcode != "NOT FOUND!":
bibcode_exists = True
bibcode_count += 1
else:
bibcode_exists = False
doi = query[2]
if doi != "NOT FOUND!":
doi_exists = True
doi_count += 1
else:
doi_exists = False
if arxiv_exists:
if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
else:
new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
if bibcode_exists:
if new_str != None:
if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", new_str):
new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, new_str)
else:
new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, new_str)
else:
if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
else:
new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
if doi_exists:
if new_str != None:
if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
else:
new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
else:
if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
else:
new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
if new_str:
text = re.sub(re.escape(old_item),new_str,text)
else:
text = re.sub(re.escape(old_item),item,text)
if unknown_journal_list is "None":
if unknown_journal is not "None":
unknown_journal_list = "\nUnknown journal(s) for " + page.title() + ":\n *" + unicode(unknown_journal) + "\n"
else:
if unknown_journal is not "None":
if not re.search(unicode(unknown_journal) + "\n", unknown_journal_list):
unknown_journal_list = unknown_journal_list + " *" + unknown_journal + "\n"
print "\nFound:\n " + str(counter) + " {{citation}}/{{cite journal}} template(s)\n " +str(id_to_arxiv_count) + " '|id={{arxiv|...}}' to convert to '|arxiv=...'\n " +str(arxiv_count) + " new arxiv eprint(s)\n " + str(bibcode_count) + " new bibcode(s)\n " + str(doi_count) + " new doi(s)."
if unknown_journal_list is "None":
print "\nUnknown journals:\n *None"
else:
print unknown_journal_list
f = open("C:/Users/Headbomb/Desktop/Pywikipedia/_Unknown_journals.txt", "a")
f.write(unknown_journal_list.encode("utf-8"))
f.close()
print "\n--------------------------------------"
return text
def findalltemplates(t):
f = []
lowertext = t.lower()
while re.search("{{\s*(cite|cite study|citation|cite journal|c journal|cite magazine|cite magazine article|cite paper|citejournal|citepaper|vcite paper|vcite journal|vancite journal)\s*\|", lowertext) != None:
firstoffset = re.search("{{\s*(cite|cite study|citation|cite journal|c journal|cite magazine|cite magazine article|cite paper|citejournal|citepaper|vcite paper|vcite journal|vancite journal)\s*\|", lowertext).start()
lastoffset = firstoffset
counter = 1
while counter > 0:
nextbracket = regex.search(lowertext, lastoffset+1)
if nextbracket.group(0) == "{{":
counter += 1
lastoffset = nextbracket.end()
elif nextbracket.group(0) == "}}":
counter -= 1
lastoffset = nextbracket.end()
f.append(t[firstoffset:lastoffset])
t = t[lastoffset:]
lowertext = lowertext[lastoffset:]
return f
def queryADS(url):
retry = True
timeout = max(1, throttle_time)
retrynum = 0
while retry:
try:
rawdata = urllib2.urlopen(url).read()
retry = False
except urllib2.URLError:
retrynum += 1
timeout = retrynum * throttle_time
if retrynum > 3:
print "Cannot connect to ADS site. Aborting..."
return ""
print "\nError connecting to ADS site. Retrying in " + str(timeout) + " seconds."
time.sleep(timeout)
continue
return rawdata
def adv_check_bibcode(code): #Try to find a valid author / section code
if code:
alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ."
url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX"
for i in range(27):
url += "&bibcode=" + urllib.quote(code[:-1]) + alpha[i]
url += "&db_key=ALL"
print "Probing for a new author..."
raw_html = queryADS(url)
bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
if bibcode_check:
print " Found! " + bibcode_check[0]
return raw_html
else:
print " Not found!"
alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
alphalower = "abcdefghijklmnopqrstuvwxyz"
url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=PLAINTEXT"
for i in range(26):
url += "&bibcode=" + urllib.quote(code[:-1]) + alpha[i]
print "Probing for a new section..."
raw_html = urllib2.urlopen(url).read()
bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
if bibcode_check:
print " Found! " + bibcode_check[0]
return raw_html
else:
url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=PLAINTEXT"
for i in range(26):
url += "&bibcode=" + urllib.quote(code[:-1]) + alphalower[i]
raw_html = queryADS(url)
if bibcode_check:
print " Found! " + bibcode_check[0]
return raw_html
else:
print " Not found!"
return "Dummy text"
def ADS_query(arxiv, bibcode, doi, item):
arxiv_match = False
bibcode_match = False
doi_match = False
raw_html = "Dummy text"
pairs = re.finditer(u"(?P<key>\w+)\s*?=\s*?(?P<value>.*?)(\n|\||\})", item)
data = {}
for pair in pairs:
key = pair.group("key").strip()
value = pair.group("value").strip(" []")
value = re.sub(r"<!--.*?-->", "", value, flags=re.DOTALL)
if len(value)>0:
data[key] = value
if not arxiv and not bibcode and not doi:
bibcode_guess = get_bibcode(data)
if bibcode_guess:
print "Bibcode guess: " + bibcode_guess
url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode_guess.encode("utf-8"))
raw_html = queryADS(url)
bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
if bibcode_check:
print " Valid!"
else:
print " Invalid!"
raw_html = adv_check_bibcode(bibcode_guess)
if arxiv and not bibcode and not doi:
url = "http://adsabs.harvard.edu/abs/arXiv:" + urllib.quote(arxiv)
raw_html = queryADS(url)
bibcode_check = re.findall("<A href=\"http://adsabs\.harvard\.edu/abs/(...................)\">", raw_html, flags=re.IGNORECASE)
if bibcode_check:
print " Found bibcode by arxiv query!" + bibcode_check[0]
else:
print " Did not find bibcode by arxiv query! Guessing bibcode..."
bibcode_guess = get_bibcode(data)
if bibcode_guess:
print "Bibcode guess: " + bibcode_guess
url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode_guess.encode("utf-8"))
raw_html = queryADS(url)
bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
if bibcode_check:
print " Valid!"
else:
print " Invalid!"
raw_html = adv_check_bibcode(bibcode_guess)
if bibcode:
url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode)
raw_html = queryADS(url)
else:
if doi:
url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&doi=" + urllib.quote(doi.encode("utf-8"))
raw_html = queryADS(url)
arxiv_match_0 = re.findall("eprint = {arXiv:(.*)}", raw_html)
arxiv_match_1 = re.findall("eprint = {(.*)}", raw_html)
arxiv_match_2 = re.findall("arXiv e-print \(arXiv:(.*)\)", raw_html)
arxiv_match_3 = re.findall("arXiv e-print \((.*)\)", raw_html)
bibcode_match_0 = re.findall("@ARTICLE{(...................)", raw_html)
bibcode_match_1 = re.findall("<A href=\"http://adsabs\.harvard\.edu/abs/(...................)\">", raw_html, flags=re.IGNORECASE)
doi_match_0 = re.findall("doi = {(.*?)}", raw_html)
doi_match_1 = re.findall("<A href=\"http://dx\.doi\.org/(.*)\">", raw_html, flags=re.IGNORECASE)
if not arxiv_match_0 and not arxiv_match_1 and not arxiv_match_2 and not arxiv_match_3 and not bibcode_match_0 and not bibcode_match_1 and not doi_match_0 and not doi_match_1:
return ("NOT FOUND!", "NOT FOUND!", "NOT FOUND!")
else:
print "Query results:"
if arxiv_match_0:
arxiv_match = arxiv_match_0[0]
print " arxiv : " + arxiv_match
if not arxiv_match_0 and arxiv_match_1:
arxiv_match = arxiv_match_1[0]
print " arxiv : " + arxiv_match
if arxiv_match_2:
arxiv_match = arxiv_match_2[0]
print " arxiv : " + arxiv_match
if not arxiv_match_2 and arxiv_match_3:
arxiv_match = arxiv_match_3[0]
print " arxiv : " + arxiv_match
if not arxiv_match:
arxiv_match = "NOT FOUND!"
print " arxiv : NOT FOUND!"
if bibcode_match_0:
bibcode_match = bibcode_match_0[0]
print " bibcode: " + bibcode_match
if bibcode_match_1:
bibcode_match = bibcode_match_1[0]
print " bibcode: " + bibcode_match
if not bibcode_match:
bibcode_match = "NOT FOUND!"
print " bibcode: NOT FOUND!"
if doi_match_0:
doi_match = doi_match_0[0]
print " doi : " + doi_match
if doi_match_1:
doi_match = doi_match_1[0]
print " doi : " + doi_match
if not doi_match:
doi_match = "NOT FOUND!"
print " doi : NOT FOUND!"
return (arxiv_match, bibcode_match, doi_match)
if __name__ == "__main__":
main()