Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import sys
- import os
- import re
- import wikipedia
- import urllib
- import urllib2
- import time
- import login
- import codecs
- import winsound
- ##import win32com.client
- from _journal_list import *
- ##speak = win32com.client.Dispatch('Sapi.SpVoice')
- ##speak.Volume = 100
- ##speak.Rate = 1
- ##speak.Voice = speak.GetVoices('Name=Microsoft Anna').Item(0)
- site = wikipedia.getSite()
- throttle_time = 5
- errorfilepath = "C:/Users/Headbomb/Desktop/Pywikipedia/_Article timeouts/"
- regex = re.compile("(\{\{)|(\}\})")
- m_codes = {}
- # m_codes[u"AJ..."]=u"."
- username = "Bibcode Bot"
- print "Logging in as Bibcode Bot..."
- login
- print "Logged in!"
- def main():
- with codecs.open("_Article list.txt", encoding="utf-8") as f:
- print "Starting run! \n--------------------------------------"
- for line in f:
- line = unicode(line.strip(u" \t\r\n\*\[\]"))
- print "Getting page: " + line
- global page
- page = wikipedia.Page(site, line)
- if not page.exists():
- print "Page does not exist! Skipping to next article.\n--------------------------------------"
- continue
- if page.isRedirectPage():
- oldpage = line
- text = page.get(get_redirect=True)
- target = re.match("\s*\#\s*redirect\s*\[\[(.*?)\]\]", text, flags=re.I).group(1)
- target = target.split("#")[0]
- page = wikipedia.Page(site, target)
- newpage = page
- print " '" + str(oldpage).strip("[]") + "' redirects to '" + str(newpage).strip("[]") + "'.\n Processing '" + str(newpage).strip("[]") + "' instead."
- if page.canBeEdited() is False:
- print "Page cannot be not editted due to protection! Skipping to next article.\n--------------------------------------"
- continue
- if not page.botMayEdit(username):
- print "Page cannot be editted by a bot. Skipping to next article.\n--------------------------------------"
- continue
- text = page.get()
- # \\n matches a linebreak for some reason
- #print text
- #bug = re.findall(r"\|\s*title\s*=.*(\\n|\\r|\\t|\\b|\\f|\\a)", text)
- #if bug != []:
- #print "Found \\n, \\t, \\r, \\f, \\b or \\a in the title. Skipping article while bug is being solved.\n--------------------------------------"
- #continue
- orig_text = text
- text = parse_template(text)
- edit_check = id_to_arxiv_count + arxiv_count + bibcode_count + doi_count
- if edit_check is not 0:
- if id_to_arxiv_count is not 0:
- print "\nConverting " + str(id_to_arxiv_count) + " \"id = {{arxiv|...}}\" to \"|arxiv=...\"."
- print "\nAdding " + str(arxiv_count) + " arxiv eprint(s), " + str(bibcode_count) + " bibcode(s) and " + str(doi_count) + " doi(s) in " + str(counter) + " possible templates."
- editsummary = "Converting " + str(id_to_arxiv_count) + " \"id = {{arxiv|...}}\" to \"|arxiv=...\". \nAdding " + str(arxiv_count) + " [[arXiv|arxiv eprint(s)]], " + str(bibcode_count) + " [[bibcode|bibcode(s)]] and " + str(doi_count) + " [[digital object identifier|doi(s)]]."
- #speak.Speak("Converting " + str(id_to_arxiv_count) + " archive I D to archive parameters. Adding " + str(arxiv_count) + " archive preprint, " + str(bibcode_count) + " bibcode and " + str(doi_count) + " d o i.")
- else:
- editsummary = "\nAdding " + str(arxiv_count) + " [[arXiv|arxiv eprint(s)]], " + str(bibcode_count) + " [[bibcode|bibcode(s)]] and " + str(doi_count) + " [[digital object identifier|doi(s)]]."
- #speak.Speak("Adding " + str(arxiv_count) + " archive preprint, " + str(bibcode_count) + " bibcode and " + str(doi_count) + " d o i.")
- try:
- page.put(text, editsummary + " Did it miss something? Report bugs, errors, and suggestions at [[User talk:Bibcode Bot]]", maxTries = 2)
- except wikipedia.MaxTriesExceededError:
- try:
- print "Couldn't send data to Wikipedia. Saving page data to " + errorfilepath + page.title()
- f = open(errorfilepath + page.title() + ".txt", "w")
- f.write(text.encode("utf-8"))
- f.close()
- except:
- print "Error saving data to file. Printing page:\n\n\n\n\n"
- print text
- else:
- print "\nNo new arxiv eprint, new bibcode or new doi found in this article. Processing next article.\n--------------------------------------"
- print "\nRun complete!"
- def get_bibcode(data): #data object that is returned from parse template
- # Bibcode format is YYYYJJJJJVVVVMPPPPA
- # YYYY = Year
- # JJJJJ = Journal code
- # VVVV = Volume
- # M = Section code / Headache
- # PPPP = Page
- # A = First letter of the last name of the first author
- global unknown_journal
- # Extract the year part
- if "year" not in data and "date" in data and re.search("[12][0-9]{3}", data["date"]) != None:
- data["year"] = re.search("[12][0-9]{3}", data["date"]).group(0)
- if "year" in data and not "date" in data and re.search("[12][0-9]{3}", data["year"]) != None:
- data["year"] = re.search("[12][0-9]{3}", data["year"]).group(0)
- if "year" not in data != None:
- print "*** YEAR ERROR *** - No year found in citation."
- unknown_journal = "None"
- return False
- else:
- bibcode=u"%s" % data["year"]
- # Let"s figure out the correct journal so we can get the JJJJJ value
- jkey = ""
- if not data.has_key("journal"):
- if data.has_key("work"):
- data["journal"] = data["work"]
- elif data.has_key("periodical"):
- data["journal"] = data["periodical"]
- else:
- print "*** JOURNAL ERROR *** - No journal found in citation."
- unknown_journal = "None"
- return False
- if data["journal"]:
- if data["journal"].lower().startswith("the "):
- data["journal"] = data["journal"][4:].strip()
- if data["journal"].endswith("."):
- data["journal"] = data["journal"].strip(".")
- for key in journals.keys():
- for item in journals[key]:
- # second part of the tuple is a boolean for regex
- if item[1]:
- if re.search(item[0],data["journal"]):
- jkey = key
- break
- # if its not a regex lets escape it and search for the title
- else:
- if item[0].lower().startswith("the "):
- item[0] = item[0][4:].strip()
- if item[0].endswith("."):
- item[0] = item[0].strip(".")
- if data["journal"].lower() == item[0].lower():
- jkey = key
- break
- if jkey == "":
- print "*** JOURNAL ERROR *** - Unknown journal detected (" + data["journal"] + ")."
- unknown_journal = data["journal"]
- return False
- else:
- unknown_journal = "None"
- # using the J key lets see if there is an M code defined
- if m_codes.has_key(jkey):
- m_code = m_codes[jkey]
- else:
- # default to . otherwise
- m_code = "."
- bibcode+= jkey
- pad_str=u""
- # lets get the volume number and then define the VVVV value
- if not data.has_key("volume"):
- print "*** VOLUME ERROR *** - No volume found in citation."
- return False
- else:
- try:
- data["volume"] = re.search(r"\d+", data["volume"]).group(0)
- except:
- print "*** VOLUME ERROR *** - Volume found (" + data["volume"] +"), but not parsable."
- return False
- pad = 4-len(data["volume"])
- while pad>0:
- pad=pad-1
- pad_str+=u"."
- bibcode+=pad_str+data["volume"]
- # boolean to see if we ignore the M code later
- ignore_m = False
- # handle both page and pages parameters
- pg = False
- pg_0 = False
- pg_1 = False
- if data.has_key("page"):
- if re.search("L\d+",data["page"],re.I):
- m_code = u"L"
- if re.search("\d+",data["page"],re.I):
- pg_0 = re.search(ur"\d+",data["page"],re.I).group(0)
- else:
- pg_0 = False
- if data.has_key("pages"):
- if re.search("L\d+",data["pages"],re.I):
- m_code = u"L"
- if re.search("\d+",data["pages"],re.I):
- pg_1 = re.search(ur"\d+",data["pages"],re.I).group(0)
- else:
- pg_1 = False
- if not pg_0 and not pg_1:
- print "*** PAGE ERROR *** - No page detected."
- return False
- else:
- if pg_1:
- pg = pg_1
- else:
- pg = pg_0
- if not data.has_key("page") and not data.has_key("pages"):
- print "*** PAGE ERROR *** - No page detected."
- return False
- # lets define PPPP and wether or not M should be ignored
- # if its less than 4 lets pad it, if its 4 exactly lets skip ahead
- if len(pg)<4:
- pad_str=u""
- pad = 4-len(pg)
- while pad>0:
- pad=pad-1
- pad_str+=u"."
- pg = pad_str+pg
- elif len(pg)==5:
- # if its 5 M should be ignored and the 5th page number should be used instead
- ignore_m = True
- elif len(pg)==6:
- # if its 6 convert the last 2 to a letter and ignore M
- ignore_m = True
- alpha = "abcdefghijklmnopqrstuvwxyz"
- lettercode = alpha[int(pg[:1])]
- pg = lettercode+pg[2:]
- # now to combine everything
- if ignore_m:
- m_code =""
- if data.has_key("last1"):
- a = data["last1"][0]
- elif data.has_key("last"):
- a = data["last"][0]
- else:
- a = "."
- return bibcode+m_code+pg+a
- def parse_template(text):
- #Kingpin's regex: \{\{cite\sjournal(((\{\{.+\}\})|[^{]([{][^{])?)+)\}\}
- found = findalltemplates(text)
- global counter
- counter = 0
- global id_to_arxiv_count
- id_to_arxiv_count = 0
- global arxiv_count
- arxiv_count = 0
- global bibcode_count
- bibcode_count = 0
- global doi_count
- doi_count = 0
- unknown_journal_list = "None"
- for item in found:
- #Used to compare the result at the end of the processing
- old_item = item
- #Pre-cleaner (cleans |id={{arxiv|foobar}} to |arxiv=foobar)
- if re.search("{{\s*arxiv", item, re.IGNORECASE):
- if re.findall(u"\|(\s*)id(\s*)=(\s*){{\s*arxiv\s*\|\s*(archive)?\s*=?\s*([^(\||}|\s)]*)\s*\|?\s*(id)?\s*=?\s*([^(\||}|\s)]*)(\s*)}}", item, re.IGNORECASE):
- clean_str = re.sub(u"\|(\s*)id(\s*)=(\s*){{\s*arxiv\s*\|\s*(archive)?\s*=?\s*([^(\||}|\s)]*)\s*\|?\s*(id)?\s*=?\s*([^(\||}|\s)]*)(\s*)}}",ur"|\1arxiv\2=\3\5/\7\8", item, re.IGNORECASE)
- if re.findall(u"\|(\s*)arxiv(\s*)=(\s*)(.*)/(\s*)(\||})", clean_str):
- clean_str = re.sub(u"\|(\s*)arxiv(\s*)=(\s*)(.*)/(\s*)(\||})",ur"|\1arxiv\2=\3\4\5\6", clean_str)
- id_to_arxiv_count += 1
- item = clean_str
- global unknown_journal
- unknown_journal = "None"
- counter += 1
- pairs = re.finditer(u"(?P<key>\w+)\s*=\s*(?P<value>.*?)(\n\s*|\||\}\})",item)
- data = {}
- for pair in pairs:
- key = pair.group("key").strip()
- value = pair.group("value").strip(u" []\t\r\n")
- value = re.sub(r"<!--.*?-->", "", value, flags=re.DOTALL)
- if len(value)>0:
- data[key] = value
- # The following gets rids of the error messages if any of last1/last/year/date/etc... is missing
- # This is used to build a message more explicit than "Examining citation 15"
- # Such as "Schwartz (2000). MISSING JOURNAL, v.194, p.123"
- # The code might be stupid and weird, but it seems to work just fine
- # -Headbomb
- if "last1" not in data != None:
- if "last" not in data != None:
- author_message = "MISSING AUTHOR"
- else:
- author_message = data["last"]
- else:
- author_message = data["last1"]
- if "year" not in data and "date" in data and re.search("[12][0-9]{3}", data["date"]) != None:
- data["year"] = re.search("[12][0-9]{3}", data["date"]).group(0)
- if "year" in data and not "date" in data and re.search("[12][0-9]{3}", data["year"]) != None:
- data["year"] = re.search("[12][0-9]{3}", data["year"]).group(0)
- if "year" not in data != None:
- year_message = "MISSING YEAR"
- else:
- year_message = data["year"]
- if "journal" not in data != None:
- if "work" not in data != None:
- if "periodical" not in data != None:
- journal_message = "MISSING JOURNAL"
- else:
- journal_message = data["periodical"]
- else:
- journal_message = data["work"]
- else:
- journal_message = data["journal"]
- if "volume" not in data !=None:
- volume_message = "MISSING"
- else:
- volume_message = data["volume"]
- if "pages" not in data != None:
- if "page" not in data != None:
- page_message = "MISSING"
- else:
- page_message = data["page"]
- else:
- page_message = data["pages"]
- if "arxiv" not in data != None:
- arxiv_message = "MISSING"
- else:
- arxiv_message = data["arxiv"]
- if "bibcode" not in data != None:
- bibcode_message = "MISSING"
- else:
- bibcode_message = data["bibcode"]
- if "doi" not in data != None:
- doi_message = "MISSING"
- else:
- doi_message = data["doi"]
- #Message identifying what citation we"re dealing with
- print "\nExamining citation " + str(counter) + " [" + str(page).strip("[]") +"]"
- print " " + author_message + " (" + year_message + "). " + journal_message + ", vol. " + volume_message + ", p. " + page_message
- print " arxiv : " + arxiv_message
- print " bibcode: " + bibcode_message
- print " doi : " + doi_message
- #Safety net for now. Will be removed later
- arxiv = False
- arxiv_exists = False
- bibcode = False
- bibcode_exist = False
- doi = False
- doi_exists = False
- new_str = None
- #ARXIV, BIBCODE AND DOI ARE FOUND --> DO NOTHING
- if data.has_key("arxiv") and data.has_key("bibcode") and data.has_key("doi"):
- print "arxiv (OK), bibcode (OK), doi (OK). Nothing to do."
- arxiv = data["arxiv"]
- arxiv_exists = True
- bibcode = data["bibcode"]
- bibcode_exists = True
- doi = data["doi"]
- doi_exists = True
- #ARXIV NOT FOUND, BIBCODE FOUND, DOI FOUND --> SEARCH FOR ARXIV
- if not data.has_key("arxiv") and data.has_key("bibcode") and data.has_key("doi"):
- print "arxiv (??), bibcode (OK), doi (OK). Searching for arxiv."
- query = ADS_query(arxiv, data["bibcode"], data["doi"], item)
- arxiv = query[0]
- if arxiv != "NOT FOUND!":
- arxiv_exists = True
- arxiv_count += 1
- else:
- arxiv_exists = False
- bibcode = data["bibcode"]
- bibcode_exists = True
- doi = data["doi"]
- doi_exists = True
- if arxiv_exists:
- if re.search(u"\|(\s*)arxiv(\s*)=( *)", item):
- new_str = re.sub(u"\|(\s*)arxiv(\s*)=( *)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
- else:
- new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
- #ARXIV FOUND, BIBCODE NOT FOUND, DOI FOUND --> SEARCH FOR BIBCODE
- if data.has_key("arxiv") and not data.has_key("bibcode") and data.has_key("doi"):
- print "arxiv (OK), bibcode (??), doi (OK). Searching for bibcode."
- query = ADS_query(data["arxiv"], bibcode, data["doi"], item)
- arxiv = data["arxiv"]
- arxiv_exists = True
- bibcode = query[1]
- if bibcode != "NOT FOUND!":
- bibcode_exists = True
- bibcode_count += 1
- else:
- bibcode_exists = False
- doi = data["doi"]
- doi_exists = True
- if bibcode_exists:
- if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
- new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
- else:
- new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
- #ARXIV FOUND, BIBCODE FOUND, DOI NOT FOUND --> SEARCH FOR DOI
- if data.has_key("arxiv") and data.has_key("bibcode") and not data.has_key("doi"):
- print "arxiv (OK), bibcode (OK), doi (??). Searching for doi."
- query = ADS_query(data["arxiv"], data["bibcode"], doi, item)
- arxiv = data["arxiv"]
- arxiv_exists = True
- bibcode = data["bibcode"]
- bibcode_exists = True
- doi = query[2]
- if doi != "NOT FOUND!":
- doi_exists = True
- doi_count += 1
- else:
- doi_exists = False
- if doi_exists:
- if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
- new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
- else:
- new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
- #ARXIV NOT FOUND, BIBCODE NOT FOUND, DOI FOUND --> SEARCH FOR ARXIV AND BIBCODE
- if not data.has_key("arxiv") and not data.has_key("bibcode") and data.has_key("doi"):
- print "arxiv (??), bibcode (??), doi (OK). Searching for arxiv and bibcode."
- query = ADS_query(arxiv, bibcode, data["doi"], item)
- arxiv = query[0]
- if arxiv != "NOT FOUND!":
- arxiv_exists = True
- arxiv_count += 1
- else:
- arxiv_exist = False
- bibcode = query[1]
- if bibcode != "NOT FOUND!":
- bibcode_exists = True
- bibcode_count += 1
- else:
- bibcode_exists = False
- doi = data["doi"]
- doi_exists = True
- if arxiv_exists:
- if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
- new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
- else:
- new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
- if bibcode_exists:
- if new_str != None:
- if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", new_str):
- new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, new_str)
- else:
- new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, new_str)
- else:
- if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
- new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
- else:
- new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
- #ARXIV FOUND, BIBCODE NOT FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI
- if data.has_key("arxiv") and not data.has_key("bibcode") and not data.has_key("doi"):
- print "arxiv (OK), bibcode (??), doi (??). Searching for bibcode and doi."
- query = ADS_query(data["arxiv"], bibcode, doi, item)
- arxiv = data["arxiv"]
- arxiv_exists = True
- bibcode = query[1]
- if bibcode != "NOT FOUND!":
- bibcode_exists = True
- bibcode_count += 1
- else:
- bibcode_exists = False
- doi = query[2]
- if doi != "NOT FOUND!":
- doi_exists = True
- doi_count += 1
- else:
- doi_exists = False
- if bibcode_exists:
- if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
- new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
- else:
- new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
- if doi_exists:
- if new_str != None:
- if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
- new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
- else:
- new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
- else:
- if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
- new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
- else:
- new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
- #ARXIV NOT FOUND, BIBCODE FOUND, DOI NOT FOUND --> SEARCH FOR ARXIV AND DOI
- if not data.has_key("arxiv") and data.has_key("bibcode") and not data.has_key("doi"):
- print "arxiv (??), bibcode (OK), doi (??). Searching for arxiv and doi."
- query = ADS_query(arxiv, data["bibcode"], doi, item)
- arxiv = query[0]
- if arxiv != "NOT FOUND!":
- arxiv_exists = True
- arxiv_count += 1
- else:
- arxiv_exist = False
- bibcode = data["bibcode"]
- bibcode_exists = True
- doi = query[2]
- if doi != "NOT FOUND!":
- doi_exists = True
- doi_count += 1
- else:
- doi_exists = False
- if arxiv_exists:
- if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
- new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
- else:
- new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
- if doi_exists:
- if new_str != None:
- if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
- new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
- else:
- new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
- else:
- if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
- new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
- else:
- new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
- #ARXIV NOT FOUND, BIBCODE NOT FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI
- if not data.has_key("arxiv") and not data.has_key("bibcode") and not data.has_key("doi"):
- print "arxiv (??), bibcode (??), doi (??). Guessing bibcode..."
- query = ADS_query(arxiv, bibcode, doi, item)
- arxiv = query[0]
- if arxiv != "NOT FOUND!":
- arxiv_exists = True
- arxiv_count += 1
- else:
- arxiv_exist = False
- bibcode = query[1]
- if bibcode != "NOT FOUND!":
- bibcode_exists = True
- bibcode_count += 1
- else:
- bibcode_exists = False
- doi = query[2]
- if doi != "NOT FOUND!":
- doi_exists = True
- doi_count += 1
- else:
- doi_exists = False
- if arxiv_exists:
- if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
- new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
- else:
- new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
- if bibcode_exists:
- if new_str != None:
- if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", new_str):
- new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, new_str)
- else:
- new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, new_str)
- else:
- if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
- new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
- else:
- new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
- if doi_exists:
- if new_str != None:
- if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
- new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
- else:
- new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
- else:
- if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
- new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
- else:
- new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
- if new_str:
- text = re.sub(re.escape(old_item),new_str,text)
- else:
- text = re.sub(re.escape(old_item),item,text)
- if unknown_journal_list is "None":
- if unknown_journal is not "None":
- unknown_journal_list = "\nUnknown journal(s) for " + page.title() + ":\n *" + unicode(unknown_journal) + "\n"
- else:
- if unknown_journal is not "None":
- if not re.search(unicode(unknown_journal) + "\n", unknown_journal_list):
- unknown_journal_list = unknown_journal_list + " *" + unknown_journal + "\n"
- print "\nFound:\n " + str(counter) + " {{citation}}/{{cite journal}} template(s)\n " +str(id_to_arxiv_count) + " '|id={{arxiv|...}}' to convert to '|arxiv=...'\n " +str(arxiv_count) + " new arxiv eprint(s)\n " + str(bibcode_count) + " new bibcode(s)\n " + str(doi_count) + " new doi(s)."
- if unknown_journal_list is "None":
- print "\nUnknown journals:\n *None"
- else:
- print unknown_journal_list
- f = open("C:/Users/Headbomb/Desktop/Pywikipedia/_Unknown_journals.txt", "a")
- f.write(unknown_journal_list.encode("utf-8"))
- f.close()
- print "\n--------------------------------------"
- return text
- def findalltemplates(t):
- f = []
- lowertext = t.lower()
- while re.search("{{\s*(cite|cite study|citation|cite journal|c journal|cite magazine|cite magazine article|cite paper|citejournal|citepaper|vcite paper|vcite journal|vancite journal)\s*\|", lowertext) != None:
- firstoffset = re.search("{{\s*(cite|cite study|citation|cite journal|c journal|cite magazine|cite magazine article|cite paper|citejournal|citepaper|vcite paper|vcite journal|vancite journal)\s*\|", lowertext).start()
- lastoffset = firstoffset
- counter = 1
- while counter > 0:
- nextbracket = regex.search(lowertext, lastoffset+1)
- if nextbracket.group(0) == "{{":
- counter += 1
- lastoffset = nextbracket.end()
- elif nextbracket.group(0) == "}}":
- counter -= 1
- lastoffset = nextbracket.end()
- f.append(t[firstoffset:lastoffset])
- t = t[lastoffset:]
- lowertext = lowertext[lastoffset:]
- return f
- def queryADS(url):
- retry = True
- timeout = max(1, throttle_time)
- retrynum = 0
- while retry:
- try:
- rawdata = urllib2.urlopen(url).read()
- retry = False
- except urllib2.URLError:
- retrynum += 1
- timeout = retrynum * throttle_time
- if retrynum > 3:
- print "Cannot connect to ADS site. Aborting..."
- return ""
- print "\nError connecting to ADS site. Retrying in " + str(timeout) + " seconds."
- time.sleep(timeout)
- continue
- return rawdata
- def adv_check_bibcode(code): #Try to find a valid author / section code
- if code:
- alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ."
- url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX"
- for i in range(27):
- url += "&bibcode=" + urllib.quote(code[:-1]) + alpha[i]
- url += "&db_key=ALL"
- print "Probing for a new author..."
- raw_html = queryADS(url)
- bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
- if bibcode_check:
- print " Found! " + bibcode_check[0]
- return raw_html
- else:
- print " Not found!"
- alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
- alphalower = "abcdefghijklmnopqrstuvwxyz"
- url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=PLAINTEXT"
- for i in range(26):
- url += "&bibcode=" + urllib.quote(code[:-1]) + alpha[i]
- print "Probing for a new section..."
- raw_html = urllib2.urlopen(url).read()
- bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
- if bibcode_check:
- print " Found! " + bibcode_check[0]
- return raw_html
- else:
- url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=PLAINTEXT"
- for i in range(26):
- url += "&bibcode=" + urllib.quote(code[:-1]) + alphalower[i]
- raw_html = queryADS(url)
- if bibcode_check:
- print " Found! " + bibcode_check[0]
- return raw_html
- else:
- print " Not found!"
- return "Dummy text"
- def ADS_query(arxiv, bibcode, doi, item):
- arxiv_match = False
- bibcode_match = False
- doi_match = False
- raw_html = "Dummy text"
- pairs = re.finditer(u"(?P<key>\w+)\s*?=\s*?(?P<value>.*?)(\n|\||\})", item)
- data = {}
- for pair in pairs:
- key = pair.group("key").strip()
- value = pair.group("value").strip(" []")
- value = re.sub(r"<!--.*?-->", "", value, flags=re.DOTALL)
- if len(value)>0:
- data[key] = value
- if not arxiv and not bibcode and not doi:
- bibcode_guess = get_bibcode(data)
- if bibcode_guess:
- print "Bibcode guess: " + bibcode_guess
- url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode_guess.encode("utf-8"))
- raw_html = queryADS(url)
- bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
- if bibcode_check:
- print " Valid!"
- else:
- print " Invalid!"
- raw_html = adv_check_bibcode(bibcode_guess)
- if arxiv and not bibcode and not doi:
- url = "http://adsabs.harvard.edu/abs/arXiv:" + urllib.quote(arxiv)
- raw_html = queryADS(url)
- bibcode_check = re.findall("<A href=\"http://adsabs\.harvard\.edu/abs/(...................)\">", raw_html, flags=re.IGNORECASE)
- if bibcode_check:
- print " Found bibcode by arxiv query!" + bibcode_check[0]
- else:
- print " Did not find bibcode by arxiv query! Guessing bibcode..."
- bibcode_guess = get_bibcode(data)
- if bibcode_guess:
- print "Bibcode guess: " + bibcode_guess
- url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode_guess.encode("utf-8"))
- raw_html = queryADS(url)
- bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
- if bibcode_check:
- print " Valid!"
- else:
- print " Invalid!"
- raw_html = adv_check_bibcode(bibcode_guess)
- if bibcode:
- url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode)
- raw_html = queryADS(url)
- else:
- if doi:
- url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&doi=" + urllib.quote(doi.encode("utf-8"))
- raw_html = queryADS(url)
- arxiv_match_0 = re.findall("eprint = {arXiv:(.*)}", raw_html)
- arxiv_match_1 = re.findall("eprint = {(.*)}", raw_html)
- arxiv_match_2 = re.findall("arXiv e-print \(arXiv:(.*)\)", raw_html)
- arxiv_match_3 = re.findall("arXiv e-print \((.*)\)", raw_html)
- bibcode_match_0 = re.findall("@ARTICLE{(...................)", raw_html)
- bibcode_match_1 = re.findall("<A href=\"http://adsabs\.harvard\.edu/abs/(...................)\">", raw_html, flags=re.IGNORECASE)
- doi_match_0 = re.findall("doi = {(.*?)}", raw_html)
- doi_match_1 = re.findall("<A href=\"http://dx\.doi\.org/(.*)\">", raw_html, flags=re.IGNORECASE)
- if not arxiv_match_0 and not arxiv_match_1 and not arxiv_match_2 and not arxiv_match_3 and not bibcode_match_0 and not bibcode_match_1 and not doi_match_0 and not doi_match_1:
- return ("NOT FOUND!", "NOT FOUND!", "NOT FOUND!")
- else:
- print "Query results:"
- if arxiv_match_0:
- arxiv_match = arxiv_match_0[0]
- print " arxiv : " + arxiv_match
- if not arxiv_match_0 and arxiv_match_1:
- arxiv_match = arxiv_match_1[0]
- print " arxiv : " + arxiv_match
- if arxiv_match_2:
- arxiv_match = arxiv_match_2[0]
- print " arxiv : " + arxiv_match
- if not arxiv_match_2 and arxiv_match_3:
- arxiv_match = arxiv_match_3[0]
- print " arxiv : " + arxiv_match
- if not arxiv_match:
- arxiv_match = "NOT FOUND!"
- print " arxiv : NOT FOUND!"
- if bibcode_match_0:
- bibcode_match = bibcode_match_0[0]
- print " bibcode: " + bibcode_match
- if bibcode_match_1:
- bibcode_match = bibcode_match_1[0]
- print " bibcode: " + bibcode_match
- if not bibcode_match:
- bibcode_match = "NOT FOUND!"
- print " bibcode: NOT FOUND!"
- if doi_match_0:
- doi_match = doi_match_0[0]
- print " doi : " + doi_match
- if doi_match_1:
- doi_match = doi_match_1[0]
- print " doi : " + doi_match
- if not doi_match:
- doi_match = "NOT FOUND!"
- print " doi : NOT FOUND!"
- return (arxiv_match, bibcode_match, doi_match)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement