This week only. Pastebin PRO Accounts Christmas Special! Don't miss out!Want more features on Pastebin? Sign Up, it's FREE!
Guest

Bibcode bot - main code

By: Headbomb on Apr 22nd, 2011  |  syntax: Python  |  size: 29.45 KB  |  views: 101  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. # -*- coding: utf-8  -*-
  2. import sys
  3. import os
  4. import re
  5. import wikipedia
  6. import urllib
  7. import urllib2
  8. import time
  9. import login
  10. import codecs
  11. from _journal_list import *
  12.  
  13. site = wikipedia.getSite()
  14. throttle_time = 5
  15. errorfilepath = "C:/Users/Headbomb/Desktop/Pywikipedia/_Article timeouts/"
  16. regex = re.compile("(\{\{)|(\}\})")
  17. m_codes = {}
  18. # m_codes[u"AJ..."]=u"."
  19.                
  20. username = "Bibcode Bot"
  21. print "Logging in as Bibcode Bot..."
  22. login
  23. print "Logged in!"
  24.  
  25. def main():
  26.   with codecs.open("_Article list.txt", encoding="utf-8") as f:
  27.     print "Starting run! \n--------------------------------------"
  28.     for line in f:
  29.       line = unicode(line.strip(u" \t\r\n\*\[\]"))
  30.       print "Getting page: " + line
  31.       global page
  32.       page = wikipedia.Page(site, line)
  33.       if not page.exists():
  34.         print "Page does not exist! Skipping to next article.\n--------------------------------------"
  35.         continue
  36.       if page.isRedirectPage():
  37.         oldpage = line
  38.         page  = page.getRedirectTarget()
  39.         newpage = page
  40.         newpage = line.strip(" \[\]")
  41.         print "   '" + str(oldpage) + "' redirects to '" + str(newpage) + "'.\n   Processing '" + str(newpage) + "' instead."
  42.       if page.canBeEdited() is False:
  43.         print "Page cannot be not editted due to protection! Skipping to next article.\n--------------------------------------"
  44.         continue
  45.       if not page.botMayEdit(username):
  46.         print "Page cannot be editted by a bot. Skipping to next article.\n--------------------------------------"
  47.         continue
  48.       text = page.get()
  49.       orig_text = text
  50.       text = parse_template(text)
  51.       if text is not orig_text:
  52.         if id_to_arxiv_count is not 0:
  53.           print "\nConverting " + str(id_to_arxiv_count) + " \"id = {{arxiv|...}}\" to \"|arxiv=...\"."
  54.           print "\nAdding " + str(arxiv_count) + " arxiv eprint(s), " + str(bibcode_count) + " bibcode(s) and " + str(doi_count) + " doi(s) in " + str(counter) + " possible templates."
  55.           editsummary = "Converting " + str(id_to_arxiv_count) + " \"id = {{arxiv|...}}\" to \"|arxiv=...\". \nAdding " + str(arxiv_count) + " [[arXiv|arxiv eprint(s)]], " + str(bibcode_count) + " [[bibcode|bibcode(s)]] and " + str(doi_count) + " [[digital object identifier|doi(s)]]."
  56.         else:
  57.           editsummary = "\nAdding " + str(arxiv_count) + " [[arXiv|arxiv eprint(s)]], " + str(bibcode_count) + " [[bibcode|bibcode(s)]] and " + str(doi_count) + " [[digital object identifier|doi(s)]]."
  58.         try:
  59.           page.put(text, editsummary + " Did it miss something? Report bugs, errors, and suggestions at [[User talk:Bibcode Bot]]", maxTries = 2)
  60.         except wikipedia.MaxTriesExceededError:
  61.             try:
  62.                 print "Couldn't send data to Wikipedia.  Saving page data to " + errorfilepath + page.title()
  63.                 f = open(errorfilepath + page.title() + ".txt", "w")
  64.                 f.write(text.encode("utf-8"))
  65.                 f.close()
  66.             except:
  67.                 print "Error saving data to file.  Printing page:\n\n\n\n\n"
  68.                 print text
  69.       else:
  70.         print "\nNo new arxiv eprint, new bibcode or new doi found in this article. Processing next article.\n--------------------------------------"
  71.     print "\nRun complete!"
  72.  
  73. def get_bibcode(data): #data object that is returned from parse template
  74.     # Bibcode format is YYYYJJJJJVVVVMPPPPA
  75.     # YYYY  = Year
  76.     # JJJJJ = Journal code
  77.     # VVVV  = Volume
  78.     # M     = Section code / Headache
  79.     # PPPP  = Page
  80.     # A     = First letter of the last name of the first author
  81.   global unknown_journal
  82.   # Extract the year part
  83.   if "year" not in data and "date" in data and re.search("[12][0-9]{3}", data["date"]) != None:
  84.       data["year"] = re.search("[12][0-9]{3}", data["date"]).group(0)
  85.   if "year" in data and not "date" in data and re.search("[12][0-9]{3}", data["year"]) != None:
  86.       data["year"] = re.search("[12][0-9]{3}", data["year"]).group(0)
  87.   if "year" not in data != None:
  88.       print "*** YEAR ERROR *** - No year found in citation."
  89.       unknown_journal = "None"
  90.       return False
  91.   else:
  92.     bibcode=u"%s" % data["year"]  
  93.   # Let"s figure out the correct journal so we can get the JJJJJ value
  94.   jkey = ""
  95.   if not data.has_key("journal"):
  96.     if data.has_key("work"):
  97.       data["journal"] = data["work"]
  98.     elif data.has_key("periodical"):
  99.       data["journal"] = data["periodical"]
  100.     else:
  101.       print "*** JOURNAL ERROR *** - No journal found in citation."
  102.       unknown_journal = "None"
  103.       return False
  104.   if data["journal"]:
  105.     if data["journal"].lower().startswith("the "):
  106.       data["journal"] = data["journal"][4:].strip()
  107.     if data["journal"].endswith("."):
  108.       data["journal"] = data["journal"].strip(".")
  109.     for key in journals.keys():
  110.       for item in journals[key]:
  111.         # second part of the tuple is a boolean for regex
  112.         if item[1]:
  113.           if re.search(item[0],data["journal"]):
  114.             jkey = key
  115.             break
  116.         # if its not a regex lets escape it and search for the title
  117.         else:
  118.           if item[0].lower().startswith("the "):
  119.             item[0] = item[0][4:].strip()
  120.           if item[0].endswith("."):
  121.             item[0] = item[0].strip(".")
  122.           if data["journal"].lower() == item[0].lower():
  123.             jkey = key
  124.             break
  125.   if jkey == "":
  126.     print "*** JOURNAL ERROR *** - Unknown journal detected (" + data["journal"] + ")."
  127.     unknown_journal = data["journal"]
  128.     return False
  129.   else:
  130.     unknown_journal = "None"
  131.   # using the J key lets see if there is an M code defined
  132.   if m_codes.has_key(jkey):
  133.     m_code = m_codes[jkey]
  134.   else:
  135.     # default to . otherwise
  136.     m_code = "."
  137.   bibcode+= jkey
  138.   pad_str=u""
  139.   # lets get the volume number and then define the VVVV value
  140.   if not data.has_key("volume"):
  141.     print "*** VOLUME ERROR *** - No volume found in citation."
  142.     return False
  143.   else:
  144.     try:
  145.       data["volume"] = re.search(r"\d+", data["volume"]).group(0)
  146.     except:
  147.       print "*** VOLUME ERROR *** - Volume found (" + data["volume"] +"), but not parsable."
  148.       return False
  149.     pad = 4-len(data["volume"])
  150.     while pad>0:
  151.       pad=pad-1
  152.       pad_str+=u"."
  153.     bibcode+=pad_str+data["volume"]
  154.   # boolean to see if we ignore the M code later
  155.   ignore_m = False
  156.   # handle both page and pages parameters
  157.   pg = False
  158.   pg_0 = False
  159.   pg_1 = False
  160.   if data.has_key("page"):
  161.     if re.search("L\d+",data["page"],re.I):
  162.       m_code = u"L"
  163.     if re.search("\d+",data["page"],re.I):
  164.       pg_0 = re.search(ur"\d+",data["page"],re.I).group(0)
  165.     else:
  166.       pg_0 = False
  167.   if data.has_key("pages"):
  168.     if re.search("L\d+",data["pages"],re.I):
  169.       m_code = u"L"
  170.     if re.search("\d+",data["pages"],re.I):
  171.       pg_1 = re.search(ur"\d+",data["pages"],re.I).group(0)
  172.     else:
  173.       pg_1 = False
  174.   if not pg_0 and not pg_1:
  175.     print "*** PAGE ERROR *** - No page detected."
  176.     return False
  177.   else:
  178.     if pg_1:
  179.       pg = pg_1
  180.     else:
  181.       pg = pg_0
  182.   if not data.has_key("page") and not data.has_key("pages"):
  183.     print "*** PAGE ERROR *** - No page detected."
  184.     return False  
  185.   # lets define PPPP and wether or not M should be ignored
  186.   # if its less than 4 lets pad it, if its 4 exactly lets skip ahead
  187.   if len(pg)<4:
  188.     pad_str=u""
  189.     pad = 4-len(pg)
  190.     while pad>0:
  191.       pad=pad-1
  192.       pad_str+=u"."
  193.     pg = pad_str+pg
  194.   elif len(pg)==5:
  195.     # if its 5 M should be ignored and the 5th page number should be used instead
  196.     ignore_m = True
  197.   elif len(pg)==6:
  198.     # if its 6 convert the last 2 to a letter and ignore M
  199.     ignore_m = True
  200.     alpha = "abcdefghijklmnopqrstuvwxyz"
  201.     lettercode = alpha[int(pg[:1])]
  202.     pg = lettercode+pg[2:]
  203.   # now to combine everything
  204.   if ignore_m:
  205.     m_code =""
  206.   if data.has_key("last1"):
  207.     a = data["last1"][0]
  208.   elif data.has_key("last"):
  209.     a = data["last"][0]
  210.   else:
  211.     a = "."
  212.   return bibcode+m_code+pg+a
  213.  
  214. def parse_template(text):
  215.   #Kingpin's regex:  \{\{cite\sjournal(((\{\{.+\}\})|[^{]([{][^{])?)+)\}\}
  216.   found = findalltemplates(text)
  217.   global counter
  218.   counter = 0
  219.   global id_to_arxiv_count
  220.   id_to_arxiv_count = 0
  221.   global arxiv_count
  222.   arxiv_count = 0
  223.   global bibcode_count
  224.   bibcode_count = 0
  225.   global doi_count
  226.   doi_count = 0
  227.   unknown_journal_list = "None"
  228.   for item in found:
  229.     #Used to compare the result at the end of the processing
  230.     old_item = item
  231.     #Pre-cleaner (cleans |id={{arxiv|foobar}} to |arxiv=foobar)
  232.     if re.search("{{\s*arxiv", item):
  233.       if re.findall(u"\|(\s*)id(\s*)=(\s*){{\s*arxiv\s*\|\s*(archive)?\s*=?\s*([^(\||}|\s)]*)\s*\|?\s*(id)?\s*=?\s*([^(\||}|\s)]*)(\s*)}}", item):
  234.         clean_str = re.sub(u"\|(\s*)id(\s*)=(\s*){{\s*arxiv\s*\|\s*(archive)?\s*=?\s*([^(\||}|\s)]*)\s*\|?\s*(id)?\s*=?\s*([^(\||}|\s)]*)(\s*)}}",ur"|\1arxiv\2=\3\5/\7\8", item)
  235.         if re.findall(u"\|(\s*)arxiv(\s*)=(\s*)(.*)/(\s*)(\||})", clean_str):
  236.           clean_str = re.sub(u"\|(\s*)arxiv(\s*)=(\s*)(.*)/(\s*)(\||})",ur"|\1arxiv\2=\3\4\5\6", clean_str)
  237.         id_to_arxiv_count += 1
  238.         item = clean_str
  239.      
  240.     global unknown_journal
  241.     unknown_journal = "None"
  242.     counter += 1    
  243.     pairs = re.finditer(u"(?P<key>\w+)\s*=\s*(?P<value>.*?)(\n\s*|\||\})",item)
  244.     data = {}
  245.     for pair in pairs:
  246.       key = pair.group("key").strip()
  247.       value = pair.group("value").strip(u" []\t\r\n")
  248.       value = re.sub(r"<!--.*?-->", "", value, flags=re.DOTALL)
  249.       if len(value)>0:
  250.         data[key] = value
  251.  
  252.     # The following gets rids of the error messages if any of last1/last/year/date/etc... is missing
  253.     # This is used to build a message more explicit than "Examining citation 15"
  254.     # Such as "Schwartz (2000). MISSING JOURNAL, v.194, p.123"
  255.     # The code might be stupid and weird, but it seems to work just fine
  256.     # -Headbomb
  257.     if "last1" not in data != None:
  258.       if "last" not in data != None:
  259.         author_message = "MISSING AUTHOR"
  260.       else:
  261.         author_message = data["last"]
  262.     else:
  263.       author_message = data["last1"]
  264.     if "year" not in data and "date" in data and re.search("[12][0-9]{3}", data["date"]) != None:
  265.       data["year"] = re.search("[12][0-9]{3}", data["date"]).group(0)
  266.     if "year" in data and not "date" in data and re.search("[12][0-9]{3}", data["year"]) != None:
  267.       data["year"] = re.search("[12][0-9]{3}", data["year"]).group(0)
  268.     if "year" not in data != None:
  269.       year_message = "MISSING YEAR"
  270.     else:
  271.       year_message = data["year"]
  272.     if "journal" not in data != None:
  273.       if "work" not in data != None:
  274.         if "periodical" not in data != None:
  275.           journal_message = "MISSING JOURNAL"
  276.         else:
  277.           journal_message = data["periodical"]
  278.       else:
  279.         journal_message = data["work"]
  280.     else:
  281.       journal_message = data["journal"]
  282.     if "volume" not in data !=None:
  283.       volume_message = "MISSING"
  284.     else:
  285.       volume_message = data["volume"]
  286.     if "pages" not in data != None:
  287.       if "page" not in data != None:
  288.         page_message = "MISSING"
  289.       else:
  290.         page_message = data["page"]
  291.     else:
  292.       page_message = data["pages"]
  293.     if "arxiv" not in data != None:
  294.       arxiv_message = "MISSING"
  295.     else:
  296.       arxiv_message = data["arxiv"]
  297.     if "bibcode" not in data != None:
  298.       bibcode_message = "MISSING"
  299.     else:
  300.       bibcode_message = data["bibcode"]
  301.     if "doi" not in data != None:
  302.       doi_message = "MISSING"
  303.     else:
  304.       doi_message = data["doi"]
  305.          
  306.     #Message identifying what citation we"re dealing with
  307.     print "\nExamining citation " + str(counter)
  308.     print "   " + author_message + " (" + year_message + "). " + journal_message + ", v." + volume_message + ", p." + page_message
  309.     print "   arxiv  : " + arxiv_message
  310.     print "   bibcode: " + bibcode_message
  311.     print "   doi    : " + doi_message
  312.    
  313.     #Safety net for now. Will be removed later
  314.     arxiv = False
  315.     arxiv_exists = False
  316.     bibcode = False
  317.     bibcode_exist = False
  318.     doi = False
  319.     doi_exists = False
  320.     new_str = None
  321.    
  322.     #ARXIV, BIBCODE AND DOI ARE FOUND --> DO NOTHING    
  323.     if data.has_key("arxiv") and data.has_key("bibcode") and data.has_key("doi"):
  324.       print "arxiv (OK), bibcode (OK), doi (OK). Nothing to do."
  325.       arxiv = data["arxiv"]
  326.       arxiv_exists = True
  327.       bibcode = data["bibcode"]
  328.       bibcode_exists = True
  329.       doi = data["doi"]
  330.       doi_exists = True
  331.    
  332.     #ARXIV NOT FOUND, BIBCODE FOUND, DOI FOUND --> SEARCH FOR ARXIV
  333.     if not data.has_key("arxiv") and data.has_key("bibcode") and data.has_key("doi"):
  334.       print "arxiv (??), bibcode (OK), doi (OK). Searching for arxiv."
  335.       query = ADS_query(arxiv, data["bibcode"], data["doi"], item)
  336.       arxiv = query[0]
  337.       if arxiv != "NOT FOUND!":
  338.         arxiv_exists = True
  339.         arxiv_count += 1
  340.       else:
  341.         arxiv_exists = False
  342.       bibcode = data["bibcode"]
  343.       bibcode_exists = True
  344.       doi = data["doi"]
  345.       doi_exists = True
  346.       if arxiv_exists:
  347.         if re.search(u"\|(\s*)arxiv(\s*)=( *)", item):
  348.           new_str = re.sub(u"\|(\s*)arxiv(\s*)=( *)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
  349.         else:
  350.           new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
  351.  
  352.     #ARXIV FOUND, BIBCODE NOT FOUND, DOI FOUND --> SEARCH FOR BIBCODE
  353.     if data.has_key("arxiv") and not data.has_key("bibcode") and data.has_key("doi"):
  354.       print "arxiv (OK), bibcode (??), doi (OK). Searching for bibcode."
  355.       query = ADS_query(data["arxiv"], bibcode, data["doi"], item)
  356.       arxiv = data["arxiv"]
  357.       arxiv_exists = True
  358.       bibcode = query[1]
  359.       if bibcode != "NOT FOUND!":
  360.         bibcode_exists = True
  361.         bibcode_count += 1
  362.       else:
  363.         bibcode_exists = False
  364.         doi = data["doi"]
  365.         doi_exists = True
  366.       if bibcode_exists:
  367.         if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
  368.           new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
  369.         else:
  370.           new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
  371.        
  372.     #ARXIV FOUND, BIBCODE FOUND, DOI NOT FOUND --> SEARCH FOR DOI
  373.     if data.has_key("arxiv") and data.has_key("bibcode") and not data.has_key("doi"):
  374.       print "arxiv (OK), bibcode (OK), doi (??). Searching for doi."
  375.       query = ADS_query(data["arxiv"], data["bibcode"], doi, item)
  376.       arxiv = data["arxiv"]
  377.       arxiv_exists = True
  378.       bibcode = data["bibcode"]
  379.       bibcode_exists = True
  380.       doi = query[2]    
  381.       if doi != "NOT FOUND!":
  382.         doi_exists = True
  383.         doi_count += 1
  384.       else:
  385.         doi_exists = False
  386.       if doi_exists:
  387.         if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
  388.           new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
  389.         else:
  390.           new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
  391.    
  392.     #ARXIV NOT FOUND, BIBCODE NOT FOUND, DOI FOUND --> SEARCH FOR ARXIV AND BIBCODE
  393.     if not data.has_key("arxiv") and not data.has_key("bibcode") and data.has_key("doi"):
  394.       print "arxiv (??), bibcode (??), doi (OK). Searching for arxiv and bibcode."
  395.       query = ADS_query(arxiv, bibcode, data["doi"], item)
  396.       arxiv = query[0]
  397.       if arxiv != "NOT FOUND!":
  398.         arxiv_exists = True
  399.         arxiv_count += 1
  400.       else:
  401.         arxiv_exist = False
  402.       bibcode = query[1]
  403.       if bibcode != "NOT FOUND!":
  404.         bibcode_exists = True
  405.         bibcode_count += 1
  406.       else:
  407.         bibcode_exists = False
  408.       doi = data["doi"]
  409.       doi_exists = True
  410.       if arxiv_exists:
  411.         if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
  412.           new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
  413.         else:
  414.           new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
  415.       if bibcode_exists:
  416.         if new_str != None:
  417.           if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", new_str):
  418.             new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, new_str)
  419.           else:
  420.            new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, new_str)
  421.         else:
  422.           if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
  423.             new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
  424.           else:
  425.             new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
  426.  
  427.     #ARXIV FOUND, BIBCODE NOT FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI
  428.     if data.has_key("arxiv") and not data.has_key("bibcode") and not data.has_key("doi"):
  429.       print "arxiv (OK), bibcode (??), doi (??). Searching for bibcode and doi."
  430.       query = ADS_query(data["arxiv"], bibcode, doi, item)
  431.       arxiv = data["arxiv"]
  432.       arxiv_exists = True
  433.       bibcode = query[1]
  434.       if bibcode != "NOT FOUND!":
  435.         bibcode_exists = True
  436.         bibcode_count += 1
  437.       else:
  438.         bibcode_exists = False
  439.       doi = query[2]
  440.       if doi != "NOT FOUND!":
  441.         doi_exists = True
  442.         doi_count += 1
  443.       else:
  444.         doi_exists = False
  445.       if bibcode_exists:
  446.         if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
  447.           new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
  448.         else:
  449.           new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
  450.       if doi_exists:
  451.         if new_str != None:
  452.           if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
  453.             new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
  454.           else:
  455.             new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
  456.         else:
  457.           if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
  458.             new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
  459.           else:
  460.             new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
  461.  
  462.     #ARXIV NOT FOUND, BIBCODE FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI
  463.     if not data.has_key("arxiv") and data.has_key("bibcode") and not data.has_key("doi"):
  464.       print "arxiv (??), bibcode (OK), doi (??). Searching for bibcode and doi."
  465.       query = ADS_query(arxiv, data["bibcode"], doi, item)
  466.       arxiv = query[0]
  467.       if arxiv != "NOT FOUND!":
  468.         arxiv_exists = True
  469.         arxiv_count += 1
  470.       else:
  471.         arxiv_exist = False
  472.       bibcode = data["bibcode"]
  473.       bibcode_exists = True
  474.       doi = query[2]
  475.       if doi != "NOT FOUND!":
  476.         doi_exists = True
  477.         doi_count += 1
  478.       else:
  479.         doi_exists = False
  480.       if arxiv_exists:
  481.         if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
  482.           new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
  483.         else:
  484.           new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
  485.       if doi_exists:
  486.         if new_str != None:
  487.           if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
  488.             new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
  489.           else:
  490.             new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
  491.         else:
  492.           if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
  493.             new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
  494.           else:
  495.             new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
  496.            
  497.     #ARXIV NOT FOUND, BIBCODE NOT FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI
  498.     if not data.has_key("arxiv") and not data.has_key("bibcode") and not data.has_key("doi"):
  499.       print "arxiv (??), bibcode (??), doi (??). Guessing bibcode..."
  500.       query = ADS_query(arxiv, bibcode, doi, item)
  501.       arxiv = query[0]
  502.       if arxiv != "NOT FOUND!":
  503.         arxiv_exists = True
  504.         arxiv_count += 1
  505.       else:
  506.         arxiv_exist = False
  507.       bibcode = query[1]
  508.       if bibcode != "NOT FOUND!":
  509.         bibcode_exists = True
  510.         bibcode_count += 1
  511.       else:
  512.         bibcode_exists = False
  513.       doi = query[2]
  514.       if doi != "NOT FOUND!":
  515.         doi_exists = True
  516.         doi_count += 1
  517.       else:
  518.         doi_exists = False
  519.       if arxiv_exists:
  520.         if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
  521.           new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
  522.         else:
  523.           new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
  524.       if bibcode_exists:
  525.         if new_str != None:
  526.           if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", new_str):
  527.             new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, new_str)
  528.           else:
  529.             new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, new_str)
  530.         else:
  531.           if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
  532.             new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
  533.           else:
  534.             new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
  535.       if doi_exists:
  536.         if new_str != None:
  537.           if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
  538.             new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
  539.           else:
  540.             new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
  541.         else:
  542.           if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
  543.             new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
  544.           else:
  545.             new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
  546.      
  547.     if new_str:
  548.       text = re.sub(re.escape(old_item),new_str,text)
  549.     else:
  550.       text = re.sub(re.escape(old_item),item,text)
  551.     if unknown_journal_list is "None":
  552.       if unknown_journal is not "None":
  553.         unknown_journal_list = "\nUnknown journal(s) for " + page.title() + ":\n   *" + unicode(unknown_journal) + "\n"
  554.     else:
  555.       if unknown_journal is not "None":
  556.         if not re.search(unicode(unknown_journal) + "\n", unknown_journal_list):
  557.           unknown_journal_list = unknown_journal_list + "   *" + unknown_journal + "\n"
  558.   print "\nFound:\n   " + str(counter) + " {{citation}}/{{cite journal}} template(s)\n   " +str(id_to_arxiv_count) + " '|id={{arxiv|...}}' to convert to '|arxiv=...'\n   " +str(arxiv_count) + " new arxiv eprint(s)\n   " + str(bibcode_count) + " new bibcode(s)\n   " + str(doi_count) + " new doi(s)."
  559.   if unknown_journal_list is "None":
  560.     print "\nUnknown journals:\n   *None"
  561.   else:
  562.     print unknown_journal_list
  563.     f = open("C:/Users/Headbomb/Desktop/Pywikipedia/_Unknown_journals.txt", "a")
  564.     f.write(unknown_journal_list.encode("utf-8"))
  565.     f.close()
  566.   return text
  567.  
  568.            
  569. def findalltemplates(t):
  570.   f = []
  571.   lowertext = t.lower()
  572.   while re.search("{{\s*(cite|cite study|citation|cite journal|c journal|cite magazine|cite magazine article|cite paper|citejournal|citepaper|vcite paper)\s*\|", lowertext) != None:
  573.     firstoffset = re.search("{{\s*(cite|cite study|citation|cite journal|c journal|cite magazine|cite magazine article|cite paper|citejournal|citepaper|vcite paper)\s*\|", lowertext).start()
  574.     lastoffset = firstoffset
  575.     counter = 1
  576.     while counter > 0:
  577.       nextbracket = regex.search(lowertext, lastoffset+1)
  578.       if nextbracket.group(0) == "{{":
  579.         counter += 1
  580.         lastoffset = nextbracket.end()
  581.       elif nextbracket.group(0) == "}}":
  582.         counter -= 1
  583.         lastoffset = nextbracket.end()
  584.     f.append(t[firstoffset:lastoffset])
  585.     t = t[lastoffset:]
  586.     lowertext = lowertext[lastoffset:]
  587.   return f
  588.  
  589. def queryADS(url):
  590.   retry = True
  591.   timeout = max(1, throttle_time)
  592.   retrynum = 0
  593.   while retry:
  594.     try:
  595.       rawdata = urllib2.urlopen(url).read()
  596.       retry = False
  597.     except urllib2.URLError:
  598.       retrynum += 1
  599.       timeout = retrynum * throttle_time
  600.       if retrynum > 3:
  601.         print "Cannot connect to ADS site.  Aborting..."
  602.         return ""
  603.       print "\nError connecting to ADS site.  Retrying in " + str(timeout) + " seconds."
  604.       time.sleep(timeout)
  605.       continue
  606.   return rawdata
  607.  
  608. def adv_check_bibcode(code): #Try to find a valid author / section code
  609.   if code:
  610.     alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ."
  611.     url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX"
  612.     for i in range(27):
  613.       url += "&bibcode=" + urllib.quote(code[:-1]) + alpha[i]
  614.       url += "&db_key=ALL"
  615.     print "Probing for a new author..."
  616.     raw_html = queryADS(url)
  617.     bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
  618.     if bibcode_check:
  619.       print "   Found! " + bibcode_check[0]
  620.       return raw_html
  621.     else:
  622.       print "   Not found!"
  623.       alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  624.       alphalower = "abcdefghijklmnopqrstuvwxyz"
  625.       url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=PLAINTEXT"
  626.       for i in range(26):
  627.         url += "&bibcode=" + urllib.quote(code[:-1]) + alpha[i]
  628.       print "Probing for a new section..."
  629.       raw_html = urllib2.urlopen(url).read()
  630.       bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
  631.       if bibcode_check:
  632.         print "   Found! " + bibcode_check[0]
  633.         return raw_html
  634.       else:
  635.         url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=PLAINTEXT"
  636.         for i in range(26):
  637.           url += "&bibcode=" + urllib.quote(code[:-1]) + alphalower[i]
  638.         raw_html = queryADS(url)        
  639.         if bibcode_check:
  640.           print "   Found! " + bibcode_check[0]
  641.           return raw_html
  642.         else:
  643.           print "   Not found!"
  644.           return "Dummy text"  
  645.  
  646. def ADS_query(arxiv, bibcode, doi, item):
  647.   arxiv_match = False
  648.   bibcode_match = False
  649.   doi_match = False
  650.   raw_html = "Dummy text"
  651.   pairs = re.finditer(u"(?P<key>\w+)\s*?=\s*?(?P<value>.*?)(\n|\||\})", item)
  652.   data = {}
  653.   for pair in pairs:
  654.     key = pair.group("key").strip()
  655.     value = pair.group("value").strip(" []")
  656.     value = re.sub(r"<!--.*?-->", "", value, flags=re.DOTALL)
  657.     if len(value)>0:
  658.       data[key] = value
  659.   if not arxiv and not bibcode and not doi:
  660.     bibcode_guess = get_bibcode(data)
  661.     if bibcode_guess:
  662.       print "Bibcode guess: " + bibcode_guess
  663.       url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode_guess.encode("utf-8"))
  664.       raw_html = queryADS(url)
  665.       bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
  666.       if bibcode_check:
  667.         print "   Valid!"
  668.       else:
  669.         print "   Invalid!"
  670.         raw_html = adv_check_bibcode(bibcode_guess)        
  671.   if arxiv and not bibcode and not doi:
  672.     url = "http://adsabs.harvard.edu/abs/arXiv:" + urllib.quote(arxiv)
  673.     raw_html = queryADS(url)
  674.     bibcode_check = re.findall("<A href=\"http://adsabs\.harvard\.edu/abs/(...................)\">", raw_html, flags=re.IGNORECASE)
  675.     if bibcode_check:
  676.       print "   Found bibcode by arxiv query!" + bibcode_check[0]
  677.     else:
  678.       print "   Did not find bibcode by arxiv query! Guessing bibcode..."
  679.       bibcode_guess = get_bibcode(data)
  680.       if bibcode_guess:
  681.         print "Bibcode guess: " + bibcode_guess
  682.         url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode_guess.encode("utf-8"))
  683.         raw_html = queryADS(url)
  684.         bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
  685.         if bibcode_check:
  686.           print "   Valid!"
  687.         else:
  688.           print "   Invalid!"
  689.           raw_html = adv_check_bibcode(bibcode_guess)
  690.   if bibcode:
  691.     url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode)
  692.     raw_html = queryADS(url)
  693.   else:
  694.     if doi:
  695.       url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&doi=" + urllib.quote(doi.encode("utf-8"))
  696.       raw_html = queryADS(url)
  697.   arxiv_match_0 = re.findall("eprint = {arXiv:(.*)}", raw_html)
  698.   arxiv_match_1 = re.findall("\(arXiv:(.*)\)", raw_html)
  699.   bibcode_match_0 = re.findall("@ARTICLE{(...................)", raw_html)
  700.   bibcode_match_1 = re.findall("<A href=\"http://adsabs\.harvard\.edu/abs/(...................)\">", raw_html, flags=re.IGNORECASE)
  701.   doi_match_0 = re.findall("doi = {(.*?)}", raw_html)
  702.   doi_match_1 = re.findall("<A href=\"http://dx\.doi\.org/(.*)\">", raw_html, flags=re.IGNORECASE)
  703.   if not arxiv_match_0 and not arxiv_match_1 and not bibcode_match_0 and not bibcode_match_1 and not doi_match_0 and not doi_match_1:
  704.     return ("NOT FOUND!", "NOT FOUND!", "NOT FOUND!")
  705.   else:
  706.     print "Query results:"
  707.   if arxiv_match_0:
  708.     arxiv_match = arxiv_match_0[0]
  709.     print "   arxiv  : " + arxiv_match
  710.   if arxiv_match_1:
  711.     arxiv_match = arxiv_match_1[0]
  712.     print "   arxiv  : " + arxiv_match
  713.   if not arxiv_match:
  714.     arxiv_match = "NOT FOUND!"
  715.     print "   arxiv  : NOT FOUND!"
  716.   if bibcode_match_0:
  717.     bibcode_match = bibcode_match_0[0]
  718.     print "   bibcode: " + bibcode_match
  719.   if bibcode_match_1:
  720.     bibcode_match = bibcode_match_1[0]
  721.     print "   bibcode: " + bibcode_match
  722.   if not bibcode_match:
  723.     bibcode_match = "NOT FOUND!"
  724.     print "   bibcode: NOT FOUND!"
  725.   if doi_match_0:
  726.     doi_match = doi_match_0[0]
  727.     print "   doi    : " + doi_match
  728.   if doi_match_1:
  729.     doi_match = doi_match_1[0]
  730.     print "   doi    : " + doi_match
  731.   if not doi_match:
  732.     doi_match = "NOT FOUND!"
  733.     print "   doi    : NOT FOUND!"
  734.   return (arxiv_match, bibcode_match, doi_match)  
  735.  
  736. if __name__ == "__main__":
  737.   main()
clone this paste RAW Paste Data