Advertisement
Headbomb

Bibcode bot - main code

Apr 22nd, 2011
337
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 31.20 KB | None | 0 0
  1. # -*- coding: utf-8  -*-
  2. import sys
  3. import os
  4. import re
  5. import wikipedia
  6. import urllib
  7. import urllib2
  8. import time
  9. import login
  10. import codecs
  11. import winsound
  12. ##import win32com.client
  13. from _journal_list import *
  14.  
  15. ##speak = win32com.client.Dispatch('Sapi.SpVoice')
  16. ##speak.Volume = 100
  17. ##speak.Rate = 1
  18. ##speak.Voice = speak.GetVoices('Name=Microsoft Anna').Item(0)
  19.  
  20. site = wikipedia.getSite()
  21. throttle_time = 5
  22. errorfilepath = "C:/Users/Headbomb/Desktop/Pywikipedia/_Article timeouts/"
  23. regex = re.compile("(\{\{)|(\}\})")
  24. m_codes = {}
  25. # m_codes[u"AJ..."]=u"."
  26.  
  27. username = "Bibcode Bot"
  28. print "Logging in as Bibcode Bot..."
  29. login
  30. print "Logged in!"
  31.  
  32. def main():
  33.   with codecs.open("_Article list.txt", encoding="utf-8") as f:
  34.     print "Starting run! \n--------------------------------------"
  35.     for line in f:
  36.       line = unicode(line.strip(u" \t\r\n\*\[\]"))
  37.       print "Getting page: " + line
  38.       global page
  39.       page = wikipedia.Page(site, line)
  40.       if not page.exists():
  41.         print "Page does not exist! Skipping to next article.\n--------------------------------------"
  42.         continue
  43.       if page.isRedirectPage():
  44.         oldpage = line
  45.         text = page.get(get_redirect=True)
  46.         target = re.match("\s*\#\s*redirect\s*\[\[(.*?)\]\]", text, flags=re.I).group(1)
  47.         target = target.split("#")[0]
  48.         page = wikipedia.Page(site, target)
  49.         newpage = page
  50.         print "   '" + str(oldpage).strip("[]") + "' redirects to '" + str(newpage).strip("[]") + "'.\n   Processing '" + str(newpage).strip("[]") + "' instead."
  51.       if page.canBeEdited() is False:
  52.         print "Page cannot be not editted due to protection! Skipping to next article.\n--------------------------------------"
  53.         continue
  54.       if not page.botMayEdit(username):
  55.         print "Page cannot be editted by a bot. Skipping to next article.\n--------------------------------------"
  56.         continue
  57.      
  58.       text = page.get()
  59.      
  60.       # \\n matches a linebreak for some reason
  61.       #print text
  62.       #bug = re.findall(r"\|\s*title\s*=.*(\\n|\\r|\\t|\\b|\\f|\\a)", text)
  63.       #if bug != []:
  64.         #print "Found \\n, \\t, \\r, \\f, \\b or \\a in the title. Skipping article while bug is being solved.\n--------------------------------------"
  65.         #continue
  66.      
  67.       orig_text = text
  68.       text = parse_template(text)
  69.       edit_check = id_to_arxiv_count + arxiv_count + bibcode_count + doi_count
  70.       if edit_check is not 0:
  71.         if id_to_arxiv_count is not 0:
  72.           print "\nConverting " + str(id_to_arxiv_count) + " \"id = {{arxiv|...}}\" to \"|arxiv=...\"."
  73.           print "\nAdding " + str(arxiv_count) + " arxiv eprint(s), " + str(bibcode_count) + " bibcode(s) and " + str(doi_count) + " doi(s) in " + str(counter) + " possible templates."
  74.           editsummary = "Converting " + str(id_to_arxiv_count) + " \"id = {{arxiv|...}}\" to \"|arxiv=...\". \nAdding " + str(arxiv_count) + " [[arXiv|arxiv eprint(s)]], " + str(bibcode_count) + " [[bibcode|bibcode(s)]] and " + str(doi_count) + " [[digital object identifier|doi(s)]]."
  75.           #speak.Speak("Converting " + str(id_to_arxiv_count) + " archive I D to archive parameters. Adding " + str(arxiv_count) + " archive preprint, " + str(bibcode_count) + " bibcode and " + str(doi_count) + " d o i.")
  76.         else:
  77.           editsummary = "\nAdding " + str(arxiv_count) + " [[arXiv|arxiv eprint(s)]], " + str(bibcode_count) + " [[bibcode|bibcode(s)]] and " + str(doi_count) + " [[digital object identifier|doi(s)]]."
  78.           #speak.Speak("Adding " + str(arxiv_count) + " archive preprint, " + str(bibcode_count) + " bibcode and " + str(doi_count) + " d o i.")
  79.         try:
  80.           page.put(text, editsummary + " Did it miss something? Report bugs, errors, and suggestions at [[User talk:Bibcode Bot]]", maxTries = 2)
  81.         except wikipedia.MaxTriesExceededError:
  82.             try:
  83.                 print "Couldn't send data to Wikipedia.  Saving page data to " + errorfilepath + page.title()
  84.                 f = open(errorfilepath + page.title() + ".txt", "w")
  85.                 f.write(text.encode("utf-8"))
  86.                 f.close()
  87.             except:
  88.                 print "Error saving data to file.  Printing page:\n\n\n\n\n"
  89.                 print text
  90.       else:
  91.         print "\nNo new arxiv eprint, new bibcode or new doi found in this article. Processing next article.\n--------------------------------------"
  92.     print "\nRun complete!"
  93.  
  94. def get_bibcode(data): #data object that is returned from parse template
  95.     # Bibcode format is YYYYJJJJJVVVVMPPPPA
  96.     # YYYY  = Year
  97.     # JJJJJ = Journal code
  98.     # VVVV  = Volume
  99.     # M     = Section code / Headache
  100.     # PPPP  = Page
  101.     # A     = First letter of the last name of the first author
  102.   global unknown_journal
  103.   # Extract the year part
  104.   if "year" not in data and "date" in data and re.search("[12][0-9]{3}", data["date"]) != None:
  105.       data["year"] = re.search("[12][0-9]{3}", data["date"]).group(0)
  106.   if "year" in data and not "date" in data and re.search("[12][0-9]{3}", data["year"]) != None:
  107.       data["year"] = re.search("[12][0-9]{3}", data["year"]).group(0)
  108.   if "year" not in data != None:
  109.       print "*** YEAR ERROR *** - No year found in citation."
  110.       unknown_journal = "None"
  111.       return False
  112.   else:
  113.     bibcode=u"%s" % data["year"]  
  114.   # Let"s figure out the correct journal so we can get the JJJJJ value
  115.   jkey = ""
  116.   if not data.has_key("journal"):
  117.     if data.has_key("work"):
  118.       data["journal"] = data["work"]
  119.     elif data.has_key("periodical"):
  120.       data["journal"] = data["periodical"]
  121.     else:
  122.       print "*** JOURNAL ERROR *** - No journal found in citation."
  123.       unknown_journal = "None"
  124.       return False
  125.   if data["journal"]:
  126.     if data["journal"].lower().startswith("the "):
  127.       data["journal"] = data["journal"][4:].strip()
  128.     if data["journal"].endswith("."):
  129.       data["journal"] = data["journal"].strip(".")
  130.     for key in journals.keys():
  131.       for item in journals[key]:
  132.         # second part of the tuple is a boolean for regex
  133.         if item[1]:
  134.           if re.search(item[0],data["journal"]):
  135.             jkey = key
  136.             break
  137.         # if its not a regex lets escape it and search for the title
  138.         else:
  139.           if item[0].lower().startswith("the "):
  140.             item[0] = item[0][4:].strip()
  141.           if item[0].endswith("."):
  142.             item[0] = item[0].strip(".")
  143.           if data["journal"].lower() == item[0].lower():
  144.             jkey = key
  145.             break
  146.   if jkey == "":
  147.     print "*** JOURNAL ERROR *** - Unknown journal detected (" + data["journal"] + ")."
  148.     unknown_journal = data["journal"]
  149.     return False
  150.   else:
  151.     unknown_journal = "None"
  152.   # using the J key lets see if there is an M code defined
  153.   if m_codes.has_key(jkey):
  154.     m_code = m_codes[jkey]
  155.   else:
  156.     # default to . otherwise
  157.     m_code = "."
  158.   bibcode+= jkey
  159.   pad_str=u""
  160.   # lets get the volume number and then define the VVVV value
  161.   if not data.has_key("volume"):
  162.     print "*** VOLUME ERROR *** - No volume found in citation."
  163.     return False
  164.   else:
  165.     try:
  166.       data["volume"] = re.search(r"\d+", data["volume"]).group(0)
  167.     except:
  168.       print "*** VOLUME ERROR *** - Volume found (" + data["volume"] +"), but not parsable."
  169.       return False
  170.     pad = 4-len(data["volume"])
  171.     while pad>0:
  172.       pad=pad-1
  173.       pad_str+=u"."
  174.     bibcode+=pad_str+data["volume"]
  175.   # boolean to see if we ignore the M code later
  176.   ignore_m = False
  177.   # handle both page and pages parameters
  178.   pg = False
  179.   pg_0 = False
  180.   pg_1 = False
  181.   if data.has_key("page"):
  182.     if re.search("L\d+",data["page"],re.I):
  183.       m_code = u"L"
  184.     if re.search("\d+",data["page"],re.I):
  185.       pg_0 = re.search(ur"\d+",data["page"],re.I).group(0)
  186.     else:
  187.       pg_0 = False
  188.   if data.has_key("pages"):
  189.     if re.search("L\d+",data["pages"],re.I):
  190.       m_code = u"L"
  191.     if re.search("\d+",data["pages"],re.I):
  192.       pg_1 = re.search(ur"\d+",data["pages"],re.I).group(0)
  193.     else:
  194.       pg_1 = False
  195.   if not pg_0 and not pg_1:
  196.     print "*** PAGE ERROR *** - No page detected."
  197.     return False
  198.   else:
  199.     if pg_1:
  200.       pg = pg_1
  201.     else:
  202.       pg = pg_0
  203.   if not data.has_key("page") and not data.has_key("pages"):
  204.     print "*** PAGE ERROR *** - No page detected."
  205.     return False  
  206.   # lets define PPPP and wether or not M should be ignored
  207.   # if its less than 4 lets pad it, if its 4 exactly lets skip ahead
  208.   if len(pg)<4:
  209.     pad_str=u""
  210.     pad = 4-len(pg)
  211.     while pad>0:
  212.       pad=pad-1
  213.       pad_str+=u"."
  214.     pg = pad_str+pg
  215.   elif len(pg)==5:
  216.     # if its 5 M should be ignored and the 5th page number should be used instead
  217.     ignore_m = True
  218.   elif len(pg)==6:
  219.     # if its 6 convert the last 2 to a letter and ignore M
  220.     ignore_m = True
  221.     alpha = "abcdefghijklmnopqrstuvwxyz"
  222.     lettercode = alpha[int(pg[:1])]
  223.     pg = lettercode+pg[2:]
  224.   # now to combine everything
  225.   if ignore_m:
  226.     m_code =""
  227.   if data.has_key("last1"):
  228.     a = data["last1"][0]
  229.   elif data.has_key("last"):
  230.     a = data["last"][0]
  231.   else:
  232.     a = "."
  233.   return bibcode+m_code+pg+a
  234.  
  235. def parse_template(text):
  236.   #Kingpin's regex:  \{\{cite\sjournal(((\{\{.+\}\})|[^{]([{][^{])?)+)\}\}
  237.   found = findalltemplates(text)
  238.   global counter
  239.   counter = 0
  240.   global id_to_arxiv_count
  241.   id_to_arxiv_count = 0
  242.   global arxiv_count
  243.   arxiv_count = 0
  244.   global bibcode_count
  245.   bibcode_count = 0
  246.   global doi_count
  247.   doi_count = 0
  248.   unknown_journal_list = "None"
  249.   for item in found:
  250.     #Used to compare the result at the end of the processing
  251.     old_item = item
  252.     #Pre-cleaner (cleans |id={{arxiv|foobar}} to |arxiv=foobar)
  253.     if re.search("{{\s*arxiv", item, re.IGNORECASE):
  254.       if re.findall(u"\|(\s*)id(\s*)=(\s*){{\s*arxiv\s*\|\s*(archive)?\s*=?\s*([^(\||}|\s)]*)\s*\|?\s*(id)?\s*=?\s*([^(\||}|\s)]*)(\s*)}}", item, re.IGNORECASE):
  255.         clean_str = re.sub(u"\|(\s*)id(\s*)=(\s*){{\s*arxiv\s*\|\s*(archive)?\s*=?\s*([^(\||}|\s)]*)\s*\|?\s*(id)?\s*=?\s*([^(\||}|\s)]*)(\s*)}}",ur"|\1arxiv\2=\3\5/\7\8", item, re.IGNORECASE)
  256.         if re.findall(u"\|(\s*)arxiv(\s*)=(\s*)(.*)/(\s*)(\||})", clean_str):
  257.           clean_str = re.sub(u"\|(\s*)arxiv(\s*)=(\s*)(.*)/(\s*)(\||})",ur"|\1arxiv\2=\3\4\5\6", clean_str)
  258.         id_to_arxiv_count += 1
  259.         item = clean_str
  260.      
  261.     global unknown_journal
  262.     unknown_journal = "None"
  263.     counter += 1    
  264.     pairs = re.finditer(u"(?P<key>\w+)\s*=\s*(?P<value>.*?)(\n\s*|\||\}\})",item)
  265.     data = {}
  266.     for pair in pairs:
  267.       key = pair.group("key").strip()
  268.       value = pair.group("value").strip(u" []\t\r\n")
  269.       value = re.sub(r"<!--.*?-->", "", value, flags=re.DOTALL)
  270.       if len(value)>0:
  271.         data[key] = value
  272.  
  273.     # The following gets rids of the error messages if any of last1/last/year/date/etc... is missing
  274.     # This is used to build a message more explicit than "Examining citation 15"
  275.     # Such as "Schwartz (2000). MISSING JOURNAL, v.194, p.123"
  276.     # The code might be stupid and weird, but it seems to work just fine
  277.     # -Headbomb
  278.     if "last1" not in data != None:
  279.       if "last" not in data != None:
  280.         author_message = "MISSING AUTHOR"
  281.       else:
  282.         author_message = data["last"]
  283.     else:
  284.       author_message = data["last1"]
  285.     if "year" not in data and "date" in data and re.search("[12][0-9]{3}", data["date"]) != None:
  286.       data["year"] = re.search("[12][0-9]{3}", data["date"]).group(0)
  287.     if "year" in data and not "date" in data and re.search("[12][0-9]{3}", data["year"]) != None:
  288.       data["year"] = re.search("[12][0-9]{3}", data["year"]).group(0)
  289.     if "year" not in data != None:
  290.       year_message = "MISSING YEAR"
  291.     else:
  292.       year_message = data["year"]
  293.     if "journal" not in data != None:
  294.       if "work" not in data != None:
  295.         if "periodical" not in data != None:
  296.           journal_message = "MISSING JOURNAL"
  297.         else:
  298.           journal_message = data["periodical"]
  299.       else:
  300.         journal_message = data["work"]
  301.     else:
  302.       journal_message = data["journal"]
  303.     if "volume" not in data !=None:
  304.       volume_message = "MISSING"
  305.     else:
  306.       volume_message = data["volume"]
  307.     if "pages" not in data != None:
  308.       if "page" not in data != None:
  309.         page_message = "MISSING"
  310.       else:
  311.         page_message = data["page"]
  312.     else:
  313.       page_message = data["pages"]
  314.     if "arxiv" not in data != None:
  315.       arxiv_message = "MISSING"
  316.     else:
  317.       arxiv_message = data["arxiv"]
  318.     if "bibcode" not in data != None:
  319.       bibcode_message = "MISSING"
  320.     else:
  321.       bibcode_message = data["bibcode"]
  322.     if "doi" not in data != None:
  323.       doi_message = "MISSING"
  324.     else:
  325.       doi_message = data["doi"]
  326.      
  327.     #Message identifying what citation we"re dealing with
  328.     print "\nExamining citation " + str(counter) + " [" + str(page).strip("[]") +"]"
  329.     print "   " + author_message + " (" + year_message + "). " + journal_message + ", vol. " + volume_message + ", p. " + page_message
  330.     print "   arxiv  : " + arxiv_message
  331.     print "   bibcode: " + bibcode_message
  332.     print "   doi    : " + doi_message
  333.    
  334.     #Safety net for now. Will be removed later
  335.     arxiv = False
  336.     arxiv_exists = False
  337.     bibcode = False
  338.     bibcode_exist = False
  339.     doi = False
  340.     doi_exists = False
  341.     new_str = None
  342.    
  343.     #ARXIV, BIBCODE AND DOI ARE FOUND --> DO NOTHING    
  344.     if data.has_key("arxiv") and data.has_key("bibcode") and data.has_key("doi"):
  345.       print "arxiv (OK), bibcode (OK), doi (OK). Nothing to do."
  346.       arxiv = data["arxiv"]
  347.       arxiv_exists = True
  348.       bibcode = data["bibcode"]
  349.       bibcode_exists = True
  350.       doi = data["doi"]
  351.       doi_exists = True
  352.    
  353.     #ARXIV NOT FOUND, BIBCODE FOUND, DOI FOUND --> SEARCH FOR ARXIV
  354.     if not data.has_key("arxiv") and data.has_key("bibcode") and data.has_key("doi"):
  355.       print "arxiv (??), bibcode (OK), doi (OK). Searching for arxiv."
  356.       query = ADS_query(arxiv, data["bibcode"], data["doi"], item)
  357.       arxiv = query[0]
  358.       if arxiv != "NOT FOUND!":
  359.         arxiv_exists = True
  360.         arxiv_count += 1
  361.       else:
  362.         arxiv_exists = False
  363.       bibcode = data["bibcode"]
  364.       bibcode_exists = True
  365.       doi = data["doi"]
  366.       doi_exists = True
  367.       if arxiv_exists:
  368.         if re.search(u"\|(\s*)arxiv(\s*)=( *)", item):
  369.           new_str = re.sub(u"\|(\s*)arxiv(\s*)=( *)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
  370.         else:
  371.           new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
  372.  
  373.     #ARXIV FOUND, BIBCODE NOT FOUND, DOI FOUND --> SEARCH FOR BIBCODE
  374.     if data.has_key("arxiv") and not data.has_key("bibcode") and data.has_key("doi"):
  375.       print "arxiv (OK), bibcode (??), doi (OK). Searching for bibcode."
  376.       query = ADS_query(data["arxiv"], bibcode, data["doi"], item)
  377.       arxiv = data["arxiv"]
  378.       arxiv_exists = True
  379.       bibcode = query[1]
  380.       if bibcode != "NOT FOUND!":
  381.         bibcode_exists = True
  382.         bibcode_count += 1
  383.       else:
  384.         bibcode_exists = False
  385.         doi = data["doi"]
  386.         doi_exists = True
  387.       if bibcode_exists:
  388.         if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
  389.           new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
  390.         else:
  391.           new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
  392.        
  393.     #ARXIV FOUND, BIBCODE FOUND, DOI NOT FOUND --> SEARCH FOR DOI
  394.     if data.has_key("arxiv") and data.has_key("bibcode") and not data.has_key("doi"):
  395.       print "arxiv (OK), bibcode (OK), doi (??). Searching for doi."
  396.       query = ADS_query(data["arxiv"], data["bibcode"], doi, item)
  397.       arxiv = data["arxiv"]
  398.       arxiv_exists = True
  399.       bibcode = data["bibcode"]
  400.       bibcode_exists = True
  401.       doi = query[2]    
  402.       if doi != "NOT FOUND!":
  403.         doi_exists = True
  404.         doi_count += 1
  405.       else:
  406.         doi_exists = False
  407.       if doi_exists:
  408.         if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
  409.           new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
  410.         else:
  411.           new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
  412.    
  413.     #ARXIV NOT FOUND, BIBCODE NOT FOUND, DOI FOUND --> SEARCH FOR ARXIV AND BIBCODE
  414.     if not data.has_key("arxiv") and not data.has_key("bibcode") and data.has_key("doi"):
  415.       print "arxiv (??), bibcode (??), doi (OK). Searching for arxiv and bibcode."
  416.       query = ADS_query(arxiv, bibcode, data["doi"], item)
  417.       arxiv = query[0]
  418.       if arxiv != "NOT FOUND!":
  419.         arxiv_exists = True
  420.         arxiv_count += 1
  421.       else:
  422.         arxiv_exist = False
  423.       bibcode = query[1]
  424.       if bibcode != "NOT FOUND!":
  425.         bibcode_exists = True
  426.         bibcode_count += 1
  427.       else:
  428.         bibcode_exists = False
  429.       doi = data["doi"]
  430.       doi_exists = True
  431.       if arxiv_exists:
  432.         if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
  433.           new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
  434.         else:
  435.           new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
  436.       if bibcode_exists:
  437.         if new_str != None:
  438.           if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", new_str):
  439.             new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, new_str)
  440.           else:
  441.            new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, new_str)
  442.         else:
  443.           if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
  444.             new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
  445.           else:
  446.             new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
  447.  
  448.     #ARXIV FOUND, BIBCODE NOT FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI
  449.     if data.has_key("arxiv") and not data.has_key("bibcode") and not data.has_key("doi"):
  450.       print "arxiv (OK), bibcode (??), doi (??). Searching for bibcode and doi."
  451.       query = ADS_query(data["arxiv"], bibcode, doi, item)
  452.       arxiv = data["arxiv"]
  453.       arxiv_exists = True
  454.       bibcode = query[1]
  455.       if bibcode != "NOT FOUND!":
  456.         bibcode_exists = True
  457.         bibcode_count += 1
  458.       else:
  459.         bibcode_exists = False
  460.       doi = query[2]
  461.       if doi != "NOT FOUND!":
  462.         doi_exists = True
  463.         doi_count += 1
  464.       else:
  465.         doi_exists = False
  466.       if bibcode_exists:
  467.         if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
  468.           new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
  469.         else:
  470.           new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
  471.       if doi_exists:
  472.         if new_str != None:
  473.           if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
  474.             new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
  475.           else:
  476.             new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
  477.         else:
  478.           if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
  479.             new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
  480.           else:
  481.             new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
  482.  
  483.     #ARXIV NOT FOUND, BIBCODE FOUND, DOI NOT FOUND --> SEARCH FOR ARXIV AND DOI
  484.     if not data.has_key("arxiv") and data.has_key("bibcode") and not data.has_key("doi"):
  485.       print "arxiv (??), bibcode (OK), doi (??). Searching for arxiv and doi."
  486.       query = ADS_query(arxiv, data["bibcode"], doi, item)
  487.       arxiv = query[0]
  488.       if arxiv != "NOT FOUND!":
  489.         arxiv_exists = True
  490.         arxiv_count += 1
  491.       else:
  492.         arxiv_exist = False
  493.       bibcode = data["bibcode"]
  494.       bibcode_exists = True
  495.       doi = query[2]
  496.       if doi != "NOT FOUND!":
  497.         doi_exists = True
  498.         doi_count += 1
  499.       else:
  500.         doi_exists = False
  501.       if arxiv_exists:
  502.         if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
  503.           new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
  504.         else:
  505.           new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
  506.       if doi_exists:
  507.         if new_str != None:
  508.           if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
  509.             new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
  510.           else:
  511.             new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
  512.         else:
  513.           if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
  514.             new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
  515.           else:
  516.             new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
  517.            
  518.     #ARXIV NOT FOUND, BIBCODE NOT FOUND, DOI NOT FOUND --> SEARCH FOR BIBCODE AND DOI
  519.     if not data.has_key("arxiv") and not data.has_key("bibcode") and not data.has_key("doi"):
  520.       print "arxiv (??), bibcode (??), doi (??). Guessing bibcode..."
  521.       query = ADS_query(arxiv, bibcode, doi, item)
  522.       arxiv = query[0]
  523.       if arxiv != "NOT FOUND!":
  524.         arxiv_exists = True
  525.         arxiv_count += 1
  526.       else:
  527.         arxiv_exist = False
  528.       bibcode = query[1]
  529.       if bibcode != "NOT FOUND!":
  530.         bibcode_exists = True
  531.         bibcode_count += 1
  532.       else:
  533.         bibcode_exists = False
  534.       doi = query[2]
  535.       if doi != "NOT FOUND!":
  536.         doi_exists = True
  537.         doi_count += 1
  538.       else:
  539.         doi_exists = False
  540.       if arxiv_exists:
  541.         if re.search(u"\|(\s*)arxiv(\s*)=([ \t]*)", item):
  542.           new_str = re.sub(u"\|(\s*)arxiv(\s*)=([ \t]*)",ur"|\1arxiv\2=\3 %s" % arxiv, item)
  543.         else:
  544.           new_str = re.sub("\}\}$",u"|arxiv = %s }}" % arxiv, item)
  545.       if bibcode_exists:
  546.         if new_str != None:
  547.           if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", new_str):
  548.             new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, new_str)
  549.           else:
  550.             new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, new_str)
  551.         else:
  552.           if re.search(u"\|(\s*)bibcode(\s*)=([ \t]*)", item):
  553.             new_str = re.sub(u"\|(\s*)bibcode(\s*)=([ \t]*)",ur"|\1bibcode\2=\3 %s" % bibcode, item)
  554.           else:
  555.             new_str = re.sub("\}\}$",u"|bibcode = %s }}" % bibcode, item)
  556.       if doi_exists:
  557.         if new_str != None:
  558.           if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", new_str):
  559.             new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, new_str)
  560.           else:
  561.             new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, new_str)
  562.         else:
  563.           if re.search(u"\|(\s*)doi(\s*)=([ \t]*)", item):
  564.             new_str = re.sub(u"\|(\s*)doi(\s*)=([ \t]*)",ur"|\1doi\2=\3 %s" % doi, item)
  565.           else:
  566.             new_str = re.sub("\}\}$",u"|doi = %s }}" % doi, item)
  567.      
  568.     if new_str:
  569.       text = re.sub(re.escape(old_item),new_str,text)
  570.     else:
  571.       text = re.sub(re.escape(old_item),item,text)
  572.     if unknown_journal_list is "None":
  573.       if unknown_journal is not "None":
  574.         unknown_journal_list = "\nUnknown journal(s) for " + page.title() + ":\n   *" + unicode(unknown_journal) + "\n"
  575.     else:
  576.       if unknown_journal is not "None":
  577.         if not re.search(unicode(unknown_journal) + "\n", unknown_journal_list):
  578.           unknown_journal_list = unknown_journal_list + "   *" + unknown_journal + "\n"
  579.   print "\nFound:\n   " + str(counter) + " {{citation}}/{{cite journal}} template(s)\n   " +str(id_to_arxiv_count) + " '|id={{arxiv|...}}' to convert to '|arxiv=...'\n   " +str(arxiv_count) + " new arxiv eprint(s)\n   " + str(bibcode_count) + " new bibcode(s)\n   " + str(doi_count) + " new doi(s)."
  580.   if unknown_journal_list is "None":
  581.     print "\nUnknown journals:\n   *None"
  582.   else:
  583.     print unknown_journal_list
  584.     f = open("C:/Users/Headbomb/Desktop/Pywikipedia/_Unknown_journals.txt", "a")
  585.     f.write(unknown_journal_list.encode("utf-8"))
  586.     f.close()
  587.   print "\n--------------------------------------"
  588.   return text
  589.  
  590.            
  591. def findalltemplates(t):
  592.   f = []
  593.   lowertext = t.lower()
  594.   while re.search("{{\s*(cite|cite study|citation|cite journal|c journal|cite magazine|cite magazine article|cite paper|citejournal|citepaper|vcite paper|vcite journal|vancite journal)\s*\|", lowertext) != None:
  595.     firstoffset = re.search("{{\s*(cite|cite study|citation|cite journal|c journal|cite magazine|cite magazine article|cite paper|citejournal|citepaper|vcite paper|vcite journal|vancite journal)\s*\|", lowertext).start()
  596.     lastoffset = firstoffset
  597.     counter = 1
  598.     while counter > 0:
  599.       nextbracket = regex.search(lowertext, lastoffset+1)
  600.       if nextbracket.group(0) == "{{":
  601.         counter += 1
  602.         lastoffset = nextbracket.end()
  603.       elif nextbracket.group(0) == "}}":
  604.         counter -= 1
  605.         lastoffset = nextbracket.end()
  606.     f.append(t[firstoffset:lastoffset])
  607.     t = t[lastoffset:]
  608.     lowertext = lowertext[lastoffset:]
  609.   return f
  610.  
  611. def queryADS(url):
  612.   retry = True
  613.   timeout = max(1, throttle_time)
  614.   retrynum = 0
  615.   while retry:
  616.     try:
  617.       rawdata = urllib2.urlopen(url).read()
  618.       retry = False
  619.     except urllib2.URLError:
  620.       retrynum += 1
  621.       timeout = retrynum * throttle_time
  622.       if retrynum > 3:
  623.         print "Cannot connect to ADS site.  Aborting..."
  624.         return ""
  625.       print "\nError connecting to ADS site.  Retrying in " + str(timeout) + " seconds."
  626.       time.sleep(timeout)
  627.       continue
  628.   return rawdata
  629.  
  630. def adv_check_bibcode(code): #Try to find a valid author / section code
  631.   if code:
  632.     alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ."
  633.     url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX"
  634.     for i in range(27):
  635.       url += "&bibcode=" + urllib.quote(code[:-1]) + alpha[i]
  636.       url += "&db_key=ALL"
  637.     print "Probing for a new author..."
  638.     raw_html = queryADS(url)
  639.     bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
  640.     if bibcode_check:
  641.       print "   Found! " + bibcode_check[0]
  642.       return raw_html
  643.     else:
  644.       print "   Not found!"
  645.       alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  646.       alphalower = "abcdefghijklmnopqrstuvwxyz"
  647.       url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=PLAINTEXT"
  648.       for i in range(26):
  649.         url += "&bibcode=" + urllib.quote(code[:-1]) + alpha[i]
  650.       print "Probing for a new section..."
  651.       raw_html = urllib2.urlopen(url).read()
  652.       bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
  653.       if bibcode_check:
  654.         print "   Found! " + bibcode_check[0]
  655.         return raw_html
  656.       else:
  657.         url = u"http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=PLAINTEXT"
  658.         for i in range(26):
  659.           url += "&bibcode=" + urllib.quote(code[:-1]) + alphalower[i]
  660.         raw_html = queryADS(url)        
  661.         if bibcode_check:
  662.           print "   Found! " + bibcode_check[0]
  663.           return raw_html
  664.         else:
  665.           print "   Not found!"
  666.           return "Dummy text"  
  667.  
  668. def ADS_query(arxiv, bibcode, doi, item):
  669.   arxiv_match = False
  670.   bibcode_match = False
  671.   doi_match = False
  672.   raw_html = "Dummy text"
  673.   pairs = re.finditer(u"(?P<key>\w+)\s*?=\s*?(?P<value>.*?)(\n|\||\})", item)
  674.   data = {}
  675.   for pair in pairs:
  676.     key = pair.group("key").strip()
  677.     value = pair.group("value").strip(" []")
  678.     value = re.sub(r"<!--.*?-->", "", value, flags=re.DOTALL)
  679.     if len(value)>0:
  680.       data[key] = value
  681.   if not arxiv and not bibcode and not doi:
  682.     bibcode_guess = get_bibcode(data)
  683.     if bibcode_guess:
  684.       print "Bibcode guess: " + bibcode_guess
  685.       url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode_guess.encode("utf-8"))
  686.       raw_html = queryADS(url)
  687.       bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
  688.       if bibcode_check:
  689.         print "   Valid!"
  690.       else:
  691.         print "   Invalid!"
  692.         raw_html = adv_check_bibcode(bibcode_guess)        
  693.   if arxiv and not bibcode and not doi:
  694.     url = "http://adsabs.harvard.edu/abs/arXiv:" + urllib.quote(arxiv)
  695.     raw_html = queryADS(url)
  696.     bibcode_check = re.findall("<A href=\"http://adsabs\.harvard\.edu/abs/(...................)\">", raw_html, flags=re.IGNORECASE)
  697.     if bibcode_check:
  698.       print "   Found bibcode by arxiv query!" + bibcode_check[0]
  699.     else:
  700.       print "   Did not find bibcode by arxiv query! Guessing bibcode..."
  701.       bibcode_guess = get_bibcode(data)
  702.       if bibcode_guess:
  703.         print "Bibcode guess: " + bibcode_guess
  704.         url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode_guess.encode("utf-8"))
  705.         raw_html = queryADS(url)
  706.         bibcode_check = re.findall("@ARTICLE{(...................)", raw_html)
  707.         if bibcode_check:
  708.           print "   Valid!"
  709.         else:
  710.           print "   Invalid!"
  711.           raw_html = adv_check_bibcode(bibcode_guess)
  712.   if bibcode:
  713.     url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&bibcode=" + urllib.quote(bibcode)
  714.     raw_html = queryADS(url)
  715.   else:
  716.     if doi:
  717.       url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=ALL&data_type=BIBTEX&doi=" + urllib.quote(doi.encode("utf-8"))
  718.       raw_html = queryADS(url)
  719.   arxiv_match_0 = re.findall("eprint = {arXiv:(.*)}", raw_html)
  720.   arxiv_match_1 = re.findall("eprint = {(.*)}", raw_html)
  721.   arxiv_match_2 = re.findall("arXiv e-print \(arXiv:(.*)\)", raw_html)
  722.   arxiv_match_3 = re.findall("arXiv e-print \((.*)\)", raw_html)
  723.   bibcode_match_0 = re.findall("@ARTICLE{(...................)", raw_html)
  724.   bibcode_match_1 = re.findall("<A href=\"http://adsabs\.harvard\.edu/abs/(...................)\">", raw_html, flags=re.IGNORECASE)
  725.   doi_match_0 = re.findall("doi = {(.*?)}", raw_html)
  726.   doi_match_1 = re.findall("<A href=\"http://dx\.doi\.org/(.*)\">", raw_html, flags=re.IGNORECASE)
  727.   if not arxiv_match_0 and not arxiv_match_1 and not arxiv_match_2 and not arxiv_match_3 and not bibcode_match_0 and not bibcode_match_1 and not doi_match_0 and not doi_match_1:
  728.     return ("NOT FOUND!", "NOT FOUND!", "NOT FOUND!")
  729.   else:
  730.     print "Query results:"
  731.   if arxiv_match_0:
  732.     arxiv_match = arxiv_match_0[0]
  733.     print "   arxiv  : " + arxiv_match
  734.   if not arxiv_match_0 and arxiv_match_1:
  735.     arxiv_match = arxiv_match_1[0]
  736.     print "   arxiv  : " + arxiv_match
  737.   if arxiv_match_2:
  738.     arxiv_match = arxiv_match_2[0]
  739.     print "   arxiv  : " + arxiv_match
  740.   if not arxiv_match_2 and arxiv_match_3:
  741.     arxiv_match = arxiv_match_3[0]
  742.     print "   arxiv  : " + arxiv_match
  743.   if not arxiv_match:
  744.     arxiv_match = "NOT FOUND!"
  745.     print "   arxiv  : NOT FOUND!"
  746.   if bibcode_match_0:
  747.     bibcode_match = bibcode_match_0[0]
  748.     print "   bibcode: " + bibcode_match
  749.   if bibcode_match_1:
  750.     bibcode_match = bibcode_match_1[0]
  751.     print "   bibcode: " + bibcode_match
  752.   if not bibcode_match:
  753.     bibcode_match = "NOT FOUND!"
  754.     print "   bibcode: NOT FOUND!"
  755.   if doi_match_0:
  756.     doi_match = doi_match_0[0]
  757.     print "   doi    : " + doi_match
  758.   if doi_match_1:
  759.     doi_match = doi_match_1[0]
  760.     print "   doi    : " + doi_match
  761.   if not doi_match:
  762.     doi_match = "NOT FOUND!"
  763.     print "   doi    : NOT FOUND!"
  764.   return (arxiv_match, bibcode_match, doi_match)  
  765.  
  766. if __name__ == "__main__":
  767.   main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement