Want more features on Pastebin? Sign Up, it's FREE!
Guest

Jakebot code (4)

By: a guest on Dec 30th, 2013  |  syntax: Java  |  size: 11.93 KB  |  views: 69  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
This paste has a previous version, view the difference. Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. package jakebot;
  2.  
  3. import java.io.BufferedReader;
  4. import java.io.IOException;
  5. import java.io.InputStreamReader;
  6. import java.net.MalformedURLException;
  7. import java.net.URL;
  8.  
  9. /**
  10.  * A bot to tag new unreferenced articles on the English Wikipedia.
  11.  *
  12.  * @author King jakob c 2
  13.  */
  14. public class Jakebot {
  15.  
  16.     /**
  17.      * @param args the command line arguments
  18.      */
  19.     public static void main(String[] args) throws MalformedURLException, IOException {
  20.         //Special:NewPages
  21.         //Lines 21-30 shamelessly lifted and adapted from
  22.         //http://stackoverflow.com/questions/6188901/reading-the-content-of-web-page
  23.         URL newpages = new URL("https://en.wikipedia.org/w/api.php?action=query"
  24.                 + "&list=recentchanges&rctype=new&rcprop=title|timestamp&"
  25.                 + "rcnamespace=0&rclimit=500");
  26.         BufferedReader newpagesreader = new BufferedReader(
  27.                 new InputStreamReader(newpages.openStream()));
  28.         String inputLine;
  29.         String wholepage = ""; //This will contain the code of the new pages API
  30.         //query.
  31.  
  32.         while ((inputLine = newpagesreader.readLine()) != null) {
  33.             wholepage += inputLine;
  34.         }
  35.  
  36.         //The names of the 500 newest articles
  37.         String[] newpageslist = new String[500];
  38.  
  39.         int newpagesfilled = 0;
  40.         for (int i = 0; i < wholepage.length() - 4; i++) {
  41.             if (wholepage.charAt(i) == 't' && wholepage.charAt(i + 1) == 'i'
  42.                     && wholepage.charAt(i + 2) == 't' && wholepage.charAt(i + 3) == 'l'
  43.                     && wholepage.charAt(i + 4) == 'e' && wholepage.charAt(i + 5) == '=') {
  44.                 newpageslist[newpagesfilled] = parseFromNewPages(wholepage.substring(i));
  45.                 newpagesfilled++;
  46.             }
  47.  
  48.         }
  49.  
  50.         //Checking if each page is a unreferenced and then tagging it
  51.         //if it is unreferenced.
  52.         for (int i = 0; i < newpageslist.length; i++) {
  53.             //For some reason, there are a bunch of "null"s in the newpageslist.
  54.             //Want to avoid those.
  55.             if (!newpageslist[i].equals("null")) {
  56.                 //Loading up the edit window of a page to get the wiki markup.
  57.                 URL anewpage = new URL("https://en.wikipedia.org/w/index.php?ti"
  58.                         + "tle=" + newpageslist[i] + "&action=edit");
  59.                 BufferedReader pagereader = new BufferedReader(
  60.                         new InputStreamReader(anewpage.openStream()));
  61.                 String inputLine2;
  62.                 String article = "";
  63.  
  64.                 while ((inputLine2 = pagereader.readLine()) != null) {
  65.                     article += inputLine2;
  66.                 }
  67.  
  68.                 //Cleanarticle = the page with the wiki markup, not HTML.
  69.                 String cleanarticle = parseArticle(article);
  70.  
  71.                 //Use the APISandbox to tag as a unreferenced, assuming it is one.
  72.                 if (isEligibleForTagging(cleanarticle, newpageslist[i]) && noblpcat(newpageslist[i])
  73.                         && !alreadyedited(newpageslist[i])) {
  74.                     String url = "https://en.wikipedia.org/w/api.php?action=edit&format="
  75.                             + "json&title=" + cleanarticle + "&summary=Tagging%20short"
  76.                             + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
  77.                             + "%5D%5D)&bot=&prependtext={{Unreferenced}}&assert=bot&"
  78.                             + "prop=info";
  79.                     Runtime.getRuntime().exec("rundll32 url.dll,FileProtocolHandler" + url);
  80.  
  81.                 }
  82.                 //If it is a BLP, tag it with {{BLP unsourced}} instead.
  83.                 if (isEligibleForTagging(cleanarticle, newpageslist[i]) && !noblpcat(newpageslist[i])) {
  84.                     String url = "https://en.wikipedia.org/w/api.php?action=edit&format="
  85.                             + "json&title=" + newpageslist[i] + "&summary=Tagging%20short"
  86.                             + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
  87.                             + "%5D%5D)&bot=&prependtext={{BLP unsourced}}&assert=bot&"
  88.                             + "prop=info";
  89.                     Runtime.getRuntime().exec("rundll32 url.dll,FileProtocolHandler" + url);
  90.                 }
  91.             }
  92.         }
  93.     }
  94.  
  95.     /**
  96.      * Parses out an article title from the HTML in Special:NewPages
  97.      *
  98.      * @param s a piece of the HTML of Special:NewPages
  99.      * @return A properly formatted article name
  100.      */
  101.     public static String parseFromNewPages(String s) {
  102.         String cleanpagename = ""; //this will be returned
  103.         //There are 32 characters between the <li> and the start of the article
  104.         //title.
  105.         for (int i = 1; i < s.length(); i++) {
  106.             //Add characters to cleanpagename until we hit the & sign.
  107.             if (s.charAt(i) == '&') {
  108.                 return cleanpagename;
  109.             } else {
  110.                 cleanpagename += s.charAt(i);
  111.             }
  112.         }
  113.         return ""; //this should not be reached
  114.     }
  115.  
  116.     /**
  117.      * Gets the wiki markup content of an article from the HTML of the edit
  118.      * window
  119.      *
  120.      * @param article the HTML of the edit window of an article
  121.      * @return wiki markup of an article
  122.      */
  123.     public static String parseArticle(String article) {
  124.         String articlecontent = "";
  125.         //Begin here.
  126.         int beginpage = article.indexOf('"' + "wpTextbox1" + '"' + ">");
  127.  
  128.         //Adding the wiki markup
  129.         while (true) {
  130.             articlecontent += article.charAt(beginpage + 13);
  131.             beginpage++;
  132.             if (articlecontent.contains("</textarea>")) {
  133.                 return articlecontent;
  134.             }
  135.         }
  136.     }
  137.  
  138.     /**
  139.      * Check if the bot should tag the page as unreferenced or not
  140.      *
  141.      * @param article the wiki markup of an article
  142.      * @return true if the article should be tagged as unreferenced
  143.      */
  144.     public static boolean isEligibleForTagging(String article, String title) throws IOException {
  145.         //If the article lacks a reflist (or variants), <ref> tags (or variants),
  146.         //an {{sfn}} template, external links, further reading, a references section,
  147.         //or a notes section, it is consdiered unreferenced. If it is a disambiguation
  148.         //page, already tagged as unreferenced, or has {{nobots}}, it won't
  149.         //be tagged.
  150.         article = article.toLowerCase();
  151.         if (!article.contains("reflist")
  152.                 && !article.contains("<ref>") && !article.contains("<ref name") && !article.contains("<ref group")
  153.                 && !article.contains("{{sfn")
  154.                 && !article.contains("=referneces=") && !article.contains("= references =")
  155.                 && !article.contains("= referneces=") && !article.contains("=references =")
  156.                 && !article.contains("=notes=") && !article.contains("== notes ==")
  157.                 && !article.contains("=notes =") && !article.contains("== notes==")
  158.                 && !article.contains("=citations=") && !article.contains("= citations =")
  159.                 && !article.contains("=citations =") && !article.contains("= citations=")
  160.                 && !article.contains("=sources=") && !article.contains("= sources =")
  161.                 && !article.contains("=sources =") && !article.contains("= sources=")
  162.                 && !article.contains("<references") && !article.contains("{{refbegin")
  163.                 && !article.contains("{{noteslist")
  164.                 && noextlinks(title)
  165.                 && !noblpkeywords(article)
  166.                 && !article.contains("further reading") && nodabs(title)
  167.                 && !article.contains("{{nobots}}")) {
  168.             return true;
  169.         }
  170.         return false;
  171.     }
  172.  
  173.     /**
  174.      * Uses a Wikipedia API query to search for a dmbox template
  175.      *
  176.      * @param title article title
  177.      * @return true if the page is not a disambiguation page
  178.      * @throws MalformedURLException
  179.      * @throws IOException
  180.      */
  181.     public static boolean nodabs(String title) throws MalformedURLException, IOException {
  182.         URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop="
  183.                 + "templates&title=" + title);
  184.         BufferedReader dabsearch = new BufferedReader(
  185.                 new InputStreamReader(u.openStream()));
  186.         String inputLine;
  187.         String templates = ""; //This will contain the HTML of Special:NewPages
  188.  
  189.         while ((inputLine = dabsearch.readLine()) != null) {
  190.             templates += inputLine;
  191.         }
  192.         return !templates.contains("Template:Dmbox");
  193.     }
  194.  
  195.     /**
  196.      * Uses a Wikipedia API query to search for external links in an article.
  197.      *
  198.      * @param title article title
  199.      * @return true if there are no external links
  200.      * @throws MalformedURLException
  201.      * @throws IOException
  202.      */
  203.     public static boolean noextlinks(String title) throws MalformedURLException, IOException {
  204.         URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&pr"
  205.                 + "op=extlinks&titles=" + title);
  206.         BufferedReader linksearch = new BufferedReader(
  207.                 new InputStreamReader(u.openStream()));
  208.         String inputLine;
  209.         String links = ""; //This will contain the HTML of Special:NewPages
  210.  
  211.         while ((inputLine = linksearch.readLine()) != null) {
  212.             links += inputLine;
  213.         }
  214.         return !links.contains("<el xml:space=\"preserve\">");
  215.     }
  216.  
  217.     public static boolean noblpkeywords(String article) {
  218.         String[] keywords = {"UnsourcedBLP", "BLPunreferenced", "Unreferencedblp",
  219.             "Blpunsourced", "BLPunsourced", "Unsourcedblp", "BLPUnreferenced",
  220.             "Unsourced BLP", "BLP unreferenced", "Blpunref", "Unreferenced BLP",
  221.             "Blpunreferenced", "UnreferencedBLP", "BLPUnsourced", "Unreferenced blp",
  222.             "BLP Unreferenced", "Blp-unreferenced", "Userspace BLP", "Unreferenced-blp",
  223.             "Unreferenced-BLP", "Blpnoref", "Blp unreferenced", "BLPnoref", "Unref BLP", "Blp unsourced", "Urblp", "Ublp", "Blp-unsourced", "BLPunref", "Unsourced-blp",
  224.             "Noref-blp", "Unsourced blp",
  225.             "Unsourced", "Unverified", "Unref", "References", "Uncited-article",
  226.             "Citesources", "NR", "No references", "Unrefarticle", "Unreferenced article",
  227.             "Noref", "Norefs", "Noreferences", "Cleanup-cite", "References needed",
  228.             "Nr", "No refs", "UnreferencedArticle", "No ref", "Unreferenced stub",
  229.             "Needs references", "Noreference", "No reference", "Refsneeded", "Refs needed",
  230.             "Ref needed", "Nosources", "No sources", "UNref", "UNREF", "Unr"};
  231.         for (int i = 0; i < keywords.length; i++) {
  232.             if (article.contains(keywords[i])) {
  233.                 return true;
  234.             }
  235.         }
  236.         return false;
  237.     }
  238.  
  239.     public static boolean noblpcat(String title) throws MalformedURLException, IOException {
  240.         URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop=categories&titles="
  241.                 + title);
  242.         BufferedReader catsearch = new BufferedReader(
  243.                 new InputStreamReader(u.openStream()));
  244.         String inputLine;
  245.         String cats = "";
  246.  
  247.         while ((inputLine = catsearch.readLine()) != null) {
  248.             cats += inputLine;
  249.         }
  250.         return !cats.contains("Category:Living people") && cats.contains("Category: Living people");
  251.     }
  252.  
  253.     public static boolean alreadyedited(String title) throws IOException {
  254.         URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=user&rvlimit=500&titles="
  255.                 + title);
  256.         BufferedReader botsearch = new BufferedReader(
  257.                 new InputStreamReader(u.openStream()));
  258.         String inputLine;
  259.         String users = "";
  260.  
  261.         while ((inputLine = botsearch.readLine()) != null) {
  262.             users += inputLine;
  263.         }
  264.         return users.contains("Jakebot");
  265.     }
  266. }
clone this paste RAW Paste Data