Advertisement
Guest User

Jakebot code

a guest
Dec 14th, 2013
173
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. Jakebot code below...
  2. package jakebot;
  3.  
  4. import java.io.BufferedReader;
  5. import java.io.IOException;
  6. import java.io.InputStreamReader;
  7. import java.net.MalformedURLException;
  8. import java.net.URL;
  9.  
  10. /**
  11.  * A bot to tag new unreferenced articles on the English Wikipedia.
  12.  * @author King jakob c 2
  13.  */
  14. public class Jakebot {
  15.  
  16.     /**
  17.      * @param args the command line arguments
  18.      */
  19.     public static void main(String[] args) throws MalformedURLException, IOException {
  20.         //Special:NewPages
  21.         //Lines 21-30 shamelessly lifted and adapted from
  22.         //http://stackoverflow.com/questions/6188901/reading-the-content-of-web-page
  23.         URL newpages = new URL("https://en.wikipedia.org/w/index.php?title=Speci"
  24.                 + "al:NewPages&offset=&limit=500");
  25.         BufferedReader newpagesreader = new BufferedReader(
  26.                 new InputStreamReader(newpages.openStream()));
  27.         String inputLine;
  28.         String wholepage = ""; //This will contain the HTML of Special:NewPages
  29.  
  30.         while ((inputLine = newpagesreader.readLine()) != null) {
  31.             wholepage += inputLine;
  32.         }
  33.        
  34.         //The names of the 500 newest articles
  35.         String[] newpageslist = new String[500];
  36.        
  37.         //Each <li> tag except for the first 5 <li> tags preceeds an article
  38.         int litags = 0;
  39.         int newpagesfilled = 0;
  40.         for (int i = 0; i < wholepage.length() - 4; i++) {
  41.             if (wholepage.charAt(i) == '<' && wholepage.charAt(i + 1) == 'l'
  42.                     && wholepage.charAt(i + 2) == 'i' && wholepage.charAt(i + 3) == '>') {
  43.                 litags++;
  44.                
  45.                 if (litags > 5) {
  46.                     //The content between 32 characters after the <li>, and the
  47.                     //next & sign is the name of the article.
  48.                     newpageslist[newpagesfilled] = parseFromNewPages
  49.         (wholepage.substring(i));
  50.                     newpagesfilled++;
  51.                 }
  52.             }
  53.  
  54.         }
  55.        
  56.         //Checking if each page is unreferenced and then tagging it if so.
  57.         for (int i = 0; i < newpageslist.length; i++) {
  58.             //For some reason, there are a bunch of "null"s in the newpageslist.
  59.             //Want to avoid those.
  60.             if (!newpageslist[i].equals("null")) {
  61.                 //Loading up the edit window of a page to get the wiki markup.
  62.                 URL anewpage = new URL("https://en.wikipedia.org/w/index.php?ti"
  63.                         + "tle=" + newpageslist[i] + "&action=edit");
  64.                 BufferedReader pagereader = new BufferedReader(
  65.                         new InputStreamReader(anewpage.openStream()));
  66.                 String inputLine2;
  67.                 String article = "";
  68.  
  69.                 while ((inputLine2 = pagereader.readLine()) != null) {
  70.                     article += inputLine2;
  71.                 }
  72.                
  73.                 //Cleanarticle = the page with the wiki markup, not HTML.
  74.                 String cleanarticle = parseArticle(article);
  75.                
  76.                 //Use the APISandbox to tag as unreferenced, assuming it is.
  77.                 if(isEligibleForTagging(cleanarticle)){                
  78.                 Process p=Runtime.getRuntime().exec("cmd /c start " +
  79.                         "https://en.wikipedia.org/w/api.php?action=edit&format="
  80.                         + "json&title=" + cleanarticle+"&summary=Tagging unref"
  81.                         + "erenced article(%5B%5BWP%3ABOT%7CBot%20edit"
  82.                         + "%5D%5D)&bot=&appendtext={{Unreferenced}}&assert=bot&"
  83.                         + "prop=info");
  84.                 p.destroy(); //and close the window
  85.                 }
  86.             }
  87.         }
  88.     }
  89.  
  90.     /**
  91.      * Parses out an article title from the HTML in Special:NewPages
  92.      * @param s a piece of the HTML of Special:NewPages
  93.      * @return A properly formatted article name
  94.      */
  95.     public static String parseFromNewPages(String s) {
  96.         String cleanpagename = ""; //this will be returned
  97.         //There are 32 characters between the <li> and the start of the article
  98.         //title.
  99.         for (int i = 32; i < s.length(); i++) {
  100.             //Add characters to cleanpagename until we hit the & sign.
  101.             if (s.charAt(i) == '&') {
  102.                 return cleanpagename;
  103.             } else {
  104.                 cleanpagename += s.charAt(i);
  105.             }
  106.         }
  107.         return ""; //this should not be reached
  108.     }
  109.  
  110.     /**
  111.      * Gets the wiki markup content of an article from the HTML of the edit window
  112.      * @param article the HTML of the edit window of an article
  113.      * @return wiki markup of an article
  114.      */
  115.     public static String parseArticle(String article) {
  116.         String articlecontent = "";
  117.         //Begin here.
  118.         int beginpage = article.indexOf('"' + "wpTextbox1" + '"' + ">");
  119.        
  120.         //Adding the wiki markup
  121.         while (true) {
  122.             articlecontent += article.charAt(beginpage+13);
  123.             beginpage++;
  124.             if(articlecontent.contains("</textarea>")){
  125.                 return articlecontent;
  126.             }
  127.         }
  128.     }
  129.    
  130.     /**
  131.      * Check if the bot should tag the page as unreferenced or not
  132.      * @param article the wiki markup of an article
  133.      * @return true if the article should be tagged as unreferenced
  134.      */
  135.     public static boolean isEligibleForTagging(String article){
  136.         //If there is no reflist, external links, or further reading, return true.
  137.         //Return false if it is already tagged is a unreferenced, if it is a disambiguation page,
  138.         //or if it is up for speedy deletion.
  139.         article = article.toLowerCase();
  140.         if(!(article.contains("reflist") && article.contains("<ref>")) &&
  141.                 !article.contains("==referneces==") && !article.contains("http")
  142.                 && !article.contains("further reading") && !article.contains("{{db")
  143.         && !article.contains("{{unreferenced")
  144.                 && !article.contains("{{bots}}") &&
  145.                 !article.contains("disambiguation")){
  146.             return true;
  147.         }
  148.         return false;
  149.     }
  150. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement