Jakebot code below...
package jakebot;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
/**
* A bot to tag new unreferenced articles on the English Wikipedia.
* @author King jakob c 2
*/
public class Jakebot {
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws MalformedURLException, IOException {
//Special:NewPages
//Lines 21-30 shamelessly lifted and adapted from
//http://stackoverflow.com/questions/6188901/reading-the-content-of-web-page
URL newpages = new URL("https://en.wikipedia.org/w/index.php?title=Speci"
+ "al:NewPages&offset=&limit=500");
BufferedReader newpagesreader = new BufferedReader(
new InputStreamReader(newpages.openStream()));
String inputLine;
String wholepage = ""; //This will contain the HTML of Special:NewPages
while ((inputLine = newpagesreader.readLine()) != null) {
wholepage += inputLine;
}
//The names of the 500 newest articles
String[] newpageslist = new String[500];
//Each
tag except for the first 5 tags preceeds an article
int litags = 0;
int newpagesfilled = 0;
for (int i = 0; i < wholepage.length() - 4; i++) {
if (wholepage.charAt(i) == '<' && wholepage.charAt(i + 1) == 'l'
&& wholepage.charAt(i + 2) == 'i' && wholepage.charAt(i + 3) == '>') {
litags++;
if (litags > 5) {
//The content between 32 characters after the , and the
//next & sign is the name of the article.
newpageslist[newpagesfilled] = parseFromNewPages
(wholepage.substring(i));
newpagesfilled++;
}
}
}
//Checking if each page is unreferenced and then tagging it if so.
for (int i = 0; i < newpageslist.length; i++) {
//For some reason, there are a bunch of "null"s in the newpageslist.
//Want to avoid those.
if (!newpageslist[i].equals("null")) {
//Loading up the edit window of a page to get the wiki markup.
URL anewpage = new URL("https://en.wikipedia.org/w/index.php?ti"
+ "tle=" + newpageslist[i] + "&action=edit");
BufferedReader pagereader = new BufferedReader(
new InputStreamReader(anewpage.openStream()));
String inputLine2;
String article = "";
while ((inputLine2 = pagereader.readLine()) != null) {
article += inputLine2;
}
//Cleanarticle = the page with the wiki markup, not HTML.
String cleanarticle = parseArticle(article);
//Use the APISandbox to tag as unreferenced, assuming it is.
if(isEligibleForTagging(cleanarticle)){
Process p=Runtime.getRuntime().exec("cmd /c start " +
"https://en.wikipedia.org/w/api.php?action=edit&format="
+ "json&title=" + cleanarticle+"&summary=Tagging unref"
+ "erenced article(%5B%5BWP%3ABOT%7CBot%20edit"
+ "%5D%5D)&bot=&appendtext={{Unreferenced}}&assert=bot&"
+ "prop=info");
p.destroy(); //and close the window
}
}
}
}
/**
* Parses out an article title from the HTML in Special:NewPages
* @param s a piece of the HTML of Special:NewPages
* @return A properly formatted article name
*/
public static String parseFromNewPages(String s) {
String cleanpagename = ""; //this will be returned
//There are 32 characters between the and the start of the article
//title.
for (int i = 32; i < s.length(); i++) {
//Add characters to cleanpagename until we hit the & sign.
if (s.charAt(i) == '&') {
return cleanpagename;
} else {
cleanpagename += s.charAt(i);
}
}
return ""; //this should not be reached
}
/**
* Gets the wiki markup content of an article from the HTML of the edit window
* @param article the HTML of the edit window of an article
* @return wiki markup of an article
*/
public static String parseArticle(String article) {
String articlecontent = "";
//Begin here.
int beginpage = article.indexOf('"' + "wpTextbox1" + '"' + ">");
//Adding the wiki markup
while (true) {
articlecontent += article.charAt(beginpage+13);
beginpage++;
if(articlecontent.contains("")){
return articlecontent;
}
}
}
/**
* Check if the bot should tag the page as unreferenced or not
* @param article the wiki markup of an article
* @return true if the article should be tagged as unreferenced
*/
public static boolean isEligibleForTagging(String article){
//If there is no reflist, external links, or further reading, return true.
//Return false if it is already tagged is a unreferenced, if it is a disambiguation page,
//or if it is up for speedy deletion.
article = article.toLowerCase();
if(!(article.contains("reflist") && article.contains("[")) &&
!article.contains("==referneces==") && !article.contains("http")
&& !article.contains("further reading") && !article.contains("{{db")
&& !article.contains("{{unreferenced")
&& !article.contains("{{bots}}") &&
!article.contains("disambiguation")){
return true;
}
return false;
}
}]