package jakebot;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
/**
* A bot to tag new unreferenced articles on the English Wikipedia.
*
* @author King jakob c 2
*/
public class Jakebot {
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws MalformedURLException, IOException {
//Special:NewPages
//Lines 21-30 shamelessly lifted and adapted from
//http://stackoverflow.com/questions/6188901/reading-the-content-of-web-page
URL newpages = new URL("https://en.wikipedia.org/w/api.php?action=query"
+ "&list=recentchanges&rctype=new&rcprop=title|timestamp&"
+ "rcnamespace=0&rclimit=500");
BufferedReader newpagesreader = new BufferedReader(
new InputStreamReader(newpages.openStream()));
String inputLine;
String wholepage = ""; //This will contain the code of the new pages API
//query.
while ((inputLine = newpagesreader.readLine()) != null) {
wholepage += inputLine;
}
//The names of the 500 newest articles
String[] newpageslist = new String[500];
int newpagesfilled = 0;
for (int i = 0; i < wholepage.length() - 4; i++) {
if (wholepage.charAt(i) == 't' && wholepage.charAt(i + 1) == 'i'
&& wholepage.charAt(i + 2) == 't' && wholepage.charAt(i + 3) == 'l'
&& wholepage.charAt(i + 4) == 'e' && wholepage.charAt(i + 5) == '=') {
newpageslist[newpagesfilled] = parseFromNewPages(wholepage.substring(i));
newpagesfilled++;
}
}
//Checking if each page is a unreferenced and then tagging it
//if it is unreferenced.
for (int i = 0; i < newpageslist.length; i++) {
//For some reason, there are a bunch of "null"s in the newpageslist.
//Want to avoid those.
if (!newpageslist[i].equals("null")) {
//Loading up the edit window of a page to get the wiki markup.
URL anewpage = new URL("https://en.wikipedia.org/w/index.php?ti"
+ "tle=" + newpageslist[i] + "&action=edit");
BufferedReader pagereader = new BufferedReader(
new InputStreamReader(anewpage.openStream()));
String inputLine2;
String article = "";
while ((inputLine2 = pagereader.readLine()) != null) {
article += inputLine2;
}
//Cleanarticle = the page with the wiki markup, not HTML.
String cleanarticle = parseArticle(article);
//Use the APISandbox to tag as a unreferenced, assuming it is one.
if (isEligibleForTagging(cleanarticle, newpageslist[i]) && noblpcat(newpageslist[i])
&& !alreadyedited(newpageslist[i])) {
String url = "https://en.wikipedia.org/w/api.php?action=edit&format="
+ "json&title=" + cleanarticle + "&summary=Tagging%20short"
+ "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
+ "%5D%5D)&bot=&prependtext={{Unreferenced}}&assert=bot&"
+ "prop=info";
Runtime.getRuntime().exec("rundll32 url.dll,FileProtocolHandler" + url);
}
//If it is a BLP, tag it with {{BLP unsourced}} instead.
if (isEligibleForTagging(cleanarticle, newpageslist[i]) && !noblpcat(newpageslist[i])) {
String url = "https://en.wikipedia.org/w/api.php?action=edit&format="
+ "json&title=" + newpageslist[i] + "&summary=Tagging%20short"
+ "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
+ "%5D%5D)&bot=&prependtext={{BLP unsourced}}&assert=bot&"
+ "prop=info";
Runtime.getRuntime().exec("rundll32 url.dll,FileProtocolHandler" + url);
}
}
}
}
/**
* Parses out an article title from the HTML in Special:NewPages
*
* @param s a piece of the HTML of Special:NewPages
* @return A properly formatted article name
*/
public static String parseFromNewPages(String s) {
String cleanpagename = ""; //this will be returned
//There are 32 characters between the <li> and the start of the article
//title.
for (int i = 1; i < s.length(); i++) {
//Add characters to cleanpagename until we hit the & sign.
if (s.charAt(i) == '&') {
return cleanpagename;
} else {
cleanpagename += s.charAt(i);
}
}
return ""; //this should not be reached
}
/**
* Gets the wiki markup content of an article from the HTML of the edit
* window
*
* @param article the HTML of the edit window of an article
* @return wiki markup of an article
*/
public static String parseArticle(String article) {
String articlecontent = "";
//Begin here.
int beginpage = article.indexOf('"' + "wpTextbox1" + '"' + ">");
//Adding the wiki markup
while (true) {
articlecontent += article.charAt(beginpage + 13);
beginpage++;
if (articlecontent.contains("</textarea>")) {
return articlecontent;
}
}
}
/**
* Check if the bot should tag the page as unreferenced or not
*
* @param article the wiki markup of an article
* @return true if the article should be tagged as unreferenced
*/
public static boolean isEligibleForTagging(String article, String title) throws IOException {
//If the article lacks a reflist (or variants), <ref> tags (or variants),
//an {{sfn}} template, external links, further reading, a references section,
//or a notes section, it is consdiered unreferenced. If it is a disambiguation
//page, already tagged as unreferenced, or has {{nobots}}, it won't
//be tagged.
article = article.toLowerCase();
if (!article.contains("reflist")
&& !article.contains("<ref>") && !article.contains("<ref name") && !article.contains("<ref group")
&& !article.contains("{{sfn")
&& !article.contains("=referneces=") && !article.contains("= references =")
&& !article.contains("= referneces=") && !article.contains("=references =")
&& !article.contains("=notes=") && !article.contains("== notes ==")
&& !article.contains("=notes =") && !article.contains("== notes==")
&& !article.contains("=citations=") && !article.contains("= citations =")
&& !article.contains("=citations =") && !article.contains("= citations=")
&& !article.contains("=sources=") && !article.contains("= sources =")
&& !article.contains("=sources =") && !article.contains("= sources=")
&& !article.contains("<references") && !article.contains("{{refbegin")
&& !article.contains("{{noteslist")
&& noextlinks(title)
&& !noblpkeywords(article)
&& !article.contains("further reading") && nodabs(title)
&& !article.contains("{{nobots}}")) {
return true;
}
return false;
}
/**
* Uses a Wikipedia API query to search for a dmbox template
*
* @param title article title
* @return true if the page is not a disambiguation page
* @throws MalformedURLException
* @throws IOException
*/
public static boolean nodabs(String title) throws MalformedURLException, IOException {
URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop="
+ "templates&title=" + title);
BufferedReader dabsearch = new BufferedReader(
new InputStreamReader(u.openStream()));
String inputLine;
String templates = ""; //This will contain the HTML of Special:NewPages
while ((inputLine = dabsearch.readLine()) != null) {
templates += inputLine;
}
return !templates.contains("Template:Dmbox");
}
/**
* Uses a Wikipedia API query to search for external links in an article.
*
* @param title article title
* @return true if there are no external links
* @throws MalformedURLException
* @throws IOException
*/
public static boolean noextlinks(String title) throws MalformedURLException, IOException {
URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&pr"
+ "op=extlinks&titles=" + title);
BufferedReader linksearch = new BufferedReader(
new InputStreamReader(u.openStream()));
String inputLine;
String links = ""; //This will contain the HTML of Special:NewPages
while ((inputLine = linksearch.readLine()) != null) {
links += inputLine;
}
return !links.contains("<el xml:space=\"preserve\">");
}
public static boolean noblpkeywords(String article) {
String[] keywords = {"UnsourcedBLP", "BLPunreferenced", "Unreferencedblp",
"Blpunsourced", "BLPunsourced", "Unsourcedblp", "BLPUnreferenced",
"Unsourced BLP", "BLP unreferenced", "Blpunref", "Unreferenced BLP",
"Blpunreferenced", "UnreferencedBLP", "BLPUnsourced", "Unreferenced blp",
"BLP Unreferenced", "Blp-unreferenced", "Userspace BLP", "Unreferenced-blp",
"Unreferenced-BLP", "Blpnoref", "Blp unreferenced", "BLPnoref", "Unref BLP", "Blp unsourced", "Urblp", "Ublp", "Blp-unsourced", "BLPunref", "Unsourced-blp",
"Noref-blp", "Unsourced blp",
"Unsourced", "Unverified", "Unref", "References", "Uncited-article",
"Citesources", "NR", "No references", "Unrefarticle", "Unreferenced article",
"Noref", "Norefs", "Noreferences", "Cleanup-cite", "References needed",
"Nr", "No refs", "UnreferencedArticle", "No ref", "Unreferenced stub",
"Needs references", "Noreference", "No reference", "Refsneeded", "Refs needed",
"Ref needed", "Nosources", "No sources", "UNref", "UNREF", "Unr"};
for (int i = 0; i < keywords.length; i++) {
if (article.contains(keywords[i])) {
return true;
}
}
return false;
}
public static boolean noblpcat(String title) throws MalformedURLException, IOException {
URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop=categories&titles="
+ title);
BufferedReader catsearch = new BufferedReader(
new InputStreamReader(u.openStream()));
String inputLine;
String cats = "";
while ((inputLine = catsearch.readLine()) != null) {
cats += inputLine;
}
return !cats.contains("Category:Living people") && cats.contains("Category: Living people");
}
public static boolean alreadyedited(String title) throws IOException {
URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=user&rvlimit=500&titles="
+ title);
BufferedReader botsearch = new BufferedReader(
new InputStreamReader(u.openStream()));
String inputLine;
String users = "";
while ((inputLine = botsearch.readLine()) != null) {
users += inputLine;
}
return users.contains("Jakebot");
}
}