Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Jakebot code below...
- package jakebot;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.net.MalformedURLException;
- import java.net.URL;
- /**
- * A bot to tag new unreferenced articles on the English Wikipedia.
- * @author King jakob c 2
- */
- public class Jakebot {
- /**
- * @param args the command line arguments
- */
- public static void main(String[] args) throws MalformedURLException, IOException {
- //Special:NewPages
- //Lines 21-30 shamelessly lifted and adapted from
- //http://stackoverflow.com/questions/6188901/reading-the-content-of-web-page
- URL newpages = new URL("https://en.wikipedia.org/w/index.php?title=Speci"
- + "al:NewPages&offset=&limit=500");
- BufferedReader newpagesreader = new BufferedReader(
- new InputStreamReader(newpages.openStream()));
- String inputLine;
- String wholepage = ""; //This will contain the HTML of Special:NewPages
- while ((inputLine = newpagesreader.readLine()) != null) {
- wholepage += inputLine;
- }
- //The names of the 500 newest articles
- String[] newpageslist = new String[500];
- //Each <li> tag except for the first 5 <li> tags preceeds an article
- int litags = 0;
- int newpagesfilled = 0;
- for (int i = 0; i < wholepage.length() - 4; i++) {
- if (wholepage.charAt(i) == '<' && wholepage.charAt(i + 1) == 'l'
- && wholepage.charAt(i + 2) == 'i' && wholepage.charAt(i + 3) == '>') {
- litags++;
- if (litags > 5) {
- //The content between 32 characters after the <li>, and the
- //next & sign is the name of the article.
- newpageslist[newpagesfilled] = parseFromNewPages
- (wholepage.substring(i));
- newpagesfilled++;
- }
- }
- }
- //Checking if each page is unreferenced and then tagging it if so.
- for (int i = 0; i < newpageslist.length; i++) {
- //For some reason, there are a bunch of "null"s in the newpageslist.
- //Want to avoid those.
- if (!newpageslist[i].equals("null")) {
- //Loading up the edit window of a page to get the wiki markup.
- URL anewpage = new URL("https://en.wikipedia.org/w/index.php?ti"
- + "tle=" + newpageslist[i] + "&action=edit");
- BufferedReader pagereader = new BufferedReader(
- new InputStreamReader(anewpage.openStream()));
- String inputLine2;
- String article = "";
- while ((inputLine2 = pagereader.readLine()) != null) {
- article += inputLine2;
- }
- //Cleanarticle = the page with the wiki markup, not HTML.
- String cleanarticle = parseArticle(article);
- //Use the APISandbox to tag as unreferenced, assuming it is.
- if(isEligibleForTagging(cleanarticle)){
- Process p=Runtime.getRuntime().exec("cmd /c start " +
- "https://en.wikipedia.org/w/api.php?action=edit&format="
- + "json&title=" + cleanarticle+"&summary=Tagging unref"
- + "erenced article(%5B%5BWP%3ABOT%7CBot%20edit"
- + "%5D%5D)&bot=&appendtext={{Unreferenced}}&assert=bot&"
- + "prop=info");
- p.destroy(); //and close the window
- }
- }
- }
- }
- /**
- * Parses out an article title from the HTML in Special:NewPages
- * @param s a piece of the HTML of Special:NewPages
- * @return A properly formatted article name
- */
- public static String parseFromNewPages(String s) {
- String cleanpagename = ""; //this will be returned
- //There are 32 characters between the <li> and the start of the article
- //title.
- for (int i = 32; i < s.length(); i++) {
- //Add characters to cleanpagename until we hit the & sign.
- if (s.charAt(i) == '&') {
- return cleanpagename;
- } else {
- cleanpagename += s.charAt(i);
- }
- }
- return ""; //this should not be reached
- }
- /**
- * Gets the wiki markup content of an article from the HTML of the edit window
- * @param article the HTML of the edit window of an article
- * @return wiki markup of an article
- */
- public static String parseArticle(String article) {
- String articlecontent = "";
- //Begin here.
- int beginpage = article.indexOf('"' + "wpTextbox1" + '"' + ">");
- //Adding the wiki markup
- while (true) {
- articlecontent += article.charAt(beginpage+13);
- beginpage++;
- if(articlecontent.contains("</textarea>")){
- return articlecontent;
- }
- }
- }
- /**
- * Check if the bot should tag the page as unreferenced or not
- * @param article the wiki markup of an article
- * @return true if the article should be tagged as unreferenced
- */
- public static boolean isEligibleForTagging(String article){
- //If there is no reflist, external links, or further reading, return true.
- //Return false if it is already tagged is a unreferenced, if it is a disambiguation page,
- //or if it is up for speedy deletion.
- article = article.toLowerCase();
- if(!(article.contains("reflist") && article.contains("<ref>")) &&
- !article.contains("==referneces==") && !article.contains("http")
- && !article.contains("further reading") && !article.contains("{{db")
- && !article.contains("{{unreferenced")
- && !article.contains("{{bots}}") &&
- !article.contains("disambiguation")){
- return true;
- }
- return false;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement