View difference between Paste ID: miRVsakD and kx1qCYC8
SHOW: | | - or go back to the newest paste.
1
Jakebot code below...
2
package jakebot;
3
4
import java.io.BufferedReader;
5
import java.io.IOException;
6
import java.io.InputStreamReader;
7
import java.net.MalformedURLException;
8
import java.net.URL;
9
10
/**
11
 * A bot to tag new unreferenced articles on the English Wikipedia.
12
 * @author King jakob c 2
13
 */
14
public class Jakebot {
15
16
    /**
17
     * @param args the command line arguments
18
     */
19
    public static void main(String[] args) throws MalformedURLException, IOException {
20
        //Special:NewPages
21
        //Lines 21-30 shamelessly lifted and adapted from 
22
        //http://stackoverflow.com/questions/6188901/reading-the-content-of-web-page
23
        URL newpages = new URL("https://en.wikipedia.org/w/index.php?title=Speci"
24
                + "al:NewPages&offset=&limit=500");
25
        BufferedReader newpagesreader = new BufferedReader(
26
                new InputStreamReader(newpages.openStream()));
27
        String inputLine;
28
        String wholepage = ""; //This will contain the HTML of Special:NewPages
29
30
        while ((inputLine = newpagesreader.readLine()) != null) {
31
            wholepage += inputLine;
32
        }
33
        
34
        //The names of the 500 newest articles
35
        String[] newpageslist = new String[500]; 
36
        
37
        //Each <li> tag except for the first 5 <li> tags preceeds an article
38
        int litags = 0;
39
        int newpagesfilled = 0;
40
        for (int i = 0; i < wholepage.length() - 4; i++) {
41
            if (wholepage.charAt(i) == '<' && wholepage.charAt(i + 1) == 'l'
42
                    && wholepage.charAt(i + 2) == 'i' && wholepage.charAt(i + 3) == '>') {
43
                litags++;
44
                
45
                if (litags > 5) {
46
                    //The content between 32 characters after the <li>, and the
47
                    //next & sign is the name of the article.
48
                    newpageslist[newpagesfilled] = parseFromNewPages
49
        (wholepage.substring(i));
50
                    newpagesfilled++;
51
                }
52
            }
53
54
        }
55
        
56-
        //Checking if each page is a stub and then tagging it if it is a stub.
56+
        //Checking if each page is unreferenced and then tagging it if so.
57
        for (int i = 0; i < newpageslist.length; i++) {
58
            //For some reason, there are a bunch of "null"s in the newpageslist.
59
            //Want to avoid those.
60
            if (!newpageslist[i].equals("null")) {
61
                //Loading up the edit window of a page to get the wiki markup.
62
                URL anewpage = new URL("https://en.wikipedia.org/w/index.php?ti"
63
                        + "tle=" + newpageslist[i] + "&action=edit");
64
                BufferedReader pagereader = new BufferedReader(
65
                        new InputStreamReader(anewpage.openStream()));
66
                String inputLine2;
67
                String article = "";
68
69
                while ((inputLine2 = pagereader.readLine()) != null) {
70
                    article += inputLine2;
71
                }
72
                
73
                //Cleanarticle = the page with the wiki markup, not HTML.
74
                String cleanarticle = parseArticle(article);
75
                
76-
                //Use the APISandbox to tag as a stub, assuming it is one.
76+
                //Use the APISandbox to tag as unreferenced, assuming it is.
77
                if(isEligibleForTagging(cleanarticle)){                
78
                Process p=Runtime.getRuntime().exec("cmd /c start " + 
79
                        "https://en.wikipedia.org/w/api.php?action=edit&format="
80-
                        + "json&title=" + cleanarticle+"&summary=Tagging%20short"
80+
                        + "json&title=" + cleanarticle+"&summary=Tagging unref"
81-
                        + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
81+
                        + "erenced article(%5B%5BWP%3ABOT%7CBot%20edit"
82
                        + "%5D%5D)&bot=&appendtext={{Unreferenced}}&assert=bot&"
83
                        + "prop=info");
84
                p.destroy(); //and close the window
85
                }
86
            }
87
        }
88
    }
89
90
    /**
91
     * Parses out an article title from the HTML in Special:NewPages
92
     * @param s a piece of the HTML of Special:NewPages
93
     * @return A properly formatted article name
94
     */
95
    public static String parseFromNewPages(String s) {
96
        String cleanpagename = ""; //this will be returned
97
        //There are 32 characters between the <li> and the start of the article
98
        //title.
99
        for (int i = 32; i < s.length(); i++) {
100
            //Add characters to cleanpagename until we hit the & sign.
101
            if (s.charAt(i) == '&') {
102
                return cleanpagename;
103
            } else {
104
                cleanpagename += s.charAt(i);
105
            }
106
        }
107
        return ""; //this should not be reached
108
    }
109
110
    /**
111
     * Gets the wiki markup content of an article from the HTML of the edit window
112
     * @param article the HTML of the edit window of an article
113
     * @return wiki markup of an article
114
     */
115
    public static String parseArticle(String article) {
116
        String articlecontent = "";
117
        //Begin here.
118
        int beginpage = article.indexOf('"' + "wpTextbox1" + '"' + ">");
119
        
120
        //Adding the wiki markup
121
        while (true) {
122
            articlecontent += article.charAt(beginpage+13);
123
            beginpage++;
124
            if(articlecontent.contains("</textarea>")){
125
                return articlecontent;
126
            }
127
        }
128
    }
129
    
130
    /**
131-
     * Check if the bot should tag the page as a stub or not
131+
     * Check if the bot should tag the page as unreferenced or not
132
     * @param article the wiki markup of an article
133-
     * @return true if the article should be tagged as a stub
133+
     * @return true if the article should be tagged as unreferenced 
134
     */
135
    public static boolean isEligibleForTagging(String article){
136-
        //If there are under 1500 characters of wikimarkup, there are under 1500
136+
        //If there is no reflist, external links, or further reading, return true.
137-
        //characters of readable prose, which is sometimes used as a cutoff point
137+
        //Return false if it is already tagged is a unreferenced, if it is a disambiguation page,
138-
        //for stubs/start-class articles. This method will also return false if
138+
139-
        //it is already tagged is a stub, if it is a list or a disambiguation page,
139+
140
        if(!(article.contains("reflist") && article.contains("<ref>")) &&
141
                !article.contains("==referneces==") && !article.contains("http")
142
                && !article.contains("further reading") && !article.contains("{{db")
143
		&& !article.contains("{{unreferenced")
144
                && !article.contains("{{bots}}") && 
145
                !article.contains("disambiguation")){
146
            return true;
147
        }
148
        return false;
149
    }
150
}