View difference between Paste ID: KKDWUvGm and KE48mJFV
SHOW: | | - or go back to the newest paste.
1
package jakebot;
2
3
import java.io.BufferedReader;
4
import java.io.IOException;
5
import java.io.InputStreamReader;
6
import java.net.MalformedURLException;
7
import java.net.URL;
8
9
/**
10
 * A bot to tag new unreferenced articles on the English Wikipedia.
11
 *
12
 * @author King jakob c 2
13
 */
14
public class Jakebot {
15
16
    /**
17
     * @param args the command line arguments
18
     */
19
    public static void main(String[] args) throws MalformedURLException, IOException {
20
        //Special:NewPages
21
        //Lines 21-30 shamelessly lifted and adapted from 
22-
        URL newpages = new URL("https://en.wikipedia.org/w/index.php?title=Speci"
22+
23-
                + "al:NewPages&offset=50&limit=500");
23+
        URL newpages = new URL("https://en.wikipedia.org/w/api.php?action=query"
24
                + "&list=recentchanges&rctype=new&rcprop=title|timestamp&"
25
                + "rcnamespace=0&rclimit=500");
26
        BufferedReader newpagesreader = new BufferedReader(
27-
        String wholepage = ""; //This will contain the HTML of Special:NewPages
27+
28
        String inputLine;
29
        String wholepage = ""; //This will contain the code of the new pages API
30
        //query.
31
32-
        
32+
33-
        //The names of the 50th to 550th newest articles
33+
34-
        String[] newpageslist = new String[500]; 
34+
35-
        
35+
36-
        //Each <li> tag except for the first 5 <li> tags preceeds an article
36+
        //The names of the 500 newest articles
37-
        int litags = 0;
37+
        String[] newpageslist = new String[500];
38
39
        int newpagesfilled = 0;
40-
            if (wholepage.charAt(i) == '<' && wholepage.charAt(i + 1) == 'l'
40+
41-
                    && wholepage.charAt(i + 2) == 'i' && wholepage.charAt(i + 3) == '>') {
41+
            if (wholepage.charAt(i) == 't' && wholepage.charAt(i + 1) == 'i'
42-
                litags++;
42+
                    && wholepage.charAt(i + 2) == 't' && wholepage.charAt(i + 3) == 'l'
43-
                
43+
                    && wholepage.charAt(i + 4) == 'e' && wholepage.charAt(i + 5) == '=') {
44-
                if (litags > 5) {
44+
                newpageslist[newpagesfilled] = parseFromNewPages(wholepage.substring(i));
45-
                    //The content between 32 characters after the <li>, and the
45+
                newpagesfilled++;
46-
                    //next & sign is the name of the article.
46+
47-
                    newpageslist[newpagesfilled] = parseFromNewPages
47+
48-
        (wholepage.substring(i));
48+
49-
                    newpagesfilled++;
49+
50
        //Checking if each page is a unreferenced and then tagging it 
51
        //if it is unreferenced.
52
        for (int i = 0; i < newpageslist.length; i++) {
53
            //For some reason, there are a bunch of "null"s in the newpageslist.
54-
        
54+
55
            if (!newpageslist[i].equals("null")) {
56
                //Loading up the edit window of a page to get the wiki markup.
57
                URL anewpage = new URL("https://en.wikipedia.org/w/index.php?ti"
58
                        + "tle=" + newpageslist[i] + "&action=edit");
59
                BufferedReader pagereader = new BufferedReader(
60
                        new InputStreamReader(anewpage.openStream()));
61
                String inputLine2;
62
                String article = "";
63
64
                while ((inputLine2 = pagereader.readLine()) != null) {
65
                    article += inputLine2;
66
                }
67
68
                //Cleanarticle = the page with the wiki markup, not HTML.
69
                String cleanarticle = parseArticle(article);
70
71
                //Use the APISandbox to tag as a unreferenced, assuming it is one.
72-
                
72+
                if (isEligibleForTagging(cleanarticle, newpageslist[i]) && noblpcat(newpageslist[i])
73
                        && !alreadyedited(newpageslist[i])) {
74
                    String url = "https://en.wikipedia.org/w/api.php?action=edit&format="
75-
                
75+
                            + "json&title=" + cleanarticle + "&summary=Tagging%20short"
76
                            + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
77-
                if(isEligibleForTagging(cleanarticle, newpageslist[i]) && !cleanarticle.contains("[[Category:Living people]]")){                
77+
                            + "%5D%5D)&bot=&prependtext={{Unreferenced}}&assert=bot&"
78-
                Process p=Runtime.getRuntime().exec("cmd /c start " + 
78+
                            + "prop=info";
79-
                        "https://en.wikipedia.org/w/api.php?action=edit&format="
79+
                    Runtime.getRuntime().exec("rundll32 url.dll,FileProtocolHandler" + url);
80-
                        + "json&title=" + cleanarticle+"&summary=Tagging%20short"
80+
81-
                        + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
81+
82-
                        + "%5D%5D)&bot=&appendtext={{Unreferenced}}&assert=bot&"
82+
                //If it is a BLP, tag it with {{BLP unsourced}} instead.
83-
                        + "prop=info");
83+
                if (isEligibleForTagging(cleanarticle, newpageslist[i]) && !noblpcat(newpageslist[i])) {
84-
                p.destroy(); //and close the window
84+
                    String url = "https://en.wikipedia.org/w/api.php?action=edit&format="
85
                            + "json&title=" + newpageslist[i] + "&summary=Tagging%20short"
86-
                //If it is a BLP, tag it with {{BLP Unsourced}} instead.
86+
                            + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
87-
                if(isEligibleForTagging(cleanarticle, newpageslist[i]) && cleanarticle.contains("[[Category:Living people]]")){                
87+
                            + "%5D%5D)&bot=&prependtext={{BLP unsourced}}&assert=bot&"
88-
                Process p=Runtime.getRuntime().exec("cmd /c start " + 
88+
                            + "prop=info";
89-
                        "https://en.wikipedia.org/w/api.php?action=edit&format="
89+
                    Runtime.getRuntime().exec("rundll32 url.dll,FileProtocolHandler" + url);
90-
                        + "json&title=" + newpageslist[i]+"&summary=Tagging%20short"
90+
91-
                        + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
91+
92-
                        + "%5D%5D)&bot=&appendtext={{BLP Unsourced}}&assert=bot&"
92+
93-
                        + "prop=info");
93+
94-
                p.destroy(); //and close the window
94+
95
    /**
96
     * Parses out an article title from the HTML in Special:NewPages
97
     *
98
     * @param s a piece of the HTML of Special:NewPages
99
     * @return A properly formatted article name
100
     */
101
    public static String parseFromNewPages(String s) {
102
        String cleanpagename = ""; //this will be returned
103
        //There are 32 characters between the <li> and the start of the article
104
        //title.
105
        for (int i = 1; i < s.length(); i++) {
106
            //Add characters to cleanpagename until we hit the & sign.
107
            if (s.charAt(i) == '&') {
108
                return cleanpagename;
109-
        for (int i = 32; i < s.length(); i++) {
109+
110
                cleanpagename += s.charAt(i);
111
            }
112
        }
113
        return ""; //this should not be reached
114
    }
115
116
    /**
117
     * Gets the wiki markup content of an article from the HTML of the edit
118
     * window
119
     *
120
     * @param article the HTML of the edit window of an article
121-
     * Gets the wiki markup content of an article from the HTML of the edit window
121+
122
     */
123
    public static String parseArticle(String article) {
124
        String articlecontent = "";
125
        //Begin here.
126
        int beginpage = article.indexOf('"' + "wpTextbox1" + '"' + ">");
127
128
        //Adding the wiki markup
129-
        
129+
130
            articlecontent += article.charAt(beginpage + 13);
131
            beginpage++;
132-
            articlecontent += article.charAt(beginpage+13);
132+
            if (articlecontent.contains("</textarea>")) {
133
                return articlecontent;
134-
            if(articlecontent.contains("</textarea>")){
134+
135
        }
136
    }
137
138
    /**
139-
    
139+
140
     *
141
     * @param article the wiki markup of an article
142
     * @return true if the article should be tagged as unreferenced
143
     */
144
    public static boolean isEligibleForTagging(String article, String title) throws IOException {
145-
    public static boolean isEligibleForTagging(String article, String title) throws IOException{
145+
146
        //an {{sfn}} template, external links, further reading, a references section,
147
        //or a notes section, it is consdiered unreferenced. If it is a disambiguation
148
        //page, already tagged as unreferenced, or has {{nobots}}, it won't
149
        //be tagged.
150
        article = article.toLowerCase();
151
        if (!article.contains("reflist")
152-
        if(!article.contains("reflist") && 
152+
                && !article.contains("<ref>") && !article.contains("<ref name") && !article.contains("<ref group")
153-
                !article.contains("<ref>") && !article.contains("<ref name") &&
153+
                && !article.contains("{{sfn")
154-
                !article.contains("{{sfn") &&
154+
                && !article.contains("=referneces=") && !article.contains("= references =")
155-
                !article.contains("==referneces==") && !article.contains("== references ==") &&
155+
                && !article.contains("= referneces=") && !article.contains("=references =")
156-
                !article.contains("==notes==") && !article.contains("== notes ==") &&
156+
                && !article.contains("=notes=") && !article.contains("== notes ==")
157-
                !article.contains("<references/>") && !article.contains("<references />") &&
157+
                && !article.contains("=notes =") && !article.contains("== notes==")
158-
                noextlinks(title)
158+
                && !article.contains("=citations=") && !article.contains("= citations =")
159
                && !article.contains("=citations =") && !article.contains("= citations=")
160-
                && !article.contains("{{nobots}}")
160+
                && !article.contains("=sources=") && !article.contains("= sources =")
161-
                && !article.contains("{{unreferenced")){
161+
                && !article.contains("=sources =") && !article.contains("= sources=")
162
                && !article.contains("<references") && !article.contains("{{refbegin")
163
                && !article.contains("{{noteslist")
164
                && noextlinks(title)
165
                && !noblpkeywords(article)
166-
    
166+
167
                && !article.contains("{{nobots}}")) {
168
            return true;
169
        }
170
        return false;
171
    }
172-
     * @throws IOException 
172+
173
    /**
174-
    public static boolean nodabs(String title) throws MalformedURLException, IOException{
174+
175
     *
176-
                + "templates&title="+title);
176+
177
     * @return true if the page is not a disambiguation page
178
     * @throws MalformedURLException
179
     * @throws IOException
180-
        String templates = "";
180+
181
    public static boolean nodabs(String title) throws MalformedURLException, IOException {
182
        URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop="
183
                + "templates&title=" + title);
184
        BufferedReader dabsearch = new BufferedReader(
185-
        return !templates.contains("Template:Dmbox");   
185+
186
        String inputLine;
187-
    
187+
        String templates = ""; //This will contain the HTML of Special:NewPages
188
189
        while ((inputLine = dabsearch.readLine()) != null) {
190
            templates += inputLine;
191
        }
192
        return !templates.contains("Template:Dmbox");
193-
     * @throws IOException 
193+
194
195-
    public static boolean noextlinks(String title) throws MalformedURLException, IOException{
195+
196
     * Uses a Wikipedia API query to search for external links in an article.
197-
                + "op=extlinks&titles="+title);
197+
     *
198
     * @param title article title
199
     * @return true if there are no external links
200
     * @throws MalformedURLException
201-
        String links = "";
201+
     * @throws IOException
202
     */
203
    public static boolean noextlinks(String title) throws MalformedURLException, IOException {
204
        URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&pr"
205
                + "op=extlinks&titles=" + title);
206-
        return !links.contains("<el xml:space=\"preserve\">");   
206+
207
                new InputStreamReader(u.openStream()));
208
        String inputLine;
209
        String links = ""; //This will contain the HTML of Special:NewPages
210
211
        while ((inputLine = linksearch.readLine()) != null) {
212
            links += inputLine;
213
        }
214
        return !links.contains("<el xml:space=\"preserve\">");
215
    }
216
217
    public static boolean noblpkeywords(String article) {
218
        String[] keywords = {"UnsourcedBLP", "BLPunreferenced", "Unreferencedblp",
219
            "Blpunsourced", "BLPunsourced", "Unsourcedblp", "BLPUnreferenced",
220
            "Unsourced BLP", "BLP unreferenced", "Blpunref", "Unreferenced BLP",
221
            "Blpunreferenced", "UnreferencedBLP", "BLPUnsourced", "Unreferenced blp",
222
            "BLP Unreferenced", "Blp-unreferenced", "Userspace BLP", "Unreferenced-blp",
223
            "Unreferenced-BLP", "Blpnoref", "Blp unreferenced", "BLPnoref", "Unref BLP", "Blp unsourced", "Urblp", "Ublp", "Blp-unsourced", "BLPunref", "Unsourced-blp",
224
            "Noref-blp", "Unsourced blp",
225
            "Unsourced", "Unverified", "Unref", "References", "Uncited-article",
226
            "Citesources", "NR", "No references", "Unrefarticle", "Unreferenced article",
227
            "Noref", "Norefs", "Noreferences", "Cleanup-cite", "References needed",
228
            "Nr", "No refs", "UnreferencedArticle", "No ref", "Unreferenced stub",
229
            "Needs references", "Noreference", "No reference", "Refsneeded", "Refs needed",
230
            "Ref needed", "Nosources", "No sources", "UNref", "UNREF", "Unr"};
231
        for (int i = 0; i < keywords.length; i++) {
232
            if (article.contains(keywords[i])) {
233
                return true;
234
            }
235
        }
236
        return false;
237
    }
238
239
    public static boolean noblpcat(String title) throws MalformedURLException, IOException {
240
        URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop=categories&titles="
241
                + title);
242
        BufferedReader catsearch = new BufferedReader(
243
                new InputStreamReader(u.openStream()));
244
        String inputLine;
245
        String cats = "";
246
247
        while ((inputLine = catsearch.readLine()) != null) {
248
            cats += inputLine;
249
        }
250
        return !cats.contains("Category:Living people") && cats.contains("Category: Living people");
251
    }
252
253
    public static boolean alreadyedited(String title) throws IOException {
254
        URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=user&rvlimit=500&titles="
255
                + title);
256
        BufferedReader botsearch = new BufferedReader(
257
                new InputStreamReader(u.openStream()));
258
        String inputLine;
259
        String users = "";
260
261
        while ((inputLine = botsearch.readLine()) != null) {
262
            users += inputLine;
263
        }
264
        return users.contains("Jakebot");
265
    }
266
}