View difference between Paste ID: KE48mJFV and miRVsakD
SHOW: | | - or go back to the newest paste.
1-
Jakebot code below...
1+
2
3
import java.io.BufferedReader;
4
import java.io.IOException;
5
import java.io.InputStreamReader;
6
import java.net.MalformedURLException;
7
import java.net.URL;
8
9
/**
10
 * A bot to tag new unreferenced articles on the English Wikipedia.
11
 * @author King jakob c 2
12
 */
13
public class Jakebot {
14
15
    /**
16
     * @param args the command line arguments
17
     */
18
    public static void main(String[] args) throws MalformedURLException, IOException {
19
        //Special:NewPages
20
        //Lines 21-30 shamelessly lifted and adapted from 
21
        //http://stackoverflow.com/questions/6188901/reading-the-content-of-web-page
22
        URL newpages = new URL("https://en.wikipedia.org/w/index.php?title=Speci"
23
                + "al:NewPages&offset=50&limit=500");
24-
                + "al:NewPages&offset=&limit=500");
24+
25
                new InputStreamReader(newpages.openStream()));
26
        String inputLine;
27
        String wholepage = ""; //This will contain the HTML of Special:NewPages
28
29
        while ((inputLine = newpagesreader.readLine()) != null) {
30
            wholepage += inputLine;
31
        }
32
        
33
        //The names of the 50th to 550th newest articles
34-
        //The names of the 500 newest articles
34+
35
        
36
        //Each <li> tag except for the first 5 <li> tags preceeds an article
37
        int litags = 0;
38
        int newpagesfilled = 0;
39
        for (int i = 0; i < wholepage.length() - 4; i++) {
40
            if (wholepage.charAt(i) == '<' && wholepage.charAt(i + 1) == 'l'
41
                    && wholepage.charAt(i + 2) == 'i' && wholepage.charAt(i + 3) == '>') {
42
                litags++;
43
                
44
                if (litags > 5) {
45
                    //The content between 32 characters after the <li>, and the
46
                    //next & sign is the name of the article.
47
                    newpageslist[newpagesfilled] = parseFromNewPages
48
        (wholepage.substring(i));
49
                    newpagesfilled++;
50
                }
51
            }
52
53
        }
54
        
55
        //Checking if each page is a unreferenced and then tagging it 
56-
        //Checking if each page is unreferenced and then tagging it if so.
56+
        //if it is unreferenced.
57
        for (int i = 0; i < newpageslist.length; i++) {
58
            //For some reason, there are a bunch of "null"s in the newpageslist.
59
            //Want to avoid those.
60
            if (!newpageslist[i].equals("null")) {
61
                //Loading up the edit window of a page to get the wiki markup.
62
                URL anewpage = new URL("https://en.wikipedia.org/w/index.php?ti"
63
                        + "tle=" + newpageslist[i] + "&action=edit");
64
                BufferedReader pagereader = new BufferedReader(
65
                        new InputStreamReader(anewpage.openStream()));
66
                String inputLine2;
67
                String article = "";
68
69
                while ((inputLine2 = pagereader.readLine()) != null) {
70
                    article += inputLine2;
71
                }
72
                
73
                //Cleanarticle = the page with the wiki markup, not HTML.
74
                String cleanarticle = parseArticle(article);
75
                
76-
                //Use the APISandbox to tag as unreferenced, assuming it is.
76+
                //Use the APISandbox to tag as a unreferenced, assuming it is one.
77-
                if(isEligibleForTagging(cleanarticle)){                
77+
                if(isEligibleForTagging(cleanarticle, newpageslist[i]) && !cleanarticle.contains("[[Category:Living people]]")){                
78
                Process p=Runtime.getRuntime().exec("cmd /c start " + 
79
                        "https://en.wikipedia.org/w/api.php?action=edit&format="
80-
                        + "json&title=" + cleanarticle+"&summary=Tagging unref"
80+
                        + "json&title=" + cleanarticle+"&summary=Tagging%20short"
81-
                        + "erenced article(%5B%5BWP%3ABOT%7CBot%20edit"
81+
                        + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
82
                        + "%5D%5D)&bot=&appendtext={{Unreferenced}}&assert=bot&"
83
                        + "prop=info");
84
                p.destroy(); //and close the window
85
                }
86
                //If it is a BLP, tag it with {{BLP Unsourced}} instead.
87
                if(isEligibleForTagging(cleanarticle, newpageslist[i]) && cleanarticle.contains("[[Category:Living people]]")){                
88
                Process p=Runtime.getRuntime().exec("cmd /c start " + 
89
                        "https://en.wikipedia.org/w/api.php?action=edit&format="
90
                        + "json&title=" + newpageslist[i]+"&summary=Tagging%20short"
91
                        + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
92
                        + "%5D%5D)&bot=&appendtext={{BLP Unsourced}}&assert=bot&"
93
                        + "prop=info");
94
                p.destroy(); //and close the window
95
                }
96
            }
97
        }
98
    }
99
100
    /**
101
     * Parses out an article title from the HTML in Special:NewPages
102
     * @param s a piece of the HTML of Special:NewPages
103
     * @return A properly formatted article name
104
     */
105
    public static String parseFromNewPages(String s) {
106
        String cleanpagename = ""; //this will be returned
107
        //There are 32 characters between the <li> and the start of the article
108
        //title.
109
        for (int i = 32; i < s.length(); i++) {
110
            //Add characters to cleanpagename until we hit the & sign.
111
            if (s.charAt(i) == '&') {
112
                return cleanpagename;
113
            } else {
114
                cleanpagename += s.charAt(i);
115
            }
116
        }
117
        return ""; //this should not be reached
118
    }
119
120
    /**
121
     * Gets the wiki markup content of an article from the HTML of the edit window
122
     * @param article the HTML of the edit window of an article
123
     * @return wiki markup of an article
124
     */
125
    public static String parseArticle(String article) {
126
        String articlecontent = "";
127
        //Begin here.
128
        int beginpage = article.indexOf('"' + "wpTextbox1" + '"' + ">");
129
        
130
        //Adding the wiki markup
131
        while (true) {
132
            articlecontent += article.charAt(beginpage+13);
133-
     * @return true if the article should be tagged as unreferenced 
133+
134
            if(articlecontent.contains("</textarea>")){
135-
    public static boolean isEligibleForTagging(String article){
135+
136-
        //If there is no reflist, external links, or further reading, return true.
136+
137-
        //Return false if it is already tagged is a unreferenced, if it is a disambiguation page,
137+
138-
        //or if it is up for speedy deletion.
138+
139
    
140-
        if(!(article.contains("reflist") && article.contains("<ref>")) &&
140+
141-
                !article.contains("==referneces==") && !article.contains("http")
141+
142-
                && !article.contains("further reading") && !article.contains("{{db")
142+
143-
		&& !article.contains("{{unreferenced")
143+
     * @return true if the article should be tagged as unreferenced
144-
                && !article.contains("{{bots}}") && 
144+
145-
                !article.contains("disambiguation")){
145+
    public static boolean isEligibleForTagging(String article, String title) throws IOException{
146
        //If the article lacks a reflist (or variants), <ref> tags (or variants),
147
        //an {{sfn}} template, external links, further reading, a references section,
148
        //or a notes section, it is consdiered unreferenced. If it is a disambiguation
149
        //page, already tagged as unreferenced, or has {{nobots}}, it won't
150
        //be tagged.
151
        article = article.toLowerCase();
152
        if(!article.contains("reflist") && 
153
                !article.contains("<ref>") && !article.contains("<ref name") &&
154
                !article.contains("{{sfn") &&
155
                !article.contains("==referneces==") && !article.contains("== references ==") &&
156
                !article.contains("==notes==") && !article.contains("== notes ==") &&
157
                !article.contains("<references/>") && !article.contains("<references />") &&
158
                noextlinks(title)
159
                && !article.contains("further reading") && nodabs(title)
160
                && !article.contains("{{nobots}}")
161
                && !article.contains("{{unreferenced")){
162
            return true;
163
        }
164
        return false;
165
    }
166
    
167
    /**
168
     * Uses a Wikipedia API query to search for a dmbox template
169
     * @param title article title
170
     * @return true if the page is not a disambiguation page
171
     * @throws MalformedURLException
172
     * @throws IOException 
173
     */
174
    public static boolean nodabs(String title) throws MalformedURLException, IOException{
175
        URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop="
176
                + "templates&title="+title);
177
        BufferedReader dabsearch = new BufferedReader(
178
                new InputStreamReader(u.openStream()));
179
        String inputLine;
180
        String templates = "";
181
182
        while ((inputLine = dabsearch.readLine()) != null) {
183
            templates += inputLine;
184
        }
185
        return !templates.contains("Template:Dmbox");   
186
    }
187
    
188
    /**
189
     * Uses a Wikipedia API query to search for external links in an article.
190
     * @param title article title
191
     * @return true if there are no external links
192
     * @throws MalformedURLException
193
     * @throws IOException 
194
     */
195
    public static boolean noextlinks(String title) throws MalformedURLException, IOException{
196
        URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&pr"
197
                + "op=extlinks&titles="+title);
198
        BufferedReader linksearch = new BufferedReader(
199
                new InputStreamReader(u.openStream()));
200
        String inputLine;
201
        String links = "";
202
203
        while ((inputLine = linksearch.readLine()) != null) {
204
            links += inputLine;
205
        }
206
        return !links.contains("<el xml:space=\"preserve\">");   
207
    }
208
209
}