SHOW:
|
|
- or go back to the newest paste.
1 | package jakebot; | |
2 | ||
3 | import java.io.BufferedReader; | |
4 | import java.io.IOException; | |
5 | import java.io.InputStreamReader; | |
6 | import java.net.MalformedURLException; | |
7 | import java.net.URL; | |
8 | ||
9 | /** | |
10 | * A bot to tag new unreferenced articles on the English Wikipedia. | |
11 | * | |
12 | * @author King jakob c 2 | |
13 | */ | |
14 | public class Jakebot { | |
15 | ||
16 | /** | |
17 | * @param args the command line arguments | |
18 | */ | |
19 | public static void main(String[] args) throws MalformedURLException, IOException { | |
20 | //Special:NewPages | |
21 | //Lines 21-30 shamelessly lifted and adapted from | |
22 | - | URL newpages = new URL("https://en.wikipedia.org/w/index.php?title=Speci" |
22 | + | |
23 | - | + "al:NewPages&offset=50&limit=500"); |
23 | + | URL newpages = new URL("https://en.wikipedia.org/w/api.php?action=query" |
24 | + "&list=recentchanges&rctype=new&rcprop=title|timestamp&" | |
25 | + "rcnamespace=0&rclimit=500"); | |
26 | BufferedReader newpagesreader = new BufferedReader( | |
27 | - | String wholepage = ""; //This will contain the HTML of Special:NewPages |
27 | + | |
28 | String inputLine; | |
29 | String wholepage = ""; //This will contain the code of the new pages API | |
30 | //query. | |
31 | ||
32 | - | |
32 | + | |
33 | - | //The names of the 50th to 550th newest articles |
33 | + | |
34 | - | String[] newpageslist = new String[500]; |
34 | + | |
35 | - | |
35 | + | |
36 | - | //Each <li> tag except for the first 5 <li> tags preceeds an article |
36 | + | //The names of the 500 newest articles |
37 | - | int litags = 0; |
37 | + | String[] newpageslist = new String[500]; |
38 | ||
39 | int newpagesfilled = 0; | |
40 | - | if (wholepage.charAt(i) == '<' && wholepage.charAt(i + 1) == 'l' |
40 | + | |
41 | - | && wholepage.charAt(i + 2) == 'i' && wholepage.charAt(i + 3) == '>') { |
41 | + | if (wholepage.charAt(i) == 't' && wholepage.charAt(i + 1) == 'i' |
42 | - | litags++; |
42 | + | && wholepage.charAt(i + 2) == 't' && wholepage.charAt(i + 3) == 'l' |
43 | - | |
43 | + | && wholepage.charAt(i + 4) == 'e' && wholepage.charAt(i + 5) == '=') { |
44 | - | if (litags > 5) { |
44 | + | newpageslist[newpagesfilled] = parseFromNewPages(wholepage.substring(i)); |
45 | - | //The content between 32 characters after the <li>, and the |
45 | + | newpagesfilled++; |
46 | - | //next & sign is the name of the article. |
46 | + | |
47 | - | newpageslist[newpagesfilled] = parseFromNewPages |
47 | + | |
48 | - | (wholepage.substring(i)); |
48 | + | |
49 | - | newpagesfilled++; |
49 | + | |
50 | //Checking if each page is a unreferenced and then tagging it | |
51 | //if it is unreferenced. | |
52 | for (int i = 0; i < newpageslist.length; i++) { | |
53 | //For some reason, there are a bunch of "null"s in the newpageslist. | |
54 | - | |
54 | + | |
55 | if (!newpageslist[i].equals("null")) { | |
56 | //Loading up the edit window of a page to get the wiki markup. | |
57 | URL anewpage = new URL("https://en.wikipedia.org/w/index.php?ti" | |
58 | + "tle=" + newpageslist[i] + "&action=edit"); | |
59 | BufferedReader pagereader = new BufferedReader( | |
60 | new InputStreamReader(anewpage.openStream())); | |
61 | String inputLine2; | |
62 | String article = ""; | |
63 | ||
64 | while ((inputLine2 = pagereader.readLine()) != null) { | |
65 | article += inputLine2; | |
66 | } | |
67 | ||
68 | //Cleanarticle = the page with the wiki markup, not HTML. | |
69 | String cleanarticle = parseArticle(article); | |
70 | ||
71 | //Use the APISandbox to tag as a unreferenced, assuming it is one. | |
72 | - | |
72 | + | if (isEligibleForTagging(cleanarticle, newpageslist[i]) && noblpcat(newpageslist[i]) |
73 | && !alreadyedited(newpageslist[i])) { | |
74 | String url = "https://en.wikipedia.org/w/api.php?action=edit&format=" | |
75 | - | |
75 | + | + "json&title=" + cleanarticle + "&summary=Tagging%20short" |
76 | + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit" | |
77 | - | if(isEligibleForTagging(cleanarticle, newpageslist[i]) && !cleanarticle.contains("[[Category:Living people]]")){ |
77 | + | + "%5D%5D)&bot=&prependtext={{Unreferenced}}&assert=bot&" |
78 | - | Process p=Runtime.getRuntime().exec("cmd /c start " + |
78 | + | + "prop=info"; |
79 | - | "https://en.wikipedia.org/w/api.php?action=edit&format=" |
79 | + | Runtime.getRuntime().exec("rundll32 url.dll,FileProtocolHandler" + url); |
80 | - | + "json&title=" + cleanarticle+"&summary=Tagging%20short" |
80 | + | |
81 | - | + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit" |
81 | + | |
82 | - | + "%5D%5D)&bot=&appendtext={{Unreferenced}}&assert=bot&" |
82 | + | //If it is a BLP, tag it with {{BLP unsourced}} instead. |
83 | - | + "prop=info"); |
83 | + | if (isEligibleForTagging(cleanarticle, newpageslist[i]) && !noblpcat(newpageslist[i])) { |
84 | - | p.destroy(); //and close the window |
84 | + | String url = "https://en.wikipedia.org/w/api.php?action=edit&format=" |
85 | + "json&title=" + newpageslist[i] + "&summary=Tagging%20short" | |
86 | - | //If it is a BLP, tag it with {{BLP Unsourced}} instead. |
86 | + | + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit" |
87 | - | if(isEligibleForTagging(cleanarticle, newpageslist[i]) && cleanarticle.contains("[[Category:Living people]]")){ |
87 | + | + "%5D%5D)&bot=&prependtext={{BLP unsourced}}&assert=bot&" |
88 | - | Process p=Runtime.getRuntime().exec("cmd /c start " + |
88 | + | + "prop=info"; |
89 | - | "https://en.wikipedia.org/w/api.php?action=edit&format=" |
89 | + | Runtime.getRuntime().exec("rundll32 url.dll,FileProtocolHandler" + url); |
90 | - | + "json&title=" + newpageslist[i]+"&summary=Tagging%20short" |
90 | + | |
91 | - | + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit" |
91 | + | |
92 | - | + "%5D%5D)&bot=&appendtext={{BLP Unsourced}}&assert=bot&" |
92 | + | |
93 | - | + "prop=info"); |
93 | + | |
94 | - | p.destroy(); //and close the window |
94 | + | |
95 | /** | |
96 | * Parses out an article title from the HTML in Special:NewPages | |
97 | * | |
98 | * @param s a piece of the HTML of Special:NewPages | |
99 | * @return A properly formatted article name | |
100 | */ | |
101 | public static String parseFromNewPages(String s) { | |
102 | String cleanpagename = ""; //this will be returned | |
103 | //There are 32 characters between the <li> and the start of the article | |
104 | //title. | |
105 | for (int i = 1; i < s.length(); i++) { | |
106 | //Add characters to cleanpagename until we hit the & sign. | |
107 | if (s.charAt(i) == '&') { | |
108 | return cleanpagename; | |
109 | - | for (int i = 32; i < s.length(); i++) { |
109 | + | |
110 | cleanpagename += s.charAt(i); | |
111 | } | |
112 | } | |
113 | return ""; //this should not be reached | |
114 | } | |
115 | ||
116 | /** | |
117 | * Gets the wiki markup content of an article from the HTML of the edit | |
118 | * window | |
119 | * | |
120 | * @param article the HTML of the edit window of an article | |
121 | - | * Gets the wiki markup content of an article from the HTML of the edit window |
121 | + | |
122 | */ | |
123 | public static String parseArticle(String article) { | |
124 | String articlecontent = ""; | |
125 | //Begin here. | |
126 | int beginpage = article.indexOf('"' + "wpTextbox1" + '"' + ">"); | |
127 | ||
128 | //Adding the wiki markup | |
129 | - | |
129 | + | |
130 | articlecontent += article.charAt(beginpage + 13); | |
131 | beginpage++; | |
132 | - | articlecontent += article.charAt(beginpage+13); |
132 | + | if (articlecontent.contains("</textarea>")) { |
133 | return articlecontent; | |
134 | - | if(articlecontent.contains("</textarea>")){ |
134 | + | |
135 | } | |
136 | } | |
137 | ||
138 | /** | |
139 | - | |
139 | + | |
140 | * | |
141 | * @param article the wiki markup of an article | |
142 | * @return true if the article should be tagged as unreferenced | |
143 | */ | |
144 | public static boolean isEligibleForTagging(String article, String title) throws IOException { | |
145 | - | public static boolean isEligibleForTagging(String article, String title) throws IOException{ |
145 | + | |
146 | //an {{sfn}} template, external links, further reading, a references section, | |
147 | //or a notes section, it is consdiered unreferenced. If it is a disambiguation | |
148 | //page, already tagged as unreferenced, or has {{nobots}}, it won't | |
149 | //be tagged. | |
150 | article = article.toLowerCase(); | |
151 | if (!article.contains("reflist") | |
152 | - | if(!article.contains("reflist") && |
152 | + | && !article.contains("<ref>") && !article.contains("<ref name") && !article.contains("<ref group") |
153 | - | !article.contains("<ref>") && !article.contains("<ref name") && |
153 | + | && !article.contains("{{sfn") |
154 | - | !article.contains("{{sfn") && |
154 | + | && !article.contains("=referneces=") && !article.contains("= references =") |
155 | - | !article.contains("==referneces==") && !article.contains("== references ==") && |
155 | + | && !article.contains("= referneces=") && !article.contains("=references =") |
156 | - | !article.contains("==notes==") && !article.contains("== notes ==") && |
156 | + | && !article.contains("=notes=") && !article.contains("== notes ==") |
157 | - | !article.contains("<references/>") && !article.contains("<references />") && |
157 | + | && !article.contains("=notes =") && !article.contains("== notes==") |
158 | - | noextlinks(title) |
158 | + | && !article.contains("=citations=") && !article.contains("= citations =") |
159 | && !article.contains("=citations =") && !article.contains("= citations=") | |
160 | - | && !article.contains("{{nobots}}") |
160 | + | && !article.contains("=sources=") && !article.contains("= sources =") |
161 | - | && !article.contains("{{unreferenced")){ |
161 | + | && !article.contains("=sources =") && !article.contains("= sources=") |
162 | && !article.contains("<references") && !article.contains("{{refbegin") | |
163 | && !article.contains("{{noteslist") | |
164 | && noextlinks(title) | |
165 | && !noblpkeywords(article) | |
166 | - | |
166 | + | |
167 | && !article.contains("{{nobots}}")) { | |
168 | return true; | |
169 | } | |
170 | return false; | |
171 | } | |
172 | - | * @throws IOException |
172 | + | |
173 | /** | |
174 | - | public static boolean nodabs(String title) throws MalformedURLException, IOException{ |
174 | + | |
175 | * | |
176 | - | + "templates&title="+title); |
176 | + | |
177 | * @return true if the page is not a disambiguation page | |
178 | * @throws MalformedURLException | |
179 | * @throws IOException | |
180 | - | String templates = ""; |
180 | + | |
181 | public static boolean nodabs(String title) throws MalformedURLException, IOException { | |
182 | URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop=" | |
183 | + "templates&title=" + title); | |
184 | BufferedReader dabsearch = new BufferedReader( | |
185 | - | return !templates.contains("Template:Dmbox"); |
185 | + | |
186 | String inputLine; | |
187 | - | |
187 | + | String templates = ""; //This will contain the HTML of Special:NewPages |
188 | ||
189 | while ((inputLine = dabsearch.readLine()) != null) { | |
190 | templates += inputLine; | |
191 | } | |
192 | return !templates.contains("Template:Dmbox"); | |
193 | - | * @throws IOException |
193 | + | |
194 | ||
195 | - | public static boolean noextlinks(String title) throws MalformedURLException, IOException{ |
195 | + | |
196 | * Uses a Wikipedia API query to search for external links in an article. | |
197 | - | + "op=extlinks&titles="+title); |
197 | + | * |
198 | * @param title article title | |
199 | * @return true if there are no external links | |
200 | * @throws MalformedURLException | |
201 | - | String links = ""; |
201 | + | * @throws IOException |
202 | */ | |
203 | public static boolean noextlinks(String title) throws MalformedURLException, IOException { | |
204 | URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&pr" | |
205 | + "op=extlinks&titles=" + title); | |
206 | - | return !links.contains("<el xml:space=\"preserve\">"); |
206 | + | |
207 | new InputStreamReader(u.openStream())); | |
208 | String inputLine; | |
209 | String links = ""; //This will contain the HTML of Special:NewPages | |
210 | ||
211 | while ((inputLine = linksearch.readLine()) != null) { | |
212 | links += inputLine; | |
213 | } | |
214 | return !links.contains("<el xml:space=\"preserve\">"); | |
215 | } | |
216 | ||
217 | public static boolean noblpkeywords(String article) { | |
218 | String[] keywords = {"UnsourcedBLP", "BLPunreferenced", "Unreferencedblp", | |
219 | "Blpunsourced", "BLPunsourced", "Unsourcedblp", "BLPUnreferenced", | |
220 | "Unsourced BLP", "BLP unreferenced", "Blpunref", "Unreferenced BLP", | |
221 | "Blpunreferenced", "UnreferencedBLP", "BLPUnsourced", "Unreferenced blp", | |
222 | "BLP Unreferenced", "Blp-unreferenced", "Userspace BLP", "Unreferenced-blp", | |
223 | "Unreferenced-BLP", "Blpnoref", "Blp unreferenced", "BLPnoref", "Unref BLP", "Blp unsourced", "Urblp", "Ublp", "Blp-unsourced", "BLPunref", "Unsourced-blp", | |
224 | "Noref-blp", "Unsourced blp", | |
225 | "Unsourced", "Unverified", "Unref", "References", "Uncited-article", | |
226 | "Citesources", "NR", "No references", "Unrefarticle", "Unreferenced article", | |
227 | "Noref", "Norefs", "Noreferences", "Cleanup-cite", "References needed", | |
228 | "Nr", "No refs", "UnreferencedArticle", "No ref", "Unreferenced stub", | |
229 | "Needs references", "Noreference", "No reference", "Refsneeded", "Refs needed", | |
230 | "Ref needed", "Nosources", "No sources", "UNref", "UNREF", "Unr"}; | |
231 | for (int i = 0; i < keywords.length; i++) { | |
232 | if (article.contains(keywords[i])) { | |
233 | return true; | |
234 | } | |
235 | } | |
236 | return false; | |
237 | } | |
238 | ||
239 | public static boolean noblpcat(String title) throws MalformedURLException, IOException { | |
240 | URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop=categories&titles=" | |
241 | + title); | |
242 | BufferedReader catsearch = new BufferedReader( | |
243 | new InputStreamReader(u.openStream())); | |
244 | String inputLine; | |
245 | String cats = ""; | |
246 | ||
247 | while ((inputLine = catsearch.readLine()) != null) { | |
248 | cats += inputLine; | |
249 | } | |
250 | return !cats.contains("Category:Living people") && cats.contains("Category: Living people"); | |
251 | } | |
252 | ||
253 | public static boolean alreadyedited(String title) throws IOException { | |
254 | URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=user&rvlimit=500&titles=" | |
255 | + title); | |
256 | BufferedReader botsearch = new BufferedReader( | |
257 | new InputStreamReader(u.openStream())); | |
258 | String inputLine; | |
259 | String users = ""; | |
260 | ||
261 | while ((inputLine = botsearch.readLine()) != null) { | |
262 | users += inputLine; | |
263 | } | |
264 | return users.contains("Jakebot"); | |
265 | } | |
266 | } |