SHOW:
|
|
- or go back to the newest paste.
1 | - | Jakebot code below... |
1 | + | |
2 | ||
3 | import java.io.BufferedReader; | |
4 | import java.io.IOException; | |
5 | import java.io.InputStreamReader; | |
6 | import java.net.MalformedURLException; | |
7 | import java.net.URL; | |
8 | ||
9 | /** | |
10 | * A bot to tag new unreferenced articles on the English Wikipedia. | |
11 | * @author King jakob c 2 | |
12 | */ | |
13 | public class Jakebot { | |
14 | ||
15 | /** | |
16 | * @param args the command line arguments | |
17 | */ | |
18 | public static void main(String[] args) throws MalformedURLException, IOException { | |
19 | //Special:NewPages | |
20 | //Lines 21-30 shamelessly lifted and adapted from | |
21 | //http://stackoverflow.com/questions/6188901/reading-the-content-of-web-page | |
22 | URL newpages = new URL("https://en.wikipedia.org/w/index.php?title=Speci" | |
23 | + "al:NewPages&offset=50&limit=500"); | |
24 | - | + "al:NewPages&offset=&limit=500"); |
24 | + | |
25 | new InputStreamReader(newpages.openStream())); | |
26 | String inputLine; | |
27 | String wholepage = ""; //This will contain the HTML of Special:NewPages | |
28 | ||
29 | while ((inputLine = newpagesreader.readLine()) != null) { | |
30 | wholepage += inputLine; | |
31 | } | |
32 | ||
33 | //The names of the 50th to 550th newest articles | |
34 | - | //The names of the 500 newest articles |
34 | + | |
35 | ||
36 | //Each <li> tag except for the first 5 <li> tags preceeds an article | |
37 | int litags = 0; | |
38 | int newpagesfilled = 0; | |
39 | for (int i = 0; i < wholepage.length() - 4; i++) { | |
40 | if (wholepage.charAt(i) == '<' && wholepage.charAt(i + 1) == 'l' | |
41 | && wholepage.charAt(i + 2) == 'i' && wholepage.charAt(i + 3) == '>') { | |
42 | litags++; | |
43 | ||
44 | if (litags > 5) { | |
45 | //The content between 32 characters after the <li>, and the | |
46 | //next & sign is the name of the article. | |
47 | newpageslist[newpagesfilled] = parseFromNewPages | |
48 | (wholepage.substring(i)); | |
49 | newpagesfilled++; | |
50 | } | |
51 | } | |
52 | ||
53 | } | |
54 | ||
55 | //Checking if each page is a unreferenced and then tagging it | |
56 | - | //Checking if each page is unreferenced and then tagging it if so. |
56 | + | //if it is unreferenced. |
57 | for (int i = 0; i < newpageslist.length; i++) { | |
58 | //For some reason, there are a bunch of "null"s in the newpageslist. | |
59 | //Want to avoid those. | |
60 | if (!newpageslist[i].equals("null")) { | |
61 | //Loading up the edit window of a page to get the wiki markup. | |
62 | URL anewpage = new URL("https://en.wikipedia.org/w/index.php?ti" | |
63 | + "tle=" + newpageslist[i] + "&action=edit"); | |
64 | BufferedReader pagereader = new BufferedReader( | |
65 | new InputStreamReader(anewpage.openStream())); | |
66 | String inputLine2; | |
67 | String article = ""; | |
68 | ||
69 | while ((inputLine2 = pagereader.readLine()) != null) { | |
70 | article += inputLine2; | |
71 | } | |
72 | ||
73 | //Cleanarticle = the page with the wiki markup, not HTML. | |
74 | String cleanarticle = parseArticle(article); | |
75 | ||
76 | - | //Use the APISandbox to tag as unreferenced, assuming it is. |
76 | + | //Use the APISandbox to tag as a unreferenced, assuming it is one. |
77 | - | if(isEligibleForTagging(cleanarticle)){ |
77 | + | if(isEligibleForTagging(cleanarticle, newpageslist[i]) && !cleanarticle.contains("[[Category:Living people]]")){ |
78 | Process p=Runtime.getRuntime().exec("cmd /c start " + | |
79 | "https://en.wikipedia.org/w/api.php?action=edit&format=" | |
80 | - | + "json&title=" + cleanarticle+"&summary=Tagging unref" |
80 | + | + "json&title=" + cleanarticle+"&summary=Tagging%20short" |
81 | - | + "erenced article(%5B%5BWP%3ABOT%7CBot%20edit" |
81 | + | + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit" |
82 | + "%5D%5D)&bot=&appendtext={{Unreferenced}}&assert=bot&" | |
83 | + "prop=info"); | |
84 | p.destroy(); //and close the window | |
85 | } | |
86 | //If it is a BLP, tag it with {{BLP Unsourced}} instead. | |
87 | if(isEligibleForTagging(cleanarticle, newpageslist[i]) && cleanarticle.contains("[[Category:Living people]]")){ | |
88 | Process p=Runtime.getRuntime().exec("cmd /c start " + | |
89 | "https://en.wikipedia.org/w/api.php?action=edit&format=" | |
90 | + "json&title=" + newpageslist[i]+"&summary=Tagging%20short" | |
91 | + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit" | |
92 | + "%5D%5D)&bot=&appendtext={{BLP Unsourced}}&assert=bot&" | |
93 | + "prop=info"); | |
94 | p.destroy(); //and close the window | |
95 | } | |
96 | } | |
97 | } | |
98 | } | |
99 | ||
100 | /** | |
101 | * Parses out an article title from the HTML in Special:NewPages | |
102 | * @param s a piece of the HTML of Special:NewPages | |
103 | * @return A properly formatted article name | |
104 | */ | |
105 | public static String parseFromNewPages(String s) { | |
106 | String cleanpagename = ""; //this will be returned | |
107 | //There are 32 characters between the <li> and the start of the article | |
108 | //title. | |
109 | for (int i = 32; i < s.length(); i++) { | |
110 | //Add characters to cleanpagename until we hit the & sign. | |
111 | if (s.charAt(i) == '&') { | |
112 | return cleanpagename; | |
113 | } else { | |
114 | cleanpagename += s.charAt(i); | |
115 | } | |
116 | } | |
117 | return ""; //this should not be reached | |
118 | } | |
119 | ||
120 | /** | |
121 | * Gets the wiki markup content of an article from the HTML of the edit window | |
122 | * @param article the HTML of the edit window of an article | |
123 | * @return wiki markup of an article | |
124 | */ | |
125 | public static String parseArticle(String article) { | |
126 | String articlecontent = ""; | |
127 | //Begin here. | |
128 | int beginpage = article.indexOf('"' + "wpTextbox1" + '"' + ">"); | |
129 | ||
130 | //Adding the wiki markup | |
131 | while (true) { | |
132 | articlecontent += article.charAt(beginpage+13); | |
133 | - | * @return true if the article should be tagged as unreferenced |
133 | + | |
134 | if(articlecontent.contains("</textarea>")){ | |
135 | - | public static boolean isEligibleForTagging(String article){ |
135 | + | |
136 | - | //If there is no reflist, external links, or further reading, return true. |
136 | + | |
137 | - | //Return false if it is already tagged is a unreferenced, if it is a disambiguation page, |
137 | + | |
138 | - | //or if it is up for speedy deletion. |
138 | + | |
139 | ||
140 | - | if(!(article.contains("reflist") && article.contains("<ref>")) && |
140 | + | |
141 | - | !article.contains("==referneces==") && !article.contains("http") |
141 | + | |
142 | - | && !article.contains("further reading") && !article.contains("{{db") |
142 | + | |
143 | - | && !article.contains("{{unreferenced") |
143 | + | * @return true if the article should be tagged as unreferenced |
144 | - | && !article.contains("{{bots}}") && |
144 | + | |
145 | - | !article.contains("disambiguation")){ |
145 | + | public static boolean isEligibleForTagging(String article, String title) throws IOException{ |
146 | //If the article lacks a reflist (or variants), <ref> tags (or variants), | |
147 | //an {{sfn}} template, external links, further reading, a references section, | |
148 | //or a notes section, it is consdiered unreferenced. If it is a disambiguation | |
149 | //page, already tagged as unreferenced, or has {{nobots}}, it won't | |
150 | //be tagged. | |
151 | article = article.toLowerCase(); | |
152 | if(!article.contains("reflist") && | |
153 | !article.contains("<ref>") && !article.contains("<ref name") && | |
154 | !article.contains("{{sfn") && | |
155 | !article.contains("==referneces==") && !article.contains("== references ==") && | |
156 | !article.contains("==notes==") && !article.contains("== notes ==") && | |
157 | !article.contains("<references/>") && !article.contains("<references />") && | |
158 | noextlinks(title) | |
159 | && !article.contains("further reading") && nodabs(title) | |
160 | && !article.contains("{{nobots}}") | |
161 | && !article.contains("{{unreferenced")){ | |
162 | return true; | |
163 | } | |
164 | return false; | |
165 | } | |
166 | ||
167 | /** | |
168 | * Uses a Wikipedia API query to search for a dmbox template | |
169 | * @param title article title | |
170 | * @return true if the page is not a disambiguation page | |
171 | * @throws MalformedURLException | |
172 | * @throws IOException | |
173 | */ | |
174 | public static boolean nodabs(String title) throws MalformedURLException, IOException{ | |
175 | URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop=" | |
176 | + "templates&title="+title); | |
177 | BufferedReader dabsearch = new BufferedReader( | |
178 | new InputStreamReader(u.openStream())); | |
179 | String inputLine; | |
180 | String templates = ""; | |
181 | ||
182 | while ((inputLine = dabsearch.readLine()) != null) { | |
183 | templates += inputLine; | |
184 | } | |
185 | return !templates.contains("Template:Dmbox"); | |
186 | } | |
187 | ||
188 | /** | |
189 | * Uses a Wikipedia API query to search for external links in an article. | |
190 | * @param title article title | |
191 | * @return true if there are no external links | |
192 | * @throws MalformedURLException | |
193 | * @throws IOException | |
194 | */ | |
195 | public static boolean noextlinks(String title) throws MalformedURLException, IOException{ | |
196 | URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&pr" | |
197 | + "op=extlinks&titles="+title); | |
198 | BufferedReader linksearch = new BufferedReader( | |
199 | new InputStreamReader(u.openStream())); | |
200 | String inputLine; | |
201 | String links = ""; | |
202 | ||
203 | while ((inputLine = linksearch.readLine()) != null) { | |
204 | links += inputLine; | |
205 | } | |
206 | return !links.contains("<el xml:space=\"preserve\">"); | |
207 | } | |
208 | ||
209 | } |