SHOW:
|
|
- or go back to the newest paste.
1 | Jakebot code below... | |
2 | package jakebot; | |
3 | ||
4 | import java.io.BufferedReader; | |
5 | import java.io.IOException; | |
6 | import java.io.InputStreamReader; | |
7 | import java.net.MalformedURLException; | |
8 | import java.net.URL; | |
9 | ||
10 | /** | |
11 | * A bot to tag new unreferenced articles on the English Wikipedia. | |
12 | * @author King jakob c 2 | |
13 | */ | |
14 | public class Jakebot { | |
15 | ||
16 | /** | |
17 | * @param args the command line arguments | |
18 | */ | |
19 | public static void main(String[] args) throws MalformedURLException, IOException { | |
20 | //Special:NewPages | |
21 | //Lines 21-30 shamelessly lifted and adapted from | |
22 | //http://stackoverflow.com/questions/6188901/reading-the-content-of-web-page | |
23 | URL newpages = new URL("https://en.wikipedia.org/w/index.php?title=Speci" | |
24 | + "al:NewPages&offset=&limit=500"); | |
25 | BufferedReader newpagesreader = new BufferedReader( | |
26 | new InputStreamReader(newpages.openStream())); | |
27 | String inputLine; | |
28 | String wholepage = ""; //This will contain the HTML of Special:NewPages | |
29 | ||
30 | while ((inputLine = newpagesreader.readLine()) != null) { | |
31 | wholepage += inputLine; | |
32 | } | |
33 | ||
34 | //The names of the 500 newest articles | |
35 | String[] newpageslist = new String[500]; | |
36 | ||
37 | //Each <li> tag except for the first 5 <li> tags preceeds an article | |
38 | int litags = 0; | |
39 | int newpagesfilled = 0; | |
40 | for (int i = 0; i < wholepage.length() - 4; i++) { | |
41 | if (wholepage.charAt(i) == '<' && wholepage.charAt(i + 1) == 'l' | |
42 | && wholepage.charAt(i + 2) == 'i' && wholepage.charAt(i + 3) == '>') { | |
43 | litags++; | |
44 | ||
45 | if (litags > 5) { | |
46 | //The content between 32 characters after the <li>, and the | |
47 | //next & sign is the name of the article. | |
48 | newpageslist[newpagesfilled] = parseFromNewPages | |
49 | (wholepage.substring(i)); | |
50 | newpagesfilled++; | |
51 | } | |
52 | } | |
53 | ||
54 | } | |
55 | ||
56 | - | //Checking if each page is a stub and then tagging it if it is a stub. |
56 | + | //Checking if each page is unreferenced and then tagging it if so. |
57 | for (int i = 0; i < newpageslist.length; i++) { | |
58 | //For some reason, there are a bunch of "null"s in the newpageslist. | |
59 | //Want to avoid those. | |
60 | if (!newpageslist[i].equals("null")) { | |
61 | //Loading up the edit window of a page to get the wiki markup. | |
62 | URL anewpage = new URL("https://en.wikipedia.org/w/index.php?ti" | |
63 | + "tle=" + newpageslist[i] + "&action=edit"); | |
64 | BufferedReader pagereader = new BufferedReader( | |
65 | new InputStreamReader(anewpage.openStream())); | |
66 | String inputLine2; | |
67 | String article = ""; | |
68 | ||
69 | while ((inputLine2 = pagereader.readLine()) != null) { | |
70 | article += inputLine2; | |
71 | } | |
72 | ||
73 | //Cleanarticle = the page with the wiki markup, not HTML. | |
74 | String cleanarticle = parseArticle(article); | |
75 | ||
76 | - | //Use the APISandbox to tag as a stub, assuming it is one. |
76 | + | //Use the APISandbox to tag as unreferenced, assuming it is. |
77 | if(isEligibleForTagging(cleanarticle)){ | |
78 | Process p=Runtime.getRuntime().exec("cmd /c start " + | |
79 | "https://en.wikipedia.org/w/api.php?action=edit&format=" | |
80 | - | + "json&title=" + cleanarticle+"&summary=Tagging%20short" |
80 | + | + "json&title=" + cleanarticle+"&summary=Tagging unref" |
81 | - | + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit" |
81 | + | + "erenced article(%5B%5BWP%3ABOT%7CBot%20edit" |
82 | + "%5D%5D)&bot=&appendtext={{Unreferenced}}&assert=bot&" | |
83 | + "prop=info"); | |
84 | p.destroy(); //and close the window | |
85 | } | |
86 | } | |
87 | } | |
88 | } | |
89 | ||
90 | /** | |
91 | * Parses out an article title from the HTML in Special:NewPages | |
92 | * @param s a piece of the HTML of Special:NewPages | |
93 | * @return A properly formatted article name | |
94 | */ | |
95 | public static String parseFromNewPages(String s) { | |
96 | String cleanpagename = ""; //this will be returned | |
97 | //There are 32 characters between the <li> and the start of the article | |
98 | //title. | |
99 | for (int i = 32; i < s.length(); i++) { | |
100 | //Add characters to cleanpagename until we hit the & sign. | |
101 | if (s.charAt(i) == '&') { | |
102 | return cleanpagename; | |
103 | } else { | |
104 | cleanpagename += s.charAt(i); | |
105 | } | |
106 | } | |
107 | return ""; //this should not be reached | |
108 | } | |
109 | ||
110 | /** | |
111 | * Gets the wiki markup content of an article from the HTML of the edit window | |
112 | * @param article the HTML of the edit window of an article | |
113 | * @return wiki markup of an article | |
114 | */ | |
115 | public static String parseArticle(String article) { | |
116 | String articlecontent = ""; | |
117 | //Begin here. | |
118 | int beginpage = article.indexOf('"' + "wpTextbox1" + '"' + ">"); | |
119 | ||
120 | //Adding the wiki markup | |
121 | while (true) { | |
122 | articlecontent += article.charAt(beginpage+13); | |
123 | beginpage++; | |
124 | if(articlecontent.contains("</textarea>")){ | |
125 | return articlecontent; | |
126 | } | |
127 | } | |
128 | } | |
129 | ||
130 | /** | |
131 | - | * Check if the bot should tag the page as a stub or not |
131 | + | * Check if the bot should tag the page as unreferenced or not |
132 | * @param article the wiki markup of an article | |
133 | - | * @return true if the article should be tagged as a stub |
133 | + | * @return true if the article should be tagged as unreferenced |
134 | */ | |
135 | public static boolean isEligibleForTagging(String article){ | |
136 | - | //If there are under 1500 characters of wikimarkup, there are under 1500 |
136 | + | //If there is no reflist, external links, or further reading, return true. |
137 | - | //characters of readable prose, which is sometimes used as a cutoff point |
137 | + | //Return false if it is already tagged is a unreferenced, if it is a disambiguation page, |
138 | - | //for stubs/start-class articles. This method will also return false if |
138 | + | |
139 | - | //it is already tagged is a stub, if it is a list or a disambiguation page, |
139 | + | |
140 | if(!(article.contains("reflist") && article.contains("<ref>")) && | |
141 | !article.contains("==referneces==") && !article.contains("http") | |
142 | && !article.contains("further reading") && !article.contains("{{db") | |
143 | && !article.contains("{{unreferenced") | |
144 | && !article.contains("{{bots}}") && | |
145 | !article.contains("disambiguation")){ | |
146 | return true; | |
147 | } | |
148 | return false; | |
149 | } | |
150 | } |