Advertisement
Guest User

Jakebot code (3)

a guest
Dec 21st, 2013
117
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. package jakebot;
  2.  
  3. import java.io.BufferedReader;
  4. import java.io.IOException;
  5. import java.io.InputStreamReader;
  6. import java.net.MalformedURLException;
  7. import java.net.URL;
  8.  
  9. /**
  10.  * A bot to tag new unreferenced articles on the English Wikipedia.
  11.  * @author King jakob c 2
  12.  */
  13. public class Jakebot {
  14.  
  15.     /**
  16.      * @param args the command line arguments
  17.      */
  18.     public static void main(String[] args) throws MalformedURLException, IOException {
  19.         //Special:NewPages
  20.         //Lines 21-30 shamelessly lifted and adapted from
  21.         //http://stackoverflow.com/questions/6188901/reading-the-content-of-web-page
  22.         URL newpages = new URL("https://en.wikipedia.org/w/index.php?title=Speci"
  23.                 + "al:NewPages&offset=50&limit=500");
  24.         BufferedReader newpagesreader = new BufferedReader(
  25.                 new InputStreamReader(newpages.openStream()));
  26.         String inputLine;
  27.         String wholepage = ""; //This will contain the HTML of Special:NewPages
  28.  
  29.         while ((inputLine = newpagesreader.readLine()) != null) {
  30.             wholepage += inputLine;
  31.         }
  32.        
  33.         //The names of the 50th to 550th newest articles
  34.         String[] newpageslist = new String[500];
  35.        
  36.         //Each <li> tag except for the first 5 <li> tags preceeds an article
  37.         int litags = 0;
  38.         int newpagesfilled = 0;
  39.         for (int i = 0; i < wholepage.length() - 4; i++) {
  40.             if (wholepage.charAt(i) == '<' && wholepage.charAt(i + 1) == 'l'
  41.                     && wholepage.charAt(i + 2) == 'i' && wholepage.charAt(i + 3) == '>') {
  42.                 litags++;
  43.                
  44.                 if (litags > 5) {
  45.                     //The content between 32 characters after the <li>, and the
  46.                     //next & sign is the name of the article.
  47.                     newpageslist[newpagesfilled] = parseFromNewPages
  48.         (wholepage.substring(i));
  49.                     newpagesfilled++;
  50.                 }
  51.             }
  52.  
  53.         }
  54.        
  55.         //Checking if each page is a unreferenced and then tagging it
  56.         //if it is unreferenced.
  57.         for (int i = 0; i < newpageslist.length; i++) {
  58.             //For some reason, there are a bunch of "null"s in the newpageslist.
  59.             //Want to avoid those.
  60.             if (!newpageslist[i].equals("null")) {
  61.                 //Loading up the edit window of a page to get the wiki markup.
  62.                 URL anewpage = new URL("https://en.wikipedia.org/w/index.php?ti"
  63.                         + "tle=" + newpageslist[i] + "&action=edit");
  64.                 BufferedReader pagereader = new BufferedReader(
  65.                         new InputStreamReader(anewpage.openStream()));
  66.                 String inputLine2;
  67.                 String article = "";
  68.  
  69.                 while ((inputLine2 = pagereader.readLine()) != null) {
  70.                     article += inputLine2;
  71.                 }
  72.                
  73.                 //Cleanarticle = the page with the wiki markup, not HTML.
  74.                 String cleanarticle = parseArticle(article);
  75.                
  76.                 //Use the APISandbox to tag as a unreferenced, assuming it is one.
  77.                 if(isEligibleForTagging(cleanarticle, newpageslist[i]) && !cleanarticle.contains("[[Category:Living people]]")){                
  78.                 Process p=Runtime.getRuntime().exec("cmd /c start " +
  79.                         "https://en.wikipedia.org/w/api.php?action=edit&format="
  80.                         + "json&title=" + cleanarticle+"&summary=Tagging%20short"
  81.                         + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
  82.                         + "%5D%5D)&bot=&appendtext={{Unreferenced}}&assert=bot&"
  83.                         + "prop=info");
  84.                 p.destroy(); //and close the window
  85.                 }
  86.                 //If it is a BLP, tag it with {{BLP Unsourced}} instead.
  87.                 if(isEligibleForTagging(cleanarticle, newpageslist[i]) && cleanarticle.contains("[[Category:Living people]]")){                
  88.                 Process p=Runtime.getRuntime().exec("cmd /c start " +
  89.                         "https://en.wikipedia.org/w/api.php?action=edit&format="
  90.                         + "json&title=" + newpageslist[i]+"&summary=Tagging%20short"
  91.                         + "%20article%20as%20stub%20(%5B%5BWP%3ABOT%7CBot%20edit"
  92.                         + "%5D%5D)&bot=&appendtext={{BLP Unsourced}}&assert=bot&"
  93.                         + "prop=info");
  94.                 p.destroy(); //and close the window
  95.                 }
  96.             }
  97.         }
  98.     }
  99.  
  100.     /**
  101.      * Parses out an article title from the HTML in Special:NewPages
  102.      * @param s a piece of the HTML of Special:NewPages
  103.      * @return A properly formatted article name
  104.      */
  105.     public static String parseFromNewPages(String s) {
  106.         String cleanpagename = ""; //this will be returned
  107.         //There are 32 characters between the <li> and the start of the article
  108.         //title.
  109.         for (int i = 32; i < s.length(); i++) {
  110.             //Add characters to cleanpagename until we hit the & sign.
  111.             if (s.charAt(i) == '&') {
  112.                 return cleanpagename;
  113.             } else {
  114.                 cleanpagename += s.charAt(i);
  115.             }
  116.         }
  117.         return ""; //this should not be reached
  118.     }
  119.  
  120.     /**
  121.      * Gets the wiki markup content of an article from the HTML of the edit window
  122.      * @param article the HTML of the edit window of an article
  123.      * @return wiki markup of an article
  124.      */
  125.     public static String parseArticle(String article) {
  126.         String articlecontent = "";
  127.         //Begin here.
  128.         int beginpage = article.indexOf('"' + "wpTextbox1" + '"' + ">");
  129.        
  130.         //Adding the wiki markup
  131.         while (true) {
  132.             articlecontent += article.charAt(beginpage+13);
  133.             beginpage++;
  134.             if(articlecontent.contains("</textarea>")){
  135.                 return articlecontent;
  136.             }
  137.         }
  138.     }
  139.    
  140.     /**
  141.      * Check if the bot should tag the page as unreferenced or not
  142.      * @param article the wiki markup of an article
  143.      * @return true if the article should be tagged as unreferenced
  144.      */
  145.     public static boolean isEligibleForTagging(String article, String title) throws IOException{
  146.         //If the article lacks a reflist (or variants), <ref> tags (or variants),
  147.         //an {{sfn}} template, external links, further reading, a references section,
  148.         //or a notes section, it is consdiered unreferenced. If it is a disambiguation
  149.         //page, already tagged as unreferenced, or has {{nobots}}, it won't
  150.         //be tagged.
  151.         article = article.toLowerCase();
  152.         if(!article.contains("reflist") &&
  153.                 !article.contains("<ref>") && !article.contains("<ref name") &&
  154.                 !article.contains("{{sfn") &&
  155.                 !article.contains("==referneces==") && !article.contains("== references ==") &&
  156.                 !article.contains("==notes==") && !article.contains("== notes ==") &&
  157.                 !article.contains("<references/>") && !article.contains("<references />") &&
  158.                 noextlinks(title)
  159.                 && !article.contains("further reading") && nodabs(title)
  160.                 && !article.contains("{{nobots}}")
  161.                 && !article.contains("{{unreferenced")){
  162.             return true;
  163.         }
  164.         return false;
  165.     }
  166.    
  167.     /**
  168.      * Uses a Wikipedia API query to search for a dmbox template
  169.      * @param title article title
  170.      * @return true if the page is not a disambiguation page
  171.      * @throws MalformedURLException
  172.      * @throws IOException
  173.      */
  174.     public static boolean nodabs(String title) throws MalformedURLException, IOException{
  175.         URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&prop="
  176.                 + "templates&title="+title);
  177.         BufferedReader dabsearch = new BufferedReader(
  178.                 new InputStreamReader(u.openStream()));
  179.         String inputLine;
  180.         String templates = "";
  181.  
  182.         while ((inputLine = dabsearch.readLine()) != null) {
  183.             templates += inputLine;
  184.         }
  185.         return !templates.contains("Template:Dmbox");  
  186.     }
  187.    
  188.     /**
  189.      * Uses a Wikipedia API query to search for external links in an article.
  190.      * @param title article title
  191.      * @return true if there are no external links
  192.      * @throws MalformedURLException
  193.      * @throws IOException
  194.      */
  195.     public static boolean noextlinks(String title) throws MalformedURLException, IOException{
  196.         URL u = new URL("https://en.wikipedia.org/w/api.php?action=query&pr"
  197.                 + "op=extlinks&titles="+title);
  198.         BufferedReader linksearch = new BufferedReader(
  199.                 new InputStreamReader(u.openStream()));
  200.         String inputLine;
  201.         String links = "";
  202.  
  203.         while ((inputLine = linksearch.readLine()) != null) {
  204.             links += inputLine;
  205.         }
  206.         return !links.contains("<el xml:space=\"preserve\">");  
  207.     }
  208.  
  209. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement