Advertisement
ranveer5289

webscrapping htmlcleaner

Oct 7th, 2011
231
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 2.29 KB | None | 0 0
  1. /*
  2.  * To change this template, choose Tools | Templates
  3.  * and open the template in the editor.
  4.  */
  5.  
  6. //REMOVE IF NOT USING NETBEANS IDE
  7. package testscrapping;
  8.  
  9. /**
  10.  *
  11.  * @author ranveer
  12.  */
  13. import java.io.IOException;
  14. import java.io.InputStreamReader;
  15. import java.net.MalformedURLException;
  16. import java.net.URL;
  17. import java.net.URLConnection;
  18.  
  19.  
  20. import org.htmlcleaner.CleanerProperties;
  21. import org.htmlcleaner.HtmlCleaner;
  22. import org.htmlcleaner.TagNode;
  23. import org.htmlcleaner.XPatherException;
  24.  
  25. public class TestScrapping {
  26.  
  27.     public static void main(String[] args) throws MalformedURLException, IOException, XPatherException {
  28.  
  29.         // URL to be fetched in the below url u can replace s=cantabil with company of ur choice
  30.         String url_fetch = "http://in.finance.yahoo.com/lookup?s=cantabil&t=A&b=0&m=IN";
  31.        
  32.         //create tagnode object to traverse XML using xpath
  33.         TagNode node;
  34.         String info = null;
  35.  
  36.         //XPath of the data to be fetched.....use firefox's firepath addon or use firebug to fetch the required XPath.
  37.         //the below XPath will display the title of the company u have queried for
  38.         String name_xpath = "//div[1]/div[2]/div[2]/div[1]/div/div/div/div/table/tbody/tr[1]/td[2]/text()";
  39.  
  40.          // declarations related to the api
  41.         HtmlCleaner cleaner = new HtmlCleaner();
  42.         CleanerProperties props = new CleanerProperties();
  43.         props.setAllowHtmlInsideAttributes(true);
  44.         props.setAllowMultiWordAttributes(true);
  45.         props.setRecognizeUnicodeChars(true);
  46.         props.setOmitComments(true);
  47.  
  48.        
  49.         //creating url object
  50.         URL url = new URL(url_fetch);
  51.         URLConnection conn = url.openConnection(); //opening connection
  52.         node = cleaner.clean(new InputStreamReader(conn.getInputStream()));//reading input stream
  53.  
  54.         //storing the nodes belonging to the given xpath
  55.         Object[] info_nodes = node.evaluateXPath(name_xpath);
  56.  
  57.  
  58.  
  59. //checking if something returned or not....if XPath invalid info_nodes.length=0
  60.         if (info_nodes.length > 0) {
  61.             //info_nodes[0] will return string buffer
  62.             StringBuffer str = new StringBuffer();
  63.             str.append(info_nodes[0]);
  64.             System.out.println(str);
  65.  
  66.         }
  67.  
  68.  
  69.  
  70.  
  71.  
  72.     }
  73. }
  74.  
  75.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement