SHARE
TWEET

Untitled

a guest May 21st, 2017 54 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import java.io.*;
  2. import java.sql.*;
  3. import javax.servlet.*;
  4. import javax.servlet.http.*;
  5.  
  6. public class HTMLParse{
  7.   public static void main(String[] args) throws SQLException, FileNotFoundException{
  8.  
  9.     java.io.File file = new java.io.File("webPagesSourceCodeFile.txt");
  10.  
  11.     java.util.Scanner input = new java.util.Scanner(file);
  12.    
  13.     StringBuffer StrBuf = new StringBuffer();
  14.     //String HTMLcode = new String();
  15.    
  16.     while (input.hasNext()) {
  17.     StrBuf.append(input.next());   
  18.     }
  19.     //System.out.println(StrBuf);
  20.     String HTMLcode = new String(StrBuf);
  21.     String[] website = HTMLcode.split(".IIIII");
  22.     //System.out.print(website[1]);
  23.    
  24.     //System.out.println(website[l].indexOf('<'));
  25.     //System.out.println(website[l].indexOf("<title"));
  26.     //System.out.println(website[l].indexOf("/title"));
  27.    
  28.     /*for (int l = 1;l < website.length; l++)
  29.     {
  30.     // String www = website[l].substring(website[l].indexOf("http"));
  31.     String www = website[l].substring(1, website[l].indexOf('<'));
  32.     System.out.println(www);
  33.     String title = website[l].substring(website[l].indexOf("<title") + 7,website[l].indexOf("/title") - 1);
  34.     System.out.println(title);
  35.     //String HTMLCode = website[1].substring(website[1].indexOf("/title") + 7);
  36.     String HTMLCode = website[l].substring(website[l].indexOf('<'));
  37.     //System.out.println(HTMLCode);
  38.     System.out.println();
  39.     }
  40.     String HTMLCode = website[1].substring(website[1].indexOf('<'));
  41.     String DOCText = HTMLCode.replaceAll("<(script|style).*?</\1>", "");
  42.     DOCText = DOCText.replaceAll("<.*?>", "");
  43.     DOCText = DOCText.replaceAll(";.*?", "");
  44.     DOCText = DOCText.replaceAll("==.*?", "");*/
  45.     String www = "testing2";
  46.     String title = "titletest2";
  47.     String HTMLCode2 = "HTML TEST2";
  48.     String DOCText = "DOC Text2";
  49.    
  50.     try {
  51.     //DriverManager.registerDriver(new oracle.jdbc.driver.OracleDriver());
  52.     //String driver = "oracle.jdbc.driver.OracleDriver";
  53.      // Class.forName(driver);
  54.     DriverManager.registerDriver(new oracle.jdbc.driver.OracleDriver());
  55.     Connection con = DriverManager.getConnection(
  56.         "jdbc:oracle:thin:@127.0.0.1:1521:orcl1",
  57.         "ADMIN",
  58.         "ltultu");
  59.     Statement s = con.createStatement();
  60.     //s.execute("INSERT INTO DocInfoExtraction VALUES ('" + www + "', '" + title + "', '" + HTMLCode2 + "', '" + DOCText + "')");
  61.     s.execute("INSERT INTO DocInfoExtraction VALUES ('" + www + "', '" + DOCText + "', '" + title + "')"); 
  62.     s.close();
  63.     con.close();
  64.     }
  65.     catch (Exception ex) {
  66.       ex.printStackTrace();
  67.     }
  68.  
  69.    
  70.    
  71.     //System.out.println(HTMLCode);
  72. }
  73. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top