Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.*;
- import java.sql.*;
- import javax.servlet.*;
- import javax.servlet.http.*;
- public class HTMLParse{
- public static void main(String[] args) throws SQLException, FileNotFoundException{
- java.io.File file = new java.io.File("webPagesSourceCodeFile.txt");
- java.util.Scanner input = new java.util.Scanner(file);
- StringBuffer StrBuf = new StringBuffer();
- //String HTMLcode = new String();
- while (input.hasNext()) {
- StrBuf.append(input.next());
- }
- //System.out.println(StrBuf);
- String HTMLcode = new String(StrBuf);
- String[] website = HTMLcode.split(".IIIII");
- //System.out.print(website[1]);
- //System.out.println(website[l].indexOf('<'));
- //System.out.println(website[l].indexOf("<title"));
- //System.out.println(website[l].indexOf("/title"));
- /*for (int l = 1;l < website.length; l++)
- {
- // String www = website[l].substring(website[l].indexOf("http"));
- String www = website[l].substring(1, website[l].indexOf('<'));
- System.out.println(www);
- String title = website[l].substring(website[l].indexOf("<title") + 7,website[l].indexOf("/title") - 1);
- System.out.println(title);
- //String HTMLCode = website[1].substring(website[1].indexOf("/title") + 7);
- String HTMLCode = website[l].substring(website[l].indexOf('<'));
- //System.out.println(HTMLCode);
- System.out.println();
- }
- String HTMLCode = website[1].substring(website[1].indexOf('<'));
- String DOCText = HTMLCode.replaceAll("<(script|style).*?</\1>", "");
- DOCText = DOCText.replaceAll("<.*?>", "");
- DOCText = DOCText.replaceAll(";.*?", "");
- DOCText = DOCText.replaceAll("==.*?", "");*/
- String www = "testing2";
- String title = "titletest2";
- String HTMLCode2 = "HTML TEST2";
- String DOCText = "DOC Text2";
- try {
- //DriverManager.registerDriver(new oracle.jdbc.driver.OracleDriver());
- //String driver = "oracle.jdbc.driver.OracleDriver";
- // Class.forName(driver);
- DriverManager.registerDriver(new oracle.jdbc.driver.OracleDriver());
- Connection con = DriverManager.getConnection(
- "jdbc:oracle:thin:@127.0.0.1:1521:orcl1",
- "ADMIN",
- "ltultu");
- Statement s = con.createStatement();
- //s.execute("INSERT INTO DocInfoExtraction VALUES ('" + www + "', '" + title + "', '" + HTMLCode2 + "', '" + DOCText + "')");
- s.execute("INSERT INTO DocInfoExtraction VALUES ('" + www + "', '" + DOCText + "', '" + title + "')");
- s.close();
- con.close();
- }
- catch (Exception ex) {
- ex.printStackTrace();
- }
- //System.out.println(HTMLCode);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement