Advertisement
Guest User

Untitled

a guest
Dec 3rd, 2018
133
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 2.95 KB | None | 0 0
  1.  
  2. import java.io.File;
  3. import java.io.IOException;
  4. import java.sql.Connection;
  5. import java.sql.DriverManager;
  6. import java.sql.PreparedStatement;
  7. import java.sql.SQLException;
  8. import java.sql.Statement;
  9. import org.apache.pdfbox.pdmodel.PDDocument;
  10. import org.apache.pdfbox.text.PDFTextStripper;
  11.  
  12.  
  13. public class SO53538039
  14. {
  15.     static PreparedStatement preparedStatement;
  16.     static Connection con1;
  17.  
  18.     public static void main(String[] args) throws IOException, SQLException
  19.     {
  20.         con1 = DriverManager.getConnection("jdbc:mysql://127.0.0.1/XXXXX", "XXXX", "XXXX");
  21.         con1.setAutoCommit(false);
  22.  
  23.         Statement stmt = con1.createStatement();
  24.         stmt.execute("drop table if exists indextable");
  25.         stmt.execute("create table indextable "
  26.                 + "("
  27.                 + "path varchar(300), "
  28.                 + "word varchar(300), "
  29.                 + "primary key(path,word)"
  30.                 + ")");
  31.  
  32.         // Anzahl Seiten: 18519
  33.         try (PDDocument document = PDDocument.load(new File("10-million-password-list-top-1000000.pdf")))
  34.         {
  35.             PDFTextStripper tStripper = new PDFTextStripper();
  36.             for (int p = 1; p <= document.getNumberOfPages(); ++p)
  37.             {
  38.                 tStripper.setStartPage(p);
  39.                 tStripper.setEndPage(p);
  40.                 String pdfFileInText = tStripper.getText(document);
  41.                 processText(pdfFileInText);
  42.                 System.out.println("page " + p + " done");
  43.             }
  44.         }
  45.        
  46.         con1.commit();
  47.     }
  48.  
  49.     static public void processText(String text) throws SQLException
  50.     {
  51.         String lines[] = text.split("\\r?\\n");
  52.         for (String line : lines)
  53.         {
  54.             String[] words = line.split(" ");
  55.  
  56.             String sql = "insert IGNORE into indextable values (?,?);";
  57.  
  58.             preparedStatement = con1.prepareStatement(sql);
  59.             int i = 0;
  60.             for (String word : words)
  61.             {
  62.                 // check if one or more special characters at end of string then remove OR
  63.                 // check special characters in beginning of the string then remove
  64.                 // insert every word directly to table db
  65.                 word = word.replaceAll("([\\W]+$)|(^[\\W]+)", "");
  66.                 preparedStatement.setString(1, "path1");
  67.                 preparedStatement.setString(2, word);
  68.                 preparedStatement.addBatch();
  69.  
  70.                 i++;
  71.                 if (i % 1000 == 0)
  72.                 {
  73.                     preparedStatement.executeBatch();
  74.  
  75.                     System.out.println("Add Thousand");
  76.                 }
  77.             }
  78.  
  79.             if (i > 0)
  80.             {
  81.                 preparedStatement.executeBatch();
  82.  
  83.                 //System.out.println("Add Remaining");
  84.             }
  85.         }
  86.         preparedStatement.close();
  87.         //System.out.println("Successfully commited changes to the database!");
  88.     }
  89. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement