Advertisement
Guest User

Untitled

a guest
Feb 9th, 2017
177
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.93 KB | None | 0 0
  1. package edu.wiki.demo;
  2.  
  3. import edu.wiki.api.concept.IConceptIterator;
  4. import edu.wiki.api.concept.IConceptVector;
  5. import edu.wiki.search.ESASearcher;
  6.  
  7. import java.io.*;
  8. import java.sql.Connection;
  9. import java.sql.DriverManager;
  10. import java.sql.SQLException;
  11. import java.sql.Statement;
  12. import java.util.*;
  13. import java.util.logging.Logger;
  14.  
  15. public class TestGeneralESAVectors {
  16.  
  17. static Connection connection;
  18. static Statement stmtQuery;
  19. private ESASearcher searcher;
  20. private static final Logger LOGGER = Logger.getLogger(TestGeneralESAVectors.class.getName());
  21. private static boolean isDone = false;
  22.  
  23. public static void initDB() throws ClassNotFoundException, SQLException, IOException {
  24. // Load the JDBC driver
  25. String driverName = "com.mysql.jdbc.Driver"; // MySQL Connector
  26. Class.forName(driverName);
  27.  
  28. // read DB config
  29. InputStream is = ESASearcher.class.getResourceAsStream("/config/db.conf");
  30. BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
  31. String serverName = br.readLine();
  32. String mydatabase = br.readLine();
  33. String username = br.readLine();
  34. String password = "123456";
  35. br.close();
  36.  
  37. // Create a connection to the database
  38. String url = "jdbc:mysql://" + serverName + "/" + mydatabase; // a JDBC url
  39. connection = DriverManager.getConnection(url, username, password);
  40. stmtQuery = connection.createStatement();
  41. stmtQuery.setFetchSize(100);
  42. }
  43.  
  44. /**
  45. * @throws IOException
  46. * @throws SQLException
  47. * @throws ClassNotFoundException
  48. */
  49. public String getVector(String text) throws ClassNotFoundException, SQLException, IOException {
  50. if (!isDone) {
  51. searcher = new ESASearcher();
  52. initDB();
  53. isDone = true;
  54. }
  55. int limit = 500000; //set limit to big value to get all concepts
  56. IConceptVector cvBase = searcher.getConceptVector(text);
  57. IConceptVector cvNormal = searcher.getNormalVector(cvBase, limit);
  58. if (cvNormal == null) {
  59. LOGGER.info("empty concept vector => " + text);
  60. return "";
  61. }
  62.  
  63. IConceptIterator it = cvNormal.orderedIterator();
  64.  
  65. int count = 0;
  66. TreeMap<Integer, Double> tree = new TreeMap<>();
  67. while (it.next() && count < limit) {
  68. tree.put(it.getId(), it.getValue());
  69. count++;
  70. }
  71.  
  72. String ret = "";
  73. for (Map.Entry<Integer, Double> entry : tree.entrySet()) {
  74. ret += " " + entry.getKey() + ":" + entry.getValue();
  75. }
  76.  
  77. return ret;
  78. }
  79.  
  80. private String getLine(String line) throws SQLException, IOException, ClassNotFoundException {
  81. String arr[] = line.split("\\s");
  82. String s = arr[1];
  83. for (int i = 2; i < arr.length; i++) {
  84. s += " " + arr[i];
  85. }
  86. String vector = getVector(s);
  87. return (arr[0].substring(0, arr[0].length() - 1) + vector);
  88. }
  89.  
  90. private void start(String inputFile, String outputFile) throws IOException, SQLException, ClassNotFoundException {
  91. PrintWriter writer = new PrintWriter(outputFile);
  92. Scanner input = new Scanner(new File(inputFile));
  93. int cnt = 0;
  94. while (input.hasNextLine()) {
  95. String line = input.nextLine();
  96. String res = getLine(line);
  97. writer.println(res);
  98. cnt++;
  99. if (cnt % 1000 == 0)
  100. LOGGER.info(cnt + " lines are done");
  101. }
  102. writer.close();
  103. input.close();
  104. }
  105.  
  106. public static void main(String args[]) throws IOException, SQLException, ClassNotFoundException {
  107. TestGeneralESAVectors test = new TestGeneralESAVectors();
  108. test.start("/home/moustah/PycharmProjects/twitter/twitter API/training/partitions/part_0.txt", "part_0_output.txt");
  109. }
  110. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement