Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package edu.wiki.demo;
- import edu.wiki.api.concept.IConceptIterator;
- import edu.wiki.api.concept.IConceptVector;
- import edu.wiki.search.ESASearcher;
- import java.io.*;
- import java.sql.Connection;
- import java.sql.DriverManager;
- import java.sql.SQLException;
- import java.sql.Statement;
- import java.util.*;
- import java.util.logging.Logger;
- public class TestGeneralESAVectors {
- static Connection connection;
- static Statement stmtQuery;
- private ESASearcher searcher;
- private static final Logger LOGGER = Logger.getLogger(TestGeneralESAVectors.class.getName());
- private static boolean isDone = false;
- public static void initDB() throws ClassNotFoundException, SQLException, IOException {
- // Load the JDBC driver
- String driverName = "com.mysql.jdbc.Driver"; // MySQL Connector
- Class.forName(driverName);
- // read DB config
- InputStream is = ESASearcher.class.getResourceAsStream("/config/db.conf");
- BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
- String serverName = br.readLine();
- String mydatabase = br.readLine();
- String username = br.readLine();
- String password = "123456";
- br.close();
- // Create a connection to the database
- String url = "jdbc:mysql://" + serverName + "/" + mydatabase; // a JDBC url
- connection = DriverManager.getConnection(url, username, password);
- stmtQuery = connection.createStatement();
- stmtQuery.setFetchSize(100);
- }
- /**
- * @throws IOException
- * @throws SQLException
- * @throws ClassNotFoundException
- */
- public String getVector(String text) throws ClassNotFoundException, SQLException, IOException {
- if (!isDone) {
- searcher = new ESASearcher();
- initDB();
- isDone = true;
- }
- int limit = 500000; //set limit to big value to get all concepts
- IConceptVector cvBase = searcher.getConceptVector(text);
- IConceptVector cvNormal = searcher.getNormalVector(cvBase, limit);
- if (cvNormal == null) {
- LOGGER.info("empty concept vector => " + text);
- return "";
- }
- IConceptIterator it = cvNormal.orderedIterator();
- int count = 0;
- TreeMap<Integer, Double> tree = new TreeMap<>();
- while (it.next() && count < limit) {
- tree.put(it.getId(), it.getValue());
- count++;
- }
- String ret = "";
- for (Map.Entry<Integer, Double> entry : tree.entrySet()) {
- ret += " " + entry.getKey() + ":" + entry.getValue();
- }
- return ret;
- }
- private String getLine(String line) throws SQLException, IOException, ClassNotFoundException {
- //1,2,3, text tweet_id
- //arr[0] arr[1] arr[arr.length-1]
- String arr[] = line.split("\\s");
- String label = arr[0];
- String tweetId = arr[arr.length - 1];
- String text = "";
- for (int i = 1; i < arr.length - 1; i++) {
- text += " " + arr[i];
- }
- String vector = getVector(text);
- //removing the last comma from label
- return (label.substring(0, label.length() - 1) + vector + " #" + tweetId);
- }
- private void start(String inputFile, String outputFile) throws IOException, SQLException, ClassNotFoundException {
- PrintWriter writer = new PrintWriter(outputFile);
- BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), "UTF-8"));
- int cnt = 0;
- String line;
- while ((line = br.readLine()) != null) {
- String res = getLine(line);
- writer.println(res);
- cnt++;
- if (cnt % 1000 == 0)
- LOGGER.info(cnt + " lines are done");
- }
- writer.close();
- }
- public static void main(String args[]) throws IOException, SQLException, ClassNotFoundException {
- if (args != null && args.length == 2) {
- TestGeneralESAVectors test = new TestGeneralESAVectors();
- LOGGER.info("source file " + args[0] + " " + "destination file " + args[1]);
- test.start(args[0], args[1]);
- } else {
- LOGGER.info("wrong number of args -> system exit with -1");
- System.exit(-1);
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement