Advertisement
Guest User

Untitled

a guest
Jan 17th, 2018
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 18.58 KB | None | 0 0
  1. package it.dlms.connectors.mongoDb.manager;
  2.  
  3. import static com.mongodb.client.model.Filters.eq;
  4.  
  5. import java.io.Serializable;
  6. import java.util.ArrayList;
  7. import java.util.Collections;
  8. import java.util.Comparator;
  9. import java.util.HashMap;
  10. import java.util.LinkedHashMap;
  11. import java.util.LinkedList;
  12. import java.util.List;
  13. import java.util.Map;
  14. import java.util.Map.Entry;
  15. import java.util.regex.Pattern;
  16. import java.util.Set;
  17.  
  18. import org.bson.Document;
  19. import org.slf4j.Logger;
  20. import org.slf4j.LoggerFactory;
  21.  
  22. import com.mongodb.BasicDBObject;
  23. import com.mongodb.DBCursor;
  24. import com.mongodb.DBObject;
  25. import com.mongodb.MongoClient;
  26. import com.mongodb.client.FindIterable;
  27. import com.mongodb.client.MongoCollection;
  28. import com.mongodb.client.MongoCursor;
  29. import com.mongodb.client.MongoDatabase;
  30.  
  31. import it.dlms.metadataCatalog.MetadataManager;
  32. import it.dlms.metadataCatalog.metadata.DatasetProfile;
  33. import it.dlms.metadataCatalog.metadata.GlobalDatasetMetadata;
  34. import scala.Tuple2;
  35.  
  36. public class MongoDBGlobalMetadataConnector implements Serializable {
  37.  
  38. private final static Logger log = LoggerFactory.getLogger(MetadataManager.class);
  39. private static final long serialVersionUID = 1L;
  40. private static MongoDBGlobalMetadataConnector instance;
  41. private static MongoClient MONGOCLIENT;
  42. private static MongoDatabase MONGODATABASE;
  43. private static MongoDatabase GDMENTITYDATABASE;
  44. private static MongoDatabase GDMONTOLOGYDATABASE;
  45. private static MongoCollection<Document> MONGOCOLLECTION;
  46. private static MongoCollection<Document> ENTITIESMONGOCOLLECTION;
  47. private static MongoCollection<Document> ONTOLOGIESMONGOCOLLECTION;
  48.  
  49. public static synchronized MongoDBGlobalMetadataConnector getInstance() {
  50. if(instance == null) {
  51. instance = new MongoDBGlobalMetadataConnector();
  52. }
  53. return instance;
  54. }
  55.  
  56. private MongoDBGlobalMetadataConnector (){
  57. if(MONGOCLIENT == null) {
  58. MONGOCLIENT = new MongoClient( "localhost" , 27017 );
  59. }
  60. MONGODATABASE = MONGOCLIENT.getDatabase("MetaLakeTest");//.getDatabase("MetaLake");
  61. GDMENTITYDATABASE = MONGOCLIENT.getDatabase("entities");
  62. GDMONTOLOGYDATABASE = MONGOCLIENT.getDatabase("ontologies");
  63. MONGOCOLLECTION = MONGODATABASE.getCollection("GlobalMetadata");
  64. ENTITIESMONGOCOLLECTION = MONGODATABASE.getCollection("entities");
  65. ONTOLOGIESMONGOCOLLECTION = MONGODATABASE.getCollection("ontologies");
  66. }
  67.  
  68. public static MongoCollection<Document> getMONGOCOLLECTION() {
  69. return MONGOCOLLECTION;
  70. }
  71.  
  72. public static MongoCollection<Document> getENTITIESMONGOCOLLECTION() {
  73. return ENTITIESMONGOCOLLECTION;
  74. }
  75.  
  76. public static MongoCollection<Document> getONTOLOGIESMONGOCOLLECTION() {
  77. return ONTOLOGIESMONGOCOLLECTION;
  78. }
  79.  
  80. public static MongoDatabase getGDMENTITYDATABASE() {
  81. return GDMENTITYDATABASE;
  82. }
  83.  
  84. public static MongoDatabase getGDMONTOLOGYDATABASE() {
  85. return GDMONTOLOGYDATABASE;
  86. }
  87.  
  88. public static void resetConnection() {
  89. MONGOCLIENT.close();
  90. MONGOCLIENT = new MongoClient( "localhost" , 27017 );
  91. MONGODATABASE = MONGOCLIENT.getDatabase("MetaLakeTest");//.getDatabase("MetaLake");
  92. GDMENTITYDATABASE = MONGOCLIENT.getDatabase("entities");
  93. GDMONTOLOGYDATABASE = MONGOCLIENT.getDatabase("ontologies");
  94. MONGOCOLLECTION = MONGODATABASE.getCollection("GlobalMetadata");
  95. ENTITIESMONGOCOLLECTION = MONGODATABASE.getCollection("entities");
  96. ONTOLOGIESMONGOCOLLECTION = MONGODATABASE.getCollection("ontologies");
  97. }
  98.  
  99.  
  100. public void resetGlobalMetadataMongoDBCollection() {
  101. MongoDBGlobalMetadataConnector.getMONGOCOLLECTION().drop();
  102. MongoDBGlobalMetadataConnector.getENTITIESMONGOCOLLECTION().drop();
  103. MongoDBGlobalMetadataConnector.getONTOLOGIESMONGOCOLLECTION().drop();
  104. MongoDBGlobalMetadataConnector.getGDMENTITYDATABASE().drop();
  105. MongoDBGlobalMetadataConnector.getGDMONTOLOGYDATABASE().drop();
  106. }
  107.  
  108. public void removeDocumentFromEntities(String fileName, Map<String, Map<String, Long>> frequencyOfTopFrequentItems) {
  109. MongoCollection<Document> document = getENTITIESMONGOCOLLECTION();
  110. log.info("filename: " + fileName);
  111.  
  112. for (Entry<String, Map<String, Long>> entry : frequencyOfTopFrequentItems.entrySet()) {
  113. String columnName = entry.getKey();
  114. String datasetColumnKey = fileName.replace(".", " ") + "|%#-#%|" + columnName;
  115. Set<String> entriesToRemove = entry.getValue().keySet();
  116. for (String entryToRemove : entriesToRemove) {
  117. BasicDBObject searchQuery = new BasicDBObject();
  118. searchQuery.put("entity", entryToRemove);
  119. FindIterable<Document> cursor = document.find(searchQuery);
  120. if (cursor.first() != null) {
  121. Map<String, Integer> datasetColumnsEntries = (Map<String, Integer>) cursor.first().get("map");
  122. log.info("dck: " + datasetColumnKey);
  123. datasetColumnsEntries.remove(datasetColumnKey);
  124. if (datasetColumnsEntries.size() == 0) {
  125. document.findOneAndDelete(searchQuery);
  126. }
  127. else {
  128. Document newDocument = new Document();
  129. newDocument.put("entity", entryToRemove);
  130. newDocument.put("map", datasetColumnsEntries);
  131. document.findOneAndReplace(searchQuery, newDocument);
  132. }
  133.  
  134. }
  135. }
  136. }
  137. }
  138.  
  139. public void removeElementsFromEntitiesMap(String fileName, Map<String, Map<String, Long>> frequencyOfTopFrequentItems) {
  140. MongoCollection<Document> doc = getMONGOCOLLECTION();
  141. log.info("filename: " + fileName);
  142. GlobalDatasetMetadata gdm = new GlobalDatasetMetadata(doc.find().first());
  143. Map<String, Map<String, Integer>> entitiesMap = gdm.getEntitiesMap();
  144. for (Entry<String, Map<String, Long>> entry : frequencyOfTopFrequentItems.entrySet()) {
  145. String columnName = entry.getKey();
  146. String datasetColumnKey = fileName.replace(".", " ") + "|%#-#%|" + columnName;
  147. Set<String> entriesToRemove = entry.getValue().keySet();
  148. for (String entryToRemove : entriesToRemove) {
  149. if (entitiesMap.containsKey(entryToRemove)) {
  150. Map<String, Integer> datasetColumnsEntries = entitiesMap.get(entryToRemove);
  151. log.info("dck: " + datasetColumnKey);
  152. if (datasetColumnsEntries.containsKey(datasetColumnKey)) {
  153. datasetColumnsEntries.remove(datasetColumnKey);
  154. if (datasetColumnsEntries.size() == 0) {
  155. entitiesMap.remove(entryToRemove);
  156. }
  157. else {
  158. entitiesMap.put(entryToRemove, datasetColumnsEntries);
  159. }
  160. }
  161. }
  162. }
  163. }
  164. gdm.setEntitiesMap(entitiesMap);
  165. Document updatedDoc = gdm.toMongoParser();
  166. MongoDBGlobalMetadataConnector.replaceGlobalMetadata(updatedDoc, doc);
  167. }
  168.  
  169. public void removeElementsFromOntologiesMap(String fileName,
  170. Map<String, Map<String, Integer>> ontologiesToRemoveMap) {
  171. MongoCollection<Document> doc = getMONGOCOLLECTION();
  172. GlobalDatasetMetadata gdm = new GlobalDatasetMetadata(doc.find().first());
  173. Map<String, Map<String, Integer>> ontologiesMap = gdm.getOntologiesMap();
  174. for (Entry<String, Map<String, Integer>> entry : ontologiesToRemoveMap.entrySet()) {
  175. String columnName = entry.getKey();
  176. String datasetColumnKey = fileName.replace("."," ") + "|%#-#%|" + columnName;
  177. Set<String> ontologiesToRemove = entry.getValue().keySet();
  178. for (String entryToRemove : ontologiesToRemove) {
  179. if (ontologiesMap.containsKey(entryToRemove)) {
  180. Map<String, Integer> datasetColumnsEntries = ontologiesMap.get(entryToRemove);
  181. log.info("dck: " + datasetColumnKey);
  182. if (datasetColumnsEntries.containsKey(datasetColumnKey)) {
  183. datasetColumnsEntries.remove(datasetColumnKey);
  184. if (datasetColumnsEntries.size() == 0) {
  185. ontologiesMap.remove(entryToRemove);
  186. }
  187. else {
  188. ontologiesMap.put(entryToRemove, datasetColumnsEntries);
  189. }
  190. }
  191. }
  192. }
  193. }
  194. gdm.setOntologiesMap(ontologiesMap);
  195. Document updatedDoc = gdm.toMongoParser();
  196. MongoDBGlobalMetadataConnector.replaceGlobalMetadata(updatedDoc, doc);
  197. }
  198.  
  199. public static void replaceGlobalMetadata(Document globalProfileDoc,
  200. MongoCollection<Document> globalMetadataCollection) {
  201. globalMetadataCollection.replaceOne(globalMetadataCollection.find().first(), globalProfileDoc);
  202. log.info("Global document replaced successfully");
  203. }
  204.  
  205. public static void insertEntitiesInGDM(Map<String, Map<String, Long>> frequencyOfTopFrequentItems,
  206. String fileName, long rowsCount) {
  207. int count = 0;
  208. Map<String, List<Document>> docMap = new HashMap<>();
  209. //List<Tuple2<String, Document>> docList = new LinkedList<Tuple2<String, Document>>();
  210. for (Entry<String, Map<String, Long>> entry : frequencyOfTopFrequentItems.entrySet()) {
  211. String columnName = entry.getKey();
  212. String datasetColumnKey = fileName.replace(".",",") + "|%#-#%|" + columnName;
  213. Map<String, Long> topFrequentMap = entry.getValue();
  214. count += topFrequentMap.size();
  215. List<Document> docList = new LinkedList<>();
  216. for (Entry<String, Long> topFrequentEntry : topFrequentMap.entrySet()) {
  217. Integer topFrequentCount = topFrequentMap.entrySet().size();
  218. String entityName = topFrequentEntry.getKey();
  219. Long entityOccurrencies = topFrequentEntry.getValue();
  220. Document datasetAndColumnDocument = new Document("datasetAndColumn", datasetColumnKey);
  221. Document details = new Document("occurrencies", entityOccurrencies)
  222. .append("topFrequentCount", topFrequentCount);
  223. datasetAndColumnDocument.append("details", details);
  224. docMap.putIfAbsent(entityName, new LinkedList<>());
  225. docMap.get(entityName).add(details);
  226. //docList.add(new Tuple2<>(entityName, datasetAndColumnDocument));
  227.  
  228.  
  229. }
  230. }
  231. log.info("Inserimento in mongo per : " + fileName + " di " + Integer.toString(docMap.size()) + " entita " + count);
  232.  
  233. int i = 0;
  234. for (Entry<String, List<Document>> entry : docMap.entrySet()) {
  235. i++;
  236. log.info("Entita numero : " + Integer.toString(i));
  237. if (i > 200) {
  238. resetConnection();
  239. i = 0;
  240. }
  241. String entityName = entry.getKey();
  242. List<Document> listDocs = entry.getValue();
  243.  
  244. GDMENTITYDATABASE.getCollection(entityName).insertMany(listDocs);
  245. //MongoCollection<Document> entityCollection = GDMENTITYDATABASE.getCollection(element._1());
  246. //entityCollection.insertMany(element._2());
  247. }
  248. }
  249.  
  250. public static void insertOntologiesInGDM(Map<String, Map<String, Integer>> ontologyMap,
  251. String fileName) {
  252. for (Entry<String, Map<String, Integer>> entry : ontologyMap.entrySet()) {
  253. String columnName = entry.getKey();
  254. String datasetColumnKey = fileName.replace("."," ") + "|%#-#%|" + columnName;
  255. Map<String, Integer> datasetOntologiesMap = entry.getValue();
  256. for (Entry<String, Integer> ontologyEntry : datasetOntologiesMap.entrySet()) {
  257. String ontologyName = ontologyEntry.getKey();
  258. Integer ontologyOccurrencies = ontologyEntry.getValue();
  259. MongoCollection<Document> ontologyCollection = GDMONTOLOGYDATABASE.getCollection(ontologyName);
  260. Document datasetAndColumnDocument = new Document("datasetAndColumn", datasetColumnKey);
  261. Document details = new Document("occurrencies", ontologyOccurrencies);
  262. datasetAndColumnDocument.append("details", details);
  263. ontologyCollection.insertOne(datasetAndColumnDocument);
  264. }
  265. }
  266.  
  267. }
  268.  
  269. public void removeEntitiesFromGDM(String fileName, Map<String, Map<String, Long>> frequencyOfTopFrequentItems) {
  270. for (Entry<String, Map<String, Long>> entry : frequencyOfTopFrequentItems.entrySet()) {
  271. String columnName = entry.getKey();
  272. String datasetColumnKey = fileName.replace(".", " ") + "|%#-#%|" + columnName;
  273. Set<String> entriesToRemove = entry.getValue().keySet();
  274. for (String entryToRemove : entriesToRemove) {
  275. MongoCollection<Document> entityCollection = MONGOCLIENT.getDatabase("entities").getCollection(entryToRemove);
  276. entityCollection.findOneAndDelete(eq("datasetAndColumn", datasetColumnKey));
  277. }
  278. }
  279. }
  280.  
  281. public void removeOntologiesFromGDM(String fileName, Map<String, Map<String, Integer>> ontologiesToRemoveMap) {
  282. for (Entry<String, Map<String, Integer>> entry : ontologiesToRemoveMap.entrySet()) {
  283. String columnName = entry.getKey();
  284. String datasetColumnKey = fileName.replace("."," ") + "|%#-#%|" + columnName;
  285. Set<String> ontologiesToRemove = entry.getValue().keySet();
  286. for (String entryToRemove : ontologiesToRemove) {
  287. MongoCollection<Document> ontologyCollection = GDMONTOLOGYDATABASE.getCollection(entryToRemove);
  288. ontologyCollection.findOneAndDelete(eq("datasetAndColumn", datasetColumnKey));
  289. }
  290. }
  291. }
  292.  
  293. public static Map<String, Map<String, Double>> findEntitiesMatchingInGDM(DatasetProfile profile) {
  294. String fileName = profile.getFileName().replace(".csv","");
  295. Map<String, Map<String, Double>> matchingDatasets = new LinkedHashMap<>();
  296. Map<String, Map<String, Long>> frequencyOfTopFrequentItems = profile.getContentMetadata().getFrequencyOfTopFrequentItems();
  297. for (Entry<String, Map<String, Long>> frequencyOfTopFrequentItemsEntry : frequencyOfTopFrequentItems.entrySet()) {
  298. String columnName = frequencyOfTopFrequentItemsEntry.getKey();
  299. matchingDatasets.put(columnName, new LinkedHashMap<>());
  300. for (Entry<String, Long> singleItemEntry : frequencyOfTopFrequentItemsEntry.getValue().entrySet()) {
  301. Double topFrequentCount = (double) frequencyOfTopFrequentItemsEntry.getValue().size();
  302. MongoCollection<Document> entityCollection = MONGOCLIENT.getDatabase("entities").getCollection(singleItemEntry.getKey());
  303. MongoCursor<Document> cursor = entityCollection.find().iterator();
  304. try {
  305. while (cursor.hasNext()) {
  306. Document doc = cursor.next();
  307. String key = (String) doc.get("datasetAndColumn");
  308. Document datasetDetails = (Document) doc.get("details");
  309. Long occurrencies = (Long) datasetDetails.get("occurrencies");
  310. Integer occurrencyTopFrequentCount = (Integer) datasetDetails.get("topFrequentCount");
  311. if (!key.split(Pattern.quote("|%#-#%|"))[0].equals(fileName)) {
  312. Double value = 2*1/(topFrequentCount + (double) occurrencyTopFrequentCount);
  313. Map<String, Double> matchingMap = matchingDatasets.get(columnName);
  314. if (matchingMap.containsKey(key)) {
  315. Double prevValue = matchingMap.get(key);
  316. prevValue = prevValue + value;
  317. matchingMap.put(key, prevValue);
  318. }
  319. else {
  320. matchingMap.put(key, value);
  321. }
  322. matchingDatasets.put(columnName, matchingMap);
  323. }
  324.  
  325. }
  326. } finally {
  327. cursor.close();
  328. }
  329.  
  330. }
  331. }
  332. //sorting column's map
  333. for (Map.Entry<String, Map<String, Double>> entry : matchingDatasets.entrySet()) {
  334. Map<String, Double> unsortedMap = entry.getValue();
  335. List<Map.Entry<String, Double>> freqList = new ArrayList<>(unsortedMap.entrySet());
  336. Comparator<Map.Entry<String, Double>> freqComparator =
  337. (entry1, entry2) -> entry1.getValue().compareTo(entry2.getValue());
  338. Collections.sort(freqList, freqComparator.reversed());
  339. Map<String, Double> sortedMap = new LinkedHashMap<>();
  340. for(Entry<String, Double> sortedEntry : freqList) {
  341. sortedMap.put(sortedEntry.getKey(), sortedEntry.getValue());
  342. }
  343. matchingDatasets.put(entry.getKey(), sortedMap);
  344. }
  345. return matchingDatasets;
  346. }
  347.  
  348. public static Map<String, Map<String, Map<String, Long>>> findOntologiesMatchingInGDM(DatasetProfile profile) {
  349. String fileName = profile.getFileName().replace(".csv","");
  350. Map<String, Map<String, Map<String, Long>>> matchingDatasets = new LinkedHashMap<>();
  351. if (profile.getOntologyMetadata() == null) {
  352. log.info("Couldn't find ontology metadata for the given dataset " + fileName);
  353. return null;
  354. } else {
  355. Map<String, Map<String, Integer>> ontologiesMap = profile.getOntologyMetadata().getOntologies();
  356. for (Entry<String, Map<String, Integer>> ontologiesMapEntry : ontologiesMap.entrySet()) {
  357. String columnName = ontologiesMapEntry.getKey();
  358. matchingDatasets.put(columnName, new LinkedHashMap<>());
  359. for (Entry<String, Integer> singleOntologyEntry : ontologiesMapEntry.getValue().entrySet()) {
  360. MongoCollection<Document> ontologyCollection = GDMONTOLOGYDATABASE.getCollection(singleOntologyEntry.getKey());
  361. MongoCursor<Document> cursor = ontologyCollection.find().iterator();
  362. try {
  363. while (cursor.hasNext()) {
  364. Document doc = cursor.next();
  365. String key = (String) doc.get("datasetAndColumn");
  366. Document datasetDetails = (Document) doc.get("details");
  367. Long occurrencies = (Long) datasetDetails.get("occurrencies");
  368. if (!key.split(Pattern.quote("|%#-#%|"))[0].equals(fileName)) {
  369. Long value = (long) occurrencies + (long) singleOntologyEntry.getValue();
  370. Map<String, Map<String, Long>> matchingMap = matchingDatasets.get(columnName);
  371. if (matchingMap.containsKey(key)) {
  372. if (matchingMap.get(key).containsKey(singleOntologyEntry.getKey())) {
  373. Long prevValue = matchingMap.get(key).get(singleOntologyEntry.getKey());
  374. prevValue = prevValue + value;
  375. matchingMap.get(key).put(singleOntologyEntry.getKey(), prevValue);
  376. }
  377. else {
  378. matchingMap.get(key).put(singleOntologyEntry.getKey(), value);
  379. }
  380. }
  381. else {
  382. Map<String, Long> newMapOfEntologies = new LinkedHashMap<>();
  383. newMapOfEntologies.put(singleOntologyEntry.getKey(), value);
  384. matchingMap.put(key, newMapOfEntologies);
  385. }
  386. matchingDatasets.put(columnName, matchingMap);
  387. }
  388. }
  389. } finally {
  390. cursor.close();
  391. }
  392. }
  393. }
  394. }
  395. //sorting column's map
  396. for (Entry<String, Map<String, Map<String, Long>>> entry : matchingDatasets.entrySet()) {
  397. Map<String, Map<String, Long>> columnMatchings = entry.getValue();
  398. for (Entry<String, Map<String, Long>> matchings : columnMatchings.entrySet()) {
  399. Map<String, Long> unsortedMap = matchings.getValue();
  400. List<Map.Entry<String, Long>> ontologiesList = new ArrayList<>(unsortedMap.entrySet());
  401. Comparator<Map.Entry<String, Long>> freqComparator =
  402. (entry1, entry2) -> entry1.getValue().compareTo(entry2.getValue());
  403. Collections.sort(ontologiesList, freqComparator.reversed());
  404. Map<String, Long> sortedMap = new LinkedHashMap<>();
  405. for(Entry<String, Long> sortedEntry : ontologiesList) {
  406. sortedMap.put(sortedEntry.getKey(), sortedEntry.getValue());
  407. }
  408. columnMatchings.put(matchings.getKey(), sortedMap);
  409. }
  410. matchingDatasets.put(entry.getKey(), columnMatchings);
  411. }
  412. return matchingDatasets;
  413. }
  414.  
  415. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement