Advertisement
Guest User

Untitled

a guest
Apr 26th, 2018
186
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.10 KB | None | 0 0
  1. package org.aksw.ocelot.core.indexsearch;
  2.  
  3. import java.util.HashMap;
  4. import java.util.HashSet;
  5. import java.util.Iterator;
  6. import java.util.Map;
  7. import java.util.Map.Entry;
  8. import java.util.Set;
  9. import java.util.concurrent.CompletionService;
  10. import java.util.concurrent.ConcurrentHashMap;
  11. import java.util.concurrent.ExecutionException;
  12. import java.util.concurrent.ExecutorCompletionService;
  13. import java.util.concurrent.ExecutorService;
  14. import java.util.concurrent.Executors;
  15. import java.util.concurrent.Future;
  16. import java.util.concurrent.TimeUnit;
  17. import java.util.regex.Pattern;
  18.  
  19. import org.aksw.ocelot.data.Const;
  20. import org.aksw.ocelot.share.EnumSolrWikiIndex;
  21. import org.apache.log4j.LogManager;
  22. import org.apache.log4j.Logger;
  23. import org.apache.solr.common.SolrDocument;
  24.  
  25. /**
  26. *
  27. * @author Ren&eacute; Speck <speck@informatik.uni-leipzig.de>
  28. *
  29. */
  30. public class WikipediaCorpus implements ICorpus {
  31.  
  32. protected static Logger LOG = LogManager.getLogger(WikipediaCorpus.class);
  33.  
  34. protected static String replace = "https://en.wikipedia.org/wiki?curid=";
  35. protected WikipediaIndex wikipediaIndex = new WikipediaIndex();
  36.  
  37. @Override
  38. public String getSentence(final String id) {
  39. final Set<SolrDocument> set = wikipediaIndex.search(replace + id, EnumSolrWikiIndex.ID);
  40. if (set.size() > 1) {
  41. LOG.warn("Found more than one result to the given id!");
  42. }
  43. if (set.size() > 0) {
  44. return (String) set.iterator().next().getFieldValue(EnumSolrWikiIndex.SENTENCE.getName());
  45. }
  46. return null;
  47. }
  48.  
  49. @Override
  50. public CorpusElement getSolrDocument(final String id) {
  51. final Set<SolrDocument> set = wikipediaIndex.search(replace + id, EnumSolrWikiIndex.ID);
  52. if (set.size() > 1) {
  53. LOG.warn("Found more than one result to the given id!");
  54. }
  55. if (set.size() > 0) {
  56. return new CorpusElement(set.iterator().next());
  57. }
  58. return null;
  59. }
  60.  
  61. @Override
  62. public Set<String> sentenceIDs(final String sf, final String domain, final String range) {
  63. final Set<SolrDocument> set = wikipediaIndex.searchCandidate(sf, domain, range);
  64. return getIdsToDocs(set).keySet();
  65. }
  66.  
  67. @Override
  68. public Map<String, Set<String>> sentenceIDs(final Set<String> sfs, final String domain,
  69. final String range) {
  70.  
  71. final int size = sfs.size();
  72. int current = 0;
  73. LOG.info("Surfaceforms size: " + sfs.size());
  74.  
  75. final Map<String, Set<String>> sfToSentenceIDs = new ConcurrentHashMap<>();
  76. // search each surfaceform in the index
  77. final ExecutorService executorService = Executors.newFixedThreadPool(Const.searchThreadsSF);
  78. final CompletionService<Set<String>> completionService =
  79. new ExecutorCompletionService<>(executorService);
  80.  
  81. final Map<Future<Set<String>>, String> futures = new HashMap<>();
  82.  
  83. int i = 0;
  84. for (final String sf : sfs) {
  85. if (sfToSentenceIDs.get(sf) == null) {
  86. futures.put(completionService.submit(() -> {
  87. return sentenceIDs(sf, domain, range);
  88. }), sf);
  89. i++;
  90. }
  91. }
  92. executorService.shutdown();
  93.  
  94. for (int iii = 0; iii < i; ++iii) {
  95. try {
  96. final Future<Set<String>> future =
  97. completionService.poll(Const.searchTimeoutSF, TimeUnit.SECONDS);
  98. if (future == null) {
  99. LOG.warn("Timeout ...");
  100. } else {
  101. final String sf = futures.get(future);
  102. final Set<String> sentenceIDs = future.get();
  103. sfToSentenceIDs.put(sf, sentenceIDs);
  104. ++current;
  105.  
  106. if ((current % (size / 4)) == 0) {
  107. LOG.info(current + "/" + size + " surfaceforms done.");
  108. }
  109. }
  110. } catch (InterruptedException | ExecutionException e) {
  111. LOG.error(e.getLocalizedMessage(), e);
  112. }
  113. }
  114. // remove empty once
  115. for (final Iterator<Entry<String, Set<String>>> iter =
  116. sfToSentenceIDs.entrySet().iterator(); iter.hasNext();) {
  117. if (iter.next().getValue().isEmpty()) {
  118. iter.remove();
  119. }
  120. }
  121. LOG.info("sfToSentenceIDs: " + sfToSentenceIDs.size());
  122. return sfToSentenceIDs;
  123. }
  124.  
  125. public Set<String> getDocumentIds(final Set<SolrDocument> docs) {
  126. final Set<String> ids = new HashSet<>();
  127. docs.forEach(doc -> ids.add(((String) doc.getFieldValue(EnumSolrWikiIndex.ID.getName()))
  128. .replaceAll(Pattern.quote(replace), "")));
  129. return ids;
  130. }
  131.  
  132. public Map<String, SolrDocument> getIdsToDocs(final Set<SolrDocument> docs) {
  133. final Map<String, SolrDocument> ids = new HashMap<>();
  134. docs.forEach(doc -> ids.put(((String) doc.getFieldValue(EnumSolrWikiIndex.ID.getName()))
  135. .replaceAll(Pattern.quote(replace), ""), doc));
  136. return ids;
  137. }
  138.  
  139. /**
  140. * Searches for a surfaceform in the index to find candidate sentences. <br>
  141. * Returns a map of document IDs and document candidates.
  142. *
  143. * @param sf surfaceform
  144. * @return map of document IDs and candidates
  145. */
  146. public Map<String, SolrDocument> candidatesMap(final String sf) {
  147. final Set<SolrDocument> set = wikipediaIndex.search(sf, EnumSolrWikiIndex.SENTENCE);
  148. return getIdsToDocs(set);
  149. }
  150. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement