Advertisement
Guest User

Untitled

a guest
Jun 21st, 2018
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.33 KB | None | 0 0
  1. import drqa.tokenizers
  2. drqa.tokenizers.set_default('corenlp_classpath', '/content/DrQA/coreNLP/*')
  3.  
  4. from drqa.tokenizers import CoreNLPTokenizer
  5. tok = CoreNLPTokenizer()
  6. print(tok.tokenize('hello world').words()) # Should complete immediately
  7.  
  8.  
  9. #!/usr/bin/env python3
  10. # Copyright 2017-present, Facebook, Inc.
  11. # All rights reserved.
  12. #
  13. # This source code is licensed under the license found in the
  14. # LICENSE file in the root directory of this source tree.
  15. """Various retriever utilities."""
  16.  
  17. import regex
  18. import unicodedata
  19. import numpy as np
  20. import scipy.sparse as sp
  21. from sklearn.utils import murmurhash3_32
  22.  
  23.  
  24. # ------------------------------------------------------------------------------
  25. # Sparse matrix saving/loading helpers.
  26. # ------------------------------------------------------------------------------
  27.  
  28.  
  29. def save_sparse_csr(filename, matrix, metadata=None):
  30. data = {
  31. 'data': matrix.data,
  32. 'indices': matrix.indices,
  33. 'indptr': matrix.indptr,
  34. 'shape': matrix.shape,
  35. 'metadata': metadata,
  36. }
  37. np.savez(filename, **data)
  38.  
  39.  
  40. def load_sparse_csr(filename):
  41. loader = np.load(filename)
  42. matrix = sp.csr_matrix((loader['data'], loader['indices'],
  43. loader['indptr']), shape=loader['shape'])
  44. return matrix, loader['metadata'].item(0) if 'metadata' in loader else None
  45.  
  46.  
  47. # ------------------------------------------------------------------------------
  48. # Token hashing.
  49. # ------------------------------------------------------------------------------
  50.  
  51.  
  52. def hash(token, num_buckets):
  53. """Unsigned 32 bit murmurhash for feature hashing."""
  54. return murmurhash3_32(token, positive=True) % num_buckets
  55.  
  56.  
  57. # ------------------------------------------------------------------------------
  58. # Text cleaning.
  59. # ------------------------------------------------------------------------------
  60.  
  61.  
  62. STOPWORDS = {
  63. 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
  64. 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
  65. 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
  66. 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
  67. 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
  68. 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
  69. 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
  70. 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
  71. 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
  72. 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
  73. 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
  74. 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
  75. 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can',
  76. 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've',
  77. 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven',
  78. 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren',
  79. 'won', 'wouldn', "'ll", "'re", "'ve", "n't", "'s", "'d", "'m", "''", "``"
  80. }
  81.  
  82.  
  83. def normalize(text):
  84. """Resolve different type of unicode encodings."""
  85. return unicodedata.normalize('NFD', text)
  86.  
  87.  
  88. def filter_word(text):
  89. """Take out english stopwords, punctuation, and compound endings."""
  90. text = normalize(text)
  91. if regex.match(r'^\p{P}+$', text):
  92. return True
  93. if text.lower() in STOPWORDS:
  94. return True
  95. return False
  96.  
  97.  
  98. def filter_ngram(gram, mode='any'):
  99. """Decide whether to keep or discard an n-gram.
  100.  
  101. Args:
  102. gram: list of tokens (length N)
  103. mode: Option to throw out ngram if
  104. 'any': any single token passes filter_word
  105. 'all': all tokens pass filter_word
  106. 'ends': book-ended by filterable tokens
  107. """
  108. filtered = [filter_word(w) for w in gram]
  109. if mode == 'any':
  110. return any(filtered)
  111. elif mode == 'all':
  112. return all(filtered)
  113. elif mode == 'ends':
  114. return filtered[0] or filtered[-1]
  115. else:
  116. raise ValueError('Invalid mode: %s' % mode)
  117.  
  118. s=load_sparse_csr("drqa/data/wikipedia/docs-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz")
  119. print(s)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement