Guest User

Untitled

a guest
Jan 18th, 2018
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.94 KB | None | 0 0
  1. import h5py
  2. import os
  3. import pickle
  4.  
  5. try:
  6. import gensim
  7. except ImportError:
  8. gensim = None
  9.  
  10.  
  11. class Word2VecLookup(object):
  12. def __init__(self, dbpath):
  13. self.h5file = os.path.join(dbpath, "db.h5py")
  14. self.lookupfile = os.path.join(dbpath, "lookup.pkl")
  15. if not (os.path.exists(self.h5file) and
  16. os.path.exists(self.lookupfile)):
  17. print("Word2VecLookup directory is malformed. Please recreate "
  18. "using Word2VecLookup.create_db")
  19. raise TypeError
  20. with open(self.lookupfile, 'rb') as fd:
  21. self.lookup = pickle.load(fd)
  22.  
  23. def __getitem__(self, items):
  24. if isinstance(items, (str, bytes)):
  25. return self.__getitem__([items])
  26. w2v_indicies = list(
  27. filter(
  28. None,
  29. map(
  30. self.lookup.get,
  31. items
  32. )
  33. )
  34. )
  35. w2v_indicies_sort = sorted(set(w2v_indicies))
  36. with h5py.File(self.h5file, 'r') as f:
  37. vectors = f['word2vec'][w2v_indicies_sort]
  38. unsort = {w: i for i, w in enumerate(w2v_indicies_sort)}
  39. unsort_idxs = [unsort[i]for i in w2v_indicies if i in unsort]
  40. return vectors[unsort_idxs, ...]
  41.  
  42. @staticmethod
  43. def create_db(word2vec_bin, dbpath):
  44. if gensim is None:
  45. print("Cannot create h5db from word2vec binary file "
  46. "without gensim installed")
  47. model = gensim.models.KeyedVectors.load_word2vec_format(
  48. word2vec_bin,
  49. binary=True
  50. )
  51. os.makedirs(dbpath, exist_ok=True)
  52. h5file = os.path.join(dbpath, "db.h5py")
  53. lookupfile = os.path.join(dbpath, "lookup.pkl")
  54.  
  55. lookup = {w: d.index for w, d in model.vocab.items()}
  56. with open(lookupfile, 'wb+') as fd:
  57. pickle.dump(lookup, fd)
  58.  
  59. with h5py.File(h5file, 'w') as f:
  60. f.create_dataset("word2vec", data=model.syn0)
Add Comment
Please, Sign In to add comment