Guest User

Untitled

a guest
Oct 22nd, 2018
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.45 KB | None | 0 0
  1. from nltk.tokenize import word_tokenize
  2. from itertools import combinations
  3. from collections import Counter
  4.  
  5. sentences = ['i go to london', 'you do not go to london','but london goes to you']
  6. vocab = set(word_tokenize(' '.join(sentences)))
  7. print('Vocabulary:n',vocab,'n')
  8. token_sent_list = [word_tokenize(sen) for sen in sentences]
  9. print('Each sentence in token form:n',token_sent_list,'n')
  10.  
  11. co_occ = {ii:Counter({jj:0 for jj in vocab if jj!=ii}) for ii in vocab}
  12. k=2
  13.  
  14. for sen in token_sent_list:
  15. for ii in range(len(sen)):
  16. if ii < k:
  17. c = Counter(sen[0:ii+k+1])
  18. del c[sen[ii]]
  19. co_occ[sen[ii]] = co_occ[sen[ii]] + c
  20. elif ii > len(sen)-(k+1):
  21. c = Counter(sen[ii-k::])
  22. del c[sen[ii]]
  23. co_occ[sen[ii]] = co_occ[sen[ii]] + c
  24. else:
  25. c = Counter(sen[ii-k:ii+k+1])
  26. del c[sen[ii]]
  27. co_occ[sen[ii]] = co_occ[sen[ii]] + c
  28.  
  29. # Having final matrix in dict form lets you convert it to different python data structures
  30. co_occ = {ii:dict(co_occ[ii]) for ii in vocab}
  31. display(co_occ)
  32.  
  33. Vocabulary:
  34. {'london', 'but', 'goes', 'i', 'do', 'you', 'go', 'not', 'to'}
  35.  
  36. Each sentence in token form:
  37. [['i', 'go', 'to', 'london'], ['you', 'do', 'not', 'go', 'to', 'london'], ['but', 'london', 'goes', 'to', 'you']]
  38.  
  39. {'london': {'go': 2, 'to': 3, 'but': 1, 'goes': 1},
  40. 'but': {'london': 1, 'goes': 1},
  41. 'goes': {'london': 1, 'but': 1, 'you': 1, 'to': 1},
  42. 'i': {'go': 1, 'to': 1},
  43. 'do': {'you': 1, 'go': 1, 'not': 1},
  44. 'you': {'do': 1, 'not': 1, 'goes': 1, 'to': 1},
  45. 'go': {'london': 2, 'i': 1, 'to': 2, 'do': 1, 'not': 1},
  46. 'not': {'do': 1, 'you': 1, 'go': 1, 'to': 1},
  47. 'to': {'london': 3, 'i': 1, 'go': 2, 'not': 1, 'goes': 1, 'you': 1}}
  48.  
  49. import numpy as np
  50. import pandas as pd
  51.  
  52. ctxs = [
  53. 'krayyem like candy crush more then coffe',
  54. 'krayyem plays candy crush all days',
  55. 'krayyem do not invite his friends to play candy crush',
  56. 'krayyem is smart',
  57. ]
  58.  
  59. l_unique = list(set((' '.join(ctxs)).split(' ')))
  60. mat = np.zeros((len(l_unique), len(l_unique)))
  61.  
  62. nei = []
  63. nei_size = 3
  64.  
  65. for ctx in ctxs:
  66. for word in ctx.split(' '):
  67. nei.append(word)
  68. if len(nei) > nei_size:
  69. nei.pop(0)
  70. for word_1 in nei:
  71. for word_2 in nei:
  72. # if word_1 != word_2 -> to avoid diagonal
  73. mat[l_unique.index(word_1), l_unique.index(word_2)] += 1
  74.  
  75. mat = pd.DataFrame(mat)
  76. mat.index = l_unique
  77. mat.columns = l_unique
  78. display(mat)
Add Comment
Please, Sign In to add comment