Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from nltk.tokenize import word_tokenize
- from itertools import combinations
- from collections import Counter
- sentences = ['i go to london', 'you do not go to london','but london goes to you']
- vocab = set(word_tokenize(' '.join(sentences)))
- print('Vocabulary:n',vocab,'n')
- token_sent_list = [word_tokenize(sen) for sen in sentences]
- print('Each sentence in token form:n',token_sent_list,'n')
- co_occ = {ii:Counter({jj:0 for jj in vocab if jj!=ii}) for ii in vocab}
- k=2
- for sen in token_sent_list:
- for ii in range(len(sen)):
- if ii < k:
- c = Counter(sen[0:ii+k+1])
- del c[sen[ii]]
- co_occ[sen[ii]] = co_occ[sen[ii]] + c
- elif ii > len(sen)-(k+1):
- c = Counter(sen[ii-k::])
- del c[sen[ii]]
- co_occ[sen[ii]] = co_occ[sen[ii]] + c
- else:
- c = Counter(sen[ii-k:ii+k+1])
- del c[sen[ii]]
- co_occ[sen[ii]] = co_occ[sen[ii]] + c
- # Having final matrix in dict form lets you convert it to different python data structures
- co_occ = {ii:dict(co_occ[ii]) for ii in vocab}
- display(co_occ)
- Vocabulary:
- {'london', 'but', 'goes', 'i', 'do', 'you', 'go', 'not', 'to'}
- Each sentence in token form:
- [['i', 'go', 'to', 'london'], ['you', 'do', 'not', 'go', 'to', 'london'], ['but', 'london', 'goes', 'to', 'you']]
- {'london': {'go': 2, 'to': 3, 'but': 1, 'goes': 1},
- 'but': {'london': 1, 'goes': 1},
- 'goes': {'london': 1, 'but': 1, 'you': 1, 'to': 1},
- 'i': {'go': 1, 'to': 1},
- 'do': {'you': 1, 'go': 1, 'not': 1},
- 'you': {'do': 1, 'not': 1, 'goes': 1, 'to': 1},
- 'go': {'london': 2, 'i': 1, 'to': 2, 'do': 1, 'not': 1},
- 'not': {'do': 1, 'you': 1, 'go': 1, 'to': 1},
- 'to': {'london': 3, 'i': 1, 'go': 2, 'not': 1, 'goes': 1, 'you': 1}}
- import numpy as np
- import pandas as pd
- ctxs = [
- 'krayyem like candy crush more then coffe',
- 'krayyem plays candy crush all days',
- 'krayyem do not invite his friends to play candy crush',
- 'krayyem is smart',
- ]
- l_unique = list(set((' '.join(ctxs)).split(' ')))
- mat = np.zeros((len(l_unique), len(l_unique)))
- nei = []
- nei_size = 3
- for ctx in ctxs:
- for word in ctx.split(' '):
- nei.append(word)
- if len(nei) > nei_size:
- nei.pop(0)
- for word_1 in nei:
- for word_2 in nei:
- # if word_1 != word_2 -> to avoid diagonal
- mat[l_unique.index(word_1), l_unique.index(word_2)] += 1
- mat = pd.DataFrame(mat)
- mat.index = l_unique
- mat.columns = l_unique
- display(mat)
Add Comment
Please, Sign In to add comment