Untitled

from nltk.tokenize import word_tokenize
from itertools import combinations
from collections import Counter

sentences = ['i go to london', 'you do not go to london','but london goes to you']
vocab = set(word_tokenize(' '.join(sentences)))
print('Vocabulary:n',vocab,'n')
token_sent_list = [word_tokenize(sen) for sen in sentences]
print('Each sentence in token form:n',token_sent_list,'n')

co_occ = {ii:Counter({jj:0 for jj in vocab if jj!=ii}) for ii in vocab}
k=2

for sen in token_sent_list:
    for ii in range(len(sen)):
        if ii < k:
            c = Counter(sen[0:ii+k+1])
            del c[sen[ii]]
            co_occ[sen[ii]] = co_occ[sen[ii]] + c
        elif ii > len(sen)-(k+1):
            c = Counter(sen[ii-k::])
            del c[sen[ii]]
            co_occ[sen[ii]] = co_occ[sen[ii]] + c
        else:
            c = Counter(sen[ii-k:ii+k+1])
            del c[sen[ii]]
            co_occ[sen[ii]] = co_occ[sen[ii]] + c

# Having final matrix in dict form lets you convert it to different python data structures
co_occ = {ii:dict(co_occ[ii]) for ii in vocab}
display(co_occ)

Vocabulary:
 {'london', 'but', 'goes', 'i', 'do', 'you', 'go', 'not', 'to'}

Each sentence in token form:
 [['i', 'go', 'to', 'london'], ['you', 'do', 'not', 'go', 'to', 'london'], ['but', 'london', 'goes', 'to', 'you']]

{'london': {'go': 2, 'to': 3, 'but': 1, 'goes': 1},
 'but': {'london': 1, 'goes': 1},
 'goes': {'london': 1, 'but': 1, 'you': 1, 'to': 1},
 'i': {'go': 1, 'to': 1},
 'do': {'you': 1, 'go': 1, 'not': 1},
 'you': {'do': 1, 'not': 1, 'goes': 1, 'to': 1},
 'go': {'london': 2, 'i': 1, 'to': 2, 'do': 1, 'not': 1},
 'not': {'do': 1, 'you': 1, 'go': 1, 'to': 1},
 'to': {'london': 3, 'i': 1, 'go': 2, 'not': 1, 'goes': 1, 'you': 1}}

import numpy as np
import pandas as pd

ctxs = [
    'krayyem like candy crush more then coffe',
    'krayyem plays candy crush all days',
    'krayyem do not invite his friends to play candy crush',
    'krayyem is smart',
]

l_unique = list(set((' '.join(ctxs)).split(' ')))
mat = np.zeros((len(l_unique), len(l_unique)))

nei = []
nei_size = 3

for ctx in ctxs:
    for word in ctx.split(' '):
        nei.append(word)
        if len(nei) > nei_size:
            nei.pop(0)
        for word_1 in nei:
            for word_2 in nei:
                # if word_1 != word_2 -> to avoid diagonal
                mat[l_unique.index(word_1), l_unique.index(word_2)] += 1

mat = pd.DataFrame(mat)
mat.index = l_unique
mat.columns = l_unique
display(mat)