Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- import json
- from nltk.corpus import stopwords
- import numpy as np
- from scipy.stats import ttest_ind
- data = json.load(open('inter_language_analysis/parsed_data/parsed_data.json'))
- data_br= data[3]
- schiz = [nltk.word_tokenize(data_br['schiz'][k]) for k in data_br['schiz']]
- control = [nltk.word_tokenize(data_br['control'][k]) for k in data_br['control']]
- from nltk.corpus import stopwords
- stopwords_en = set(stopwords.words('english'))
- words = set([w for c in control+schiz for w in c])
- dic_word_vector = {r['word']:r['vector'] for r in col.find({'word':{'$in':sorted(words)}})}
- control_vecs = [ [np.array(dic_word_vector[w]) for w in sorted((c)) if w in dic_word_vector and not w in stopwords_en]for c in control]
- schiz_vecs = [ [np.array(dic_word_vector[w]) for w in sorted((c)) if w in dic_word_vector and not w in stopwords_en]for c in schiz]
- rank_schiz = [np.linalg.matrix_rank(np.array(m)[:100,:],tol=.1) for m in schiz_vecs]
- rank_control = [np.linalg.matrix_rank(np.array(m)[:100,:],tol=.1) for m in control_vecs]
- print('schiz,control,pval', np.mean(rank_schiz), np.mean(rank_control),ttest_ind(norm_schiz,norm_control)[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement