Advertisement
Guest User

idea_codigo_rango_matrix

a guest
Jan 24th, 2020
111
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.16 KB | None | 0 0
  1. import nltk
  2. import json
  3. from nltk.corpus import stopwords
  4. import numpy as np
  5. from scipy.stats import ttest_ind
  6.  
  7. data = json.load(open('inter_language_analysis/parsed_data/parsed_data.json'))
  8.  
  9. data_br= data[3]
  10. schiz = [nltk.word_tokenize(data_br['schiz'][k]) for k in data_br['schiz']]
  11. control = [nltk.word_tokenize(data_br['control'][k]) for k in data_br['control']]
  12.  
  13. from nltk.corpus import stopwords
  14. stopwords_en = set(stopwords.words('english'))
  15.  
  16. words = set([w for c in control+schiz for w in c])
  17. dic_word_vector = {r['word']:r['vector'] for r in col.find({'word':{'$in':sorted(words)}})}
  18.  
  19.  
  20.  
  21. control_vecs = [ [np.array(dic_word_vector[w]) for w in sorted((c)) if w in dic_word_vector and not w in stopwords_en]for c in control]
  22.  
  23.  
  24. schiz_vecs = [ [np.array(dic_word_vector[w]) for w in sorted((c)) if w in dic_word_vector and not w in stopwords_en]for c in schiz]
  25.  
  26.  
  27.  
  28. rank_schiz = [np.linalg.matrix_rank(np.array(m)[:100,:],tol=.1) for m in schiz_vecs]
  29.  
  30. rank_control = [np.linalg.matrix_rank(np.array(m)[:100,:],tol=.1) for m in control_vecs]
  31.  
  32. print('schiz,control,pval', np.mean(rank_schiz), np.mean(rank_control),ttest_ind(norm_schiz,norm_control)[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement