# Centroid Algorithm

a guest
Feb 12th, 2012
33
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. function ypred = predict(Xtrain, ytrain, Xtest)
2.
3. docs = size(Xtrain,1);
4. terms = size(Xtrain,2);
5. classes = size(ytrain,2);
6.
7. %% Находим классическую TF*IDF матрицу
8.
9. fprintf('computing TF*IDF... '); tic;
10.
11. tf = Xtrain;
12. for i=1:docs
13.     tf(i,:) = tf(i,:)/sum(tf(i,:));
14.     %fprintf('%d out of %d tf lines computed\n', i, docs);
15. end
16.
17. idf = zeros(1,terms);
18. for i=1:terms
19.     ni = sum(Xtrain(:,i)>0);
20.     if (ni==0)
21.         idf(i) = 0;
22.     else
23.         idf(i) = log(docs/ni);
24.     end
25. end
26.
27. TFIDF = tf;
28. for i=1:docs
29.     TFIDF(i,:) = tf(i,:).*idf(1,:);
30.     %fprintf('%d of %d TFIDF lines computed\n',i,docs);
31. end
32.
33. clear tf i idf ni;
34.
35. toc;
36.
37. %% Находим центры классов
38.
39. fprintf('computing centroids... '); tic;
40.
41. centroids = zeros(classes,terms);
42. for k=1:classes
43.     klass = ytrain(:,k)>0.5;
44.     centroids(k,:) = sum(TFIDF(klass,:))/sum(klass);
45.     %fprintf('%d out of %d centroids made\n', k, classes);
46. end
47. centroids = sparse(centroids);
48.
49. clear k klass;
50. clear TFIDF;
51.
52. toc;
53.
54. %% Относим вектора к близким по косинусной мере ценрам классов
55.
56. threshold = 0.825;
57.
58. fprintf('classifying... '); tic;
59.
60. ypred = zeros(size(Xtest,1),classes);
61. dists = zeros(classes,1);
62. for i=1:size(Xtest,1)
63.     for k=1:classes
64.         dists(k) = cossim(Xtest(i,:)',centroids(k,:)');
65.     end
66.     ypred(i,dists<threshold)=1;
67.     [~,I] = min(dists);
68.     ypred(i,I)=1; % как минимум к одному точно
69.     %fprintf('%d out of %d processed (%d)\n', i, size(Xtest,1), sum(ypred(i,:)));
70. end
71.
72. clear threshold dists i k I;
73. clear centroids;
74.
75. toc;
76.
77. %% Прибираем за собой (имело смысл в скрипте до выделения функции)
78. clear docs terms classes;
RAW Paste Data