Advertisement
Guest User

Centroid Algorithm

a guest
Feb 12th, 2012
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
MatLab 1.79 KB | None | 0 0
  1. function ypred = predict(Xtrain, ytrain, Xtest)
  2.  
  3. docs = size(Xtrain,1);
  4. terms = size(Xtrain,2);
  5. classes = size(ytrain,2);
  6.  
  7. %% Находим классическую TF*IDF матрицу
  8.  
  9. fprintf('computing TF*IDF... '); tic;
  10.  
  11. tf = Xtrain;
  12. for i=1:docs    
  13.     tf(i,:) = tf(i,:)/sum(tf(i,:));
  14.     %fprintf('%d out of %d tf lines computed\n', i, docs);
  15. end
  16.  
  17. idf = zeros(1,terms);
  18. for i=1:terms
  19.     ni = sum(Xtrain(:,i)>0);
  20.     if (ni==0)
  21.         idf(i) = 0;
  22.     else
  23.         idf(i) = log(docs/ni);
  24.     end
  25. end
  26.  
  27. TFIDF = tf;
  28. for i=1:docs
  29.     TFIDF(i,:) = tf(i,:).*idf(1,:);
  30.     %fprintf('%d of %d TFIDF lines computed\n',i,docs);
  31. end
  32.  
  33. clear tf i idf ni;
  34.  
  35. toc;
  36.  
  37. %% Находим центры классов
  38.  
  39. fprintf('computing centroids... '); tic;
  40.  
  41. centroids = zeros(classes,terms);
  42. for k=1:classes
  43.     klass = ytrain(:,k)>0.5;
  44.     centroids(k,:) = sum(TFIDF(klass,:))/sum(klass);
  45.     %fprintf('%d out of %d centroids made\n', k, classes);
  46. end
  47. centroids = sparse(centroids);
  48.  
  49. clear k klass;
  50. clear TFIDF;
  51.  
  52. toc;
  53.  
  54. %% Относим вектора к близким по косинусной мере ценрам классов
  55.  
  56. threshold = 0.825;
  57.  
  58. fprintf('classifying... '); tic;
  59.  
  60. ypred = zeros(size(Xtest,1),classes);
  61. dists = zeros(classes,1);
  62. for i=1:size(Xtest,1)
  63.     for k=1:classes
  64.         dists(k) = cossim(Xtest(i,:)',centroids(k,:)');
  65.     end
  66.     ypred(i,dists<threshold)=1;
  67.     [~,I] = min(dists);
  68.     ypred(i,I)=1; % как минимум к одному точно
  69.     %fprintf('%d out of %d processed (%d)\n', i, size(Xtest,1), sum(ypred(i,:)));
  70. end
  71.  
  72. clear threshold dists i k I;
  73. clear centroids;
  74.  
  75. toc;
  76.  
  77. %% Прибираем за собой (имело смысл в скрипте до выделения функции)
  78. clear docs terms classes;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement