Guest User

Centroid Algorithm

a guest
Feb 12th, 2012
33
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. function ypred = predict(Xtrain, ytrain, Xtest)
  2.  
  3. docs = size(Xtrain,1);
  4. terms = size(Xtrain,2);
  5. classes = size(ytrain,2);
  6.  
  7. %% Находим классическую TF*IDF матрицу
  8.  
  9. fprintf('computing TF*IDF... '); tic;
  10.  
  11. tf = Xtrain;
  12. for i=1:docs    
  13.     tf(i,:) = tf(i,:)/sum(tf(i,:));
  14.     %fprintf('%d out of %d tf lines computed\n', i, docs);
  15. end
  16.  
  17. idf = zeros(1,terms);
  18. for i=1:terms
  19.     ni = sum(Xtrain(:,i)>0);
  20.     if (ni==0)
  21.         idf(i) = 0;
  22.     else
  23.         idf(i) = log(docs/ni);
  24.     end
  25. end
  26.  
  27. TFIDF = tf;
  28. for i=1:docs
  29.     TFIDF(i,:) = tf(i,:).*idf(1,:);
  30.     %fprintf('%d of %d TFIDF lines computed\n',i,docs);
  31. end
  32.  
  33. clear tf i idf ni;
  34.  
  35. toc;
  36.  
  37. %% Находим центры классов
  38.  
  39. fprintf('computing centroids... '); tic;
  40.  
  41. centroids = zeros(classes,terms);
  42. for k=1:classes
  43.     klass = ytrain(:,k)>0.5;
  44.     centroids(k,:) = sum(TFIDF(klass,:))/sum(klass);
  45.     %fprintf('%d out of %d centroids made\n', k, classes);
  46. end
  47. centroids = sparse(centroids);
  48.  
  49. clear k klass;
  50. clear TFIDF;
  51.  
  52. toc;
  53.  
  54. %% Относим вектора к близким по косинусной мере ценрам классов
  55.  
  56. threshold = 0.825;
  57.  
  58. fprintf('classifying... '); tic;
  59.  
  60. ypred = zeros(size(Xtest,1),classes);
  61. dists = zeros(classes,1);
  62. for i=1:size(Xtest,1)
  63.     for k=1:classes
  64.         dists(k) = cossim(Xtest(i,:)',centroids(k,:)');
  65.     end
  66.     ypred(i,dists<threshold)=1;
  67.     [~,I] = min(dists);
  68.     ypred(i,I)=1; % как минимум к одному точно
  69.     %fprintf('%d out of %d processed (%d)\n', i, size(Xtest,1), sum(ypred(i,:)));
  70. end
  71.  
  72. clear threshold dists i k I;
  73. clear centroids;
  74.  
  75. toc;
  76.  
  77. %% Прибираем за собой (имело смысл в скрипте до выделения функции)
  78. clear docs terms classes;
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×