Advertisement
Guest User

Untitled

a guest
Nov 26th, 2014
150
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 21.33 KB | None | 0 0
  1. % for thisFold = 1:5
  2. % cd('C:\Users\ryan fitchett\Desktop\IR cw');
  3. % %define directory
  4. % thisDir = ['fold' num2str(thisFold)];
  5. % cmd = ['cd ' thisDir];
  6. % eval(cmd);
  7. %
  8. % %load training for given fold
  9. % temp = dir('Counts-Folds*.mat');
  10. % countsFile = temp.name;
  11. % load(countsFile);
  12. %
  13. % % get names of true and deceptive files.
  14. % trueFiles = dir('t_*.oneline');
  15. % decepFiles = dir('d_*.oneline');
  16. %
  17. % truthTable = table(trainTerms', 'VariableNames', {'Terms'});
  18. % falseTable = table(trainTerms', 'VariableNames', {'Terms'});
  19. %
  20. % for f = 1:size(trueFiles, 1)
  21. % % get the filename
  22. % thisFile = trueFiles(f).name;
  23. %
  24. % % open the file
  25. % fileName = fopen(thisFile);
  26. %
  27. % % get the first line(one term per line)
  28. % line = fgetl(fileName);
  29. %
  30. % % add first term to vector
  31. % trueTerms = {};
  32. % trueTerms{end+1} = line;
  33. %
  34. % % while there are more lines(terms) in the file
  35. % while ischar(line)
  36. % % get the next line(term)
  37. % line = fgetl(fileName);
  38. %
  39. % % if the line isn't blank and the end hasn't been reached
  40. % % the term to the term vector.
  41. % if(ischar(line) & (line ~= ' '))
  42. % trueTerms{end+1} = line;
  43. % end
  44. % end
  45. %
  46. % [trueTerms, ~, a] = unique(trueTerms);
  47. % occurrenceTrue = accumarray(a,1);
  48. % tempTable = table(trueTerms', occurrenceTrue, 'VariableNames', {'Terms', 'Count'});
  49. % truthTable = outerjoin(truthTable, tempTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  50. %
  51. % %close the file
  52. % fclose(fileName);
  53. % end
  54. %
  55. % for f = 1:size(decepFiles, 1)
  56. % % get the filename
  57. % thisFile = decepFiles(f).name;
  58. %
  59. % % open the file
  60. % fileName = fopen(thisFile);
  61. %
  62. % % get the first line(one term per line)
  63. % line = fgetl(fileName);
  64. %
  65. % % add first term to vector
  66. % falseTerms = {};
  67. % falseTerms{end+1} = line;
  68. %
  69. % % while there are more lines(terms) in the file
  70. % while ischar(line)
  71. % % get the next line(term)
  72. % line = fgetl(fileName);
  73. %
  74. % % if the line isn't blank and the end hasn't been reached
  75. % % the term to the term vector.
  76. % if(ischar(line) & (line ~= ' '))
  77. % falseTerms{end+1} = line;
  78. % end
  79. % end
  80. %
  81. % [falseTerms, ~, a] = unique(falseTerms);
  82. % occurrenceFalse = accumarray(a,1);
  83. % tempTable = table(falseTerms', occurrenceFalse, 'VariableNames', {'Terms', 'Count'});
  84. % falseTable = outerjoin(falseTable, tempTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  85. %
  86. % %close the file
  87. % fclose(fileName);
  88. % end
  89. %
  90. % cd('C:\Users\ryan fitchett\Desktop\IR cw');
  91. % fileName = ['trueFold' num2str(thisFold) '.mat'];
  92. % save(fileName, 'truthTable');
  93. % fileName = ['decepFold' num2str(thisFold) '.mat'];
  94. % save(fileName, 'falseTable');
  95. % end
  96. % clear
  97. count = 0;
  98.  
  99. for thisFold = 1:5
  100.  
  101.  
  102. %define directory
  103. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  104. thisDir = ['fold' num2str(thisFold)];
  105. cmd = ['cd ' thisDir];
  106. eval(cmd);
  107.  
  108. %load training for given fold
  109. temp = dir('Counts-Folds*.mat');
  110. countsFile = temp.name;
  111. load(countsFile);
  112.  
  113. truthTrainTable = table(trainTerms', 'VariableNames', {'Terms'});
  114. falseTrainTable = table(trainTerms', 'VariableNames', {'Terms'});
  115. truthTestTable = table(trainTerms', 'VariableNames', {'Terms'});
  116. falseTestTable = table(trainTerms', 'VariableNames', {'Terms'});
  117.  
  118. switch thisFold
  119.  
  120. case 1
  121. %Load in truthful information
  122.  
  123. %load in test data
  124. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  125. load('trueFold1.mat');
  126. truthTestTable = truthTable;
  127. %load in training data
  128. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  129. load('trueFold2.mat');
  130. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  131. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  132. load('trueFold3.mat');
  133. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  134. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  135. load('trueFold4.mat');
  136. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  137. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  138. load('trueFold5.mat');
  139. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  140.  
  141.  
  142. %Load in decepful information
  143.  
  144. %load in test data
  145. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  146. load('decepFold1.mat');
  147. falseTestTable = falseTable;
  148. %load in training data
  149. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  150. load('decepFold2.mat');
  151. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  152. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  153. load('decepFold3.mat');
  154. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  155. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  156. load('decepFold4.mat');
  157. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  158. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  159. load('decepFold5.mat');
  160. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  161.  
  162. case 2
  163. %load in test data
  164. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  165. load('trueFold2.mat');
  166. truthTestTable = truthTable;
  167. %load in training data
  168. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  169. load('trueFold1.mat');
  170. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  171. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  172. load('trueFold3.mat');
  173. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  174. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  175. load('trueFold4.mat');
  176. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  177. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  178. load('trueFold5.mat');
  179. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  180.  
  181. %Load in decepful information
  182.  
  183. %load in test data
  184. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  185. load('decepFold2.mat');
  186. falseTestTable = falseTable;
  187. %load in training data
  188. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  189. load('decepFold1.mat');
  190. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  191. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  192. load('decepFold3.mat');
  193. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  194. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  195. load('decepFold4.mat');
  196. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  197. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  198. load('decepFold5.mat');
  199. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  200. case 3
  201. %load in test data
  202. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  203. load('trueFold3.mat');
  204. truthTestTable = truthTable;
  205. %load in training data
  206. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  207. load('trueFold1.mat');
  208. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  209. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  210. load('trueFold2.mat');
  211. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  212. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  213. load('trueFold4.mat');
  214. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  215. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  216. load('trueFold5.mat');
  217. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  218.  
  219. %Load in decepful information
  220.  
  221. %load in test data
  222. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  223. load('decepFold3.mat');
  224. falseTestTable = falseTable;
  225. %load in training data
  226. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  227. load('decepFold1.mat');
  228. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  229. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  230. load('decepFold2.mat');
  231. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  232. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  233. load('decepFold4.mat');
  234. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  235. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  236. load('decepFold5.mat');
  237. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  238. case 4
  239. %load in test data
  240. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  241. load('trueFold4.mat');
  242. truthTestTable = truthTable;
  243. %load in training data
  244. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  245. load('trueFold1.mat');
  246. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  247. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  248. load('trueFold2.mat');
  249. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  250. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  251. load('trueFold3.mat');
  252. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  253. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  254. load('trueFold5.mat');
  255. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  256.  
  257. %Load in decepful information
  258.  
  259. %load in test data
  260. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  261. load('decepFold4.mat');
  262. falseTestTable = falseTable;
  263. %load in training data
  264. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  265. load('decepFold1.mat');
  266. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  267. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  268. load('decepFold2.mat');
  269. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  270. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  271. load('decepFold3.mat');
  272. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  273. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  274. load('decepFold5.mat');
  275. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  276. case 5
  277. %load in test data
  278. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  279. load('trueFold5.mat');
  280. truthTestTable = truthTable;
  281. %load in training data
  282. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  283. load('trueFold1.mat');
  284. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  285. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  286. load('trueFold2.mat');
  287. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  288. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  289. load('trueFold3.mat');
  290. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  291. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  292. load('trueFold4.mat');
  293. truthTrainTable = outerjoin(truthTrainTable, truthTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  294.  
  295. %Load in decepful information
  296.  
  297. %load in test data
  298. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  299. load('decepFold5.mat');
  300. falseTestTable = falseTable;
  301. %load in training data
  302. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  303. load('decepFold1.mat');
  304. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  305. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  306. load('decepFold2.mat');
  307. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  308. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  309. load('decepFold3.mat');
  310. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  311. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  312. load('decepFold4.mat');
  313. falseTrainTable = outerjoin(falseTrainTable, falseTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  314. end
  315.  
  316. %find all the terms accross all 5 folds.
  317. %testTerms are the terms from the current fold
  318. %trainTerms are the terms from the other 4 folds.
  319. allTermsTable = unique(vertcat(truthTrainTable(:,1), falseTrainTable(:,1), truthTestTable(:,1), falseTestTable(:,1)));
  320.  
  321. %TrueTraining
  322. truthTrainSet = outerjoin(truthTrainTable, allTermsTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  323. truthTrainSet(:,1) = [];
  324. truthTrainSet = cell2mat(table2cell(truthTrainSet));
  325. truthTrainSet(isnan(truthTrainSet)) = 0;
  326.  
  327. idx = truthTrainSet ~= 0;
  328. trueCounts = sum(idx,2);
  329.  
  330. %FalseTraining
  331. falseTrainSet = outerjoin(falseTrainTable, allTermsTable, 'Keys', {'Terms'}, 'MergeKeys', true);
  332. falseTrainSet(:,1) = [];
  333. falseTrainSet = cell2mat(table2cell(falseTrainSet));
  334. falseTrainSet(isnan(falseTrainSet)) = 0;
  335.  
  336. idx = falseTrainSet ~= 0;
  337. falseCounts = sum(idx,2);
  338.  
  339. allTrainSet = table();
  340. allTrainSet(:,end+1) = allTermsTable(:,1);
  341. allTrainSet(:,end+1) = num2cell(trueCounts(:,1));
  342. allTrainSet(:,end+1) = num2cell(falseCounts(:,1));
  343.  
  344. noTrueDocs = size(truthTrainSet,2) - 1;
  345. noFalseDocs = size(falseTrainSet,2) - 1;
  346. %classifications in here
  347.  
  348. %define directory
  349. cd('C:\Users\ryan fitchett\Desktop\IR cw');
  350. thisDir = ['fold' num2str(thisFold)];
  351. cmd = ['cd ' thisDir];
  352. eval(cmd);
  353.  
  354. % get names of true and deceptive files.
  355. trueFiles = dir('t_*.oneline');
  356. decepFiles = dir('d_*.oneline');
  357.  
  358.  
  359. %read contents of true files.
  360. for f = 1:size(trueFiles, 1)
  361.  
  362. Qtrue = [];
  363. Qfalse = [];
  364.  
  365. thisFile = trueFiles(f).name;
  366. %open the file to be read
  367. fileName = fopen(thisFile);
  368. %get the first line (one term per line)
  369. line = fgetl(fileName);
  370.  
  371. %get the value representing true in the training set for the given
  372. %word of the document.
  373. temp2 = cell2table({line});
  374. temp2.Properties.VariableNames{1} = 'Terms';
  375. temp = find(ismember(allTermsTable, temp2, 'rows'),1);
  376. temp2 = cell2mat(table2cell(allTrainSet(temp,2)));
  377.  
  378. pt = (temp2 + 1) / (noTrueDocs + 2);
  379. if(temp2 == 0)
  380. Qtrue{end+1,1} = (1-pt);
  381. else
  382. Qtrue{end+1,1} = pt;
  383. end
  384.  
  385. temp2 = cell2mat(table2cell(allTrainSet(temp,3)));
  386. pf = (temp2 + 1) / (noFalseDocs + 2);
  387. if(temp2 == 0)
  388. Qfalse{end+1, 1} = (1-pf);
  389. else
  390. Qfalse{end+1,1} = pf;
  391. end
  392. %while there are more lines(terms) in the file
  393. while ischar(line)
  394. %get the next line(term)
  395. line = fgetl(fileName);
  396. %if the line isn't blank and the end hasn't been reached
  397. if(ischar(line) & (line ~= ' '))
  398. temp2 = cell2table({line});
  399. temp2.Properties.VariableNames{1} = 'Terms';
  400. temp = find(ismember(allTermsTable, temp2, 'rows'),1);
  401. temp2 = cell2mat(table2cell(allTrainSet(temp,2)));
  402.  
  403. pt = (temp2 + 1) / (noTrueDocs + 2);
  404. if(temp2 == 0)
  405. Qtrue{end+1,1} = (1-pt);
  406. else
  407. Qtrue{end+1,1} = pt;
  408. end
  409.  
  410. temp2 = cell2mat(table2cell(allTrainSet(temp,3)));
  411. pf = (temp2 + 1) / (noFalseDocs + 2);
  412. if(temp2 == 0)
  413. Qfalse{end+1, 1} = (1-pf);
  414. else
  415. Qfalse{end+1,1} = pf;
  416. end
  417. end
  418. end
  419. %close the file
  420. fclose(fileName);
  421.  
  422. pTrue = prod(cell2mat(Qtrue));
  423. pFalse = prod(cell2mat(Qfalse));
  424.  
  425. if(pTrue >= pFalse)
  426. count = count + 1;
  427. end
  428. end
  429.  
  430. %read contents of false files.
  431. for f = 1:size(decepFiles, 1)
  432.  
  433. Qtrue = [];
  434. Qfalse = [];
  435.  
  436. thisFile = decepFiles(f).name;
  437. %open the file to be read
  438. fileName = fopen(thisFile);
  439. %get the first line (one term per line)
  440. line = fgetl(fileName);
  441.  
  442. %get the value representing true in the training set for the given
  443. %word of the document.
  444. temp2 = cell2table({line});
  445. temp2.Properties.VariableNames{1} = 'Terms';
  446. temp = find(ismember(allTermsTable, temp2, 'rows'),1);
  447. temp2 = cell2mat(table2cell(allTrainSet(temp,2)));
  448.  
  449. pt = (temp2 + 1) / (noTrueDocs + 2);
  450. Qtrue{end + 1,1} = pt;
  451.  
  452. temp2 = cell2mat(table2cell(allTrainSet(temp,3)));
  453. pf = (temp2 + 1) / (noFalseDocs + 2);
  454. Qfalse{end+1, 1} = pf;
  455. %while there are more lines(terms) in the file
  456. while ischar(line)
  457. %get the next line(term)
  458. line = fgetl(fileName);
  459. %if the line isn't blank and the end hasn't been reached
  460. if(ischar(line) & (line ~= ' '))
  461. temp2 = cell2table({line});
  462. temp2.Properties.VariableNames{1} = 'Terms';
  463. temp = find(ismember(allTermsTable, temp2, 'rows'),1);
  464. temp2 = cell2mat(table2cell(allTrainSet(temp,2)));
  465.  
  466. pt = (temp2 + 1) / (noTrueDocs + 2);
  467. Qtrue{end + 1, 1} = pt;
  468.  
  469. temp2 = cell2mat(table2cell(allTrainSet(temp,3)));
  470. pf = (temp2 + 1) / (noFalseDocs + 2);
  471. Qfalse{end+1, 1} = pf;
  472. end
  473. end
  474. %close the file
  475. fclose(fileName);
  476.  
  477.  
  478. pTrue = prod(cell2mat(Qtrue));
  479. pFalse = prod(cell2mat(Qfalse));
  480.  
  481. if(pFalse >= pTrue)
  482. count = count + 1;
  483. end
  484. end
  485.  
  486.  
  487. %clear to stop corruption across folds.
  488. clearvars -except count
  489. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement