Advertisement
Guest User

Untitled

a guest
Jun 24th, 2017
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.71 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <math.h>
  4. #include <string.h>
  5. #include <time.h>
  6. #include <unistd.h>
  7.  
  8. #define ROWS 6500
  9. #define COLS 5002
  10. #define BUFSIZE 1000000
  11. #define CORR_THRESHOLD 0.97
  12. #define PBSTR "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"
  13. #define PBWIDTH 60
  14.  
  15. void printProgress (double percentage)
  16. {
  17. int val = (int) (percentage * 100);
  18. int lpad = (int) (percentage * PBWIDTH);
  19. int rpad = PBWIDTH - lpad;
  20. printf ("\r%3d%% [%.*s%*s]", val, lpad, PBSTR, rpad, "");
  21. fflush (stdout);
  22. }
  23.  
  24. double correlation(int arr[ROWS][COLS],int col1, int col2)
  25. {
  26. int N = ROWS;
  27. int i;
  28. double sigma_x = 0;
  29. double sigma_y = 0;
  30. double sigma_x2 = 0;
  31. double sigma_y2 = 0;
  32. double sigma_xy = 0;
  33. for(i=0;i<ROWS;i++)
  34. {
  35. sigma_x2 += pow(arr[i][col1],2);
  36. sigma_y2 += pow(arr[i][col2],2);
  37. sigma_xy += arr[i][col1] * arr[i][col2];
  38. sigma_x += arr[i][col1];
  39. sigma_y += arr[i][col2];
  40. }
  41. //printf("sigma_x = %f\nsigma_y = %f\nsigma_x2 = %f\nsigma_y2 = %f\nsigma_xy = %f\n",sigma_x,sigma_y,sigma_x2,sigma_y2,sigma_xy);
  42. double corr = ((N*sigma_xy)-(sigma_x*sigma_y))/(sqrt( (N*sigma_x2 - pow(sigma_x,2))*(N*sigma_y2 - pow(sigma_y,2)) ));
  43. return corr;
  44. }
  45.  
  46. struct node
  47. {
  48. int data;
  49. struct node *next;
  50. };
  51.  
  52. double max(double a, double b)
  53. {
  54. return (a > b) ? a : b;
  55. }
  56.  
  57. int isInList(struct node *head, int val)
  58. {
  59. struct node *temp;
  60. int flag = 0;
  61. for(temp=head;temp!=NULL;temp=temp->next)
  62. {
  63. if(temp->data == val)
  64. {
  65. flag = 1;
  66. break;
  67. }
  68. }
  69. return flag;
  70. }
  71.  
  72. int main()
  73. {
  74. FILE *fp = fopen("TrainingData2.csv","r");
  75. FILE *out = fopen("Correlations1.csv","w");
  76. if(fp==NULL || out==NULL)
  77. {
  78. printf("File open error\n");
  79. return 0;
  80. }
  81. struct timespec start, end;
  82. clock_gettime(CLOCK_MONOTONIC, &start);
  83.  
  84. size_t buffer_size = BUFSIZE;
  85.  
  86. static int table[ROWS][COLS];
  87. for(int i=0;i<ROWS;i++)
  88. {
  89. for(int j=0;j<COLS;j++)
  90. table[i][j]=0;
  91. }
  92. int num;
  93. char *buffer = malloc(sizeof(char)*buffer_size);
  94. bzero(buffer,buffer_size);
  95. char *p;
  96. int rowCounter = -1;
  97. int colCounter;
  98. //load the data set into an array
  99. while(-1 != getline(&buffer, &buffer_size, fp))
  100. {
  101.  
  102. if(rowCounter == -1)
  103. {
  104. rowCounter++;
  105. printf("Skipping first row\n");
  106. }
  107. else
  108. {
  109.  
  110. colCounter = 0;
  111. p=strtok(buffer,",");
  112. while(p!=NULL)
  113. {
  114. //printf("colCounter = %d\n",colCounter);
  115. num = (int)strtol(p,NULL,10);
  116. table[rowCounter][colCounter] = num;
  117. p = strtok(NULL,",");
  118. colCounter++;
  119. }
  120. bzero(buffer,buffer_size);
  121. rowCounter++;
  122. }
  123. }
  124. clock_gettime(CLOCK_MONOTONIC, &end);
  125. double total_time = (end.tv_sec - start.tv_sec);
  126. total_time += (end.tv_nsec - start.tv_nsec)/1000000000.0;
  127. printf("array loaded, time = %f\n",total_time);
  128. struct node *temp = NULL, *tail = NULL, *head = NULL;
  129. struct node *irrelevantList = NULL, *irrelevantListTail = NULL;
  130. int flag, distinct;
  131. int irrelevantListLength = 0;
  132.  
  133. //note down columns with only one distinct value
  134. for(int j=0;j<COLS;j++)
  135. {
  136. distinct = 0;
  137. for(int i=0; i<ROWS; i++)
  138. {
  139. if(head == NULL)
  140. {
  141. head = malloc(sizeof(struct node));
  142. head->data = table[i][j];
  143. head->next = NULL;
  144. tail = head;
  145. distinct++;
  146. }
  147. else
  148. {
  149. flag = 0;
  150. for(temp = head; temp!= NULL; temp=temp->next)
  151. {
  152. if(temp->data == table[i][j])
  153. {
  154. flag = 1;
  155. break;
  156. }
  157. }
  158. if(flag == 1)
  159. {
  160. continue;
  161. }
  162. else
  163. {
  164. temp = malloc(sizeof(struct node));
  165. temp->data = table[i][j];
  166. temp->next = NULL;
  167. distinct++;
  168. tail->next = temp;
  169. tail = temp;
  170. }
  171.  
  172. }
  173. }
  174. temp = head;
  175. while(temp!=NULL)
  176. {
  177. tail = temp;
  178. temp=temp->next;
  179. free(tail);
  180. }
  181. head = NULL;
  182. temp = NULL;
  183. tail = NULL;
  184. if(distinct == 1)
  185. {
  186. if(irrelevantList == NULL)
  187. {
  188. irrelevantList = malloc(sizeof(struct node));
  189. irrelevantList->data = j;
  190. irrelevantList->next = NULL;
  191. irrelevantListTail = irrelevantList;
  192. irrelevantListLength++;
  193. }
  194. else
  195. {
  196. temp = malloc(sizeof(struct node));
  197. temp->data = j;
  198. temp->next = NULL;
  199. irrelevantListTail->next = temp;
  200. irrelevantListTail = temp;
  201. irrelevantListLength++;
  202. }
  203. }
  204.  
  205. }
  206. printf("Irrelevant List length = %d\n",irrelevantListLength);
  207. double corr;
  208.  
  209. printf("corr(53,class) = %f\n",correlation(table,53,5001));
  210. fprintf(out,"Attrib1,Attrib2,corr\n");
  211. printf("Loop test\n");
  212. double progress = 0;
  213.  
  214. printf("Correlation testing started\n");
  215. //compute the correlations of all relevant pairs
  216. for(int i=1;i<COLS-2;i++)
  217. {
  218. if(isInList(irrelevantList,i))
  219. continue;
  220. progress = (double)i/(COLS-2);
  221. printProgress(progress);
  222. for(int j=(i+1);j<(COLS-1);j++)
  223. {
  224. if(isInList(irrelevantList,j))
  225. continue;
  226. corr = correlation(table,i,j);
  227.  
  228. if(corr >= CORR_THRESHOLD) //note down pairs that have a high correlation
  229. fprintf(out,"%d,%d,%f\n",i,j,corr);
  230. }
  231. }
  232. clock_gettime(CLOCK_MONOTONIC, &end);
  233. total_time = (end.tv_sec - start.tv_sec);
  234. total_time += (end.tv_nsec - start.tv_nsec)/1000000000.0;
  235. printf("\nWrite complete. total time = %f\n",total_time);
  236.  
  237. fclose(fp);
  238. fclose(out);
  239. while(irrelevantList!=NULL) //free the list
  240. {
  241. irrelevantListTail = irrelevantList;
  242. irrelevantList = irrelevantList->next;
  243. free(irrelevantListTail);
  244. }
  245. return 0;
  246. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement