Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <math.h>
- #include <string.h>
- #include <time.h>
- #include <unistd.h>
- #define ROWS 6500
- #define COLS 5002
- #define BUFSIZE 1000000
- #define CORR_THRESHOLD 0.97
- #define PBSTR "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"
- #define PBWIDTH 60
- void printProgress (double percentage)
- {
- int val = (int) (percentage * 100);
- int lpad = (int) (percentage * PBWIDTH);
- int rpad = PBWIDTH - lpad;
- printf ("\r%3d%% [%.*s%*s]", val, lpad, PBSTR, rpad, "");
- fflush (stdout);
- }
- double correlation(int arr[ROWS][COLS],int col1, int col2)
- {
- int N = ROWS;
- int i;
- double sigma_x = 0;
- double sigma_y = 0;
- double sigma_x2 = 0;
- double sigma_y2 = 0;
- double sigma_xy = 0;
- for(i=0;i<ROWS;i++)
- {
- sigma_x2 += pow(arr[i][col1],2);
- sigma_y2 += pow(arr[i][col2],2);
- sigma_xy += arr[i][col1] * arr[i][col2];
- sigma_x += arr[i][col1];
- sigma_y += arr[i][col2];
- }
- //printf("sigma_x = %f\nsigma_y = %f\nsigma_x2 = %f\nsigma_y2 = %f\nsigma_xy = %f\n",sigma_x,sigma_y,sigma_x2,sigma_y2,sigma_xy);
- double corr = ((N*sigma_xy)-(sigma_x*sigma_y))/(sqrt( (N*sigma_x2 - pow(sigma_x,2))*(N*sigma_y2 - pow(sigma_y,2)) ));
- return corr;
- }
- struct node
- {
- int data;
- struct node *next;
- };
- double max(double a, double b)
- {
- return (a > b) ? a : b;
- }
- int isInList(struct node *head, int val)
- {
- struct node *temp;
- int flag = 0;
- for(temp=head;temp!=NULL;temp=temp->next)
- {
- if(temp->data == val)
- {
- flag = 1;
- break;
- }
- }
- return flag;
- }
- int main()
- {
- FILE *fp = fopen("TrainingData2.csv","r");
- FILE *out = fopen("Correlations1.csv","w");
- if(fp==NULL || out==NULL)
- {
- printf("File open error\n");
- return 0;
- }
- struct timespec start, end;
- clock_gettime(CLOCK_MONOTONIC, &start);
- size_t buffer_size = BUFSIZE;
- static int table[ROWS][COLS];
- for(int i=0;i<ROWS;i++)
- {
- for(int j=0;j<COLS;j++)
- table[i][j]=0;
- }
- int num;
- char *buffer = malloc(sizeof(char)*buffer_size);
- bzero(buffer,buffer_size);
- char *p;
- int rowCounter = -1;
- int colCounter;
- //load the data set into an array
- while(-1 != getline(&buffer, &buffer_size, fp))
- {
- if(rowCounter == -1)
- {
- rowCounter++;
- printf("Skipping first row\n");
- }
- else
- {
- colCounter = 0;
- p=strtok(buffer,",");
- while(p!=NULL)
- {
- //printf("colCounter = %d\n",colCounter);
- num = (int)strtol(p,NULL,10);
- table[rowCounter][colCounter] = num;
- p = strtok(NULL,",");
- colCounter++;
- }
- bzero(buffer,buffer_size);
- rowCounter++;
- }
- }
- clock_gettime(CLOCK_MONOTONIC, &end);
- double total_time = (end.tv_sec - start.tv_sec);
- total_time += (end.tv_nsec - start.tv_nsec)/1000000000.0;
- printf("array loaded, time = %f\n",total_time);
- struct node *temp = NULL, *tail = NULL, *head = NULL;
- struct node *irrelevantList = NULL, *irrelevantListTail = NULL;
- int flag, distinct;
- int irrelevantListLength = 0;
- //note down columns with only one distinct value
- for(int j=0;j<COLS;j++)
- {
- distinct = 0;
- for(int i=0; i<ROWS; i++)
- {
- if(head == NULL)
- {
- head = malloc(sizeof(struct node));
- head->data = table[i][j];
- head->next = NULL;
- tail = head;
- distinct++;
- }
- else
- {
- flag = 0;
- for(temp = head; temp!= NULL; temp=temp->next)
- {
- if(temp->data == table[i][j])
- {
- flag = 1;
- break;
- }
- }
- if(flag == 1)
- {
- continue;
- }
- else
- {
- temp = malloc(sizeof(struct node));
- temp->data = table[i][j];
- temp->next = NULL;
- distinct++;
- tail->next = temp;
- tail = temp;
- }
- }
- }
- temp = head;
- while(temp!=NULL)
- {
- tail = temp;
- temp=temp->next;
- free(tail);
- }
- head = NULL;
- temp = NULL;
- tail = NULL;
- if(distinct == 1)
- {
- if(irrelevantList == NULL)
- {
- irrelevantList = malloc(sizeof(struct node));
- irrelevantList->data = j;
- irrelevantList->next = NULL;
- irrelevantListTail = irrelevantList;
- irrelevantListLength++;
- }
- else
- {
- temp = malloc(sizeof(struct node));
- temp->data = j;
- temp->next = NULL;
- irrelevantListTail->next = temp;
- irrelevantListTail = temp;
- irrelevantListLength++;
- }
- }
- }
- printf("Irrelevant List length = %d\n",irrelevantListLength);
- double corr;
- printf("corr(53,class) = %f\n",correlation(table,53,5001));
- fprintf(out,"Attrib1,Attrib2,corr\n");
- printf("Loop test\n");
- double progress = 0;
- printf("Correlation testing started\n");
- //compute the correlations of all relevant pairs
- for(int i=1;i<COLS-2;i++)
- {
- if(isInList(irrelevantList,i))
- continue;
- progress = (double)i/(COLS-2);
- printProgress(progress);
- for(int j=(i+1);j<(COLS-1);j++)
- {
- if(isInList(irrelevantList,j))
- continue;
- corr = correlation(table,i,j);
- if(corr >= CORR_THRESHOLD) //note down pairs that have a high correlation
- fprintf(out,"%d,%d,%f\n",i,j,corr);
- }
- }
- clock_gettime(CLOCK_MONOTONIC, &end);
- total_time = (end.tv_sec - start.tv_sec);
- total_time += (end.tv_nsec - start.tv_nsec)/1000000000.0;
- printf("\nWrite complete. total time = %f\n",total_time);
- fclose(fp);
- fclose(out);
- while(irrelevantList!=NULL) //free the list
- {
- irrelevantListTail = irrelevantList;
- irrelevantList = irrelevantList->next;
- free(irrelevantListTail);
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement