Untitled

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <math.h>
#include <ctype.h>
#define MAX_LETTERS 64
#define MAX_WORDS 1000
int countUrls() {
    char ch[MAX_LETTERS];
    int i = 0;
    FILE *myfile;
    myfile = fopen("collection.txt","r");
    if (myfile== NULL){ //no file found
        printf("Error: cannot open file \n");
        return 0;
    }
    while (fscanf(myfile,"%s",ch)==1){ //while strings are still being read from txt file
        char* isURL = strstr(ch, "url"); //check it is a url
        if(isURL == NULL) {
            continue;
        }
        i++;
    }
    fclose(myfile);
    return i;
}
//make list all urls in collection.txt
void getAllUrls(int nUrls, char *list[MAX_LETTERS]) {
    char ch[MAX_LETTERS];
    int i = 0;
    int n = 0;
    FILE *myfile;
    myfile = fopen("collection.txt","r");
    if (nUrls == 0) {
        return;
    }
    //start reading urls from collection
    char** urlList = malloc(nUrls * sizeof(char*));

    for(i=0; i<nUrls; i++ ) {
        urlList[i] = (char*)malloc(MAX_LETTERS+1);
    }
    while (fscanf(myfile,"%s",ch)== 1){ //while strings are still being read from txt file
        char* isURL = strstr(ch, "url"); //check it is a url

        if (isURL == NULL) {
            continue;
        }

        urlList[n] = strdup(isURL); //duplicate string into array
        n++;
    }
    nUrls = n;
    i = 0;
    while(i < nUrls) {
        strcpy(list[i],urlList[i]); //duplicate string to global array
        i++;
    }
    fclose(myfile);
    //finish reading urls from collection
}

    //for each url loop to get words
int getWords(int nUrls, char dictionary[MAX_WORDS][MAX_LETTERS], char *urlList[MAX_LETTERS]) {
    char filename[MAX_LETTERS];
    char reopen[MAX_LETTERS];
    int localTotal = 0;
    int wordIndex;
    int dictIndex;
    int letterIndex;
    int flag;
    int nWords;
    int urlIndex;
    for(urlIndex = 0; urlIndex < nUrls; urlIndex++) { //iterate for all urls
        FILE *fp;
        snprintf(filename, sizeof(filename), "%s.txt", urlList[urlIndex]);
        fp = fopen(filename, "r"); //open each url in urlList
        //to count words
        char ch[MAX_LETTERS];
        while(fscanf(fp,"%s",ch)!=EOF && strcmp(ch, "Section-2") != 0){
        }
        nWords = 0;
        while(fscanf(fp, "%s", ch) != EOF && strcmp("#end", ch) != 0) {
            nWords++;
        }
        fclose(fp);
        if(nWords == 0) continue; //if no words are in this url, skip

        //////////////////////////////////////////////////////////////////////////////

        //once we have nWords for this URL, make array of normalised words in this url

        //step 1: re-open file for reading words in this time
        FILE *fpp;
        snprintf(reopen, sizeof(reopen), "%s.txt", urlList[urlIndex]);
        fpp = fopen(reopen, "r");
        char c[MAX_LETTERS];
        //move file pointer to word after "Section-2"
        while(fscanf(fpp, "%s", c) != EOF && strcmp(c, "Section-2") != 0) {
        }

        char original[MAX_LETTERS]; //just fscanf all characters raw
        char normalised[MAX_LETTERS]; //pass in normalise() function and fill in
        wordIndex = 0;
        //until end of this file
        while(wordIndex < nWords && fscanf(fpp, "%s", &original) != EOF && strcmp("#end", original) != 0) {
            //for every word
            memset(normalised, 0, sizeof(normalised));
            //normalise word
            letterIndex = 0;
            while(original[letterIndex] != '\0') {
                if (original[letterIndex] == '.' || original[letterIndex] == ';' || original[letterIndex] == ',' || original[letterIndex] == '?' || original[letterIndex] == ' ') {
                    normalised[letterIndex] = '\0';
                }
                else {
                    if(!islower(original[letterIndex])){
                        normalised[letterIndex] = tolower(original[letterIndex]);
                    }
                    else {
                        normalised[letterIndex] = original[letterIndex];
                    }
                }
                letterIndex++;
            }
            //copying (if necessary) into dictionary
            dictIndex = 0;
            if(localTotal == 0) { //for empty dictionary i.e. first word must be copied in
                strcpy(dictionary[dictIndex], normalised);
                localTotal = 1;
            }
            else{ //for non-empty dictionary, only copy if it is a new word
                flag = 0;
                while(dictIndex<localTotal && flag == 0) {
                    if(strcmp(dictionary[dictIndex], normalised) == 0) {
                        flag = 1;
                        }
                    dictIndex++;
                }
                //checked all words in dictionary and wordsInThis[wordIndex] doesn't exist yet
                if(dictIndex == localTotal && flag == 0) {
                    strcpy(dictionary[dictIndex], normalised);
                    localTotal++;
                }
            }
            //end copying (if necessary) into dictionary
            wordIndex++;
        }
        fclose(fpp);
    }
    return localTotal;
}
void sortWords(int nWords, char dictionary[MAX_WORDS][MAX_LETTERS]) {
    char temp[MAX_LETTERS];
    int curr;
    int next;
    printf("enters function\n");
    curr = 0;
    while (curr < nWords){
        next = curr+1;
        while (curr+next < nWords) {
            if (strcmp(dictionary[curr], dictionary[next]) > 0) {
                strcpy(temp, dictionary[curr]);
                strcpy(dictionary[curr], dictionary[next]);
                strcpy(dictionary[next], temp);
            }
            next++;
        }
        curr++;
    }
}
//TODO
//function that outputs list of urls a given word appears in
//takes in word from main, all urls, then spits out urls that this word occurs in
int getUrlsPerWord(int nUrls, char *word, char *urlList[MAX_LETTERS], char *urlsPerWord[MAX_LETTERS]){
    int urlIndex = 0;
    char filename[MAX_LETTERS];
    int urlsPerWordIndex = 0;
    int flag;
    int letterIndex;
    while (urlIndex < nUrls) {

        FILE *fp;
        snprintf(filename, sizeof(filename), "%s.txt", urlList[urlIndex]);
        fp = fopen(filename, "r");
        char ch[MAX_LETTERS];
        //move file pointer to word after "Section-2"
        while(fscanf(fp, "%s", ch) != EOF && strcmp(ch, "Section-2") != 0) {
        }
        char original[MAX_LETTERS]; //just fscanf all characters raw
        char normalised[MAX_LETTERS];
        //until end of this file
        flag = 0;
        while(flag == 0 && fscanf(fp, "%s", &original) != EOF && strcmp("#end", original) != 0) {
            //for every word
            memset(normalised, 0, sizeof(normalised));
            //normalise word
            letterIndex = 0;
            while(original[letterIndex] != '\0') {
                if (original[letterIndex] == '.' || original[letterIndex] == ';' || original[letterIndex] == ',' || original[letterIndex] == '?' || original[letterIndex] == ' ') {
                    normalised[letterIndex] = '\0';
                }
                else {
                    if(!islower(original[letterIndex])){
                        normalised[letterIndex] = tolower(original[letterIndex]);
                    }
                    else {
                        normalised[letterIndex] = original[letterIndex];
                    }
                }
                letterIndex++;
            }
            //checking if inputted "word" = "normalised" word
            if(strcmp(word, normalised) == 0) {
                flag = 1;
                strcpy(urlsPerWord[urlsPerWordIndex], urlList[urlIndex]);
                urlsPerWordIndex++;
            }
        }
        urlIndex++;
        fclose(fp);
    }
    return urlsPerWordIndex;
}
int main(void){
    //set up
    int nUrls = countUrls();
    printf("nUrls %d\n", nUrls);
    int i, j;
    char** urlsarray = malloc(nUrls * sizeof(char*));
    for(i=0; i<nUrls; i++ ) {
        urlsarray[i] = (char*)malloc(MAX_LETTERS+1);
    }
    getAllUrls(nUrls, urlsarray);
    char dictarray[MAX_WORDS][MAX_LETTERS];
    char** urlsPerWord = malloc(MAX_WORDS * sizeof(char*));
    for(i=0; i<nUrls; i++ ) {
        urlsPerWord[i] = (char*)malloc(nUrls+1);
    }
    //end set up

    //create dictionary
    getWords(nUrls, dictarray, urlsarray);
    int nWordsInDict = getWords(nUrls, dictarray, urlsarray);
    sortWords(nWordsInDict, dictarray);
    //end create dictionary

    //write to text file
    FILE *finalFile;
    finalFile = fopen("invertedIndex.txt","w");
    printf("open successful\n");
    i = 0;
    printf("total words in dictionary %d\n", nWordsInDict);
    while (i < nWordsInDict) {
        fprintf(finalFile, "%s", dictarray[i]);
        getUrlsPerWord(nUrls, dictarray[i], urlsarray, urlsPerWord);
        int n = getUrlsPerWord(nUrls, dictarray[i], urlsarray, urlsPerWord);
        j = 0;
        while (j < n) {
            fprintf(finalFile, " %s", urlsPerWord[j]);
            j++;
        }
        fprintf(finalFile, "\n");
        i++;
    }
    fclose(finalFile);
    return 0;
}