Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <assert.h>
- #include <string.h>
- #include <math.h>
- #include <ctype.h>
- #define MAX_LETTERS 64
- #define MAX_WORDS 1000
- int countUrls() {
- char ch[MAX_LETTERS];
- int i = 0;
- FILE *myfile;
- myfile = fopen("collection.txt","r");
- if (myfile== NULL){ //no file found
- printf("Error: cannot open file \n");
- return 0;
- }
- while (fscanf(myfile,"%s",ch)==1){ //while strings are still being read from txt file
- char* isURL = strstr(ch, "url"); //check it is a url
- if(isURL == NULL) {
- continue;
- }
- i++;
- }
- fclose(myfile);
- return i;
- }
- //make list all urls in collection.txt
- void getAllUrls(int nUrls, char *list[MAX_LETTERS]) {
- char ch[MAX_LETTERS];
- int i = 0;
- int n = 0;
- FILE *myfile;
- myfile = fopen("collection.txt","r");
- if (nUrls == 0) {
- return;
- }
- //start reading urls from collection
- char** urlList = malloc(nUrls * sizeof(char*));
- for(i=0; i<nUrls; i++ ) {
- urlList[i] = (char*)malloc(MAX_LETTERS+1);
- }
- while (fscanf(myfile,"%s",ch)== 1){ //while strings are still being read from txt file
- char* isURL = strstr(ch, "url"); //check it is a url
- if (isURL == NULL) {
- continue;
- }
- urlList[n] = strdup(isURL); //duplicate string into array
- n++;
- }
- nUrls = n;
- i = 0;
- while(i < nUrls) {
- strcpy(list[i],urlList[i]); //duplicate string to global array
- i++;
- }
- fclose(myfile);
- //finish reading urls from collection
- }
- //for each url loop to get words
- int getWords(int nUrls, char dictionary[MAX_WORDS][MAX_LETTERS], char *urlList[MAX_LETTERS]) {
- char filename[MAX_LETTERS];
- char reopen[MAX_LETTERS];
- int localTotal = 0;
- int wordIndex;
- int dictIndex;
- int letterIndex;
- int flag;
- int nWords;
- int urlIndex;
- for(urlIndex = 0; urlIndex < nUrls; urlIndex++) { //iterate for all urls
- FILE *fp;
- snprintf(filename, sizeof(filename), "%s.txt", urlList[urlIndex]);
- fp = fopen(filename, "r"); //open each url in urlList
- //to count words
- char ch[MAX_LETTERS];
- while(fscanf(fp,"%s",ch)!=EOF && strcmp(ch, "Section-2") != 0){
- }
- nWords = 0;
- while(fscanf(fp, "%s", ch) != EOF && strcmp("#end", ch) != 0) {
- nWords++;
- }
- fclose(fp);
- if(nWords == 0) continue; //if no words are in this url, skip
- //////////////////////////////////////////////////////////////////////////////
- //once we have nWords for this URL, make array of normalised words in this url
- //step 1: re-open file for reading words in this time
- FILE *fpp;
- snprintf(reopen, sizeof(reopen), "%s.txt", urlList[urlIndex]);
- fpp = fopen(reopen, "r");
- char c[MAX_LETTERS];
- //move file pointer to word after "Section-2"
- while(fscanf(fpp, "%s", c) != EOF && strcmp(c, "Section-2") != 0) {
- }
- char original[MAX_LETTERS]; //just fscanf all characters raw
- char normalised[MAX_LETTERS]; //pass in normalise() function and fill in
- wordIndex = 0;
- //until end of this file
- while(wordIndex < nWords && fscanf(fpp, "%s", &original) != EOF && strcmp("#end", original) != 0) {
- //for every word
- memset(normalised, 0, sizeof(normalised));
- //normalise word
- letterIndex = 0;
- while(original[letterIndex] != '\0') {
- if (original[letterIndex] == '.' || original[letterIndex] == ';' || original[letterIndex] == ',' || original[letterIndex] == '?' || original[letterIndex] == ' ') {
- normalised[letterIndex] = '\0';
- }
- else {
- if(!islower(original[letterIndex])){
- normalised[letterIndex] = tolower(original[letterIndex]);
- }
- else {
- normalised[letterIndex] = original[letterIndex];
- }
- }
- letterIndex++;
- }
- //copying (if necessary) into dictionary
- dictIndex = 0;
- if(localTotal == 0) { //for empty dictionary i.e. first word must be copied in
- strcpy(dictionary[dictIndex], normalised);
- localTotal = 1;
- }
- else{ //for non-empty dictionary, only copy if it is a new word
- flag = 0;
- while(dictIndex<localTotal && flag == 0) {
- if(strcmp(dictionary[dictIndex], normalised) == 0) {
- flag = 1;
- }
- dictIndex++;
- }
- //checked all words in dictionary and wordsInThis[wordIndex] doesn't exist yet
- if(dictIndex == localTotal && flag == 0) {
- strcpy(dictionary[dictIndex], normalised);
- localTotal++;
- }
- }
- //end copying (if necessary) into dictionary
- wordIndex++;
- }
- fclose(fpp);
- }
- return localTotal;
- }
- void sortWords(int nWords, char dictionary[MAX_WORDS][MAX_LETTERS]) {
- char temp[MAX_LETTERS];
- int curr;
- int next;
- printf("enters function\n");
- curr = 0;
- while (curr < nWords){
- next = curr+1;
- while (curr+next < nWords) {
- if (strcmp(dictionary[curr], dictionary[next]) > 0) {
- strcpy(temp, dictionary[curr]);
- strcpy(dictionary[curr], dictionary[next]);
- strcpy(dictionary[next], temp);
- }
- next++;
- }
- curr++;
- }
- }
- //TODO
- //function that outputs list of urls a given word appears in
- //takes in word from main, all urls, then spits out urls that this word occurs in
- int getUrlsPerWord(int nUrls, char *word, char *urlList[MAX_LETTERS], char *urlsPerWord[MAX_LETTERS]){
- int urlIndex = 0;
- char filename[MAX_LETTERS];
- int urlsPerWordIndex = 0;
- int flag;
- int letterIndex;
- while (urlIndex < nUrls) {
- FILE *fp;
- snprintf(filename, sizeof(filename), "%s.txt", urlList[urlIndex]);
- fp = fopen(filename, "r");
- char ch[MAX_LETTERS];
- //move file pointer to word after "Section-2"
- while(fscanf(fp, "%s", ch) != EOF && strcmp(ch, "Section-2") != 0) {
- }
- char original[MAX_LETTERS]; //just fscanf all characters raw
- char normalised[MAX_LETTERS];
- //until end of this file
- flag = 0;
- while(flag == 0 && fscanf(fp, "%s", &original) != EOF && strcmp("#end", original) != 0) {
- //for every word
- memset(normalised, 0, sizeof(normalised));
- //normalise word
- letterIndex = 0;
- while(original[letterIndex] != '\0') {
- if (original[letterIndex] == '.' || original[letterIndex] == ';' || original[letterIndex] == ',' || original[letterIndex] == '?' || original[letterIndex] == ' ') {
- normalised[letterIndex] = '\0';
- }
- else {
- if(!islower(original[letterIndex])){
- normalised[letterIndex] = tolower(original[letterIndex]);
- }
- else {
- normalised[letterIndex] = original[letterIndex];
- }
- }
- letterIndex++;
- }
- //checking if inputted "word" = "normalised" word
- if(strcmp(word, normalised) == 0) {
- flag = 1;
- strcpy(urlsPerWord[urlsPerWordIndex], urlList[urlIndex]);
- urlsPerWordIndex++;
- }
- }
- urlIndex++;
- fclose(fp);
- }
- return urlsPerWordIndex;
- }
- int main(void){
- //set up
- int nUrls = countUrls();
- printf("nUrls %d\n", nUrls);
- int i, j;
- char** urlsarray = malloc(nUrls * sizeof(char*));
- for(i=0; i<nUrls; i++ ) {
- urlsarray[i] = (char*)malloc(MAX_LETTERS+1);
- }
- getAllUrls(nUrls, urlsarray);
- char dictarray[MAX_WORDS][MAX_LETTERS];
- char** urlsPerWord = malloc(MAX_WORDS * sizeof(char*));
- for(i=0; i<nUrls; i++ ) {
- urlsPerWord[i] = (char*)malloc(nUrls+1);
- }
- //end set up
- //create dictionary
- getWords(nUrls, dictarray, urlsarray);
- int nWordsInDict = getWords(nUrls, dictarray, urlsarray);
- sortWords(nWordsInDict, dictarray);
- //end create dictionary
- //write to text file
- FILE *finalFile;
- finalFile = fopen("invertedIndex.txt","w");
- printf("open successful\n");
- i = 0;
- printf("total words in dictionary %d\n", nWordsInDict);
- while (i < nWordsInDict) {
- fprintf(finalFile, "%s", dictarray[i]);
- getUrlsPerWord(nUrls, dictarray[i], urlsarray, urlsPerWord);
- int n = getUrlsPerWord(nUrls, dictarray[i], urlsarray, urlsPerWord);
- j = 0;
- while (j < n) {
- fprintf(finalFile, " %s", urlsPerWord[j]);
- j++;
- }
- fprintf(finalFile, "\n");
- i++;
- }
- fclose(finalFile);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement