Advertisement
Guest User

Untitled

a guest
Oct 22nd, 2017
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.31 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <assert.h>
  4. #include <string.h>
  5. #include <math.h>
  6. #include <ctype.h>
  7. #define MAX_LETTERS 64
  8. #define MAX_WORDS 1000
  9. int countUrls() {
  10. char ch[MAX_LETTERS];
  11. int i = 0;
  12. FILE *myfile;
  13. myfile = fopen("collection.txt","r");
  14. if (myfile== NULL){ //no file found
  15. printf("Error: cannot open file \n");
  16. return 0;
  17. }
  18. while (fscanf(myfile,"%s",ch)==1){ //while strings are still being read from txt file
  19. char* isURL = strstr(ch, "url"); //check it is a url
  20. if(isURL == NULL) {
  21. continue;
  22. }
  23. i++;
  24. }
  25. fclose(myfile);
  26. return i;
  27. }
  28. //make list all urls in collection.txt
  29. void getAllUrls(int nUrls, char *list[MAX_LETTERS]) {
  30. char ch[MAX_LETTERS];
  31. int i = 0;
  32. int n = 0;
  33. FILE *myfile;
  34. myfile = fopen("collection.txt","r");
  35. if (nUrls == 0) {
  36. return;
  37. }
  38. //start reading urls from collection
  39. char** urlList = malloc(nUrls * sizeof(char*));
  40.  
  41. for(i=0; i<nUrls; i++ ) {
  42. urlList[i] = (char*)malloc(MAX_LETTERS+1);
  43. }
  44. while (fscanf(myfile,"%s",ch)== 1){ //while strings are still being read from txt file
  45. char* isURL = strstr(ch, "url"); //check it is a url
  46.  
  47. if (isURL == NULL) {
  48. continue;
  49. }
  50.  
  51. urlList[n] = strdup(isURL); //duplicate string into array
  52. n++;
  53. }
  54. nUrls = n;
  55. i = 0;
  56. while(i < nUrls) {
  57. strcpy(list[i],urlList[i]); //duplicate string to global array
  58. i++;
  59. }
  60. fclose(myfile);
  61. //finish reading urls from collection
  62. }
  63.  
  64. //for each url loop to get words
  65. int getWords(int nUrls, char dictionary[MAX_WORDS][MAX_LETTERS], char *urlList[MAX_LETTERS]) {
  66. char filename[MAX_LETTERS];
  67. char reopen[MAX_LETTERS];
  68. int localTotal = 0;
  69. int wordIndex;
  70. int dictIndex;
  71. int letterIndex;
  72. int flag;
  73. int nWords;
  74. int urlIndex;
  75. for(urlIndex = 0; urlIndex < nUrls; urlIndex++) { //iterate for all urls
  76. FILE *fp;
  77. snprintf(filename, sizeof(filename), "%s.txt", urlList[urlIndex]);
  78. fp = fopen(filename, "r"); //open each url in urlList
  79. //to count words
  80. char ch[MAX_LETTERS];
  81. while(fscanf(fp,"%s",ch)!=EOF && strcmp(ch, "Section-2") != 0){
  82. }
  83. nWords = 0;
  84. while(fscanf(fp, "%s", ch) != EOF && strcmp("#end", ch) != 0) {
  85. nWords++;
  86. }
  87. fclose(fp);
  88. if(nWords == 0) continue; //if no words are in this url, skip
  89.  
  90. //////////////////////////////////////////////////////////////////////////////
  91.  
  92. //once we have nWords for this URL, make array of normalised words in this url
  93.  
  94. //step 1: re-open file for reading words in this time
  95. FILE *fpp;
  96. snprintf(reopen, sizeof(reopen), "%s.txt", urlList[urlIndex]);
  97. fpp = fopen(reopen, "r");
  98. char c[MAX_LETTERS];
  99. //move file pointer to word after "Section-2"
  100. while(fscanf(fpp, "%s", c) != EOF && strcmp(c, "Section-2") != 0) {
  101. }
  102.  
  103. char original[MAX_LETTERS]; //just fscanf all characters raw
  104. char normalised[MAX_LETTERS]; //pass in normalise() function and fill in
  105. wordIndex = 0;
  106. //until end of this file
  107. while(wordIndex < nWords && fscanf(fpp, "%s", &original) != EOF && strcmp("#end", original) != 0) {
  108. //for every word
  109. memset(normalised, 0, sizeof(normalised));
  110. //normalise word
  111. letterIndex = 0;
  112. while(original[letterIndex] != '\0') {
  113. if (original[letterIndex] == '.' || original[letterIndex] == ';' || original[letterIndex] == ',' || original[letterIndex] == '?' || original[letterIndex] == ' ') {
  114. normalised[letterIndex] = '\0';
  115. }
  116. else {
  117. if(!islower(original[letterIndex])){
  118. normalised[letterIndex] = tolower(original[letterIndex]);
  119. }
  120. else {
  121. normalised[letterIndex] = original[letterIndex];
  122. }
  123. }
  124. letterIndex++;
  125. }
  126. //copying (if necessary) into dictionary
  127. dictIndex = 0;
  128. if(localTotal == 0) { //for empty dictionary i.e. first word must be copied in
  129. strcpy(dictionary[dictIndex], normalised);
  130. localTotal = 1;
  131. }
  132. else{ //for non-empty dictionary, only copy if it is a new word
  133. flag = 0;
  134. while(dictIndex<localTotal && flag == 0) {
  135. if(strcmp(dictionary[dictIndex], normalised) == 0) {
  136. flag = 1;
  137. }
  138. dictIndex++;
  139. }
  140. //checked all words in dictionary and wordsInThis[wordIndex] doesn't exist yet
  141. if(dictIndex == localTotal && flag == 0) {
  142. strcpy(dictionary[dictIndex], normalised);
  143. localTotal++;
  144. }
  145. }
  146. //end copying (if necessary) into dictionary
  147. wordIndex++;
  148. }
  149. fclose(fpp);
  150. }
  151. return localTotal;
  152. }
  153. void sortWords(int nWords, char dictionary[MAX_WORDS][MAX_LETTERS]) {
  154. char temp[MAX_LETTERS];
  155. int curr;
  156. int next;
  157. printf("enters function\n");
  158. curr = 0;
  159. while (curr < nWords){
  160. next = curr+1;
  161. while (curr+next < nWords) {
  162. if (strcmp(dictionary[curr], dictionary[next]) > 0) {
  163. strcpy(temp, dictionary[curr]);
  164. strcpy(dictionary[curr], dictionary[next]);
  165. strcpy(dictionary[next], temp);
  166. }
  167. next++;
  168. }
  169. curr++;
  170. }
  171. }
  172. //TODO
  173. //function that outputs list of urls a given word appears in
  174. //takes in word from main, all urls, then spits out urls that this word occurs in
  175. int getUrlsPerWord(int nUrls, char *word, char *urlList[MAX_LETTERS], char *urlsPerWord[MAX_LETTERS]){
  176. int urlIndex = 0;
  177. char filename[MAX_LETTERS];
  178. int urlsPerWordIndex = 0;
  179. int flag;
  180. int letterIndex;
  181. while (urlIndex < nUrls) {
  182.  
  183. FILE *fp;
  184. snprintf(filename, sizeof(filename), "%s.txt", urlList[urlIndex]);
  185. fp = fopen(filename, "r");
  186. char ch[MAX_LETTERS];
  187. //move file pointer to word after "Section-2"
  188. while(fscanf(fp, "%s", ch) != EOF && strcmp(ch, "Section-2") != 0) {
  189. }
  190. char original[MAX_LETTERS]; //just fscanf all characters raw
  191. char normalised[MAX_LETTERS];
  192. //until end of this file
  193. flag = 0;
  194. while(flag == 0 && fscanf(fp, "%s", &original) != EOF && strcmp("#end", original) != 0) {
  195. //for every word
  196. memset(normalised, 0, sizeof(normalised));
  197. //normalise word
  198. letterIndex = 0;
  199. while(original[letterIndex] != '\0') {
  200. if (original[letterIndex] == '.' || original[letterIndex] == ';' || original[letterIndex] == ',' || original[letterIndex] == '?' || original[letterIndex] == ' ') {
  201. normalised[letterIndex] = '\0';
  202. }
  203. else {
  204. if(!islower(original[letterIndex])){
  205. normalised[letterIndex] = tolower(original[letterIndex]);
  206. }
  207. else {
  208. normalised[letterIndex] = original[letterIndex];
  209. }
  210. }
  211. letterIndex++;
  212. }
  213. //checking if inputted "word" = "normalised" word
  214. if(strcmp(word, normalised) == 0) {
  215. flag = 1;
  216. strcpy(urlsPerWord[urlsPerWordIndex], urlList[urlIndex]);
  217. urlsPerWordIndex++;
  218. }
  219. }
  220. urlIndex++;
  221. fclose(fp);
  222. }
  223. return urlsPerWordIndex;
  224. }
  225. int main(void){
  226. //set up
  227. int nUrls = countUrls();
  228. printf("nUrls %d\n", nUrls);
  229. int i, j;
  230. char** urlsarray = malloc(nUrls * sizeof(char*));
  231. for(i=0; i<nUrls; i++ ) {
  232. urlsarray[i] = (char*)malloc(MAX_LETTERS+1);
  233. }
  234. getAllUrls(nUrls, urlsarray);
  235. char dictarray[MAX_WORDS][MAX_LETTERS];
  236. char** urlsPerWord = malloc(MAX_WORDS * sizeof(char*));
  237. for(i=0; i<nUrls; i++ ) {
  238. urlsPerWord[i] = (char*)malloc(nUrls+1);
  239. }
  240. //end set up
  241.  
  242. //create dictionary
  243. getWords(nUrls, dictarray, urlsarray);
  244. int nWordsInDict = getWords(nUrls, dictarray, urlsarray);
  245. sortWords(nWordsInDict, dictarray);
  246. //end create dictionary
  247.  
  248. //write to text file
  249. FILE *finalFile;
  250. finalFile = fopen("invertedIndex.txt","w");
  251. printf("open successful\n");
  252. i = 0;
  253. printf("total words in dictionary %d\n", nWordsInDict);
  254. while (i < nWordsInDict) {
  255. fprintf(finalFile, "%s", dictarray[i]);
  256. getUrlsPerWord(nUrls, dictarray[i], urlsarray, urlsPerWord);
  257. int n = getUrlsPerWord(nUrls, dictarray[i], urlsarray, urlsPerWord);
  258. j = 0;
  259. while (j < n) {
  260. fprintf(finalFile, " %s", urlsPerWord[j]);
  261. j++;
  262. }
  263. fprintf(finalFile, "\n");
  264. i++;
  265. }
  266. fclose(finalFile);
  267. return 0;
  268. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement