Advertisement
Guest User

Untitled

a guest
Oct 22nd, 2017
357
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.41 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <string.h>
  4. #include <math.h>
  5. #include "readData.h"
  6. #define MAX_LENGTH 1000
  7. //Written By Christopher Bebawi (z5162243@unsw.edu.au)
  8. //Modified by Mohammad Abdullah (z5162395@unsw.edu.au)
  9. typedef struct URL_search {
  10. char url[MAXBUF];
  11. int out_links;
  12. double pageRankWeight;
  13. int num;
  14. } URL_search;
  15.  
  16. URL_search *matched_url_list;
  17. int numLinks = 0;
  18. int numWords = 0;
  19.  
  20. URL_search *findMatchedUrls(char *, char **, int);
  21. int cmpfuncSearch(const void *, const void *);
  22. int doc_appearance(FILE *doc, char *term);
  23. int in_string(char *string_to_search_in, char *string_to_search_for);
  24. double Tf_Value(int tf, int idf);
  25. double Idf_Value(int numb_documents, int numb_documents_term_appears_in);
  26. double Tf_Idf_Value(double tf, double idf);
  27. int InSearchTerms(char *line, char **searchTerms, int numTerms);
  28.  
  29.  
  30.  
  31.  
  32. int main(int argc, char *argv[]) {
  33. if (argc == 1){
  34. fprintf(stderr,"No Search Terms given, terminating programme.....\n");
  35. return EXIT_FAILURE;
  36. }
  37. //FILE *invertedIndex = fopen("invertedIndex.txt", "r", argc);
  38. matched_url_list = findMatchedUrls("invertedIndex.txt", argv, argc);
  39.  
  40. FILE *pageRankList = fopen("pagerankList.txt", "r");
  41. char line[MAXBUF];
  42. while (fscanf(pageRankList, "%s", line) == 1) {
  43. if (line[0] == 'u' && line[1] == 'r' && line[2] == 'l'
  44. && line[3] <= '9' && line[3] >= '0') {
  45. char url[MAXBUF];
  46. strcpy(url, line);
  47. for (int i = 0; i < MAXBUF; i++) {
  48. if (url[i] == ',') {
  49. url[i] = 0;
  50. break;
  51. }
  52. }
  53. // line = pagerank after these two lines
  54. fscanf(pageRankList, "%s", line);
  55. fscanf(pageRankList, "%s", line);
  56.  
  57. for (int k = 0; k < numLinks; k++) {
  58. if (!strcmp(url, matched_url_list[k].url)) {
  59. matched_url_list[k].pageRankWeight = atof(line);
  60. break;
  61. }
  62. }
  63. }
  64. }
  65. fclose(pageRankList);
  66.  
  67. qsort(matched_url_list, numLinks, sizeof(URL_search), cmpfuncSearch);
  68. if (numLinks > 30) numLinks = 30;
  69. int i;
  70. //printf("numLinks = %d\n", numLinks);
  71. for (i = 0; i < numLinks; i++) {
  72. if (matched_url_list[i].num == numWords)
  73. printf("%s\n", matched_url_list[i].url);
  74. }
  75. return 0;
  76. }
  77.  
  78. int cmpfuncSearch(const void *a, const void *b) {
  79. const URL_search *a_URL = (URL_search *)a;
  80. const URL_search *b_URL = (URL_search *)b;
  81. if (a_URL->pageRankWeight < b_URL->pageRankWeight)
  82. return 1;
  83. else if (a_URL->pageRankWeight > b_URL->pageRankWeight)
  84. return -1;
  85. else
  86. return 0;
  87. }
  88.  
  89. URL_search *newURLSearch() {
  90. URL_search *new = malloc(sizeof(URL_search));
  91. //new->url = NULL;
  92. new->out_links = 0;
  93. new->pageRankWeight = 0;
  94. new->num = 0;
  95. return new;
  96. }
  97.  
  98. int doc_appearance(FILE *doc, char *term){//how many times does "term" pop up in the file "doc"??
  99. int appearances = 0;
  100. char line[MAX_LENGTH];
  101. while (fgets(line,MAX_LENGTH,doc) != NULL){//I think something along the lines of "fscanf(doc, "%s", &line)" would've been equivalent to this
  102. appearances += in_string(line,term);
  103. }
  104. return appearances;
  105.  
  106. }
  107.  
  108. int in_string(char *string_to_search_in, char *string_to_search_for){//returns 1 if the string to search in finds the string we're looking for in it, otherwise it returns a 0.
  109. if (strlen(string_to_search_for) > strlen(string_to_search_in)){//if we're looking for a word of length 20 in a word of length 2, we'll fail every time, no matter how hard we try
  110. return 0;
  111. }
  112. int i = 0;
  113. int j = 0;
  114. while (i < strlen(string_to_search_in)){
  115. if (string_to_search_for[j] == string_to_search_in[i]){
  116. ++j;
  117. ++i;
  118. if ((string_to_search_for[j] == '\0') && ((string_to_search_in[i] == '\0') || (string_to_search_in[i] == ' '))) {//if we've reached the end of the string to search for, it means ALL the characters were found IN ORDER in the string to search in!! We've found it!! BUT! We want the occurence of the word, not if the word is a subset of another, eg if we search for "ants", we don't want a website with the word "pants" in it, hence the second condition!
  119. return 1;
  120. }
  121. }
  122. else{
  123. while ((string_to_search_in[i] != '\0') && (string_to_search_in[i] != ' ')){
  124. ++i;
  125. }
  126. if (string_to_search_in[i] == ' '){
  127. ++i;
  128. }
  129. j = 0;//This is for the case where we say, search for "Sim" in "Symbol", the first letters match, yes, but you need to reset j, otherwise you'll try comparing
  130. }
  131.  
  132. }
  133.  
  134. return (!(strcmp(string_to_search_for,string_to_search_in)));//if we've reached this far, then we'll return if they're the same or not, if they are, then technically the string is IN the string to seach for, if they're not, we've tried everything else, so we return 0.
  135.  
  136.  
  137. //"I hate ants", "ants are what I hate", "ants", "pants", " ants ", "antsy" have all been tested to work with this.
  138. }
  139.  
  140.  
  141. int InSearchTerms(char *line, char **searchTerms, int numTerms) {
  142. int j;
  143. for (j = 1; j < numTerms; j++) {
  144. if (!strcmp(searchTerms[j], line)) {
  145. return 1;
  146. }
  147. }
  148. return 0;
  149. }
  150.  
  151. double Tf_Value(int tf, int idf){
  152. return (idf * tf);//TODO: fill this in later
  153. }
  154.  
  155. double Idf_Value(int numb_documents, int numb_documents_term_appears_in){
  156. return log10(numb_documents/(1 + numb_documents_term_appears_in)) ;//TODO: fill this in later
  157. }
  158.  
  159. double Tf_Idf_Value(double tf, double idf){
  160. return (idf * tf);//TODO: fill this in later
  161. }
  162.  
  163.  
  164.  
  165.  
  166. URL_search *findMatchedUrls(char *file, char **searchTerms, int numTerms) {
  167. FILE *f = fopen(file, "r");
  168. if (f == NULL) {
  169. printf("Couldn't open file\n");
  170. exit(EXIT_FAILURE);
  171. }
  172. int i = 0;
  173. URL_search *urls;
  174.  
  175. urls = malloc(sizeof(URL_search)*1024);
  176. //URL *url_list = newURL();
  177. //URL *index = url_list->first;
  178. char line[MAXBUF];
  179. while (fscanf(f, "%s", line) == 1) {
  180. //printf("%s\n", urls[i].url);
  181. if (!InSearchTerms(line, searchTerms, numTerms)) {
  182. //memset(line, 0, MAXBUF);
  183. continue;
  184. }
  185. numWords++;
  186. while (fscanf(f, "%s", line) == 1) {
  187. // break once it reaches another word instead of a url
  188. //printf("line = %s\n", line);
  189. if (line[0] != 'u' || line[1] != 'r' || line[2] != 'l'
  190. || line[3] > '9' || line[3] < '0') break;
  191. int found = 0;
  192. int k;
  193. for (k = 0; k < i; k++) {
  194. //printf("k = %d\n", k);
  195. // printf("line = %s\n", line);
  196. // printf("url = %s\n", urls[k].url);
  197. if (!strcmp(urls[k].url, line)) {
  198. found = 1;
  199. break;
  200. }
  201. }
  202. if (found) {
  203. urls[k].num++;
  204. } else {
  205. URL_search *new = newURLSearch();
  206. urls[i] = *new;
  207. strcpy(urls[i].url, line);
  208. urls[i].num++;
  209. //printf("urls[i].url = %s\n", urls[i].url);
  210. i++;
  211. }
  212. //memset(line, 0, MAXBUF);
  213. }
  214. //memset(line, 0, MAXBUF);
  215. }
  216. //printf("numWords = %d\n", numWords);
  217. numLinks = i;
  218. fclose(f);
  219. return urls;
  220. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement