Advertisement
pgapr14

assn-4 buggy 28oct/10:51

Oct 28th, 2015
122
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 20.94 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <assert.h>
  4. #include <string.h>
  5. #include <ctype.h>
  6.  
  7. #include "hashset.h"
  8. #include "url.h"
  9. #include "bool.h"
  10. #include "urlconnection.h"
  11. #include "streamtokenizer.h"
  12. #include "html-utils.h"
  13.  
  14. static void Welcome(const char *welcomeTextFileName);
  15. static void BuildIndices(const char *feedsFileName, void* rssDB);
  16. static void ProcessFeed(const char *remoteDocumentName, void* DB);
  17. static void PullAllNewsItems(urlconnection *urlconn, void* DB);
  18. static bool GetNextItemTag(streamtokenizer *st);
  19. static void ProcessSingleNewsItem(streamtokenizer *st, void* DB);
  20. static void ExtractElement(streamtokenizer *st, const char *htmlTag, char dataBuffer[], int bufferLength);
  21. static void ParseArticle(const char *articleTitle, const char *articleDescription, const char *articleURL, void* DB);
  22. static void ScanArticle(streamtokenizer *st, const char *articleTitle, const char *unused, const char *articleURL);
  23. static void QueryIndices();
  24. static void ProcessResponse(const char *word);
  25. static bool WordIsWellFormed(const char *word);
  26.  
  27. typedef struct{
  28. hashset stopWordSet;
  29. hashset indices;
  30. vector indexedArticles;
  31. } rssDB;
  32.  
  33.  
  34. static int strCompare(const void* elemAddr1, const void* elemAddr2){
  35. int z = strcmp(*(char**)elemAddr1, *(char**)elemAddr2);
  36. return z;
  37. }
  38.  
  39. void strFree(void* elemAddr){
  40. free(*(char**)elemAddr);
  41. // printf("freeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee\n");
  42. }
  43.  
  44. static const signed long kHashMultiplier = -1664117991L;
  45. static int StringHash(const void *s, int numBuckets)
  46. {
  47. int i;
  48. unsigned long hashcode = 0;
  49.  
  50. for (i = 0; i < strlen((char*)s); i++)
  51. hashcode = hashcode * kHashMultiplier + tolower(((char*)s)[i]);
  52.  
  53. return hashcode % numBuckets;
  54. }
  55.  
  56. static const int stopWord_num_buckets = 733;
  57. void createStopWordSet(void* stopWordSet){
  58. HashSetNew(stopWordSet, sizeof(char*), stopWord_num_buckets, StringHash, strCompare, strFree);
  59. // printf("3\n");
  60. }
  61.  
  62. static const char *const kWhiteSpaceCharacters = " \t\n\r";
  63. void readStopword(void* stopWordSet, const char *const kDefaultStopWordFile){
  64. FILE* infile = fopen(kDefaultStopWordFile,"r");
  65. streamtokenizer st;
  66. char word[128]; // reasonable upper limit on length of all English words..
  67. STNew(&st, infile, kWhiteSpaceCharacters, true);
  68.  
  69. while (STNextToken(&st, word, sizeof(word))) {
  70. char* wordDup = strdup(word);
  71. HashSetEnter(stopWordSet, &wordDup);
  72. printf("entering word -> %s \n", wordDup);
  73. char** tmp = HashSetLookup(stopWordSet, &wordDup);
  74. printf("%s \n", *tmp );
  75. }
  76.  
  77. printf("FINISHED1\n");
  78. STDispose(&st);
  79. printf("FINISHED2\n");
  80.  
  81. }
  82.  
  83.  
  84.  
  85. /**
  86. * Function: main
  87. * --------------
  88. * Serves as the entry point of the full application.
  89. * You'll want to update main to declare several hashsets--
  90. * one for stop words, another for previously seen urls, etc--
  91. * and pass them (by address) to BuildIndices and QueryIndices.
  92. * In fact, you'll need to extend many of the prototypes of the
  93. * supplied helpers functions to take one or more hashset *s.
  94. *
  95. * Think very carefully about how you're going to keep track of
  96. * all of the stop words, how you're going to keep track of
  97. * all the previously seen articles, and how you're going to
  98. * map words to the collection of news articles where that
  99. * word appears.
  100. */
  101.  
  102. void mapfn(void* elem, void* auxDat){
  103. printf("%s \n", *(char**)elem);
  104. }
  105.  
  106. void printAll(void* a){
  107. HashSetMap(a, mapfn, NULL);
  108. }
  109.  
  110. static const char *const kWelcomeTextFile = "data/welcome.txt";
  111. static const char *const kDefaultFeedsFile = "data/philly.txt";
  112. static const char *const kDefaultStopWordFile = "data/stop-words.txt";
  113. static const char *const word = "z";
  114.  
  115. int main(int argc, char **argv)
  116. {
  117. Welcome(kWelcomeTextFile);
  118. // hashset stopWordSet;
  119. rssDB DB;
  120. createStopWordSet(& (DB.stopWordSet));
  121.  
  122. readStopword(& (DB.stopWordSet), kDefaultStopWordFile);
  123. printf("asdasdsa\n");
  124.  
  125. // printAll(&DB.stopWordSet);
  126.  
  127. printf("1 \n");
  128. char** z= HashSetLookup(& (DB.stopWordSet), &word);
  129. printf("2 \n");
  130. printf("%i\n", *z);
  131. printf("3 \n");
  132. // BuildIndices((argc == 1) ? kDefaultFeedsFile : argv[1], &DB);
  133. /* QueryIndices();
  134. */
  135.  
  136.  
  137. return 0;
  138. }
  139.  
  140. /**
  141. * Function: Welcome
  142. * -----------------
  143. * Displays the contents of the specified file, which
  144. * holds the introductory remarks to be printed every time
  145. * the application launches. This type of overhead may
  146. * seem silly, but by placing the text in an external file,
  147. * we can change the welcome text without forcing a recompilation and
  148. * build of the application. It's as if welcomeTextFileName
  149. * is a configuration file that travels with the application.
  150. */
  151.  
  152. static const char *const kNewLineDelimiters = "\r\n";
  153. static void Welcome(const char *welcomeTextFileName)
  154. {
  155. FILE *infile;
  156. streamtokenizer st;
  157. char buffer[1024];
  158.  
  159. infile = fopen(welcomeTextFileName, "r");
  160. assert(infile != NULL);
  161.  
  162. STNew(&st, infile, kNewLineDelimiters, true);
  163. while (STNextToken(&st, buffer, sizeof(buffer))) {
  164. printf("%s\n", buffer);
  165. }
  166.  
  167. printf("\n");
  168. STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one..
  169. fclose(infile);
  170. }
  171.  
  172. /**
  173. * Function: BuildIndices
  174. * ----------------------
  175. * As far as the user is concerned, BuildIndices needs to read each and every
  176. * one of the feeds listed in the specied feedsFileName, and for each feed parse
  177. * content of all referenced articles and store the content in the hashset of indices.
  178. * Each line of the specified feeds file looks like this:
  179. *
  180. * <feed name>: <URL of remore xml document>
  181. *
  182. * Each iteration of the supplied while loop parses and discards the feed name (it's
  183. * in the file for humans to read, but our aggregator doesn't care what the name is)
  184. * and then extracts the URL. It then relies on ProcessFeed to pull the remote
  185. * document and index its content.
  186. */
  187.  
  188. static void BuildIndices(const char *feedsFileName, void* DB)
  189. {
  190. printf("1\n");
  191. FILE *infile;
  192. streamtokenizer st;
  193. char remoteFileName[1024];
  194.  
  195. printf("2\n");
  196. infile = fopen(feedsFileName, "r");
  197. assert(infile != NULL);
  198. STNew(&st, infile, kNewLineDelimiters, true);
  199. printf("3\n");
  200. while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
  201. STSkipOver(&st, ": "); // now ignore the semicolon and any whitespace directly after it
  202. STNextToken(&st, remoteFileName, sizeof(remoteFileName));
  203. printf("4\n");
  204. ProcessFeed(remoteFileName, DB);
  205. }
  206. printf("LAST\n");
  207. STDispose(&st);
  208. fclose(infile);
  209. printf("\n");
  210. }
  211.  
  212.  
  213. /**
  214. * Function: ProcessFeed
  215. * ---------------------
  216. * ProcessFeed locates the specified RSS document, and if a (possibly redirected) connection to that remote
  217. * document can be established, then PullAllNewsItems is tapped to actually read the feed. Check out the
  218. * documentation of the PullAllNewsItems function for more information, and inspect the documentation
  219. * for ParseArticle for information about what the different response codes mean.
  220. */
  221.  
  222. static void ProcessFeed(const char *remoteDocumentName, void* DB)
  223. {
  224. url u;
  225. urlconnection urlconn;
  226. printf("%s \n", remoteDocumentName);
  227.  
  228. URLNewAbsolute(&u, remoteDocumentName);
  229.  
  230. printf("%s \n", u.fullName );
  231.  
  232. URLConnectionNew(&urlconn, &u);
  233.  
  234. printf("5\n");
  235. switch (urlconn.responseCode) {
  236. case 0: printf("Unable to connect to \"%s\". Ignoring...", u.serverName);
  237. break;
  238. case 200: PullAllNewsItems(&urlconn, DB);
  239. break;
  240. case 301:
  241. case 302: ProcessFeed(urlconn.newUrl, DB);
  242. break;
  243. default: printf("Connection to \"%s\" was established, but unable to retrieve \"%s\". [response code: %d, response message:\"%s\"]\n",
  244. u.serverName, u.fileName, urlconn.responseCode, urlconn.responseMessage);
  245. break;
  246. };
  247.  
  248. URLConnectionDispose(&urlconn);
  249. URLDispose(&u);
  250. }
  251.  
  252. /**
  253. * Function: PullAllNewsItems
  254. * --------------------------
  255. * Steps though the data of what is assumed to be an RSS feed identifying the names and
  256. * URLs of online news articles. Check out "datafiles/sample-rss-feed.txt" for an idea of what an
  257. * RSS feed from the www.nytimes.com (or anything other server that syndicates is stories).
  258. *
  259. * PullAllNewsItems views a typical RSS feed as a sequence of "items", where each item is detailed
  260. * using a generalization of HTML called XML. A typical XML fragment for a single news item will certainly
  261. * adhere to the format of the following example:
  262. *
  263. * <item>
  264. * <title>At Installation Mass, New Pope Strikes a Tone of Openness</title>
  265. * <link>http://www.nytimes.com/2005/04/24/international/worldspecial2/24cnd-pope.html</link>
  266. * <description>The Mass, which drew 350,000 spectators, marked an important moment in the transformation of Benedict XVI.</description>
  267. * <author>By IAN FISHER and LAURIE GOODSTEIN</author>
  268. * <pubDate>Sun, 24 Apr 2005 00:00:00 EDT</pubDate>
  269. * <guid isPermaLink="false">http://www.nytimes.com/2005/04/24/international/worldspecial2/24cnd-pope.html</guid>
  270. * </item>
  271. *
  272. * PullAllNewsItems reads and discards all characters up through the opening <item> tag (discarding the <item> tag
  273. * as well, because once it's read and indentified, it's been pulled,) and then hands the state of the stream to
  274. * ProcessSingleNewsItem, which handles the job of pulling and analyzing everything up through and including the </item>
  275. * tag. PullAllNewsItems processes the entire RSS feed and repeatedly advancing to the next <item> tag and then allowing
  276. * ProcessSingleNewsItem do process everything up until </item>.
  277. */
  278.  
  279. static const char *const kTextDelimiters = " \t\n\r\b!@$%^*()_+={[}]|\\'\":;/?.>,<~`";
  280. static void PullAllNewsItems(urlconnection *urlconn, void* DB)
  281. {
  282. streamtokenizer st;
  283. STNew(&st, urlconn->dataStream, kTextDelimiters, false);
  284. while (GetNextItemTag(&st)) { // if true is returned, then assume that <item ...> has just been read and pulled from the data stream
  285. ProcessSingleNewsItem(&st, DB);
  286. }
  287.  
  288. STDispose(&st);
  289. }
  290.  
  291. /**
  292. * Function: GetNextItemTag
  293. * ------------------------
  294. * Works more or less like GetNextTag below, but this time
  295. * we're searching for an <item> tag, since that marks the
  296. * beginning of a block of HTML that's relevant to us.
  297. *
  298. * Note that each tag is compared to "<item" and not "<item>".
  299. * That's because the item tag, though unlikely, could include
  300. * attributes and perhaps look like any one of these:
  301. *
  302. * <item>
  303. * <item rdf:about="Latin America reacts to the Vatican">
  304. * <item requiresPassword=true>
  305. *
  306. * We're just trying to be as general as possible without
  307. * going overboard. (Note that we use strncasecmp so that
  308. * string comparisons are case-insensitive. That's the case
  309. * throughout the entire code base.)
  310. */
  311.  
  312. static const char *const kItemTagPrefix = "<item";
  313. static bool GetNextItemTag(streamtokenizer *st)
  314. {
  315. char htmlTag[1024];
  316. while (GetNextTag(st, htmlTag, sizeof(htmlTag))) {
  317. if (strncasecmp(htmlTag, kItemTagPrefix, strlen(kItemTagPrefix)) == 0) {
  318. return true;
  319. }
  320. }
  321. return false;
  322. }
  323.  
  324. /**
  325. * Function: ProcessSingleNewsItem
  326. * -------------------------------
  327. * Code which parses the contents of a single <item> node within an RSS/XML feed.
  328. * At the moment this function is called, we're to assume that the <item> tag was just
  329. * read and that the streamtokenizer is currently pointing to everything else, as with:
  330. *
  331. * <title>Carrie Underwood takes American Idol Crown</title>
  332. * <description>Oklahoma farm girl beats out Alabama rocker Bo Bice and 100,000 other contestants to win competition.</description>
  333. * <link>http://www.nytimes.com/frontpagenews/2841028302.html</link>
  334. * </item>
  335. *
  336. * ProcessSingleNewsItem parses everything up through and including the </item>, storing the title, link, and article
  337. * description in local buffers long enough so that the online new article identified by the link can itself be parsed
  338. * and indexed. We don't rely on <title>, <link>, and <description> coming in any particular order. We do asssume that
  339. * the link field exists (although we can certainly proceed if the title and article descrption are missing.) There
  340. * are often other tags inside an item, but we ignore them.
  341. */
  342.  
  343. static const char *const kItemEndTag = "</item>";
  344. static const char *const kTitleTagPrefix = "<title";
  345. static const char *const kDescriptionTagPrefix = "<description";
  346. static const char *const kLinkTagPrefix = "<link";
  347. static void ProcessSingleNewsItem(streamtokenizer *st, void* DB)
  348. {
  349. char htmlTag[1024];
  350. char articleTitle[1024];
  351. char articleDescription[1024];
  352. char articleURL[1024];
  353. articleTitle[0] = articleDescription[0] = articleURL[0] = '\0';
  354.  
  355. while (GetNextTag(st, htmlTag, sizeof(htmlTag)) && (strcasecmp(htmlTag, kItemEndTag) != 0)) {
  356. if (strncasecmp(htmlTag, kTitleTagPrefix, strlen(kTitleTagPrefix)) == 0) ExtractElement(st, htmlTag, articleTitle, sizeof(articleTitle));
  357. if (strncasecmp(htmlTag, kDescriptionTagPrefix, strlen(kDescriptionTagPrefix)) == 0) ExtractElement(st, htmlTag, articleDescription, sizeof(articleDescription));
  358. if (strncasecmp(htmlTag, kLinkTagPrefix, strlen(kLinkTagPrefix)) == 0) ExtractElement(st, htmlTag, articleURL, sizeof(articleURL));
  359. }
  360.  
  361. if (strncmp(articleURL, "", sizeof(articleURL)) == 0) return; // punt, since it's not going to take us anywhere
  362.  
  363.  
  364.  
  365. ParseArticle(articleTitle, articleDescription, articleURL, DB);
  366. }
  367.  
  368. /**
  369. * Function: ExtractElement
  370. * ------------------------
  371. * Potentially pulls text from the stream up through and including the matching end tag. It assumes that
  372. * the most recently extracted HTML tag resides in the buffer addressed by htmlTag. The implementation
  373. * populates the specified data buffer with all of the text up to but not including the opening '<' of the
  374. * closing tag, and then skips over all of the closing tag as irrelevant. Assuming for illustration purposes
  375. * that htmlTag addresses a buffer containing "<description" followed by other text, these three scanarios are
  376. * handled:
  377. *
  378. * Normal Situation: <description>http://some.server.com/someRelativePath.html</description>
  379. * Uncommon Situation: <description></description>
  380. * Uncommon Situation: <description/>
  381. *
  382. * In each of the second and third scenarios, the document has omitted the data. This is not uncommon
  383. * for the description data to be missing, so we need to cover all three scenarious (I've actually seen all three.)
  384. * It would be quite unusual for the title and/or link fields to be empty, but this handles those possibilities too.
  385. */
  386.  
  387. static void ExtractElement(streamtokenizer *st, const char *htmlTag, char dataBuffer[], int bufferLength)
  388. {
  389. assert(htmlTag[strlen(htmlTag) - 1] == '>');
  390. if (htmlTag[strlen(htmlTag) - 2] == '/') return; // e.g. <description/> would state that a description is not being supplied
  391. STNextTokenUsingDifferentDelimiters(st, dataBuffer, bufferLength, "<");
  392. RemoveEscapeCharacters(dataBuffer);
  393. if (dataBuffer[0] == '<') strcpy(dataBuffer, ""); // e.g. <description></description> also means there's no description
  394. STSkipUntil(st, ">");
  395. STSkipOver(st, ">");
  396. }
  397.  
  398. /**
  399. * Function: ParseArticle
  400. * ----------------------
  401. * Attempts to establish a network connect to the news article identified by the three
  402. * parameters. The network connection is either established of not. The implementation
  403. * is prepared to handle a subset of possible (but by far the most common) scenarios,
  404. * and those scenarios are categorized by response code:
  405. *
  406. * 0 means that the server in the URL doesn't even exist or couldn't be contacted.
  407. * 200 means that the document exists and that a connection to that very document has
  408. * been established.
  409. * 301 means that the document has moved to a new location
  410. * 302 also means that the document has moved to a new location
  411. * 4xx and 5xx (which are covered by the default case) means that either
  412. * we didn't have access to the document (403), the document didn't exist (404),
  413. * or that the server failed in some undocumented way (5xx).
  414. *
  415. * The are other response codes, but for the time being we're punting on them, since
  416. * no others appears all that often, and it'd be tedious to be fully exhaustive in our
  417. * enumeration of all possibilities.
  418. */
  419.  
  420. static void ParseArticle(const char *articleTitle, const char *articleDescription, const char *articleURL, void* DB)
  421. {
  422. url u;
  423. urlconnection urlconn;
  424. streamtokenizer st;
  425.  
  426. URLNewAbsolute(&u, articleURL);
  427. URLConnectionNew(&urlconn, &u);
  428.  
  429. switch (urlconn.responseCode) {
  430. case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL);
  431. break;
  432. case 200: printf("Scanning \"%s\" from \"http://%s\"\n", articleTitle, u.serverName);
  433. STNew(&st, urlconn.dataStream, kTextDelimiters, false);
  434. ScanArticle(&st, articleTitle, articleDescription, articleURL);
  435. STDispose(&st);
  436. break;
  437. case 301:
  438. case 302: // just pretend we have the redirected URL all along, though index using the new URL and not the old one...
  439. ParseArticle(articleTitle, articleDescription, urlconn.newUrl, DB);
  440. break;
  441. default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
  442. break;
  443. }
  444.  
  445. URLConnectionDispose(&urlconn);
  446. URLDispose(&u);
  447. }
  448.  
  449. /**
  450. * Function: ScanArticle
  451. * ---------------------
  452. * Parses the specified article, skipping over all HTML tags, and counts the numbers
  453. * of well-formed words that could potentially serve as keys in the set of indices.
  454. * Once the full article has been scanned, the number of well-formed words is
  455. * printed, and the longest well-formed word we encountered along the way
  456. * is printed as well.
  457. *
  458. * This is really a placeholder implementation for what will ultimately be
  459. * code that indexes the specified content.
  460. */
  461.  
  462. static void ScanArticle(streamtokenizer *st, const char *articleTitle, const char *unused, const char *articleURL)
  463. {
  464. int numWords = 0;
  465. char word[1024];
  466. char longestWord[1024] = {'\0'};
  467.  
  468. while (STNextToken(st, word, sizeof(word))) {
  469. if (strcasecmp(word, "<") == 0) {
  470. SkipIrrelevantContent(st); // in html-utls.h
  471. } else {
  472. RemoveEscapeCharacters(word);
  473. if (WordIsWellFormed(word)) {
  474.  
  475. numWords++;
  476. if (strlen(word) > strlen(longestWord))
  477. strcpy(longestWord, word);
  478. }
  479. }
  480. }
  481.  
  482. printf("\tWe counted %d well-formed words [including duplicates].\n", numWords);
  483. printf("\tThe longest word scanned was \"%s\".", longestWord);
  484. if (strlen(longestWord) >= 15 && (strchr(longestWord, '-') == NULL))
  485. printf(" [Ooooo... long word!]");
  486. printf("\n");
  487. }
  488.  
  489. /**
  490. * Function: QueryIndices
  491. * ----------------------
  492. * Standard query loop that allows the user to specify a single search term, and
  493. * then proceeds (via ProcessResponse) to list up to 10 articles (sorted by relevance)
  494. * that contain that word.
  495. */
  496.  
  497. static void QueryIndices()
  498. {
  499. char response[1024];
  500. while (true) {
  501. printf("Please enter a single query term that might be in our set of indices [enter to quit]: ");
  502. fgets(response, sizeof(response), stdin);
  503. response[strlen(response) - 1] = '\0';
  504. if (strcasecmp(response, "") == 0) break;
  505. ProcessResponse(response);
  506. }
  507. }
  508.  
  509. /**
  510. * Function: ProcessResponse
  511. * -------------------------
  512. * Placeholder implementation for what will become the search of a set of indices
  513. * for a list of web documents containing the specified word.
  514. */
  515.  
  516. static void ProcessResponse(const char *word)
  517. {
  518. if (WordIsWellFormed(word)) {
  519. printf("\tWell, we don't have the database mapping words to online news articles yet, but if we DID have\n");
  520. printf("\tour hashset of indices, we'd list all of the articles containing \"%s\".\n", word);
  521. } else {
  522. printf("\tWe won't be allowing words like \"%s\" into our set of indices.\n", word);
  523. }
  524. }
  525.  
  526. /**
  527. * Predicate Function: WordIsWellFormed
  528. * ------------------------------------
  529. * Before we allow a word to be inserted into our map
  530. * of indices, we'd like to confirm that it's a good search term.
  531. * One could generalize this function to allow different criteria, but
  532. * this version hard codes the requirement that a word begin with
  533. * a letter of the alphabet and that all letters are either letters, numbers,
  534. * or the '-' character.
  535. */
  536.  
  537. static bool WordIsWellFormed(const char *word)
  538. {
  539. int i;
  540. if (strlen(word) == 0) return true;
  541. if (!isalpha((int) word[0])) return false;
  542. for (i = 1; i < strlen(word); i++)
  543. if (!isalnum((int) word[i]) && (word[i] != '-')) return false;
  544.  
  545. return true;
  546. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement