Advertisement
amousa1990

crawler

Jan 17th, 2016
436
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.11 KB | None | 0 0
  1. #include <stdlib.h>
  2. #include <string.h>
  3. #include <curl/curl.h>
  4. //#include <curl/types.h>
  5. #include <curl/easy.h>
  6.  
  7. int i;
  8.  
  9. void append(char * string,char ch);
  10. char * getHtml();
  11. char * getWords(char * htmlCode);
  12. // Define our struct for accepting LCs output
  13. struct BufferStruct
  14. {
  15. char * buffer;
  16. size_t size;
  17. };
  18.  
  19. // This is the function we pass to LC, which writes the output to a BufferStruct
  20. static size_t WriteMemoryCallback (void *ptr, size_t size, size_t nmemb, void *data)
  21. {
  22. size_t realsize = size * nmemb;
  23.  
  24. struct BufferStruct * mem = (struct BufferStruct *) data;
  25.  
  26. mem->buffer = realloc(mem->buffer, mem->size + realsize + 1);
  27.  
  28. if (mem->buffer == NULL)
  29. return 0;
  30.  
  31. if ( mem->buffer )
  32. {
  33. memcpy( &( mem->buffer[ mem->size ] ), ptr, realsize );
  34. mem->size += realsize;
  35. mem->buffer[ mem->size ] = 0;
  36. }
  37. return realsize;
  38. }
  39.  
  40.  
  41. int main()
  42. {
  43. curl_global_init( CURL_GLOBAL_ALL );
  44. CURL * myHandle;
  45. CURLcode result; // We’ll store the result of CURL’s webpage retrieval, for simple error checking.
  46. myHandle = curl_easy_init ( ) ;
  47.  
  48. char * output;
  49. output = getHtml("http://www.facebook.com");
  50. char * htmlCode = output;
  51. getWords(htmlCode);
  52. while (htmlCode) {
  53. char * nextLine = strchr(htmlCode, '\n');
  54. if (nextLine)
  55. *nextLine = '\0';
  56. if(strstr(htmlCode,"href")!=NULL) { //href in this line
  57. char * pos = strstr(htmlCode,"href");
  58. int position = pos - htmlCode;
  59. i = position + 6;
  60. while (i < strlen(htmlCode)) {
  61. if (htmlCode[i] == '"')
  62. i = strlen(htmlCode);
  63. printf("%c",htmlCode[i]);
  64. i++;
  65. }
  66. printf("\n");
  67. }
  68. if (nextLine)
  69. *nextLine = '\n';
  70. htmlCode = nextLine ? (nextLine+1) : NULL;
  71. }
  72.  
  73.  
  74. return 0;
  75. }
  76.  
  77. void append(char * string,char ch)
  78. {
  79. int size=strlen(string);
  80. char temp[size+1];
  81. strcpy(temp,string);
  82. temp[size]=ch;
  83. temp[size+1]='\0';
  84. strcpy(string,temp);
  85. }
  86.  
  87. char * getHtml(char *url){
  88. struct BufferStruct buffer;
  89. buffer.buffer = NULL;
  90. buffer.size = 0;
  91. CURLcode result;
  92. CURL * myHandle = curl_easy_init();
  93. curl_easy_setopt(myHandle, CURLOPT_HEADER, 0);
  94. curl_easy_setopt(myHandle, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); // Passing the function pointer to LC
  95. curl_easy_setopt(myHandle, CURLOPT_WRITEDATA, (void *)&buffer); // Passing our BufferStruct to LC
  96. curl_easy_setopt(myHandle, CURLOPT_URL, url);
  97. result = curl_easy_perform( myHandle );
  98. curl_easy_cleanup( myHandle );
  99. return buffer.buffer;
  100. }
  101.  
  102. char * getWords(char * htmlCode) {
  103. htmlCode = strstr(htmlCode, "</head>");
  104. while (htmlCode) {
  105. char * nextLine = strchr(htmlCode, '\n');
  106. if (nextLine)
  107. *nextLine = '\0';
  108. int i = 0;
  109. int tags = 0;
  110. while (i<strlen(htmlCode)) {
  111. if (htmlCode[i] == '<')
  112. tags++;
  113. if(tags < 1 && htmlCode[i]!=' ' || tags < 1 && htmlCode[i+1]!=' ')
  114. printf("%c",htmlCode[i]);
  115. if (htmlCode[i] == '>')
  116. tags--;
  117. i++;
  118. }
  119. if (nextLine)
  120. *nextLine = '\n';
  121. htmlCode = nextLine ? (nextLine+1) : NULL;
  122. }
  123. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement