Advertisement
Guest User

Untitled

a guest
Sep 3rd, 2015
59
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.86 KB | None | 0 0
  1. <?php
  2. $html = file_get_contents('http://bbcsite.com/news/123');
  3. ?>
  4.  
  5. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  6. <html>
  7. <head>
  8. <title>!</title>
  9. <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
  10. </head>
  11. <body dir="rtl">
  12. <?php
  13. include_once 'Readability.php';
  14.  
  15.  
  16. // get latest Medialens alert
  17. // (change this URL to whatever you'd like to test)
  18. $url = 'http://';
  19. $html = file_get_contents($url);
  20.  
  21. // Note: PHP Readability expects UTF-8 encoded content.
  22. // If your content is not UTF-8 encoded, convert it
  23. // first before passing it to PHP Readability.
  24. // Both iconv() and mb_convert_encoding() can do this.
  25.  
  26. // If we've got Tidy, let's clean up input.
  27. // This step is highly recommended - PHP's default HTML parser
  28. // often doesn't do a great job and results in strange output.
  29. if (function_exists('tidy_parse_string')) {
  30. $tidy = tidy_parse_string($html, array(), 'UTF8');
  31. $tidy->cleanRepair();
  32. $html = $tidy->value;
  33. }
  34.  
  35. // give it to Readability
  36. $readability = new Readability($html, $url);
  37. // print debug output?
  38. // useful to compare against Arc90's original JS version -
  39. // simply click the bookmarklet with FireBug's console window open
  40. $readability->debug = false;
  41. // convert links to footnotes?
  42. $readability->convertLinksToFootnotes = true;
  43. // process it
  44. $result = $readability->init();
  45. // does it look like we found what we wanted?
  46. if ($result) {
  47. echo "== Title =====================================n";
  48. echo $readability->getTitle()->textContent, "nn";
  49. echo "== Body ======================================n";
  50. $content = $readability->getContent()->innerHTML;
  51. // if we've got Tidy, let's clean it up for output
  52. if (function_exists('tidy_parse_string')) {
  53. $tidy = tidy_parse_string($content, array('indent'=>true, 'show-body-only' => true), 'UTF8');
  54. $tidy->cleanRepair();
  55. $content = $tidy->value;
  56. }
  57. echo $content;
  58. } else {
  59. echo 'Looks like we couldn't find the content. :(';
  60. }
  61. ?>
  62. </body>
  63. </html>
  64.  
  65. function getData($url) {
  66. $url = str_replace('&', '&', urldecode(trim($url)) );
  67. $timeout = 5;
  68. $cookie = tempnam('/tmp', 'CURLCOOKIE');
  69. $ch = curl_init();
  70. curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1');
  71. curl_setopt($ch, CURLOPT_URL, $url);
  72. curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
  73. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  74. curl_setopt($ch, CURLOPT_ENCODING, '');
  75. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  76. curl_setopt($ch, CURLOPT_AUTOREFERER, true);
  77. curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
  78. curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
  79. curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
  80. $content = curl_exec($ch);
  81. curl_close ($ch);
  82. return $content;
  83. }
  84.  
  85. $url = 'http://';
  86. //$html = file_get_contents($url);
  87. $html = getData($url);
  88.  
  89. if (function_exists('tidy_parse_string')) {
  90. $tidy = tidy_parse_string($html, array(), 'UTF8');
  91. $tidy->cleanRepair();
  92. $html = $tidy->value;
  93. }
  94.  
  95. $readability = new Readability($html, $url);
  96.  
  97. //...
  98.  
  99. $html = file_get_contents('http://coder-dz.com');
  100. preg_match_all('/<li>(.*?)</li>/s', $html, $matches);
  101. foreach($matches[1] as $mytitle)
  102. {
  103. echo $mytitle."<br/>";
  104. }
  105.  
  106. use GooseClient as GooseClient;
  107.  
  108. $goose = new GooseClient();
  109. $article = $goose->extractContent('http://url.to/article');
  110.  
  111. $title = $article->getTitle();
  112. $metaDescription = $article->getMetaDescription();
  113. $metaKeywords = $article->getMetaKeywords();
  114. $canonicalLink = $article->getCanonicalLink();
  115. $domain = $article->getDomain();
  116. $tags = $article->getTags();
  117. $links = $article->getLinks();
  118. $movies = $article->getMovies();
  119. $articleText = $article->getCleanedArticleText();
  120. $entities = $article->getPopularWords();
  121. $image = $article->getTopImage();
  122. $allImages = $article->getAllImages();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement