Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- $html = file_get_contents('http://bbcsite.com/news/123');
- ?>
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
- <html>
- <head>
- <title>!</title>
- <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
- </head>
- <body dir="rtl">
- <?php
- include_once 'Readability.php';
- // get latest Medialens alert
- // (change this URL to whatever you'd like to test)
- $url = 'http://';
- $html = file_get_contents($url);
- // Note: PHP Readability expects UTF-8 encoded content.
- // If your content is not UTF-8 encoded, convert it
- // first before passing it to PHP Readability.
- // Both iconv() and mb_convert_encoding() can do this.
- // If we've got Tidy, let's clean up input.
- // This step is highly recommended - PHP's default HTML parser
- // often doesn't do a great job and results in strange output.
- if (function_exists('tidy_parse_string')) {
- $tidy = tidy_parse_string($html, array(), 'UTF8');
- $tidy->cleanRepair();
- $html = $tidy->value;
- }
- // give it to Readability
- $readability = new Readability($html, $url);
- // print debug output?
- // useful to compare against Arc90's original JS version -
- // simply click the bookmarklet with FireBug's console window open
- $readability->debug = false;
- // convert links to footnotes?
- $readability->convertLinksToFootnotes = true;
- // process it
- $result = $readability->init();
- // does it look like we found what we wanted?
- if ($result) {
- echo "== Title =====================================n";
- echo $readability->getTitle()->textContent, "nn";
- echo "== Body ======================================n";
- $content = $readability->getContent()->innerHTML;
- // if we've got Tidy, let's clean it up for output
- if (function_exists('tidy_parse_string')) {
- $tidy = tidy_parse_string($content, array('indent'=>true, 'show-body-only' => true), 'UTF8');
- $tidy->cleanRepair();
- $content = $tidy->value;
- }
- echo $content;
- } else {
- echo 'Looks like we couldn't find the content. :(';
- }
- ?>
- </body>
- </html>
- function getData($url) {
- $url = str_replace('&', '&', urldecode(trim($url)) );
- $timeout = 5;
- $cookie = tempnam('/tmp', 'CURLCOOKIE');
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1');
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
- curl_setopt($ch, CURLOPT_ENCODING, '');
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($ch, CURLOPT_AUTOREFERER, true);
- curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
- curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
- curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
- $content = curl_exec($ch);
- curl_close ($ch);
- return $content;
- }
- $url = 'http://';
- //$html = file_get_contents($url);
- $html = getData($url);
- if (function_exists('tidy_parse_string')) {
- $tidy = tidy_parse_string($html, array(), 'UTF8');
- $tidy->cleanRepair();
- $html = $tidy->value;
- }
- $readability = new Readability($html, $url);
- //...
- $html = file_get_contents('http://coder-dz.com');
- preg_match_all('/<li>(.*?)</li>/s', $html, $matches);
- foreach($matches[1] as $mytitle)
- {
- echo $mytitle."<br/>";
- }
- use GooseClient as GooseClient;
- $goose = new GooseClient();
- $article = $goose->extractContent('http://url.to/article');
- $title = $article->getTitle();
- $metaDescription = $article->getMetaDescription();
- $metaKeywords = $article->getMetaKeywords();
- $canonicalLink = $article->getCanonicalLink();
- $domain = $article->getDomain();
- $tags = $article->getTags();
- $links = $article->getLinks();
- $movies = $article->getMovies();
- $articleText = $article->getCleanedArticleText();
- $entities = $article->getPopularWords();
- $image = $article->getTopImage();
- $allImages = $article->getAllImages();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement