Advertisement
Guest User

scrapers.php

a guest
Apr 20th, 2018
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 5.76 KB | None | 0 0
  1. <?php
  2. //Header: Setting Charset to Utf-8
  3. header('Content-type: text/html; charset=utf-8');
  4.  
  5. //Function Avas: Scraping Avas.mv using Simple HTML DOM.
  6. function Avas()
  7.     {
  8.     $article = $_GET["article"];                                                //Gets Article URL
  9.     $html = file_get_html("$article");                                          //Gets Article
  10.     $title = $html->find("section[class='content'] h1", 0)->plaintext;          //Gets Title of Article
  11.     $image = $html->find("meta[name='image']", 0)->content;                     //Gets Image URL
  12.     $author = $html->find("div[class='post-author-details']", 0)->plaintext;    //Gets Author Name
  13.     $date = $html->find('time', 0)->datetime;                                   //Gets the article published time/date
  14.     $tagline = $html->find('p', 0);                                             //Gets the content
  15.     $content = $tagline . $html->find('p', 1);                                  //Gets the content
  16.    
  17.     echo "Title: $title <br>";                          //Prints the Title
  18.     echo "Image: $image <br>";                          //Prints the Image URL
  19.     echo "Author: $author <br>";                        //Prints the Author Name
  20.     echo "Date Published: $date <br>";                  //Prints the article published time/date
  21.     echo "Content: $content";                           //Prints the paragraphs(content) of the article
  22.     }
  23.  
  24. //Function Mihaaru: Scraping Mihaaru.com using Simple HTML DOM.
  25. function Mihaaru()
  26.     {
  27.     $article = $_GET["article"];                                    //Gets Article URL
  28.     $html = file_get_html("$article");                              //Gets Article
  29.     $title = $html->find('h1', 0)->plaintext;                       //Gets the title of the article
  30.     $image = $html->find("img[data-index='1']", 0)->src;            //Gets Image URL
  31.     $author = $html->find("address", 0)->plaintext;                 //Gets Author Name
  32.     $date = $html->find("span[class='date-time']", 0)->plaintext;   //Gets the date/time
  33.     $content = $html->find("article", 0)->plaintext;                //Gets the content
  34.  
  35.     echo "Title: $title <br>";              //Prints the Title     
  36.     echo "Image: $image <br>";              //Prints the Image URL
  37.     echo "Author: $author <br>";            //Prints the Author Name
  38.     echo "Date Published: $date <br>";      //Prints the Date/Time Published
  39.     echo "Content: $content";               //Prints the paragraphs(content) of the article
  40.     }
  41.  
  42. //Function Sun: Scraping Sun.mv using Simple HTML DOM
  43. function Sun()
  44.     {
  45.     Echo "Sun.mv is down at the moment";
  46.     }
  47.  
  48. //Function Vaguthu: Scraping Vaguthu.mv using Simple HTML DOM
  49. function Vaguthu()
  50.     {
  51.     $article = $_GET["article"];                                                        //Gets Article URL
  52.     $html = file_get_html("$article");                                                  //Gets Article
  53.     $title = $html->find('h1', 0)->plaintext;                                           //Gets the title of the article
  54.     $image = $html->find("meta[property='og:image']", 0)->content;                      //Gets Image URL
  55.     $author = $html->find("span[class='single-article--meta-author']", 0)->plaintext;   //Gets Author Name
  56.     $date = $html->find('time', 0)->datetime;                                           //Gets the date/time
  57.     $content = $html->find("div[class='single-article--content']", 0)->plaintext;       //Gets the content
  58.  
  59.     echo "Title: $title <br>";                                          //Prints the Title
  60.     echo "Image: $image <br>";                                          //Prints the Image URL
  61.     echo "Author: $author <br>";                                        //Prints the Author Name
  62.     echo "Date Published: $date <br>";                                  //Prints the Date/Time Published
  63.     echo "Content: " . str_replace("ADVERTISEMENT", "", "$content");    //Prints the content after removing "ADVERTISEMENT" included in some articles
  64.     }
  65.  
  66. //Function Dhuvas: Scraping Dhuvas.mv using Simple HTML DOM
  67. function Dhuvas()
  68.     {
  69.     Echo "Dhuvas is down at the moment";
  70.     }
  71.  
  72. //Function AdduLive: Scraping AdduLive.com using Simple HTML DOM
  73. function AdduLive()
  74.     {
  75.     $article = $_GET["article"];                                                                //Gets Article URL
  76.     $html = file_get_html("$article");                                                          //Gets Article
  77.     $title = $html->find("div[class='post-title']", 0)->plaintext;                              //Gets the title of the article
  78.     $image = $html->find("div[class='post-featured-image'] img", 0)->src;                       //Gets the Image URL
  79.     $author = $html->find("div[class='post-author']", 0)->plaintext;                            //Gets Author Name
  80.     $date = $html->find("div[class='post-published-time']", 0)->plaintext;                      //Gets date/time
  81.     $content = $html->find("div[class='post-body-left col-xs-12 col-sm-9'] 'p'", 0)->plaintext; //Gets the content
  82.  
  83.  
  84.     echo "Title: $title <br>";          //Prints the title
  85.     echo "Image: $image <br>";          //Prints the Image URL
  86.     echo "Author: $author <br>";        //Prints the Author Name
  87.     echo "Published Date: $date <br>";  //Prints the Date/Time Published
  88.     echo "Content: " . str_replace("  Click to share on Facebook (Opens in new window)  Click to share on Twitter (Opens in new window)  Click to print (Opens in new window)  Click to email this to a friend (Opens in new window)                        ", "", "$content"); //Prints after removing Social Media Content.
  89.     }
  90.  
  91. //Function MVSports: Scraping mvsports.mv using Simple HTML DOM
  92. function MVSports()
  93.     {
  94.     $article = $_GET["article"];
  95.     $html = file_get_html("$article");
  96.     $title = $html->find("header[class='td-post-title'] h1", 0)->plaintext;
  97.     $image = $html->find("div[class='td-post-featured-image'] img", 0)->src;
  98.     $author = $html->find("div[class='td-post-author-name'] a", 0)->plaintext;
  99.     $date = $html->find("div[class='td-post-date'] time", 0)->plaintext;
  100.     $content = $html->find("div[class='td-post-content] 'p'", 0)->plaintext;
  101.  
  102.     echo "Title: $title <br>";
  103.     echo "Image: $image <br>";
  104.     echo "Author: $author <br>";
  105.     echo "Published Date: $date <br>";
  106.     echo "Content: $content";
  107.     }
  108.  
  109. //Function AdduOnline: Scraping adduonline.com
  110. function AdduOnline()
  111.     {
  112.     $article = $_GET["article"];
  113.     $html = file_get_html("$article");
  114.     $title = $html->find("div[class='post-title'] h2", 0)->plaintext
  115.     }
  116.    
  117. //Function InvalidUrl: Shows error message and disables scraping other sites.
  118. function InvalidUrl()
  119.     {
  120.     echo "The url specified is not acceptable in our scraper. Please use a correct url. Accepted urls are listed in below <br>";
  121.     echo "Sites scrapable are: Avas, Mihaaru, Vaguthu, Sun, MVSports, AdduLive, Dhuvas";
  122.     }
  123. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement