scrapers.php

<?php
//Header: Setting Charset to Utf-8
header('Content-type: text/html; charset=utf-8');

//Function Avas: Scraping Avas.mv using Simple HTML DOM.
function Avas()
    {
    $article = $_GET["article"];                                                //Gets Article URL
    $html = file_get_html("$article");                                          //Gets Article
    $title = $html->find("section[class='content'] h1", 0)->plaintext;          //Gets Title of Article
    $image = $html->find("meta[name='image']", 0)->content;                     //Gets Image URL
    $author = $html->find("div[class='post-author-details']", 0)->plaintext;    //Gets Author Name
    $date = $html->find('time', 0)->datetime;                                   //Gets the article published time/date
    $tagline = $html->find('p', 0);                                             //Gets the content
    $content = $tagline . $html->find('p', 1);                                  //Gets the content

    echo "Title: $title <br>";                          //Prints the Title
    echo "Image: $image <br>";                          //Prints the Image URL
    echo "Author: $author <br>";                        //Prints the Author Name
    echo "Date Published: $date <br>";                  //Prints the article published time/date
    echo "Content: $content";                           //Prints the paragraphs(content) of the article
    }

//Function Mihaaru: Scraping Mihaaru.com using Simple HTML DOM.
function Mihaaru()
    {
    $article = $_GET["article"];                                    //Gets Article URL
    $html = file_get_html("$article");                              //Gets Article
    $title = $html->find('h1', 0)->plaintext;                       //Gets the title of the article
    $image = $html->find("img[data-index='1']", 0)->src;            //Gets Image URL
    $author = $html->find("address", 0)->plaintext;                 //Gets Author Name
    $date = $html->find("span[class='date-time']", 0)->plaintext;   //Gets the date/time
    $content = $html->find("article", 0)->plaintext;                //Gets the content

    echo "Title: $title <br>";              //Prints the Title
    echo "Image: $image <br>";              //Prints the Image URL
    echo "Author: $author <br>";            //Prints the Author Name
    echo "Date Published: $date <br>";      //Prints the Date/Time Published
    echo "Content: $content";               //Prints the paragraphs(content) of the article
    }

//Function Sun: Scraping Sun.mv using Simple HTML DOM
function Sun()
    {
    Echo "Sun.mv is down at the moment";
    }

//Function Vaguthu: Scraping Vaguthu.mv using Simple HTML DOM
function Vaguthu()
    {
    $article = $_GET["article"];                                                        //Gets Article URL
    $html = file_get_html("$article");                                                  //Gets Article
    $title = $html->find('h1', 0)->plaintext;                                           //Gets the title of the article
    $image = $html->find("meta[property='og:image']", 0)->content;                      //Gets Image URL
    $author = $html->find("span[class='single-article--meta-author']", 0)->plaintext;   //Gets Author Name
    $date = $html->find('time', 0)->datetime;                                           //Gets the date/time
    $content = $html->find("div[class='single-article--content']", 0)->plaintext;       //Gets the content

    echo "Title: $title <br>";                                          //Prints the Title
    echo "Image: $image <br>";                                          //Prints the Image URL
    echo "Author: $author <br>";                                        //Prints the Author Name
    echo "Date Published: $date <br>";                                  //Prints the Date/Time Published
    echo "Content: " . str_replace("ADVERTISEMENT", "", "$content");    //Prints the content after removing "ADVERTISEMENT" included in some articles
    }

//Function Dhuvas: Scraping Dhuvas.mv using Simple HTML DOM
function Dhuvas()
    {
    Echo "Dhuvas is down at the moment";
    }

//Function AdduLive: Scraping AdduLive.com using Simple HTML DOM
function AdduLive()
    {
    $article = $_GET["article"];                                                                //Gets Article URL
    $html = file_get_html("$article");                                                          //Gets Article
    $title = $html->find("div[class='post-title']", 0)->plaintext;                              //Gets the title of the article
    $image = $html->find("div[class='post-featured-image'] img", 0)->src;                       //Gets the Image URL
    $author = $html->find("div[class='post-author']", 0)->plaintext;                            //Gets Author Name
    $date = $html->find("div[class='post-published-time']", 0)->plaintext;                      //Gets date/time
    $content = $html->find("div[class='post-body-left col-xs-12 col-sm-9'] 'p'", 0)->plaintext; //Gets the content


    echo "Title: $title <br>";          //Prints the title
    echo "Image: $image <br>";          //Prints the Image URL
    echo "Author: $author <br>";        //Prints the Author Name
    echo "Published Date: $date <br>";  //Prints the Date/Time Published
    echo "Content: " . str_replace("  Click to share on Facebook (Opens in new window)  Click to share on Twitter (Opens in new window)  Click to print (Opens in new window)  Click to email this to a friend (Opens in new window)                        ", "", "$content"); //Prints after removing Social Media Content.
    }

//Function MVSports: Scraping mvsports.mv using Simple HTML DOM
function MVSports()
    {
    $article = $_GET["article"];
    $html = file_get_html("$article");
    $title = $html->find("header[class='td-post-title'] h1", 0)->plaintext;
    $image = $html->find("div[class='td-post-featured-image'] img", 0)->src;
    $author = $html->find("div[class='td-post-author-name'] a", 0)->plaintext;
    $date = $html->find("div[class='td-post-date'] time", 0)->plaintext;
    $content = $html->find("div[class='td-post-content] 'p'", 0)->plaintext;

    echo "Title: $title <br>";
    echo "Image: $image <br>";
    echo "Author: $author <br>";
    echo "Published Date: $date <br>";
    echo "Content: $content";
    }

//Function AdduOnline: Scraping adduonline.com
function AdduOnline()
    {
    $article = $_GET["article"];
    $html = file_get_html("$article");
    $title = $html->find("div[class='post-title'] h2", 0)->plaintext
    }

//Function InvalidUrl: Shows error message and disables scraping other sites.
function InvalidUrl()
    {
    echo "The url specified is not acceptable in our scraper. Please use a correct url. Accepted urls are listed in below <br>";
    echo "Sites scrapable are: Avas, Mihaaru, Vaguthu, Sun, MVSports, AdduLive, Dhuvas";
    }
?>