ACA 20201126

<?php

class AmUtil{
    const IMPOSSIBLE_MONTH = -1;
    const BOT_SIGNATURE = "For educational tests only";

    public static function leapYear(
        $pY
    ){
        return ($pY%400 === 0) || ($pY%4===0 && ($pY%100!==0));
    }//leapYear

    public static function numberOfDaysInMonth(
        $pY,
        $pM
    ){
        switch($pM){
            case 1: case 3:case 5:case 7:case 8: case 10;case 12: return 31;
            case 4: case 6:case 9:case 11: return 30;
            case 2: return (self::leapYear($pY) ? 29 :  28);
            default: return self::IMPOSSIBLE_MONTH;
        }//switch
    }//numberOfDaysInMonth

    public static function consumeUrl(
        $pUrl //can be an HTML page, can be a JPG, ...
    ){
        //$bValid = is_string($pUrl) && strlen($pUrl);
        $ch = curl_init($pUrl);
        if ($ch){
            //curl_setopt(CURLOPT_URL, $pUrl);
            /*
             * makes it explic that the request
             * will happen using HTTP GET
             */
            curl_setopt(
                $ch,
                CURLOPT_HTTPGET,
                true
            );

            /*
             * disables the verification of SSL
             * certificates
             * useful when not using cacert.pem
             */
            curl_setopt(
                $ch,
                CURLOPT_SSL_VERIFYPEER,
                true
            );

            /*
             * sets a user agent string for our
             * software
             */
            curl_setopt(
                $ch,
                CURLOPT_USERAGENT,
                self::BOT_SIGNATURE
            );

            //if set to true, curl_exec will return
            //the data consumed at the URL
            //instead of just true/false
            curl_setopt(
                $ch,
                CURLOPT_RETURNTRANSFER,
                true
            );

            /*
             * makes it clear that we want all the bytes
             */
            curl_setopt(
                $ch,
                CURLOPT_BINARYTRANSFER, //deprecated
                true
            );

            /*
             * sets automatic handling of the encoded
             * data
             */
            curl_setopt(
                $ch,
                CURLOPT_ENCODING,
                ""
            );

            $bin = curl_exec($ch);

            return $bin;
        }//if
        return false;
    }//consumeUrl

    /*
     * receives HTML source code
     * returns a collection of all "a" elements found,
     * structured as pairs "anchor", "href"
     *
     * E.g.
     * if this is the input:
     * <html><body><a href="URL1">anchor1</a></body></html>"
     * the output should be:
     * [
     *  [ "anchor" => "anchor1", "href" => "URL1"]
     * ]
     */
    const KEY_HREF = "HREF";
    const KEY_ANCHOR = "ANCHOR";
    public static function extractHyperlinksFromHtmlSourceCode(
        string $pStrHtmlSourceCode
    ) /*: array */
    {
        $aRet = []; //the collection of all "a" elements found
        $oDom = new DOMDocument();
        if ($oDom){
            //@ - "silencer"
            @$oDom->loadHTML($pStrHtmlSourceCode);
            /*
             * array of "a" elements
             */
            $as = $oDom->getElementsByTagName('a');

            //foreach ($col as $indexOfElement => $valueOfElement){body}
            //foreach ($col as $valueOfElement){body}
            foreach ($as as $someAElement){
                $strAnchor = trim($someAElement->nodeValue);
                $strHref = trim($someAElement->getAttribute('href'));

                $aPair = [
                    self::KEY_HREF => $strHref,
                    self::KEY_ANCHOR => $strAnchor
                ];

                $aRet[] = $aPair;
            }//foreach
        }//if
        return $aRet;
    }//extractHyperlinksFromHtmlSourceCode

    //**
    /*
     * tool to filter Hyperlinks,
     * keeping only those with certain href endings
     * e.g.
     * input [
         * ["anchor"=>?, "href"=>".xpto"],
         * ["anchor"=>"pic", "href"=>"bla.jpg"]
     * ]
     *
     *
     */
    const IMAGE_FILTERS = [
        ".jpg", ".png", ".jp2", ".gif",
        ".gifv", ".bmp", ".svg"
    ];
    public static function
filterHyperlinksKeepingOnlyThoseWithHrefsEndingIn(
        $paHyperlinksAsPairsAnchorsHref,
        $paFilters = [], //no filters, by default!
        $pStrURLPrefixIfSchemaIsMissing = "https:"
    )
    {
        $aRet = [];
        $bShouldDoNothing =
            is_array($paFilters) && count($paFilters)===0;

        if ($bShouldDoNothing)
            return $paHyperlinksAsPairsAnchorsHref;

        //if there are filters
        foreach (
            $paHyperlinksAsPairsAnchorsHref
            as
            $aPair
        ){
            $strAnchor = $aPair[self::KEY_ANCHOR];
            $strHref = $aPair[self::KEY_HREF];

            $bHrefEndsInAtLeastOneOfTheFilters =
                self::stringEndsInOneOfTheFollowing(
                    $strHref,
                    $paFilters
                );

            if ($bHrefEndsInAtLeastOneOfTheFilters){
                $bUrlIsMissingSchema = stripos(
                    $strHref, "//"
                ) === 0;
                if ($bUrlIsMissingSchema){
                    $strHref =
                        "$pStrURLPrefixIfSchemaIsMissing$strHref";

                    $aPair[self::KEY_HREF] = $strHref;
                }

                $aRet[] = $aPair;
            }//if
        }//foreach

        return $aRet;
    }//filterHyperlinksKeepingOnlyThoseWithHrefsEndingIn

    /*
     * stringEndsInOneOfTheFollowing ("Artur", ["ab", "r"]) => true
     * stringEndsInOneOfTheFollowing ("pic.png", [".png", "jpg"]) => true
     * case INSENSITIVE!
     */
    public static function stringEndsInOneOfTheFollowing(
        string $pStr,
        array $paTerminations,
        bool $pbCaseInsensitive = true
    ){
        foreach($paTerminations as $someTermination){
            if ($pbCaseInsensitive){
                $iWhereDoesTheTerminationOccur =
                    strripos($pStr, $someTermination);
            }//if
            else{
                $iWhereDoesTheTerminationOccur =
                    strrpos($pStr, $someTermination);
            }//else

            $bTerminationOccurs =
                $iWhereDoesTheTerminationOccur!==false;

            if ($bTerminationOccurs){
                //it it exactly at the END of the string?
                $bExactlyAtTheEnd =
                    strlen($pStr) ===
                        $iWhereDoesTheTerminationOccur +
                        strlen($someTermination);
                if ($bExactlyAtTheEnd) return true;
            }//if
        }//foreach
        return false;
    }//stringEndsInOneOfTheFollowing
}//AmUtil

//**

<?php

/*
 * testing with
 * https://boards.4chan.org/wg/
 * https://boards.4chan.org/wg/2
 * ...
 * https://boards.4chan.org/wg/10
 * 404 for .../1 and for URLs ending in numbers >10
 */

require_once "AmUtil.php";

class FourChanBot {
    //data member
    private $mBoardName; //e.g. "wg"
    private $mBoardValidUrls; //"e.g. ["https://.../wg/" ... "https://.../wg/10"]
    private $mBoardHtmlForValidUrls; //e.g. [
    //"https://.../wg/" => "<html>...</html>"
    // , ...
    //"https://.../wg/10" => "<html>... <a href="i1.jpg">...</a></html>"
    //]
    private $mHyperlinksInBoardValidUrls;
    /*
     * [ ..."https://.../wg/10" => [["anchor"=>"??", "href"=>"https:..."], ...]
     */

    private function buildHyperlinksForBoardValidUrls(){
        foreach ($this->mBoardHtmlForValidUrls as $url=>$html){
            echo "current URL: $url".PHP_EOL;

            $as = AmUtil::extractHyperlinksFromHtmlSourceCode($html);
            //var_dump ($as);

            $asForImages = AmUtil::filterHyperlinksKeepingOnlyThoseWithHrefsEndingIn(
                $as,
                AmUtil::IMAGE_FILTERS
            );
            var_dump($asForImages);

            //$this->mHyperlinksInBoardValidUrls[$url] = $as;
            $this->mHyperlinksInBoardValidUrls[$url] = $asForImages;
        }//foreach
        return $this->mHyperlinksInBoardValidUrls;
    }//buildHyperlinksForBoardValidUrls

    public function __construct(
        string $pStrBoardName
    )
    {
        $this->mBoardName = $pStrBoardName;

        //this costs no time
        $this->mBoardValidUrls =
            $this->buildAllValidBoardUrls();

        //this can take time: it will consume each and all of the board's pages
        //$this->mBoardHtmlForValidUrls = $this->buildHtmlOfAllBoardPages(); //method returns null
        $this->buildHtmlOfAllBoardPages(); //method returns null, but it built the data member with the proper values

        $this->buildHyperlinksForBoardValidUrls();
    }//__construct

    const BASE_URL = "https://boards.4chan.org";
    const MIN_PAGE = 1;
    const MAX_PAGE = 10;
    public function buildAllValidBoardUrls() : array
    {
        $aRet = [];

        for(
            $iPage=self::MIN_PAGE;
            $iPage<=self::MAX_PAGE;
            $iPage++
        ){
            $strUrl = sprintf(
                "%s/%s/%s",
                self::BASE_URL,
                $this->mBoardName,
                $iPage===1 ? "" : $iPage
            );
            $aRet[] = $strUrl;
        }//for

        return $aRet;
    }//buildAllValidBoardUrls

    public function buildHtmlOfAllBoardPages(){
        foreach($this->mBoardValidUrls as $strOneValidUrl){
            $strHtml = AmUtil::consumeUrl($strOneValidUrl);

            $this->mBoardHtmlForValidUrls[$strOneValidUrl] =
                $strHtml;
            /*
            [
                //0 => "<html>...</html>"
                "https://boards.4chan.org/wg/2" => "<html>...</html>"
            ];
            */
        }//forearch

        //return $this->mBoardHtmlForValidUrls;
    }//buildHtmlOfAllBoardPages

    public function getMBoardName()
    {
        return $this->mBoardName;
    }

    public function getMBoardValidUrls()
    {
        return $this->mBoardValidUrls;
    }

    public function getMBoardHtmlForValidUrls()
    {
        return $this->mBoardHtmlForValidUrls;
    }

    /*
     * now that we have tools to associate URLs to corresponding HTML
     * we are in need of solutions to parse the HTML and extract hyperlinks
     * from it
     */
}//FourChanBot

$bot = new FourChanBot("wg");
//$bot->consumePage(1);
//$bot->downloadResourcesAtPage(1);
//$allValidUrls = $bot->buildAllValidBoardUrls();
//var_dump($allValidUrls);

//var_dump($bot->getMBoardHtmlForValidUrls());