am_dot_com

ACA 20201126

Nov 26th, 2020 (edited)
131
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 10.79 KB | None | 0 0
  1. <?php
  2.  
  3. class AmUtil{
  4.     const IMPOSSIBLE_MONTH = -1;
  5.     const BOT_SIGNATURE = "For educational tests only";
  6.  
  7.     public static function leapYear(
  8.         $pY
  9.     ){
  10.         return ($pY%400 === 0) || ($pY%4===0 && ($pY%100!==0));
  11.     }//leapYear
  12.  
  13.     public static function numberOfDaysInMonth(
  14.         $pY,
  15.         $pM
  16.     ){
  17.         switch($pM){
  18.             case 1: case 3:case 5:case 7:case 8: case 10;case 12: return 31;
  19.             case 4: case 6:case 9:case 11: return 30;
  20.             case 2: return (self::leapYear($pY) ? 29 :  28);
  21.             default: return self::IMPOSSIBLE_MONTH;
  22.         }//switch
  23.     }//numberOfDaysInMonth
  24.  
  25.     public static function consumeUrl(
  26.         $pUrl //can be an HTML page, can be a JPG, ...
  27.     ){
  28.         //$bValid = is_string($pUrl) && strlen($pUrl);
  29.         $ch = curl_init($pUrl);
  30.         if ($ch){
  31.             //curl_setopt(CURLOPT_URL, $pUrl);
  32.             /*
  33.              * makes it explic that the request
  34.              * will happen using HTTP GET
  35.              */
  36.             curl_setopt(
  37.                 $ch,
  38.                 CURLOPT_HTTPGET,
  39.                 true
  40.             );
  41.  
  42.             /*
  43.              * disables the verification of SSL
  44.              * certificates
  45.              * useful when not using cacert.pem
  46.              */
  47.             curl_setopt(
  48.                 $ch,
  49.                 CURLOPT_SSL_VERIFYPEER,
  50.                 true
  51.             );
  52.  
  53.             /*
  54.              * sets a user agent string for our
  55.              * software
  56.              */
  57.             curl_setopt(
  58.                 $ch,
  59.                 CURLOPT_USERAGENT,
  60.                 self::BOT_SIGNATURE
  61.             );
  62.  
  63.             //if set to true, curl_exec will return
  64.             //the data consumed at the URL
  65.             //instead of just true/false
  66.             curl_setopt(
  67.                 $ch,
  68.                 CURLOPT_RETURNTRANSFER,
  69.                 true
  70.             );
  71.  
  72.             /*
  73.              * makes it clear that we want all the bytes
  74.              */
  75.             curl_setopt(
  76.                 $ch,
  77.                 CURLOPT_BINARYTRANSFER, //deprecated
  78.                 true
  79.             );
  80.  
  81.             /*
  82.              * sets automatic handling of the encoded
  83.              * data
  84.              */
  85.             curl_setopt(
  86.                 $ch,
  87.                 CURLOPT_ENCODING,
  88.                 ""
  89.             );
  90.  
  91.             $bin = curl_exec($ch);
  92.  
  93.             return $bin;
  94.         }//if
  95.         return false;
  96.     }//consumeUrl
  97.  
  98.     /*
  99.      * receives HTML source code
  100.      * returns a collection of all "a" elements found,
  101.      * structured as pairs "anchor", "href"
  102.      *
  103.      * E.g.
  104.      * if this is the input:
  105.      * <html><body><a href="URL1">anchor1</a></body></html>"
  106.      * the output should be:
  107.      * [
  108.      *  [ "anchor" => "anchor1", "href" => "URL1"]
  109.      * ]
  110.      */
  111.     const KEY_HREF = "HREF";
  112.     const KEY_ANCHOR = "ANCHOR";
  113.     public static function extractHyperlinksFromHtmlSourceCode(
  114.         string $pStrHtmlSourceCode
  115.     ) /*: array */
  116.     {
  117.         $aRet = []; //the collection of all "a" elements found
  118.         $oDom = new DOMDocument();
  119.         if ($oDom){
  120.             //@ - "silencer"
  121.             @$oDom->loadHTML($pStrHtmlSourceCode);
  122.             /*
  123.              * array of "a" elements
  124.              */
  125.             $as = $oDom->getElementsByTagName('a');
  126.  
  127.             //foreach ($col as $indexOfElement => $valueOfElement){body}
  128.             //foreach ($col as $valueOfElement){body}
  129.             foreach ($as as $someAElement){
  130.                 $strAnchor = trim($someAElement->nodeValue);
  131.                 $strHref = trim($someAElement->getAttribute('href'));
  132.  
  133.                 $aPair = [
  134.                     self::KEY_HREF => $strHref,
  135.                     self::KEY_ANCHOR => $strAnchor
  136.                 ];
  137.  
  138.                 $aRet[] = $aPair;
  139.             }//foreach
  140.         }//if
  141.         return $aRet;
  142.     }//extractHyperlinksFromHtmlSourceCode
  143.  
  144.     //**
  145.     /*
  146.      * tool to filter Hyperlinks,
  147.      * keeping only those with certain href endings
  148.      * e.g.
  149.      * input [
  150.          * ["anchor"=>?, "href"=>".xpto"],
  151.          * ["anchor"=>"pic", "href"=>"bla.jpg"]
  152.      * ]
  153.      *
  154.      *
  155.      */
  156.     const IMAGE_FILTERS = [
  157.         ".jpg", ".png", ".jp2", ".gif",
  158.         ".gifv", ".bmp", ".svg"
  159.     ];
  160.     public static function
  161. filterHyperlinksKeepingOnlyThoseWithHrefsEndingIn(
  162.         $paHyperlinksAsPairsAnchorsHref,
  163.         $paFilters = [], //no filters, by default!
  164.         $pStrURLPrefixIfSchemaIsMissing = "https:"
  165.     )
  166.     {
  167.         $aRet = [];
  168.         $bShouldDoNothing =
  169.             is_array($paFilters) && count($paFilters)===0;
  170.  
  171.         if ($bShouldDoNothing)
  172.             return $paHyperlinksAsPairsAnchorsHref;
  173.  
  174.         //if there are filters
  175.         foreach (
  176.             $paHyperlinksAsPairsAnchorsHref
  177.             as
  178.             $aPair
  179.         ){
  180.             $strAnchor = $aPair[self::KEY_ANCHOR];
  181.             $strHref = $aPair[self::KEY_HREF];
  182.  
  183.             $bHrefEndsInAtLeastOneOfTheFilters =
  184.                 self::stringEndsInOneOfTheFollowing(
  185.                     $strHref,
  186.                     $paFilters
  187.                 );
  188.  
  189.             if ($bHrefEndsInAtLeastOneOfTheFilters){
  190.                 $bUrlIsMissingSchema = stripos(
  191.                     $strHref, "//"
  192.                 ) === 0;
  193.                 if ($bUrlIsMissingSchema){
  194.                     $strHref =
  195.                         "$pStrURLPrefixIfSchemaIsMissing$strHref";
  196.  
  197.                     $aPair[self::KEY_HREF] = $strHref;
  198.                 }
  199.  
  200.                 $aRet[] = $aPair;
  201.             }//if
  202.         }//foreach
  203.  
  204.         return $aRet;
  205.     }//filterHyperlinksKeepingOnlyThoseWithHrefsEndingIn
  206.  
  207.     /*
  208.      * stringEndsInOneOfTheFollowing ("Artur", ["ab", "r"]) => true
  209.      * stringEndsInOneOfTheFollowing ("pic.png", [".png", "jpg"]) => true
  210.      * case INSENSITIVE!
  211.      */
  212.     public static function stringEndsInOneOfTheFollowing(
  213.         string $pStr,
  214.         array $paTerminations,
  215.         bool $pbCaseInsensitive = true
  216.     ){
  217.         foreach($paTerminations as $someTermination){
  218.             if ($pbCaseInsensitive){
  219.                 $iWhereDoesTheTerminationOccur =
  220.                     strripos($pStr, $someTermination);
  221.             }//if
  222.             else{
  223.                 $iWhereDoesTheTerminationOccur =
  224.                     strrpos($pStr, $someTermination);
  225.             }//else
  226.  
  227.             $bTerminationOccurs =
  228.                 $iWhereDoesTheTerminationOccur!==false;
  229.  
  230.             if ($bTerminationOccurs){
  231.                 //it it exactly at the END of the string?
  232.                 $bExactlyAtTheEnd =
  233.                     strlen($pStr) ===
  234.                         $iWhereDoesTheTerminationOccur +
  235.                         strlen($someTermination);
  236.                 if ($bExactlyAtTheEnd) return true;
  237.             }//if
  238.         }//foreach
  239.         return false;
  240.     }//stringEndsInOneOfTheFollowing
  241. }//AmUtil
  242.  
  243. //**
  244.  
  245. <?php
  246.  
  247. /*
  248.  * testing with
  249.  * https://boards.4chan.org/wg/
  250.  * https://boards.4chan.org/wg/2
  251.  * ...
  252.  * https://boards.4chan.org/wg/10
  253.  * 404 for .../1 and for URLs ending in numbers >10
  254.  */
  255.  
  256. require_once "AmUtil.php";
  257.  
  258. class FourChanBot {
  259.     //data member
  260.     private $mBoardName; //e.g. "wg"
  261.     private $mBoardValidUrls; //"e.g. ["https://.../wg/" ... "https://.../wg/10"]
  262.     private $mBoardHtmlForValidUrls; //e.g. [
  263.     //"https://.../wg/" => "<html>...</html>"
  264.     // , ...
  265.     //"https://.../wg/10" => "<html>... <a href="i1.jpg">...</a></html>"
  266.     //]
  267.     private $mHyperlinksInBoardValidUrls;
  268.     /*
  269.      * [ ..."https://.../wg/10" => [["anchor"=>"??", "href"=>"https:..."], ...]
  270.      */
  271.  
  272.     private function buildHyperlinksForBoardValidUrls(){
  273.         foreach ($this->mBoardHtmlForValidUrls as $url=>$html){
  274.             echo "current URL: $url".PHP_EOL;
  275.  
  276.             $as = AmUtil::extractHyperlinksFromHtmlSourceCode($html);
  277.             //var_dump ($as);
  278.  
  279.             $asForImages = AmUtil::filterHyperlinksKeepingOnlyThoseWithHrefsEndingIn(
  280.                 $as,
  281.                 AmUtil::IMAGE_FILTERS
  282.             );
  283.             var_dump($asForImages);
  284.  
  285.             //$this->mHyperlinksInBoardValidUrls[$url] = $as;
  286.             $this->mHyperlinksInBoardValidUrls[$url] = $asForImages;
  287.         }//foreach
  288.         return $this->mHyperlinksInBoardValidUrls;
  289.     }//buildHyperlinksForBoardValidUrls
  290.  
  291.     public function __construct(
  292.         string $pStrBoardName
  293.     )
  294.     {
  295.         $this->mBoardName = $pStrBoardName;
  296.  
  297.         //this costs no time
  298.         $this->mBoardValidUrls =
  299.             $this->buildAllValidBoardUrls();
  300.  
  301.         //this can take time: it will consume each and all of the board's pages
  302.         //$this->mBoardHtmlForValidUrls = $this->buildHtmlOfAllBoardPages(); //method returns null
  303.         $this->buildHtmlOfAllBoardPages(); //method returns null, but it built the data member with the proper values
  304.  
  305.         $this->buildHyperlinksForBoardValidUrls();
  306.     }//__construct
  307.  
  308.     const BASE_URL = "https://boards.4chan.org";
  309.     const MIN_PAGE = 1;
  310.     const MAX_PAGE = 10;
  311.     public function buildAllValidBoardUrls() : array
  312.     {
  313.         $aRet = [];
  314.  
  315.         for(
  316.             $iPage=self::MIN_PAGE;
  317.             $iPage<=self::MAX_PAGE;
  318.             $iPage++
  319.         ){
  320.             $strUrl = sprintf(
  321.                 "%s/%s/%s",
  322.                 self::BASE_URL,
  323.                 $this->mBoardName,
  324.                 $iPage===1 ? "" : $iPage
  325.             );
  326.             $aRet[] = $strUrl;
  327.         }//for
  328.  
  329.         return $aRet;
  330.     }//buildAllValidBoardUrls
  331.  
  332.     public function buildHtmlOfAllBoardPages(){
  333.         foreach($this->mBoardValidUrls as $strOneValidUrl){
  334.             $strHtml = AmUtil::consumeUrl($strOneValidUrl);
  335.  
  336.             $this->mBoardHtmlForValidUrls[$strOneValidUrl] =
  337.                 $strHtml;
  338.             /*
  339.             [
  340.                 //0 => "<html>...</html>"
  341.                 "https://boards.4chan.org/wg/2" => "<html>...</html>"
  342.             ];
  343.             */
  344.         }//forearch
  345.  
  346.         //return $this->mBoardHtmlForValidUrls;
  347.     }//buildHtmlOfAllBoardPages
  348.  
  349.     public function getMBoardName()
  350.     {
  351.         return $this->mBoardName;
  352.     }
  353.  
  354.     public function getMBoardValidUrls()
  355.     {
  356.         return $this->mBoardValidUrls;
  357.     }
  358.  
  359.     public function getMBoardHtmlForValidUrls()
  360.     {
  361.         return $this->mBoardHtmlForValidUrls;
  362.     }
  363.  
  364.     /*
  365.      * now that we have tools to associate URLs to corresponding HTML
  366.      * we are in need of solutions to parse the HTML and extract hyperlinks
  367.      * from it
  368.      */
  369. }//FourChanBot
  370.  
  371. $bot = new FourChanBot("wg");
  372. //$bot->consumePage(1);
  373. //$bot->downloadResourcesAtPage(1);
  374. //$allValidUrls = $bot->buildAllValidBoardUrls();
  375. //var_dump($allValidUrls);
  376.  
  377. //var_dump($bot->getMBoardHtmlForValidUrls());
  378.  
  379.  
  380.  
Advertisement
Add Comment
Please, Sign In to add comment