Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- class AmUtil{
- const IMPOSSIBLE_MONTH = -1;
- const BOT_SIGNATURE = "For educational tests only";
- public static function leapYear(
- $pY
- ){
- return ($pY%400 === 0) || ($pY%4===0 && ($pY%100!==0));
- }//leapYear
- public static function numberOfDaysInMonth(
- $pY,
- $pM
- ){
- switch($pM){
- case 1: case 3:case 5:case 7:case 8: case 10;case 12: return 31;
- case 4: case 6:case 9:case 11: return 30;
- case 2: return (self::leapYear($pY) ? 29 : 28);
- default: return self::IMPOSSIBLE_MONTH;
- }//switch
- }//numberOfDaysInMonth
- public static function consumeUrl(
- $pUrl //can be an HTML page, can be a JPG, ...
- ){
- //$bValid = is_string($pUrl) && strlen($pUrl);
- $ch = curl_init($pUrl);
- if ($ch){
- //curl_setopt(CURLOPT_URL, $pUrl);
- /*
- * makes it explic that the request
- * will happen using HTTP GET
- */
- curl_setopt(
- $ch,
- CURLOPT_HTTPGET,
- true
- );
- /*
- * disables the verification of SSL
- * certificates
- * useful when not using cacert.pem
- */
- curl_setopt(
- $ch,
- CURLOPT_SSL_VERIFYPEER,
- true
- );
- /*
- * sets a user agent string for our
- * software
- */
- curl_setopt(
- $ch,
- CURLOPT_USERAGENT,
- self::BOT_SIGNATURE
- );
- //if set to true, curl_exec will return
- //the data consumed at the URL
- //instead of just true/false
- curl_setopt(
- $ch,
- CURLOPT_RETURNTRANSFER,
- true
- );
- /*
- * makes it clear that we want all the bytes
- */
- curl_setopt(
- $ch,
- CURLOPT_BINARYTRANSFER, //deprecated
- true
- );
- /*
- * sets automatic handling of the encoded
- * data
- */
- curl_setopt(
- $ch,
- CURLOPT_ENCODING,
- ""
- );
- $bin = curl_exec($ch);
- return $bin;
- }//if
- return false;
- }//consumeUrl
- /*
- * receives HTML source code
- * returns a collection of all "a" elements found,
- * structured as pairs "anchor", "href"
- *
- * E.g.
- * if this is the input:
- * <html><body><a href="URL1">anchor1</a></body></html>"
- * the output should be:
- * [
- * [ "anchor" => "anchor1", "href" => "URL1"]
- * ]
- */
- const KEY_HREF = "HREF";
- const KEY_ANCHOR = "ANCHOR";
- public static function extractHyperlinksFromHtmlSourceCode(
- string $pStrHtmlSourceCode
- ) /*: array */
- {
- $aRet = []; //the collection of all "a" elements found
- $oDom = new DOMDocument();
- if ($oDom){
- //@ - "silencer"
- @$oDom->loadHTML($pStrHtmlSourceCode);
- /*
- * array of "a" elements
- */
- $as = $oDom->getElementsByTagName('a');
- //foreach ($col as $indexOfElement => $valueOfElement){body}
- //foreach ($col as $valueOfElement){body}
- foreach ($as as $someAElement){
- $strAnchor = trim($someAElement->nodeValue);
- $strHref = trim($someAElement->getAttribute('href'));
- $aPair = [
- self::KEY_HREF => $strHref,
- self::KEY_ANCHOR => $strAnchor
- ];
- $aRet[] = $aPair;
- }//foreach
- }//if
- return $aRet;
- }//extractHyperlinksFromHtmlSourceCode
- //**
- /*
- * tool to filter Hyperlinks,
- * keeping only those with certain href endings
- * e.g.
- * input [
- * ["anchor"=>?, "href"=>".xpto"],
- * ["anchor"=>"pic", "href"=>"bla.jpg"]
- * ]
- *
- *
- */
- const IMAGE_FILTERS = [
- ".jpg", ".png", ".jp2", ".gif",
- ".gifv", ".bmp", ".svg"
- ];
- public static function
- filterHyperlinksKeepingOnlyThoseWithHrefsEndingIn(
- $paHyperlinksAsPairsAnchorsHref,
- $paFilters = [], //no filters, by default!
- $pStrURLPrefixIfSchemaIsMissing = "https:"
- )
- {
- $aRet = [];
- $bShouldDoNothing =
- is_array($paFilters) && count($paFilters)===0;
- if ($bShouldDoNothing)
- return $paHyperlinksAsPairsAnchorsHref;
- //if there are filters
- foreach (
- $paHyperlinksAsPairsAnchorsHref
- as
- $aPair
- ){
- $strAnchor = $aPair[self::KEY_ANCHOR];
- $strHref = $aPair[self::KEY_HREF];
- $bHrefEndsInAtLeastOneOfTheFilters =
- self::stringEndsInOneOfTheFollowing(
- $strHref,
- $paFilters
- );
- if ($bHrefEndsInAtLeastOneOfTheFilters){
- $bUrlIsMissingSchema = stripos(
- $strHref, "//"
- ) === 0;
- if ($bUrlIsMissingSchema){
- $strHref =
- "$pStrURLPrefixIfSchemaIsMissing$strHref";
- $aPair[self::KEY_HREF] = $strHref;
- }
- $aRet[] = $aPair;
- }//if
- }//foreach
- return $aRet;
- }//filterHyperlinksKeepingOnlyThoseWithHrefsEndingIn
- /*
- * stringEndsInOneOfTheFollowing ("Artur", ["ab", "r"]) => true
- * stringEndsInOneOfTheFollowing ("pic.png", [".png", "jpg"]) => true
- * case INSENSITIVE!
- */
- public static function stringEndsInOneOfTheFollowing(
- string $pStr,
- array $paTerminations,
- bool $pbCaseInsensitive = true
- ){
- foreach($paTerminations as $someTermination){
- if ($pbCaseInsensitive){
- $iWhereDoesTheTerminationOccur =
- strripos($pStr, $someTermination);
- }//if
- else{
- $iWhereDoesTheTerminationOccur =
- strrpos($pStr, $someTermination);
- }//else
- $bTerminationOccurs =
- $iWhereDoesTheTerminationOccur!==false;
- if ($bTerminationOccurs){
- //it it exactly at the END of the string?
- $bExactlyAtTheEnd =
- strlen($pStr) ===
- $iWhereDoesTheTerminationOccur +
- strlen($someTermination);
- if ($bExactlyAtTheEnd) return true;
- }//if
- }//foreach
- return false;
- }//stringEndsInOneOfTheFollowing
- }//AmUtil
- //**
- <?php
- /*
- * testing with
- * https://boards.4chan.org/wg/
- * https://boards.4chan.org/wg/2
- * ...
- * https://boards.4chan.org/wg/10
- * 404 for .../1 and for URLs ending in numbers >10
- */
- require_once "AmUtil.php";
- class FourChanBot {
- //data member
- private $mBoardName; //e.g. "wg"
- private $mBoardValidUrls; //"e.g. ["https://.../wg/" ... "https://.../wg/10"]
- private $mBoardHtmlForValidUrls; //e.g. [
- //"https://.../wg/" => "<html>...</html>"
- // , ...
- //"https://.../wg/10" => "<html>... <a href="i1.jpg">...</a></html>"
- //]
- private $mHyperlinksInBoardValidUrls;
- /*
- * [ ..."https://.../wg/10" => [["anchor"=>"??", "href"=>"https:..."], ...]
- */
- private function buildHyperlinksForBoardValidUrls(){
- foreach ($this->mBoardHtmlForValidUrls as $url=>$html){
- echo "current URL: $url".PHP_EOL;
- $as = AmUtil::extractHyperlinksFromHtmlSourceCode($html);
- //var_dump ($as);
- $asForImages = AmUtil::filterHyperlinksKeepingOnlyThoseWithHrefsEndingIn(
- $as,
- AmUtil::IMAGE_FILTERS
- );
- var_dump($asForImages);
- //$this->mHyperlinksInBoardValidUrls[$url] = $as;
- $this->mHyperlinksInBoardValidUrls[$url] = $asForImages;
- }//foreach
- return $this->mHyperlinksInBoardValidUrls;
- }//buildHyperlinksForBoardValidUrls
- public function __construct(
- string $pStrBoardName
- )
- {
- $this->mBoardName = $pStrBoardName;
- //this costs no time
- $this->mBoardValidUrls =
- $this->buildAllValidBoardUrls();
- //this can take time: it will consume each and all of the board's pages
- //$this->mBoardHtmlForValidUrls = $this->buildHtmlOfAllBoardPages(); //method returns null
- $this->buildHtmlOfAllBoardPages(); //method returns null, but it built the data member with the proper values
- $this->buildHyperlinksForBoardValidUrls();
- }//__construct
- const BASE_URL = "https://boards.4chan.org";
- const MIN_PAGE = 1;
- const MAX_PAGE = 10;
- public function buildAllValidBoardUrls() : array
- {
- $aRet = [];
- for(
- $iPage=self::MIN_PAGE;
- $iPage<=self::MAX_PAGE;
- $iPage++
- ){
- $strUrl = sprintf(
- "%s/%s/%s",
- self::BASE_URL,
- $this->mBoardName,
- $iPage===1 ? "" : $iPage
- );
- $aRet[] = $strUrl;
- }//for
- return $aRet;
- }//buildAllValidBoardUrls
- public function buildHtmlOfAllBoardPages(){
- foreach($this->mBoardValidUrls as $strOneValidUrl){
- $strHtml = AmUtil::consumeUrl($strOneValidUrl);
- $this->mBoardHtmlForValidUrls[$strOneValidUrl] =
- $strHtml;
- /*
- [
- //0 => "<html>...</html>"
- "https://boards.4chan.org/wg/2" => "<html>...</html>"
- ];
- */
- }//forearch
- //return $this->mBoardHtmlForValidUrls;
- }//buildHtmlOfAllBoardPages
- public function getMBoardName()
- {
- return $this->mBoardName;
- }
- public function getMBoardValidUrls()
- {
- return $this->mBoardValidUrls;
- }
- public function getMBoardHtmlForValidUrls()
- {
- return $this->mBoardHtmlForValidUrls;
- }
- /*
- * now that we have tools to associate URLs to corresponding HTML
- * we are in need of solutions to parse the HTML and extract hyperlinks
- * from it
- */
- }//FourChanBot
- $bot = new FourChanBot("wg");
- //$bot->consumePage(1);
- //$bot->downloadResourcesAtPage(1);
- //$allValidUrls = $bot->buildAllValidBoardUrls();
- //var_dump($allValidUrls);
- //var_dump($bot->getMBoardHtmlForValidUrls());
Advertisement
Add Comment
Please, Sign In to add comment