Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- //*** This script takes input from a radio show website, scrapes the links and info of its episodes,
- // then outputs an iTunes-compatible podcast .xml file (RSS feed)! /l、
- // (miau?) ゙(゚、 。 7
- // Notes: l、゙ ~ヽ
- // ---------- じしf_, )ノ
- // Whenever this script is accessed, it will save the .xml file to the server. Thus, to update
- // the feed, a link to this script must be queried (or run by a server chron).
- // This script is optimized to scrape XRAY.FM radio show web-pages. Some tailoring of this code
- // may be necessary for your chosen site:
- // XRAY.FM sites consist of a parent page and broadcast/episode sub-pages. We will grab over-
- // arching info from the parent page, including links to broadcasts sub-pages, then grab data
- // from each sub-page as well.
- // If a file is not found, iTunes will ignore the rss data for that entry/item.
- //Debugger
- // require('php_error.php');
- // \php_error\reportErrors();
- //Variables
- $scrape_this_url = 'http://xray.fm/shows/gks'; //site to make into iTunes podcast feed
- $baseurl = 'http://xray.fm'; //this is redundant, but needed to make the script work right
- $owner = 'pops@sharklasers.com'; //email of the feed's owner
- $tags_string = 'greasykidstuff,radio'; //iTunes search tags separated by commas
- $chsummary = "This is a fan-made / unofficial podcast link to one of my favorite radio shows.";
- // The user must also define the XPATH query strings and $currenturl below.
- // Hint: Google Chrome has a built-in XPATH query tool in the developer
- // tools section / inspect element section.
- // Google Chrome XPATH query syntax: $x("XPATHQUERYHERE")
- //XPATH Queries - Parent page
- $homepagecontents = file_get_contents($scrape_this_url); // Get main page of the Greasy Kid Stuff radio show
- $maintitle = xpath_on_url($homepagecontents, '//div[@class="content-center clearfix"]/h1/text()'); //search that page for content specified by XPATH
- $maindescription = xpath_on_url($homepagecontents, '//div[@class="full-description clearfix"]');
- $mainimglinktmp = xpath_on_url($homepagecontents, '//div[@class="info-container clearfix"]/a/img/@src');
- $mainimglinktmp = str_replace(' src="','',$mainimglinktmp);
- $mainimglink = str_replace('"','',$mainimglinktmp);
- $broadcastlinks = xpath_on_url($homepagecontents, '//div[@class="title"]/a[contains(@href, "broadcasts")]/@href'); //Get links to broadcasts/sub-pages
- // Note: Output strings include " href=" " attribute in the string too :[, we will
- // fix each link later and paste it into $currenturl). Then get the data contained
- // in each broadcast's sub-page.
- // Note: There is no need to grep/scrape any sub-pages found on "next page" of parent page
- // since streams are only kept up on XRAY.FM for two weeks anyway.
- //XPATH Queries - Sub-pages (broadcast pages)
- // WARNING: xpath_on_url function returns arrays. This loop creates arrays of arrays,
- // with useful data contained in $var[$i][0].
- for ($i = 0; $i < count($broadcastlinks); $i ++){ //process each episode into feed
- $trunclink = str_replace(' href="','',$broadcastlinks[$i]); //fix each sub-page link before scraping it w/ XPATH
- $trunclink = str_replace('"','',$trunclink);
- $currenturl = $baseurl.$trunclink;
- $urlcontents = file_get_contents($currenturl); //grep all HTML data from sub-page link
- $title[] = xpath_on_url($urlcontents, '//div[@class="content-center clearfix"]/h1/text()'); // Parameters for "xpath_on_url" = url, xpath_string
- $pubDate[] = xpath_on_url($urlcontents, '//div[@class="date"]/text()'); //get broadcast publication date from sub-page
- $linktmp = xpath_on_url($urlcontents, '//ul/li/a/@href'); //get broadcast mp3 from sub-page
- $linktmp = str_replace(' href="','',$linktmp); //fix mp3 link url
- $link[] = str_replace('"','',$linktmp);
- $description[] = xpath_on_url($urlcontents, '//div[@class="tracks-container"]');
- }
- //Create Feed
- $doc_rss = new DOMDocument(); //create RSS tags
- $doc_rss = new_rss($doc_rss);
- $feed = new DOMDocument(); //create channel with info
- $feed = new_channel($feed, $maintitle[0], $maindescription[0], $scrape_this_url, $mainimglink[0], $owner, $tags_string, $chsummary);
- $feed = doc_combine($doc_rss, $feed, 'rss', 'channel'); //wrap channel inside of RSS tags
- for ($i = 0; $i < count($title); $i ++){ //process each episode into feed
- $doc_item = new DOMDocument(); //make an <item>...</item> XML for each episode
- $doc_item = new_item($doc_item, $pubDate[$i][0], $title[$i][0], $link[$i][0], $description[$i][0], $owner, $tags_string);
- $feed = doc_combine($feed, $doc_item, 'channel', 'item'); //insert that <item></> into <channel></>
- }
- //Export Feed to XML Document
- $feed->formatOutput = true;
- echo "<pre>";
- echo "feed.xml UPDATED!<br>";
- echo "-----------------<br><br>";
- echo $feed->saveXML(); //this saveXML() is not needed, but is kludge to get echo to work
- $feed->save("feed.xml");
- // ========================================================================
- //Reference material
- // http://xray.fm/shows/gks
- // http://192.168.1.129:8080/print.php
- // https://www.youtube.com/watch?v=SIPGkrlM3R8
- //Functions
- function xpath_on_url($urlcontents, $query){ // Loads an XML file string, runs an XPATH query on it, then returns the results as an array.
- // Load url/xml contents (string) into a DOMdocument for parsing with PHP's DOMXpath
- $dom = new DOMDocument();
- @$dom->loadHTML($urlcontents);
- // XPATH scrape page's classes/tags/objects/etc
- $xpath = new DOMXpath($dom);
- $dom_node_list = $xpath->query($query);
- // Convert DOMDocument-class "node list" into a string array
- $temp_dom = new DOMDocument(); //A temp DOMdoc to store DOMXpath results
- foreach($dom_node_list as $n) { //n = The number of results from our original XPATH query
- $current_node = $temp_dom->importNode($n,true); //allow temp to import a node. Get that node from dom_node_list.
- $array[] = $temp_dom->saveHTML($current_node); //allow temp to export that node (as XML) into an array position.
- }
- return $array;
- }
- function new_channel($doc, $channeltitle, $channeldescription, $channelurl, $channelimgurl, $owner, $tags_string, $chsummary) { //input a DOMDocument to write into
- //Name the HTML/DOMdoc tags/"elements"
- //------------------------------
- $channel = $doc->createElement("channel");
- $title = $doc->createElement("title");
- $link = $doc->createElement("link");
- $description = $doc->createElement("description");
- $itunes_owner = $doc->createElement("itunes:owner");
- $itunes_email = $doc->createElement("itunes:email");
- $itunes_explicit = $doc->createElement("itunes:explicit");
- $itunes_image = $doc->createElement("itunes:image");
- $itunes_keywords = $doc->createElement("itunes:keywords");
- $itunes_subtitle = $doc->createElement("itunes:subtitle");
- $itunes_summary = $doc->createElement("itunes:summary");
- $itunes_category = $doc->createElement("itunes:category");
- $textAttribute = $doc->createAttribute("text");
- $hrefAttribute = $doc->createAttribute("href");
- //Construct and fill HTML/DOMdoc tags/"elements"
- //-------------------------------
- //<channel>
- //<title>"TitleHere"
- //</title>
- $title->nodeValue = $channeltitle;
- $channel->appendChild($title);
- //<link>"http://...com/"
- //</link>
- $link->nodeValue = $channelurl;
- $channel->appendChild($link);
- //<description>DescriptionHere
- //</description>
- $description->nodeValue = $channeldescription;
- $channel->appendChild($description);
- //<itunes:owner><itunes:email>poops@sharklasers.com
- //</itunes:email></itunes:owner>
- $itunes_owner->nodeValue = "";
- $itunes_email->nodeValue = $owner;
- $itunes_owner->appendChild($itunes_email);
- $channel->appendChild($itunes_owner);
- //<itunes:explicit>no
- //</itunes:explicit>
- $itunes_explicit->nodeValue = "no";
- $channel->appendChild($itunes_explicit);
- //<itunes:image href="https://fbcdn-sphotos-b-a.akamaihd.net/hphotos-ak-xfa1/t1.0-9/995471_10151845076678274_55288227_n.jpg">
- //</itunes:image>
- $hrefAttribute->nodeValue = $channelimgurl;
- $itunes_image->appendChild($hrefAttribute);
- $itunes_image->nodeValue = "";
- $channel->appendChild($itunes_image);
- //<itunes:keywords>passport,approved
- //</itunes:keywords>
- $itunes_keywords->nodeValue = $tags_string;
- $channel->appendChild($itunes_keywords);
- //<itunes:subtitle>Internationally syndicated radio -- Acting local, but thinking global!
- //</itunes:subtitle>
- $itunes_subtitle->nodeValue = "";
- $channel->appendChild($itunes_subtitle);
- //<itunes:summary>This is a fan-made / unofficial podcast link to one of my favorite radio shows.
- //</itunes:summary>
- $itunes_summary->nodeValue = $chsummary;
- $channel->appendChild($itunes_summary);
- //<itunes:category text="Music">
- //</itunes:category>
- $textAttribute->nodeValue = "Music";
- $itunes_category->appendChild($textAttribute);
- $itunes_category->nodeValue = "";
- $channel->appendChild($itunes_category);
- //</channel>
- $doc->appendChild($channel);
- //</rss>
- $doc->saveXML();
- return $doc;
- }
- function new_item($doc, $pubDate, $title, $link, $description, $owner, $tags_string){//input a DOMDocument to write into + vars
- //Name the HTML/DOMdoc tags/"elements"
- //------------------------------
- $t_channel = $doc->createElement("channel");
- $t_item = $doc->createElement("item");
- $t_pubDate = $doc->createElement("pubDate");
- $t_title = $doc->createElement("title");
- $t_link = $doc->createElement("link");
- $t_description = $doc->createElement("description");
- $t_author = $doc->createElement("author");
- $t_enclosure = $doc->createElement("enclosure");
- $itunes_explicit = $doc->createElement("itunes:explicit");
- $itunes_subtitle = $doc->createElement("itunes:subtitle");
- $itunes_author = $doc->createElement("itunes:author");
- $itunes_summary = $doc->createElement("itunes:summary");
- $itunes_keywords = $doc->createElement("itunes:keywords");
- $urlAttribute = $doc->createAttribute("url");
- $typeAttribute = $doc->createAttribute("type");
- //Construct and fill HTML/DOMdoc tags/"elements"
- //-------------------------------
- //<channel>
- // ...
- // <item>
- // <pubDate>$pubDate</pubDate>
- // <title>$title</title>
- // <link>$link</link>
- // <description>$description</description>
- // <author>poops@sharklasers.com</author>
- // <enclosure url="$link" length="173267078" type="audio/mpeg" />
- // <itunes:explicit>no</itunes:explicit>
- // <itunes:subtitle></itunes:subtitle>
- // <itunes:author>poops@sharklasers.com</itunes:author>
- // <itunes:summary></itunes:summary>
- // <itunes:keywords>greasykidstuff,radio</itunes:keywords>
- // </item>
- //<channel>
- // $pubDate, $title, $link, $description
- //<item></item>
- $t_item->nodeValue = '';
- $t_channel->appendChild($t_item);
- //<pubDate>$pubDate
- //</pubDate>
- $t_pubDate->nodeValue = '';
- $t_item->appendChild($t_pubDate);
- //<title>$title
- //</title>
- $t_title->nodeValue = $title;
- $t_item->appendChild($t_title);
- //<link>$link
- //</link>
- $t_link->nodeValue = $link;
- $t_item->appendChild($t_link);
- // <description>$description
- //</description>
- $t_description->nodeValue = $description;
- $t_item->appendChild($t_description);
- // <author>poops@sharklasers.com
- //</author>
- $t_author->nodeValue = $owner;
- $t_item->appendChild($t_author);
- // <enclosure url="$link" length="173267078" type="audio/mpeg" />
- $urlAttribute->nodeValue = $link;
- $t_enclosure->appendChild($urlAttribute);
- $typeAttribute->nodeValue = 'audio/mpeg';
- $t_enclosure->appendChild($typeAttribute);
- $t_enclosure->nodeValue = '';
- $t_item->appendChild($t_enclosure);
- // <itunes:explicit>no
- //</itunes:explicit>
- $itunes_explicit->nodeValue = 'no';
- $t_item->appendChild($itunes_explicit);
- // <itunes:subtitle>
- //</itunes:subtitle>
- $itunes_subtitle->nodeValue = '';
- $t_item->appendChild($itunes_subtitle);
- // <itunes:author>poops@sharklasers.com
- //</itunes:author>
- $itunes_author->nodeValue = $owner;
- $t_item->appendChild($itunes_author);
- // <itunes:summary>
- //</itunes:summary>
- $itunes_summary->nodeValue = '';
- $t_item->appendChild($itunes_summary);
- // <itunes:keywords>greasykidstuff,radio
- //</itunes:keywords>
- $itunes_keywords->nodeValue = $tags_string;
- $t_item->appendChild($itunes_keywords);
- //</channel>
- $doc->appendChild($t_channel);
- $doc->saveXML();
- return $doc;
- }
- function new_rss($doc){
- $rss = $doc->createElement("rss");
- $xmlns_itunesAttribute = $doc->createAttribute("xmlns:itunes");
- $versionAttribute = $doc->createAttribute("version");
- //<rss xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" version="2.0">
- $xmlns_itunesAttribute->nodeValue = "http://www.itunes.com/dtds/podcast-1.0.dtd";
- $versionAttribute->nodeValue = '2.0';
- $rss->appendChild($xmlns_itunesAttribute);
- $rss->appendChild($versionAttribute);
- $doc->appendChild($rss);
- return $doc;
- }
- function doc_combine($doc1, $doc2, $tagname, $innertagname){
- //http://stackoverflow.com/questions/10163675/merge-xml-files-in-php
- //http://stackoverflow.com/questions/3091287/copy-domnodes-from-one-domdocument-to-another
- $tagof1 = $doc1->getElementsByTagName($tagname)->item(0); //Find "<channel>" in doc 1 (store "<channel>" tag into a temp DOMNodeList).
- $numberoftags2 = $doc2->getElementsByTagName($innertagname); //Find "<item>" in doc 2 (store results into a DOMNodeList)
- for ($i = 0; $i < $numberoftags2->length; $i ++) { //i=0, i<1: (i < that # count of "<channel>"s)
- $tag2 = $numberoftags2->item($i); //for the i'th encountered DOMNode in DOMNodeList (each tag found inside <item> and including <item>) in doc2, ...
- $item1 = $doc1->importNode($tag2, true);// import that i'th DOMNode into doc 1
- $tagof1->appendChild($item1); // and (working inside of doc1?) append that i'th DOMNode with doc 1's "tagname"
- }
- $doc1->saveXML();
- return $doc1;
- }
- // d8888aa,_ a8888888a __a88888888b
- // d8P `Y88ba. a8P'~~~~Y88a888P""~~~~Y88b
- // d8P ~"Y88a____aaaaa_____a8P 888 Y88
- // d8P ~Y88"8~~~~~~~88888P 88g 88
- // d8P 88 ____ _88y__ 88b
- // 88 a88 _a88~8888"8M88a_____888
- // 88 88P 88 a8"' `888888888b_
- // a8P 88 88 a88 88b Y8,
- // 8b 88 8888P 388 88b
- // a88a Y8b 88L 8888. 88P
- // a8P Y8_ _888 _a8P 88 a88
- // _8P ~Y88a888~888g_ a888yg8' a88'
- // 88 ~~~~ ~""8888 a88P
- // d8' Y8, 888L
- // 8E 88a___a8"888
- // d8P ~Y888" 88L
- // 88 ~~ 88
- // 88 88
- // 88 88b
- // ____88a_. a8a __881
- // 88""P~888 888b __ 8888888888
- // 888 888P d88b 88
- // _888ba ~ aaaa. 8888 d8P
- // a888~"Y88 888888 "8P 8aa888_
- // Y8b Y888P" 88""888a
- // _88g8 ~~~ a88 ~~
- // __a8"888_ a_ a88
- // 88"' "88g "888g_
- // ~ `88a_ _a88'"Y88gg,
- // "888aa_. _a88"' ~88
- // ~~""8888aaa______ ____a888P'
- // ~~""""""888888888888888888""~~~
- // ~~~~~~~~~~~~
- //mmmyow
- ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement