Advertisement
Guest User

XRAY.FM-to-Podcast-er

a guest
Aug 23rd, 2014
296
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 16.75 KB | None | 0 0
  1. <?php
  2. //*** This script takes input from a radio show website, scrapes the links and info of its episodes,
  3. //       then outputs an iTunes-compatible podcast .xml file (RSS feed)!                    /l、
  4. //                                                                               (miau?)  ゙(゚、 。 7
  5. //    Notes:                                                                               l、゙ ~ヽ
  6. //    ----------                                                                           じしf_, )ノ
  7. //    Whenever this script is accessed, it will save the .xml file to the server.  Thus, to update
  8. //       the feed, a link to this script must be queried (or run by a server chron).
  9. //    This script is optimized to scrape XRAY.FM radio show web-pages.  Some tailoring of this code
  10. //       may be necessary for your chosen site:
  11. //    XRAY.FM sites consist of a parent page and broadcast/episode sub-pages.  We will grab over-
  12. //       arching info from the parent page, including links to broadcasts sub-pages, then grab data
  13. //       from each sub-page as well.
  14. //    If a file is not found, iTunes will ignore the rss data for that entry/item.
  15.  
  16. //Debugger
  17. // require('php_error.php');
  18. // \php_error\reportErrors();
  19.  
  20.  
  21. //Variables
  22. $scrape_this_url = 'http://xray.fm/shows/gks'; //site to make into iTunes podcast feed
  23. $baseurl = 'http://xray.fm'; //this is redundant, but needed to make the script work right
  24. $owner = 'pops@sharklasers.com'; //email of the feed's owner
  25. $tags_string = 'greasykidstuff,radio'; //iTunes search tags separated by commas
  26. $chsummary = "This is a fan-made / unofficial podcast link to one of my favorite radio shows.";
  27. // The user must also define the XPATH query strings and $currenturl below.
  28. //      Hint: Google Chrome has a built-in XPATH query tool in the developer
  29. //      tools section / inspect element section.
  30. //      Google Chrome XPATH query syntax: $x("XPATHQUERYHERE")
  31.  
  32.  
  33. //XPATH Queries - Parent page
  34. $homepagecontents = file_get_contents($scrape_this_url); // Get main page of the Greasy Kid Stuff radio show
  35. $maintitle = xpath_on_url($homepagecontents, '//div[@class="content-center clearfix"]/h1/text()'); //search that page for content specified by XPATH
  36. $maindescription = xpath_on_url($homepagecontents, '//div[@class="full-description clearfix"]');
  37. $mainimglinktmp = xpath_on_url($homepagecontents, '//div[@class="info-container clearfix"]/a/img/@src');
  38.     $mainimglinktmp = str_replace(' src="','',$mainimglinktmp);
  39.     $mainimglink = str_replace('"','',$mainimglinktmp);
  40.  
  41. $broadcastlinks = xpath_on_url($homepagecontents, '//div[@class="title"]/a[contains(@href, "broadcasts")]/@href'); //Get links to broadcasts/sub-pages
  42.         // Note: Output strings include " href=" " attribute in the string too :[, we will
  43.         // fix each link later and paste it into $currenturl).  Then get the data contained
  44.         // in each broadcast's sub-page.
  45.  
  46.         // Note: There is no need to grep/scrape any sub-pages found on "next page" of parent page
  47.         // since streams are only kept up on XRAY.FM for two weeks anyway.
  48.  
  49.  
  50. //XPATH Queries - Sub-pages (broadcast pages)
  51.         // WARNING: xpath_on_url function returns arrays.  This loop creates arrays of arrays,
  52.         // with useful data contained in $var[$i][0].
  53.     for ($i = 0; $i < count($broadcastlinks); $i ++){ //process each episode into feed
  54.             $trunclink = str_replace(' href="','',$broadcastlinks[$i]); //fix each sub-page link before scraping it w/ XPATH
  55.             $trunclink = str_replace('"','',$trunclink);
  56.         $currenturl = $baseurl.$trunclink;
  57.  
  58.         $urlcontents = file_get_contents($currenturl); //grep all HTML data from sub-page link
  59.         $title[] = xpath_on_url($urlcontents, '//div[@class="content-center clearfix"]/h1/text()'); // Parameters for "xpath_on_url" = url, xpath_string
  60.         $pubDate[] = xpath_on_url($urlcontents, '//div[@class="date"]/text()'); //get broadcast publication date from sub-page
  61.             $linktmp = xpath_on_url($urlcontents, '//ul/li/a/@href'); //get broadcast mp3 from sub-page
  62.             $linktmp = str_replace(' href="','',$linktmp); //fix mp3 link url
  63.         $link[] = str_replace('"','',$linktmp);
  64.         $description[] = xpath_on_url($urlcontents, '//div[@class="tracks-container"]');
  65.         }
  66.  
  67.  
  68. //Create Feed
  69. $doc_rss = new DOMDocument(); //create RSS tags
  70. $doc_rss = new_rss($doc_rss);
  71.  
  72. $feed = new DOMDocument(); //create channel with info
  73. $feed = new_channel($feed, $maintitle[0], $maindescription[0], $scrape_this_url, $mainimglink[0], $owner, $tags_string, $chsummary);
  74.  
  75. $feed = doc_combine($doc_rss, $feed, 'rss', 'channel'); //wrap channel inside of RSS tags
  76.  
  77.     for ($i = 0; $i < count($title); $i ++){ //process each episode into feed
  78.         $doc_item = new DOMDocument(); //make an <item>...</item> XML for each episode
  79.         $doc_item = new_item($doc_item, $pubDate[$i][0], $title[$i][0], $link[$i][0], $description[$i][0], $owner, $tags_string);
  80.         $feed = doc_combine($feed, $doc_item, 'channel', 'item'); //insert that <item></> into <channel></>
  81.     }
  82.  
  83.  
  84. //Export Feed to XML Document
  85. $feed->formatOutput = true;
  86. echo "<pre>";
  87. echo "feed.xml UPDATED!<br>";
  88. echo "-----------------<br><br>";
  89. echo $feed->saveXML(); //this saveXML() is not needed, but is kludge to get echo to work
  90. $feed->save("feed.xml");
  91.  
  92.  
  93. // ========================================================================
  94.  
  95. //Reference material
  96.     // http://xray.fm/shows/gks
  97.     // http://192.168.1.129:8080/print.php
  98.     // https://www.youtube.com/watch?v=SIPGkrlM3R8
  99.  
  100.  
  101. //Functions
  102. function xpath_on_url($urlcontents, $query){ // Loads an XML file string, runs an XPATH query on it, then returns the results as an array.
  103.     // Load url/xml contents (string) into a DOMdocument for parsing with PHP's DOMXpath
  104.     $dom = new DOMDocument();
  105.     @$dom->loadHTML($urlcontents);
  106.  
  107.     // XPATH scrape page's classes/tags/objects/etc
  108.     $xpath = new DOMXpath($dom);
  109.     $dom_node_list = $xpath->query($query);
  110.  
  111.     // Convert DOMDocument-class "node list" into a string array
  112.     $temp_dom = new DOMDocument();  //A temp DOMdoc to store DOMXpath results
  113.     foreach($dom_node_list as $n) { //n = The number of results from our original XPATH query
  114.         $current_node = $temp_dom->importNode($n,true); //allow temp to import a node.  Get that node from dom_node_list.
  115.         $array[] = $temp_dom->saveHTML($current_node); //allow temp to export that node (as XML) into an array position.
  116.         }
  117.     return $array;
  118.     }
  119. function new_channel($doc, $channeltitle, $channeldescription, $channelurl, $channelimgurl, $owner, $tags_string, $chsummary) { //input a DOMDocument to write into
  120.     //Name the HTML/DOMdoc tags/"elements"
  121.     //------------------------------
  122.  
  123.     $channel = $doc->createElement("channel");
  124.     $title = $doc->createElement("title");
  125.     $link = $doc->createElement("link");
  126.     $description = $doc->createElement("description");
  127.     $itunes_owner = $doc->createElement("itunes:owner");
  128.     $itunes_email = $doc->createElement("itunes:email");
  129.     $itunes_explicit = $doc->createElement("itunes:explicit");
  130.     $itunes_image = $doc->createElement("itunes:image");
  131.     $itunes_keywords = $doc->createElement("itunes:keywords");
  132.     $itunes_subtitle = $doc->createElement("itunes:subtitle");
  133.     $itunes_summary = $doc->createElement("itunes:summary");
  134.     $itunes_category = $doc->createElement("itunes:category");
  135.         $textAttribute = $doc->createAttribute("text");
  136.         $hrefAttribute = $doc->createAttribute("href");
  137.  
  138.        
  139.     //Construct and fill HTML/DOMdoc tags/"elements"
  140.     //-------------------------------
  141.  
  142.       //<channel>
  143.         //<title>"TitleHere"
  144.             //</title>
  145.         $title->nodeValue = $channeltitle;
  146.             $channel->appendChild($title);
  147.  
  148.         //<link>"http://...com/"
  149.             //</link>
  150.         $link->nodeValue = $channelurl;
  151.             $channel->appendChild($link);
  152.  
  153.         //<description>DescriptionHere
  154.             //</description>
  155.         $description->nodeValue = $channeldescription;
  156.             $channel->appendChild($description);
  157.  
  158.         //<itunes:owner><itunes:email>poops@sharklasers.com
  159.             //</itunes:email></itunes:owner>
  160.         $itunes_owner->nodeValue = "";
  161.         $itunes_email->nodeValue = $owner;
  162.             $itunes_owner->appendChild($itunes_email);
  163.             $channel->appendChild($itunes_owner);
  164.  
  165.         //<itunes:explicit>no
  166.             //</itunes:explicit>
  167.         $itunes_explicit->nodeValue = "no";
  168.             $channel->appendChild($itunes_explicit);
  169.  
  170.         //<itunes:image href="https://fbcdn-sphotos-b-a.akamaihd.net/hphotos-ak-xfa1/t1.0-9/995471_10151845076678274_55288227_n.jpg">
  171.             //</itunes:image>
  172.         $hrefAttribute->nodeValue = $channelimgurl;
  173.             $itunes_image->appendChild($hrefAttribute);
  174.         $itunes_image->nodeValue = "";
  175.             $channel->appendChild($itunes_image);
  176.  
  177.         //<itunes:keywords>passport,approved
  178.             //</itunes:keywords>
  179.         $itunes_keywords->nodeValue = $tags_string;
  180.             $channel->appendChild($itunes_keywords);
  181.  
  182.         //<itunes:subtitle>Internationally syndicated radio -- Acting local, but thinking global!
  183.             //</itunes:subtitle>
  184.         $itunes_subtitle->nodeValue = "";
  185.             $channel->appendChild($itunes_subtitle);
  186.  
  187.         //<itunes:summary>This is a fan-made / unofficial podcast link to one of my favorite radio shows.
  188.             //</itunes:summary>
  189.         $itunes_summary->nodeValue = $chsummary;
  190.             $channel->appendChild($itunes_summary);
  191.  
  192.         //<itunes:category text="Music">
  193.             //</itunes:category>
  194.         $textAttribute->nodeValue = "Music";
  195.             $itunes_category->appendChild($textAttribute);
  196.         $itunes_category->nodeValue = "";
  197.             $channel->appendChild($itunes_category);
  198.  
  199.       //</channel>
  200.       $doc->appendChild($channel);
  201.     //</rss>
  202.  
  203.  
  204.     $doc->saveXML();
  205.     return $doc;
  206.     }
  207. function new_item($doc, $pubDate, $title, $link, $description, $owner, $tags_string){//input a DOMDocument to write into + vars
  208.  
  209.     //Name the HTML/DOMdoc tags/"elements"
  210.     //------------------------------
  211.     $t_channel = $doc->createElement("channel");
  212.     $t_item = $doc->createElement("item");
  213.     $t_pubDate = $doc->createElement("pubDate");
  214.     $t_title = $doc->createElement("title");
  215.     $t_link = $doc->createElement("link");
  216.     $t_description = $doc->createElement("description");
  217.     $t_author = $doc->createElement("author");
  218.     $t_enclosure = $doc->createElement("enclosure");
  219.     $itunes_explicit = $doc->createElement("itunes:explicit");
  220.     $itunes_subtitle = $doc->createElement("itunes:subtitle");
  221.     $itunes_author = $doc->createElement("itunes:author");
  222.     $itunes_summary = $doc->createElement("itunes:summary");
  223.     $itunes_keywords = $doc->createElement("itunes:keywords");
  224.         $urlAttribute = $doc->createAttribute("url");
  225.         $typeAttribute = $doc->createAttribute("type");
  226.  
  227.     //Construct and fill HTML/DOMdoc tags/"elements"
  228.     //-------------------------------
  229.     //<channel>
  230.         // ...
  231.         // <item>
  232.         //  <pubDate>$pubDate</pubDate>
  233.         //  <title>$title</title>
  234.         //  <link>$link</link>
  235.         //  <description>$description</description>
  236.         //  <author>poops@sharklasers.com</author>
  237.         //  <enclosure url="$link" length="173267078" type="audio/mpeg" />
  238.         //  <itunes:explicit>no</itunes:explicit>
  239.         //  <itunes:subtitle></itunes:subtitle>
  240.         //  <itunes:author>poops@sharklasers.com</itunes:author>
  241.         //  <itunes:summary></itunes:summary>
  242.         //  <itunes:keywords>greasykidstuff,radio</itunes:keywords>
  243.         // </item>
  244.     //<channel>
  245.     // $pubDate, $title, $link, $description
  246.  
  247.         //<item></item>
  248.         $t_item->nodeValue = '';
  249.             $t_channel->appendChild($t_item);
  250.        
  251.                 //<pubDate>$pubDate
  252.                     //</pubDate>
  253.                 $t_pubDate->nodeValue = '';
  254.                     $t_item->appendChild($t_pubDate);
  255.                
  256.                 //<title>$title
  257.                     //</title>
  258.                 $t_title->nodeValue = $title;
  259.                     $t_item->appendChild($t_title);
  260.  
  261.                 //<link>$link
  262.                     //</link>
  263.                 $t_link->nodeValue = $link;
  264.                     $t_item->appendChild($t_link);
  265.  
  266.                 //  <description>$description
  267.                     //</description>
  268.                 $t_description->nodeValue = $description;
  269.                     $t_item->appendChild($t_description);
  270.  
  271.                 //  <author>poops@sharklasers.com
  272.                     //</author>
  273.                 $t_author->nodeValue = $owner;
  274.                     $t_item->appendChild($t_author);
  275.  
  276.                 //  <enclosure url="$link" length="173267078" type="audio/mpeg" />
  277.                 $urlAttribute->nodeValue = $link;
  278.                     $t_enclosure->appendChild($urlAttribute);
  279.                 $typeAttribute->nodeValue = 'audio/mpeg';
  280.                     $t_enclosure->appendChild($typeAttribute);
  281.                 $t_enclosure->nodeValue = '';
  282.                     $t_item->appendChild($t_enclosure);
  283.  
  284.                 //  <itunes:explicit>no
  285.                     //</itunes:explicit>
  286.                 $itunes_explicit->nodeValue = 'no';
  287.                     $t_item->appendChild($itunes_explicit);
  288.  
  289.                 //  <itunes:subtitle>
  290.                     //</itunes:subtitle>
  291.                 $itunes_subtitle->nodeValue = '';
  292.                     $t_item->appendChild($itunes_subtitle);
  293.  
  294.                 //  <itunes:author>poops@sharklasers.com
  295.                     //</itunes:author>
  296.                 $itunes_author->nodeValue = $owner;
  297.                     $t_item->appendChild($itunes_author);
  298.  
  299.                 //  <itunes:summary>
  300.                     //</itunes:summary>
  301.                 $itunes_summary->nodeValue = '';
  302.                     $t_item->appendChild($itunes_summary);
  303.  
  304.                 //  <itunes:keywords>greasykidstuff,radio
  305.                     //</itunes:keywords>
  306.                 $itunes_keywords->nodeValue = $tags_string;
  307.                     $t_item->appendChild($itunes_keywords);
  308.         //</channel>
  309.     $doc->appendChild($t_channel);
  310.  
  311.     $doc->saveXML();
  312.     return $doc;
  313.     }
  314. function new_rss($doc){
  315.     $rss = $doc->createElement("rss");
  316.         $xmlns_itunesAttribute = $doc->createAttribute("xmlns:itunes");
  317.         $versionAttribute = $doc->createAttribute("version");
  318.         //<rss xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" version="2.0">
  319.     $xmlns_itunesAttribute->nodeValue = "http://www.itunes.com/dtds/podcast-1.0.dtd";
  320.     $versionAttribute->nodeValue = '2.0';
  321.         $rss->appendChild($xmlns_itunesAttribute);
  322.         $rss->appendChild($versionAttribute);
  323.     $doc->appendChild($rss);
  324.     return $doc;
  325.     }
  326. function doc_combine($doc1, $doc2, $tagname, $innertagname){
  327.     //http://stackoverflow.com/questions/10163675/merge-xml-files-in-php
  328.     //http://stackoverflow.com/questions/3091287/copy-domnodes-from-one-domdocument-to-another
  329.  
  330.     $tagof1 = $doc1->getElementsByTagName($tagname)->item(0); //Find "<channel>" in doc 1 (store "<channel>" tag into a temp DOMNodeList).
  331.    
  332.     $numberoftags2 = $doc2->getElementsByTagName($innertagname); //Find "<item>" in doc 2 (store results into a DOMNodeList)
  333.     for ($i = 0; $i < $numberoftags2->length; $i ++) { //i=0, i<1: (i < that # count of "<channel>"s)
  334.         $tag2 = $numberoftags2->item($i); //for the i'th encountered DOMNode in DOMNodeList (each tag found inside <item> and including <item>) in doc2, ...
  335.         $item1 = $doc1->importNode($tag2, true);// import that i'th DOMNode into doc 1
  336.             $tagof1->appendChild($item1);       // and (working inside of doc1?) append that i'th DOMNode with doc 1's "tagname"
  337.         }
  338.     $doc1->saveXML();
  339.     return $doc1;
  340.     }
  341.  
  342.  
  343.                // d8888aa,_                    a8888888a   __a88888888b
  344.               // d8P   `Y88ba.                a8P'~~~~Y88a888P""~~~~Y88b
  345.              // d8P      ~"Y88a____aaaaa_____a8P        888          Y88
  346.             // d8P          ~Y88"8~~~~~~~88888P          88g          88
  347.            // d8P                           88      ____ _88y__       88b
  348.            // 88                           a88    _a88~8888"8M88a_____888
  349.            // 88                           88P    88  a8"'     `888888888b_
  350.           // a8P                           88     88 a88         88b     Y8,
  351.            // 8b                           88      8888P         388      88b
  352.           // a88a                          Y8b       88L         8888.    88P
  353.          // a8P                             Y8_     _888       _a8P 88   a88
  354.         // _8P                               ~Y88a888~888g_   a888yg8'  a88'
  355.         // 88                                   ~~~~    ~""8888        a88P
  356.        // d8'                                                Y8,      888L
  357.        // 8E                                                  88a___a8"888
  358.       // d8P                                                   ~Y888"   88L
  359.       // 88                                                      ~~      88
  360.       // 88                                                              88
  361.       // 88                                                              88b
  362.   // ____88a_.      a8a                                                __881
  363. // 88""P~888        888b                                 __          8888888888
  364.       // 888        888P                                d88b             88
  365.      // _888ba       ~            aaaa.                 8888            d8P
  366.  // a888~"Y88                    888888                 "8P          8aa888_
  367.         // Y8b                   Y888P"                                88""888a
  368.         // _88g8                  ~~~                                 a88    ~~
  369.     // __a8"888_                                                  a_ a88
  370.    // 88"'    "88g                                                 "888g_
  371.    // ~         `88a_                                            _a88'"Y88gg,
  372.                 // "888aa_.                                   _a88"'      ~88
  373.                    // ~~""8888aaa______                ____a888P'
  374.                            // ~~""""""888888888888888888""~~~
  375.                                       // ~~~~~~~~~~~~
  376.         //mmmyow
  377.  
  378. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement