Advertisement
pwtenny

Vbulletin to ATOM converter (cleaner)

Jan 23rd, 2014
143
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 5.13 KB | None | 0 0
  1. # Convert a single Vbulletin forum index into a W3C valid ATOM feed.
  2.  
  3. package vbulletin2atom;
  4.  
  5. use base 'Exporter';
  6. use POSIX qw(strftime);
  7. use String::CRC32;
  8. use strict;
  9.  
  10. our $VERSION = '1.10';
  11. our @EXPORT = qw(pver);
  12.  
  13. my $atomFileData = '';
  14. my $atomFilename = '/home/pwtenny/devel.mediapundit.net/thephun.xml';
  15. my $forumUrl = 'http://forum.phun.org/forumdisplay.php?f=';
  16. my $forumId = 15;
  17. my $htmlFilename = 'phunindex.html';
  18.  
  19. sub vb2a_run
  20. {
  21.   # check time to see if allowed to run
  22.   vb2a_check_run();
  23.   vb2a_retrieve_html();
  24.   vb2a_parse_html();
  25.   vb2a_cleanup_html();
  26.   vb2a_save_atom();
  27. }
  28.  
  29. sub vb2a_retrieve_html
  30. {
  31.   system qq(wget -q "$forumUrl$forumId" -O $htmlFilename);
  32. }
  33.  
  34. # Parse a Vbulletin forum index for thread-by-thread data.
  35. sub vb2a_parse_html
  36. {
  37.   my $forumHTML;
  38.   my $threadBlock;
  39.  
  40.   open HTMLFD, $htmlFilename or die "Couldn't find the forum HTML file to parse: $htmlFilename\n";
  41.   $forumHTML = join "", <HTMLFD>;
  42.   close HTMLFD;
  43.  
  44.   # All threads on the page
  45.   ($threadBlock) = $forumHTML =~ /<ol id=.threads. class=.threads.>(.*?)<\/ol>/igs;
  46.  
  47.   # One thread per loop.
  48.   while($threadBlock =~ /<li class=\".*?\" id=\"thread_\d+\">(.*?)<\/li>/igs)
  49.   {
  50.     my $thread = $1;
  51.     my $threadUrl;
  52.     my $threadTopic;
  53.     my $threadDate;
  54.    
  55.     # Get the post URL and topic.
  56.     ($threadUrl,$threadTopic) = $thread =~ /<a class=\"title\" href=\"(.*?)\" id=\"thread_title_\d+\">(.*?)<\/a>/igs;
  57.     ($threadUrl) = $1 =~ /\?t=(\d+)/;
  58.     $threadUrl = "http://forum.phun.org/showthread.php?t=".$threadUrl;
  59.  
  60.     # Get the post date.
  61.     ($threadDate) = $thread =~ /<a href=\"member.php\?u=[A-Za-z0-9?=&;].*\" class=\".*?\" title=\"Started by [A-Za-z0-9].* on (.*?)\">/igs;
  62.    
  63.     # Convert three different kind of dates into one format.
  64.     $threadDate = vb2a_fixdate($threadDate);
  65.  
  66.     # Each entry requires a unique ID, so make a CRC from entry data that won't change.
  67.     # Try using the actual thread ID to stop getting different CRCs and dupe msgs because of that.
  68.     my $CRC32 = crc32($threadUrl);
  69.     (my $threadId) = $threadUrl =~ /\?t=(\d+)$/;
  70.  
  71.     # Construct a valid ATOM entry and store it.
  72.     $atomFileData .= << "ATOMENTRY";
  73.   <entry>
  74.     <title>$threadTopic</title>
  75.     <link href="$threadUrl"/>
  76.     <id>tag:forum.phun.org,2014://1.$threadId</id>
  77.     <updated>$threadDate</updated>
  78.     <summary type="html" xml:lang="en" xml:base="http://www.mediapundit.net/"><![CDATA[<a href="$threadUrl">$threadTopic</a>]]></summary>
  79.   </entry>
  80. ATOMENTRY
  81.   }
  82. }
  83.  
  84. # Delete the old index that we gather data from before exiting.
  85. sub vb2a_cleanup_html
  86. {
  87.   unlink $htmlFilename;
  88. }
  89.  
  90. # Write out a valid ATOM feed.
  91. sub vb2a_save_atom
  92. {
  93.   open AtomFile, ">$atomFilename";
  94.   print AtomFile qq(<?xml version="1.0" encoding="utf-8"?>\n);
  95.   print AtomFile << "FULLATOMFEED";
  96. <feed xmlns="http://www.w3.org/2005/Atom">
  97.   <title>The Phun</title>
  98.   <link rel="alternate" type="text/html" href="$forumUrl$forumId" />
  99.   <link rel="self" type="application/atom+xml" href="http://devel.mediapundit.net/thephun.xml" />
  100.   <updated>2014-12-13T18:30:02Z</updated>
  101.   <author>
  102.     <name>vbulletin2atom.pl</name>
  103.   </author>
  104.   <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
  105. $atomFileData
  106. </feed>
  107. FULLATOMFEED
  108.  
  109.   close AtomFile;
  110. }
  111.  
  112. # Convert odd forum dates into real dates accepted by the ATOM spec.
  113. sub vb2a_fixdate
  114. {
  115.   my $oldDate = shift;
  116.   my $newDate;
  117.   my $hour;
  118.   my $minute;
  119.   my $ampm;
  120.  
  121.   # Convert "Yesterday 3:15 PM" into a valid "<updated></updated>" ATOM date.
  122.   if($oldDate =~ /Yesterday (\d+):(\d+) (AM|PM)/)
  123.   {
  124.     $hour = int($1);
  125.     $minute = $2;
  126.     $ampm = $3;
  127.  
  128.     $hour += 12 if($ampm =~ /PM/ && $hour != 12);              # Convert 12hr to 24hr for ATOM, but not 12 PM which would become 24.
  129.     $hour = "0" . int($hour) if($hour <= 9);                   # Zero pad a single digit hour, also for the ATOM spec. *TODO* Do this with sprintf?
  130.     $newDate = strftime "%Y-%m-%d", localtime(time() - 86400); # year-month-day, minus 24 hours for "yesterday"
  131.     $newDate .= "T"."$hour:$minute:00"."Z";                    # Date + time in ATOM date/time format.
  132.  
  133.     return $newDate;
  134.   }
  135.  
  136.   # Convert "Today 3:15 PM" into a valid "<updated></updated>" ATOM date.
  137.   if($oldDate =~ /Today (\d+):(\d+) (AM|PM)/)
  138.   {
  139.     $hour = int($1);
  140.     $minute = $2;
  141.     $ampm = $3;
  142.  
  143.     $hour += 12 if($ampm =~ /PM/ && $hour != 12);
  144.     $hour = "0" . int($hour) if($hour <= 9);
  145.     $newDate = strftime "%Y-%m-%d", localtime;
  146.     $newDate .= "T"."$hour:$minute:00"."Z";
  147.  
  148.     return $newDate;
  149.   }
  150.  
  151.   # Convert stupid 12-31-2014 into 2014-12-31
  152.   my $year;
  153.   my $month;
  154.   my $day;
  155.   my $ampm;
  156.  
  157.   ($month,$day,$year,$hour,$minute,$ampm) = $oldDate =~ /(\d+)-(\d+)-(\d+) (\d+):(\d+) (AM|PM)/;
  158.   $hour += 12 if($ampm =~ /PM/ && $hour != 12);
  159.   $hour = "0" . int($hour) if($hour <= 9);
  160.   $newDate = "$year-$month-$day"."T"."$hour:$minute:00"."Z";
  161.  
  162.   return $newDate;
  163. }
  164.  
  165. # Only run once per hour, at the top of the hour.
  166. sub vb2a_check_run
  167. {
  168.   my $localtm = localtime()."";
  169.   my ($minute) = $localtm =~ /^... ... \d+ \d+:(\d+):\d+/;
  170.  
  171.   return 1 if($minute != 00);
  172. }
  173.  
  174. 1;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement