Advertisement
lihlii

mobile.twitter_msg_cleanup.pl

Jun 11th, 2012
149
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 2.63 KB | None | 0 0
  1. # clean up mobile.twitter messages
  2. # v120612
  3. # 1. Save twitter message web page as complete html, eg. 01.htm
  4. # NOTE: Chrome browser can't save twitter message in HTML only mode.
  5. #
  6. # 2. Install perl, eg. ActivePerl http://www.activestate.com/activeperl
  7. # 3. Save this script as mobile.twitter_msg_cleanup.pl, in the same folder as in step 1.
  8. # 4. Open a command window, cd to the folder where the files are stored.
  9. # 5. Input command: mobile.twitter_msg_cleanup.pl -h 01.htm > 01o.htm
  10. # 6. Open 01o.htm in browser to check the result.
  11.  
  12. use HTML::TokeParser;
  13. use HTML::Entities qw(decode_entities);
  14. $html_mode = 0;
  15.  
  16. if ($ARGV[0] eq "-?") {
  17.     print "Usage: $0 [-h] <input html file> > <output file>\n-h: Output HTML format, otherwise text format.\n";
  18.     exit;
  19. }
  20.  
  21. if ($ARGV[0] eq "-h") {
  22.     $html_mode = 1;
  23.     shift;
  24. }
  25.  
  26. open(my $fh, "<:utf8", (shift || "index.htm")) || die "Can't open file: $!";
  27. $p = HTML::TokeParser->new($fh);
  28.  
  29. $head=<<EOF;
  30. <html>
  31. <head>
  32. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
  33. </head>
  34. <body>
  35. EOF
  36.  
  37. print $head if $html_mode;
  38.  
  39. while (my $token = $p->get_tag("td")) {
  40.     my $class = $token->[1]{class};
  41.     next if $class ne "user-info";
  42.     $token = $p->get_tag("strong");
  43.     $class = $token->[1]{class};
  44.     next if $class ne "fullname";
  45.     my $fullname = $p->get_text("/strong");
  46.     $token = $p->get_tag("span");
  47.     $class = $token->[1]{class};
  48.     next if $class ne "username";
  49.     $p->get_tag("/span");
  50.     my $username = $p->get_trimmed_text("/span");
  51.     $token = $p->get_tag("td");
  52.     $class = $token->[1]{class};
  53.     next if $class ne "timestamp";
  54.     $token = $p->get_tag("a");
  55.     my $url = $token->[1]{href};
  56.     my $time = $p->get_text("/a");
  57.     if ($html_mode) {
  58.     print "<a href=\"$url\">$fullname $username $time</a><br />\n";
  59.     } else {
  60.     print "$fullname $username $time $url\n";
  61.     }
  62.  
  63.     $token = $p->get_tag("div");
  64.     $class = $token->[1]{class};
  65.     next if $class ne "tweet-text";
  66.     while ($token = $p->get_token) {
  67.     if ($token->[0] eq "E" && $token->[1] eq "div") {
  68.         if ($html_mode) {
  69.         print "<br />\n<br />\n";
  70.         } else {
  71.         print "\n\n";
  72.         }
  73.         last;
  74.     }
  75.     if ($token->[0] eq "T") {
  76.         my $text = $token->[1];
  77.         decode_entities($text);
  78.         print $text;
  79.     }
  80.     if ($token->[0] eq "S" && $token->[1] eq "a" && $token->[2]{class} eq "twitter_external_link") {
  81.         $link = $token->[2]{"href"};
  82.         $link_text = $p->get_text("/a");
  83.         if ($html_mode) {
  84.         print "<a href=\"$link\">$link_text</a>";
  85.         } else {
  86.         print " $link = $link_text ";
  87.         }
  88.     }
  89.     }
  90. }
  91.  
  92. print "</body>\n</html>\n" if $html_mode;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement