Advertisement
lihlii

twitter_msg_cleanup.pl

Jun 11th, 2012
121
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 2.96 KB | None | 0 0
  1. # clean up twitter messages
  2. # v120612
  3. # 1. Save twitter message web page as complete html, eg. 01.htm
  4. # NOTE: Chrome browser can't save twitter message in HTML only mode.
  5. #
  6. # 2. Install perl, eg. ActivePerl http://www.activestate.com/activeperl
  7. # 3. Save this script as twitter_msg_cleanup.pl, in the same folder as in step 1.
  8. # 4. Open a command window, cd to the folder where the files are stored.
  9. # 5. Input command: twitter_msg_cleanup.pl -h 01.htm > 01o.htm
  10. # 6. Open 01o.htm in browser to check the result.
  11.  
  12. use HTML::TokeParser;
  13. use HTML::Entities qw(decode_entities);
  14. $html_mode = 0;
  15. $infile = "index.htm";
  16.  
  17. if ($ARGV[0] eq "-?") {
  18.     print "Usage: $0 [-h] <input html file> > <output file>\n-h: Output HTML format, otherwise text format.\n";
  19.     exit;
  20. }
  21.  
  22. if ($ARGV[0] eq "-h") {
  23.     $html_mode = 1;
  24.     shift;
  25. }
  26.  
  27. open(my $fh, "<:utf8", (shift || $infile)) || die "Can't open file: $!";
  28. $p = HTML::TokeParser->new($fh);
  29.  
  30. $head=<<EOF;
  31. <html>
  32. <head>
  33. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
  34. </head>
  35. <body>
  36. EOF
  37.  
  38. print $head if $html_mode;
  39.  
  40. while (my $token = $p->get_tag("div")) {
  41.     my $class = $token->[1]{class};
  42.     next if $class ne "stream-item-header";
  43.     $token = $p->get_tag("a");
  44.     my $url = $token->[1]{href};
  45.     my $time = $token->[1]{title};
  46.  
  47.     $token = $p->get_tag("strong");
  48.     $class = $token->[1]{class};
  49.     next if $class !~ /^fullname/;
  50.     my $fullname = $p->get_text("/strong");
  51.     print $fullname;
  52.  
  53.     while ($token = $p->get_tag("span")) {
  54.     $class = $token->[1]{class};
  55.     if ($class =~ /^username/) {
  56.         my $username = $p->get_text("/span");
  57.         if ($html_mode) {
  58.         print " $username <a href=\"$url\">$time</a><br />\n";
  59.         } else {
  60.         print " $username $time $url\n";
  61.         }
  62.         last;
  63.     }
  64.     }
  65.  
  66.     $token = $p->get_tag("p");
  67.     $class = $token->[1]{"class"};
  68.     next if $class ne "js-tweet-text";
  69. #    my @img, $img_c;
  70. #    $img_c = 0;
  71.     while ($token = $p->get_token) {
  72.     if ($token->[0] eq "E" && $token->[1] eq "p") {
  73.         if ($html_mode) {
  74.         print "<br />\n<br />\n";
  75.         } else {
  76.         print "\n\n";
  77.         }
  78.         last;
  79.     }
  80.     if ($token->[0] eq "T") {
  81.         my $text = $token->[1];
  82.         decode_entities($text);
  83.         print $text;
  84.     }
  85.     if ($token->[0] eq "S" && $token->[1] eq "a" && $token->[2]{class} eq "twitter-timeline-link") {
  86.         my $link = $token->[2]{"href"};
  87.         my $link_expanded = $token->[2]{"data-expanded-url"};
  88.         my $link_ultimate = $token->[2]{"data-ultimate-url"};
  89.         print "<a href=\"$link\">$link</a> = <a href=\"$link_expanded\">$link_expanded</a>", ($link_ultimate && $link_ultimate ne $link_expanded) ? " = <a href=\"$link_ultimate\">$link_ultimate</a> " : "";
  90.         $p->get_tag("/a");
  91. #       $link = $link_ultimate ? $link_ultimate : $link_expanded;
  92. #       $img[$img_c++] = $link if $link =~ /\.(jpg|gif|png)$/i;
  93. #       $img[$img_c++] = $link if $link =~ m{^https?://img.ly/}i;
  94.     }
  95.     }
  96. }
  97.  
  98. print "</body>\n</html>\n" if $html_mode;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement