Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # clean up twitter messages
- # v120612
- # 1. Save twitter message web page as complete html, eg. 01.htm
- # NOTE: Chrome browser can't save twitter message in HTML only mode.
- #
- # 2. Install perl, eg. ActivePerl http://www.activestate.com/activeperl
- # 3. Save this script as twitter_msg_cleanup.pl, in the same folder as in step 1.
- # 4. Open a command window, cd to the folder where the files are stored.
- # 5. Input command: twitter_msg_cleanup.pl -h 01.htm > 01o.htm
- # 6. Open 01o.htm in browser to check the result.
- use HTML::TokeParser;
- use HTML::Entities qw(decode_entities);
- $html_mode = 0;
- $infile = "index.htm";
- if ($ARGV[0] eq "-?") {
- print "Usage: $0 [-h] <input html file> > <output file>\n-h: Output HTML format, otherwise text format.\n";
- exit;
- }
- if ($ARGV[0] eq "-h") {
- $html_mode = 1;
- shift;
- }
- open(my $fh, "<:utf8", (shift || $infile)) || die "Can't open file: $!";
- $p = HTML::TokeParser->new($fh);
- $head=<<EOF;
- <html>
- <head>
- <meta http-equiv="content-type" content="text/html; charset=UTF-8">
- </head>
- <body>
- EOF
- print $head if $html_mode;
- while (my $token = $p->get_tag("div")) {
- my $class = $token->[1]{class};
- next if $class ne "stream-item-header";
- $token = $p->get_tag("a");
- my $url = $token->[1]{href};
- my $time = $token->[1]{title};
- $token = $p->get_tag("strong");
- $class = $token->[1]{class};
- next if $class !~ /^fullname/;
- my $fullname = $p->get_text("/strong");
- print $fullname;
- while ($token = $p->get_tag("span")) {
- $class = $token->[1]{class};
- if ($class =~ /^username/) {
- my $username = $p->get_text("/span");
- if ($html_mode) {
- print " $username <a href=\"$url\">$time</a><br />\n";
- } else {
- print " $username $time $url\n";
- }
- last;
- }
- }
- $token = $p->get_tag("p");
- $class = $token->[1]{"class"};
- next if $class ne "js-tweet-text";
- # my @img, $img_c;
- # $img_c = 0;
- while ($token = $p->get_token) {
- if ($token->[0] eq "E" && $token->[1] eq "p") {
- if ($html_mode) {
- print "<br />\n<br />\n";
- } else {
- print "\n\n";
- }
- last;
- }
- if ($token->[0] eq "T") {
- my $text = $token->[1];
- decode_entities($text);
- print $text;
- }
- if ($token->[0] eq "S" && $token->[1] eq "a" && $token->[2]{class} eq "twitter-timeline-link") {
- my $link = $token->[2]{"href"};
- my $link_expanded = $token->[2]{"data-expanded-url"};
- my $link_ultimate = $token->[2]{"data-ultimate-url"};
- print "<a href=\"$link\">$link</a> = <a href=\"$link_expanded\">$link_expanded</a>", ($link_ultimate && $link_ultimate ne $link_expanded) ? " = <a href=\"$link_ultimate\">$link_ultimate</a> " : "";
- $p->get_tag("/a");
- # $link = $link_ultimate ? $link_ultimate : $link_expanded;
- # $img[$img_c++] = $link if $link =~ /\.(jpg|gif|png)$/i;
- # $img[$img_c++] = $link if $link =~ m{^https?://img.ly/}i;
- }
- }
- }
- print "</body>\n</html>\n" if $html_mode;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement