Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- use utf8;
- use LWP::UserAgent;
- use LWP::Debug qw(+);
- use JSON qw( decode_json );
- use HTTP::Cookies;
- use Encode;
- use strict;
- my $board_uri="https://2ch.hk/wr/catalog.json";
- my $threads_base_uri="https://2ch.hk/wr/res/";
- my $ua = LWP::UserAgent->new;
- $ua->proxy([qw(http https)] => "socks://127.0.0.1:9050");
- $ua->requests_redirectable(undef);
- $ua->agent('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0');
- my $cookie_jar = HTTP::Cookies->new;
- #$cookie_jar -> set_cookie('','cf_clearance','replace',"/","2ch.hk");
- #$cookie_jar -> set_cookie('','__cfduid','replace',"/","2ch.hk");
- $ua -> cookie_jar($cookie_jar);
- #print $ua->get($board_uri)->decoded_content;
- my $board = decode_json(encode('UTF-8', $ua->get($board_uri)->decoded_content));
- for(my $i=0;$i<=$#{$board->{'threads'}};$i++)
- {
- if ($board->{'threads'}[$i]->{'posts_count'} < 100) { next }
- my $data = $ua->get($threads_base_uri.$board->{'threads'}[$i]->{'num'}.'.json')->decoded_content;
- if (!($data =~ m/^{/)) { next };
- my $thread = decode_json(encode('UTF-8', $data));
- my $all='';
- for(my $j=0;$j<=$#{$thread->{'threads'}[0]->{'posts'}};$j++)
- {
- my $post=$thread->{'threads'}[0]->{'posts'}[$j]->{'comment'};
- $post =~ s/<a [^>]+>[^<]+<[^>]+>/ /g;
- $post =~ s/<br>/. /g;
- $post =~ s/<[^>]+>/ /g;
- $post =~ s/>>\d+/ /g;
- $post =~ s/(&#[a-f0-9]+;|&[^;]{1,6};)/ /g;
- $all .= $post.". ";
- }
- $all =~ s/ +/ /g;
- $all =~ s/[\.?!][\.?! ]+/. /g;
- $all =~ s/[\.?! ][\.?!]/. /g;
- my $number = $board->{'threads'}[$i]->{'num'};
- $number =~ s/[^0-9]//g;
- print "$number\n";
- print lc(decode_utf8($all));
- open(F,"> /tmp/data/$number");
- print F $all;
- close(F);
- }
Advertisement
Add Comment
Please, Sign In to add comment