Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- use LWP::Simple;
- #use utf8;
- use Data::Dumper;
- use Encode qw(decode encode);
- ######################### GLOBALS
- my %parsers1 = ("al" => [\&al_parser,"animelyrics.com"]);
- my %parsers2 = ("js" => [\&js_parser, "jisho.org sentences"]);
- ######################### SUBS
- sub list_parsers {
- my %parsers = (%parsers1,%parsers2);
- print "parsers:\n";
- foreach my $parser (keys %parsers) {
- print $parser . " - " . $parsers{$parser}[1] . "\n";
- }
- }
- sub js_parser {
- my ($content) = @_;
- my @sentences = $content =~ m#<td style="width: 50%" class="[japanesnglih]*">(.*?)</td>#g;
- $parsed = join("\n",@sentences) . "\n";
- $parsed =~ s/<.*?>//g;
- return $parsed;
- }
- sub al_parser_helper {
- my ($div) = @_;
- $div =~ s/Lyrics from Animelyrics.com//g;
- $div =~ s/ / /gs;
- $div =~ s/<br>/\n/gs;
- $div =~ s/<.*?>//gs;
- $div =~ s#\n\n#</p><p>#gs;
- $div =~ s#\n#<br>\n#gs;
- $div =~ s#</p>#</p>\n#gs;
- $div =~ s#<p><br>\n?#<p>#gs;
- return $div;
- }
- sub al_parser {
- my ($url) = @_;
- $url =~ s/(htm|jis)$//;
- my $jp = get($url . "jis");
- $jp =~ s/\r//g;
- my @kanji = $jp =~ m#<div id=kanji>(.*?)</div>#s;
- $parsed_jp = join("\n",@kanji) . "\n";
- $parsed_jp =~ s/\n//g;
- my @crumbs = $jp =~ m#<ul id="crumbs">(.*?)</ul>#s;
- my $parsed_cr = join("\n",@crumbs) . "\n";
- my $roen = get($url . "htm");
- $roen =~ s/\r//g;
- my @romaji = $roen =~ m#<td class=romaji NOWRAP>(.*?)</td>#gs;
- $parsed_ro = join("\n",@romaji);
- my @english = $roen =~ m#<td class=translation NOWRAP>(.*?)</td>#gs;
- $parsed_en = join("\n",@english);
- my $s1 = "<html>\n<head>\n" . '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' . "\n</head>\n<body>\n<p>";
- my $s2 = "</p>\n<hr>\n<p>";
- my $s3 = "</p>\n<hr>\n<p>";
- my $s4 = "</p>\n<hr>\n<p>";
- my $s5 = "</p>\n</body>\n</html>";
- $parsed = $s1 . al_parser_helper($parsed_cr) . $s2 . al_parser_helper($parsed_jp) . $s3 . al_parser_helper($parsed_ro) . $s4 . al_parser_helper($parsed_en) . $s5;
- $parsed =~ s#<p><br>\n?#<p>#gs;
- $parsed =~ s#<p></p>\n?##gs;
- return $parsed;
- }
- sub put_content {
- my ($file,$content) = @_;
- open OUTPUT,">:utf8",$file;
- print OUTPUT $content;
- close OUTPUT;
- }
- ######################### MAIN
- if (scalar(@ARGV)<3) {
- print "usage: http_parse.pl parser url file\n";
- list_parsers;
- if (scalar(@ARGV)==0 || (scalar(@ARGV) == 1 && ($ARGV[0] eq "-h" || $ARGV[0] eq "--help"))) {
- exit 0;
- } else {
- exit -1;
- }
- }
- my ($parser,$url,$file) = @ARGV;
- if (exists $parsers1{$parser}) {
- my $parsed = $parsers1{$parser}[0]($url);
- put_content($file,$parsed);
- } elsif (exists $parsers2{$parser}) {
- $content = get($url);
- my $parsed = $parsers2{$parser}[0]($content);
- put_content($file,$parsed);
- } else {
- print "parser " . $parser . " is not defined\n";
- list_parsers;
- exit -1;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement