Advertisement
overloop

http_parse.pl

Jan 27th, 2014
154
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 2.81 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3. use LWP::Simple;
  4. #use utf8;
  5. use Data::Dumper;
  6. use Encode qw(decode encode);
  7.  
  8. ######################### GLOBALS
  9.  
  10. my %parsers1 = ("al" => [\&al_parser,"animelyrics.com"]);
  11. my %parsers2 = ("js" => [\&js_parser, "jisho.org sentences"]);
  12.  
  13. ######################### SUBS
  14.  
  15. sub list_parsers {
  16.     my %parsers = (%parsers1,%parsers2);
  17.     print "parsers:\n";
  18.     foreach my $parser (keys %parsers) {
  19.         print $parser . " - " . $parsers{$parser}[1] . "\n";
  20.     }
  21. }
  22.  
  23. sub js_parser {
  24.     my ($content) = @_;
  25.     my @sentences = $content =~ m#<td style="width: 50%" class="[japanesnglih]*">(.*?)</td>#g;
  26.     $parsed = join("\n",@sentences) . "\n";
  27.     $parsed =~ s/<.*?>//g;
  28.     return $parsed;
  29. }
  30.  
  31. sub al_parser_helper {
  32.     my ($div) = @_;
  33.     $div =~ s/Lyrics from Animelyrics.com//g;
  34.     $div =~ s/&nbsp;/ /gs;
  35.     $div =~ s/<br>/\n/gs;
  36.     $div =~ s/<.*?>//gs;
  37.     $div =~ s#\n\n#</p><p>#gs;
  38.     $div =~ s#\n#<br>\n#gs;
  39.     $div =~ s#</p>#</p>\n#gs;
  40.     $div =~ s#<p><br>\n?#<p>#gs;
  41.     return $div;
  42. }
  43.  
  44. sub al_parser {
  45.     my ($url) = @_;
  46.     $url =~ s/(htm|jis)$//;
  47.    
  48.     my $jp = get($url . "jis");
  49.     $jp =~ s/\r//g;
  50.     my @kanji = $jp =~ m#<div id=kanji>(.*?)</div>#s;
  51.     $parsed_jp = join("\n",@kanji) . "\n";
  52.     $parsed_jp =~ s/\n//g;
  53.    
  54.     my @crumbs = $jp =~ m#<ul id="crumbs">(.*?)</ul>#s;
  55.     my $parsed_cr = join("\n",@crumbs) . "\n";
  56.    
  57.     my $roen = get($url . "htm");
  58.     $roen =~ s/\r//g;
  59.     my @romaji = $roen =~ m#<td class=romaji NOWRAP>(.*?)</td>#gs;
  60.     $parsed_ro = join("\n",@romaji);
  61.    
  62.     my @english = $roen =~ m#<td class=translation NOWRAP>(.*?)</td>#gs;
  63.     $parsed_en = join("\n",@english);
  64.    
  65.     my $s1 = "<html>\n<head>\n" . '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' . "\n</head>\n<body>\n<p>";
  66.     my $s2 = "</p>\n<hr>\n<p>";
  67.     my $s3 = "</p>\n<hr>\n<p>";
  68.     my $s4 = "</p>\n<hr>\n<p>";
  69.     my $s5 = "</p>\n</body>\n</html>";
  70.    
  71.     $parsed =  $s1 . al_parser_helper($parsed_cr) . $s2 . al_parser_helper($parsed_jp) . $s3 . al_parser_helper($parsed_ro) . $s4 . al_parser_helper($parsed_en) . $s5;
  72.    
  73.     $parsed =~ s#<p><br>\n?#<p>#gs;
  74.     $parsed =~ s#<p></p>\n?##gs;
  75.    
  76.     return $parsed;
  77. }
  78.  
  79. sub put_content {
  80.     my ($file,$content) = @_;
  81.     open OUTPUT,">:utf8",$file;
  82.     print OUTPUT $content;
  83.     close OUTPUT;
  84. }
  85.  
  86. ######################### MAIN
  87.  
  88. if (scalar(@ARGV)<3) {
  89.     print "usage: http_parse.pl parser url file\n";
  90.     list_parsers;
  91.     if (scalar(@ARGV)==0 || (scalar(@ARGV) == 1 && ($ARGV[0] eq "-h" || $ARGV[0] eq "--help"))) {
  92.         exit 0;
  93.     } else {
  94.         exit -1;
  95.     }
  96. }
  97.  
  98. my ($parser,$url,$file) = @ARGV;
  99.  
  100. if (exists $parsers1{$parser}) {
  101.     my $parsed = $parsers1{$parser}[0]($url);
  102.     put_content($file,$parsed);
  103. } elsif (exists $parsers2{$parser}) {
  104.     $content = get($url);
  105.     my $parsed = $parsers2{$parser}[0]($content);
  106.     put_content($file,$parsed);
  107. } else {
  108.     print "parser " . $parser . " is not defined\n";
  109.     list_parsers;
  110.     exit -1;
  111. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement