m4ly

DS_PARALLEL_DOWNLOADER.pl

Apr 27th, 2015
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 2.07 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3.  
  4. # Author: Dawid Mocek
  5. # PP Projekt
  6. # For educational purpose only
  7. # All rights reserved
  8.  
  9. # DS parallel recipes downloader
  10. use strict;
  11. use warnings;
  12.  
  13.  
  14. use LWP::Simple;
  15. use LWP::Parallel;
  16. use Data::Dumper;
  17. use HTML::TreeBuilder::XPath;
  18. use XML::Writer;
  19. use DBI;
  20. use Config::IniFiles;
  21.  
  22. sub load_ini {
  23.  
  24.     my $inifile = $_[0];
  25.    
  26.     unless(-e $inifile) {
  27.          print("File: $inifile does not exists\n");
  28.          exit;
  29.          }
  30.  
  31.     my $cfg =  Config::IniFiles->new(-file => $inifile, -fallback => "General");
  32.     return $cfg;
  33. }
  34.  
  35.  
  36. my $ini_file = './config.ini';
  37. my $cfg = load_ini($ini_file);
  38.  
  39.  
  40. my $url =  $cfg->val('ds', 'url') . "/przepisy/sortowanie/2/strona";
  41.  
  42. my $grids_xpath = '/html/body/div[@id="Wrapper"]/div[@id="Container"]/div[@id="Content"]/div[@class="main-wrapper"]/div[@class="content-wrapper with-sidebar"]/section[@class="content"]/div[@class="grid-wrapper"]/div[@class="grid-item"]/div[@class="grid-title"]//a';
  43.  
  44. ### LWP ###
  45. my $ua = LWP::UserAgent->new;
  46. $ua->timeout($cfg->val('ua', 'timeout'));
  47. $ua->agent($cfg->val('ua', 'agent'));
  48.  
  49. ### HTML Parser ###
  50. my $tree = HTML::TreeBuilder::XPath->new(ignore_unknown => 0);
  51.  
  52. ### Database ###
  53. my $dbh;
  54. $dbh = DBI->connect('DBI:mysql:database=' . $cfg->val('db', 'name') . ';host=' . $cfg->val('db', 'host'), $cfg->val('db', 'user'), $cfg->val('db', 'pass'), {mysql_auto_reconnect => 1, mysql_enable_utf8 => 1});
  55. my $stmt_href = $dbh->prepare('INSERT INTO `doradcasmaku_recipies_links`(title, href) VALUES(?, ?)');
  56.  
  57. binmode STDOUT, ':encoding(UTF-8)';
  58.  
  59. foreach my $i(1..1000) {
  60.    
  61.     my $absolute_url = $url . "/" . $i;
  62.     my $response = $ua->get($absolute_url);
  63.  
  64.     print "Fetching: ". $absolute_url;
  65.     if($response->is_success) {
  66.     print " " . $response->code . "\n";
  67.    
  68.         $tree->parse($response->as_string);
  69.         my @grids = $tree->findnodes($grids_xpath);
  70.    
  71.     foreach my $grid(@grids) {
  72.     $stmt_href->execute($grid->attr('title'), $grid->attr('href'));
  73.     }
  74.     }
  75.     else {
  76.     print " " . $response->code . "\n";
  77.     }
  78. }
  79.  
  80. $dbh->disconnect();
Add Comment
Please, Sign In to add comment