Advertisement
m4ly

DS_HTML_ING_PARSER.pl

May 4th, 2015
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 10.34 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3. # Author: Dawid Mocek
  4. # PP Projekt
  5. # For educational purpose only
  6. # All rights reserved
  7.  
  8. # HTML ingredients parser - gets information about recipe ingredients
  9. use strict;
  10. use warnings;
  11.  
  12. use feature 'say';
  13.  
  14. use POSIX qw(strftime);
  15. use Data::Dumper;
  16. use HTML::TreeBuilder::XPath;
  17. use HTML::Element;
  18. use DBI;
  19. use Config::IniFiles;
  20.  
  21. use constant DBG => 1;
  22.  
  23. sub trim {
  24.     my $s = shift;
  25.     $s =~ s/^\s+|\s+$//g;
  26.     return $s;
  27. }
  28.  
  29. sub _now {
  30.     return strftime "%Y-%m-%d %T", localtime;
  31. }
  32.  
  33. sub group_add {
  34.     my $sth = $_[0];
  35.     my $group_name = $_[1];
  36.     my $recipe_id = $_[2];
  37.     $sth->execute($group_name, $recipe_id);
  38.     say _now . ' group_add() Added group: '. $group_name . ' for recipe_id: ' . $recipe_id .' group_id: ' . $sth->{mysql_insertid} if DBG;
  39.     return $sth->{mysql_insertid};
  40. }
  41.  
  42. sub group_exists {
  43.     my $sth = $_[0];
  44.     my $group_name = $_[1];
  45.     my $recipe_id = $_[2];
  46.     my $row;
  47.     $sth->execute($group_name, $recipe_id);
  48.     $row = $sth->fetchrow_hashref;
  49.     if(defined $row) {
  50.     return $$row{'id'};
  51.     }
  52.     return 0;
  53. }
  54.  
  55. sub ing_add {
  56.     my $sth = $_[0];
  57.     my $ing_name = $_[1];
  58.     $sth->execute($ing_name);
  59.     say _now . ' ing_add() Added ingredient: '. $ing_name . ' ing_id: ' . $sth->{mysql_insertid} if DBG;
  60.     return $sth->{mysql_insertid};
  61. }
  62.  
  63. sub ing_exists {
  64.     my $sth = $_[0];
  65.     my $ing_name = $_[1];
  66.     my $row;
  67.     $sth->execute($ing_name);
  68.     $row = $sth->fetchrow_hashref;
  69.     if(defined $row) {
  70.     return $$row{'id'};
  71.     }
  72.     return 0;
  73. }
  74.  
  75. sub has_add {
  76.     my $sth = $_[0];
  77.     my $group_id = $_[1];
  78.     my $ing_id = $_[2];
  79.     my $ing_amount = $_[3];
  80.     if(!defined($ing_amount)) {
  81.     $sth->execute($group_id, $ing_id, undef);
  82.     }
  83.     $sth->execute($group_id, $ing_id, $ing_amount);
  84.     say _now . ' has_add() Added relation for group_id: '. $group_id . ' ing_id: ' . $ing_id . ' has_id: ' . $sth->{mysql_insertid} if DBG;
  85.     return  $sth->{mysql_insertid};
  86. }
  87.  
  88. sub ings_printer {
  89.     my %contents = @_;
  90.     my $c_size = keys %contents;
  91.     my $c_cnt  = 0;
  92.     my $ret = 'ings_printer() ';
  93.     while(my($group_name, $ings) = each %contents) {
  94.     $ret .= $group_name . ': ';
  95.     my $b_size = keys $ings;
  96.     my $b_cnt = 0;
  97.     while(my($ing_name, $ing_amount) =  each $ings) {
  98.         if(!defined($ing_amount)) { $ing_amount = 'NULL'; }
  99.         $ret .= $ing_name . ' => ' . $ing_amount;
  100.         $b_cnt++;
  101.         if($b_cnt < $b_size) { $ret .= ', '; }
  102.     }
  103.     $c_cnt++;
  104.     if($c_cnt < $c_size) { $ret .= ' | '; }
  105.     }
  106.     return $ret;
  107. }
  108.  
  109. sub load_ini {
  110.  
  111.     my $inifile = $_[0];
  112.  
  113.     unless(-e $inifile) {
  114.          print("File: $inifile does not exists\n");
  115.          exit;
  116.          }
  117.  
  118.     my $cfg =  Config::IniFiles->new(-file => $inifile, -fallback => "General");
  119.     return $cfg;
  120. }
  121.  
  122.  
  123. my $ini_file = './config.ini';
  124. my $cfg = load_ini($ini_file);
  125.  
  126. ### HTML Parser ###
  127. my $tree;
  128. my $ing_list_wrapper_xpath = '/html/body//div[@class="ingredients-list-wrapper"]';
  129.  
  130. ### Database ###
  131. my $dbh = DBI->connect('DBI:mysql:database=' .  $cfg->val('db', 'name') . ';host=' .
  132.                         $cfg->val('db', 'host'),
  133.                         $cfg->val('db', 'user'),
  134.                         $cfg->val('db', 'pass'),
  135.                         {mysql_auto_reconnect => 1, mysql_enable_utf8 => 1});
  136.  
  137.  
  138. my $tb_recipe       = $cfg->val('db', 'tb_recipe');
  139. my $tb_ing      = $cfg->val('db', 'tb_ingredient');
  140. my $tb_ing_group    = $cfg->val('db', 'tb_ingredient_group');
  141. my $tb_has      = $cfg->val('db', 'tb_has');
  142.  
  143. my $sth_recipe      = $dbh->prepare(qq{SELECT id, ds_id FROM $tb_recipe  ORDER BY ds_id ASC });
  144.  
  145. my $sth_ing_add     = $dbh->prepare('INSERT INTO `' . $tb_ing .'`(`name`) VALUES(?)');
  146. my $sth_ing_exists  = $dbh->prepare(qq{SELECT id FROM $tb_ing WHERE name = ? });
  147.  
  148. my $sth_group_add   = $dbh->prepare('INSERT INTO `'. $tb_ing_group . '`(`name`, recipe_id) VALUES(?,?)');
  149. my $sth_group_exists    = $dbh->prepare('SELECT id FROM `'.$tb_ing_group .'` WHERE `name` = ? AND recipe_id = ?');
  150.  
  151. my $sth_has_add     = $dbh->prepare('INSERT INTO `'. $tb_has . '`(ingredientGroup_id, ingredient_id, ingredientAmount) VALUES(?,?,?)');
  152.  
  153. ### Defaults ###
  154. my $default_group_name = $cfg->val('ingredients', 'default_group_name');
  155.  
  156. my $cnt = 0;
  157. $sth_recipe->execute();
  158. my $code = '0x00';
  159. $dbh->{AutoCommit} = 0;
  160. $dbh->{RaiseError} = 1;
  161.  
  162. while(my $row = $sth_recipe->fetchrow_hashref()) {
  163.     my $ds_id = $$row{'ds_id'};
  164.     my $recipe_id = $$row{'id'};
  165.  
  166.     my $filename = '/share/przepisy/'.$ds_id .'/'. $ds_id .'.html';
  167.  
  168.     say _now . ' Parsing: ' . $filename  if DBG;
  169.  
  170.     ### HTML Parser ###
  171.     $tree = HTML::TreeBuilder::XPath->new(ignore_unknown => 0);
  172.     $tree->utf8_mode(1);
  173.     $tree->parse_file($filename);
  174.  
  175.     ### Stage 1 - searching, storing values ###
  176.     my $ing_list_wrapper_nodes;
  177.     $ing_list_wrapper_nodes = $tree->findnodes($ing_list_wrapper_xpath)->[0];
  178.  
  179.     my @uls = ();
  180.     my @strongs = ();
  181.     my $uls_size = -1;
  182.     my $strongs_size = -1;
  183.  
  184.     if(defined $ing_list_wrapper_nodes ) {
  185.     @uls     = $ing_list_wrapper_nodes->look_down(_tag => 'ul', class => 'ingredients-list');
  186.     @strongs = $ing_list_wrapper_nodes->look_down(_tag => 'strong', class => 'ingredients-group-title');
  187.    
  188.     $uls_size = scalar @uls;
  189.     $strongs_size = scalar @strongs;
  190.    
  191.     #####
  192.     # 1st case - most convenient:
  193.     #    n <strong> tags
  194.     #    n <ul> tags
  195.     #
  196.     if($uls_size == $strongs_size) {
  197.         $code = '0x01';
  198.         say _now . " $code " . $ds_id . ' uls size: ' . $uls_size . ' strongs size: '.  $strongs_size;
  199.         my %contents = ();
  200.         my $cnt  = 0;
  201.         foreach my $ul(@uls) {
  202.         my @lis = $ul->look_down(_tag => 'li', class => 'group');
  203.         my $group_name = $strongs[$cnt]->as_trimmed_text;
  204.         $cnt++;
  205.         if(defined $group_name) {    
  206.             eval {
  207.             my $group_id = group_exists($sth_group_exists, $group_name, $recipe_id);
  208.             if($group_id == 0 ) {
  209.                 $group_id = group_add($sth_group_add, $group_name, $recipe_id);
  210.             }
  211.             foreach my $li(@lis) {
  212.                 my $div_to_l = $li->look_down(_tag => 'div', class => 'component-wrapper to-l');
  213.                 my $ing_name = undef;
  214.                 my $ing_amount = undef;
  215.            
  216.                 if(defined $div_to_l) {
  217.                 $ing_name = $div_to_l->look_down(_tag => 'strong');
  218.                 $ing_amount = $div_to_l->look_down(_tag => 'span');
  219.            
  220.                     if(defined $ing_name and defined $ing_amount) {
  221.                         $ing_amount = $ing_amount->as_trimmed_text;
  222.                     $ing_amount =~ s/\xa0/\x20/g;
  223.                    
  224.                     my $ing_id = ing_exists($sth_ing_exists, $ing_name->as_trimmed_text);
  225.                     if($ing_id == 0) {
  226.                     $ing_id = ing_add($sth_ing_add, $ing_name->as_trimmed_text);
  227.                     }
  228.                     has_add($sth_has_add, $group_id, $ing_id, $ing_amount);
  229.                     $contents{$group_name}{$ing_name->as_trimmed_text} = $ing_amount;
  230.                 }
  231.                 elsif(defined $ing_name and !(defined $ing_amount)) {
  232.                     my $ing_id = ing_exists($sth_ing_exists, $ing_name->as_trimmed_text);
  233.                     if($ing_id == 0) {
  234.                     $ing_id = ing_add($sth_ing_add, $ing_name->as_trimmed_text);
  235.                     }
  236.                     has_add($sth_has_add, $group_id, $ing_id, undef);
  237.                     $contents{$group_name}{$ing_name->as_trimmed_text} = undef ;
  238.                 }
  239.                 else { }
  240.                 }
  241.             }
  242.             say _now . " $code " . $ds_id . ' '. ings_printer(%contents) if DBG;
  243.             $dbh->commit();
  244.             };
  245.             if(%@) {
  246.             warn "Transaction rollback becaouse $@";
  247.             eval { $dbh->rollback() };
  248.             }
  249.         }
  250.         }
  251.        
  252.     }
  253.     #####
  254.     # 2nd case:
  255.     #    0 <strong> tags
  256.     #    n <ul>     tags
  257.     #
  258.     elsif($uls_size > 0 and $strongs_size == 0) {
  259.         $code = '0x02';
  260.         say _now . " $code " . $ds_id . ' uls size: ' . $uls_size . ' strongs size: '.  $strongs_size;
  261.         my $ul = $uls[0];
  262.         my @lis = $ul->look_down(_tag => 'li', class => 'group');
  263.         my $group_name = $default_group_name;
  264.         my %contents = ();
  265.         if(defined $group_name) {    
  266.             eval {
  267.             my $group_id = group_exists($sth_group_exists, $group_name, $recipe_id);
  268.             if($group_id == 0 ) {
  269.                 $group_id = group_add($sth_group_add, $group_name, $recipe_id);
  270.             }
  271.                 foreach my $li(@lis) {
  272.                 my $div_to_l = $li->look_down(_tag => 'div', class => 'component-wrapper to-l');
  273.                 my $ing_name = undef;
  274.                 my $ing_amount = undef;
  275.                 if(defined $div_to_l) {
  276.                 $ing_name = $div_to_l->look_down(_tag => 'strong');
  277.                 $ing_amount = $div_to_l->look_down(_tag => 'span', class => '');
  278.                 if(defined $ing_name and defined $ing_amount) {
  279.                     $ing_amount = $ing_amount->as_trimmed_text;
  280.                     $ing_amount =~ s/\xa0/\x20/g;
  281.                    
  282.                         my $ing_id = ing_exists($sth_ing_exists, $ing_name->as_trimmed_text);
  283.                     if($ing_id == 0) {
  284.                     $ing_id = ing_add($sth_ing_add, $ing_name->as_trimmed_text);
  285.                     }
  286.                     has_add($sth_has_add, $group_id, $ing_id, $ing_amount);
  287.                     $contents{$default_group_name}{$ing_name->as_trimmed_text} = $ing_amount;
  288.                 }
  289.                 elsif(defined $ing_name and !(defined $ing_amount)) {
  290.                         my $ing_id = ing_exists($sth_ing_exists, $ing_name->as_trimmed_text);
  291.                     if($ing_id == 0) {
  292.                     $ing_id = ing_add($sth_ing_add, $ing_name->as_trimmed_text);
  293.                     }
  294.                     has_add($sth_has_add, $group_id, $ing_id, undef);
  295.                         $contents{$default_group_name}{$ing_name->as_trimmed_text} = undef ;
  296.                 }
  297.                 else { }
  298.                 }
  299.             }
  300.             say _now . " $code " . $ds_id . ' '. ings_printer(%contents) if DBG;
  301.             $dbh->commit();
  302.             };
  303.             if(%@) {
  304.             warn "Transaction rollback becaouse $@";
  305.             eval { $dbh->rollback() };
  306.             }
  307.         }
  308.     }
  309.     ####
  310.     # 3th case:
  311.     #  0 <ul> tags
  312.     #  n <strong> tags
  313.     #
  314.     elsif($uls_size == 0 and $strongs_size > 0) {
  315.         say _now . ' 0x03 ' . $ds_id . ' uls size: ' . $uls_size . ' strongs size: '.  $strongs_size;
  316.     }
  317.     #####
  318.     # 4th case:
  319.     #    0 <strong> tags
  320.     #    0 <ul> tags
  321.     #
  322.     elsif($uls_size == 0 and $strongs_size == 0) {
  323.         say _now . ' 0x04 ' . $ds_id . ' uls size: ' . $uls_size . ' strongs size: '.  $strongs_size;
  324.     }
  325.     #####
  326.     # 5th case - the worst:
  327.     #    m <strong> tags
  328.     #    n <ul> tags
  329.     #
  330.     elsif($uls_size > $strongs_size || $uls_size < $strongs_size ) {
  331.         say _now . ' 0x05 ' . $ds_id . ' uls size: ' . $uls_size . ' strongs size: '.  $strongs_size;
  332.     }
  333.     #####
  334.     # 6th case - unrecognized:
  335.     #
  336.     else {
  337.         say _now . ' 0x06 ' . $ds_id . ' uls size: ' . $uls_size . ' strongs size: '.  $strongs_size;
  338.     }
  339.     } else { say _now . ' 0x00 ' . $ds_id . ' ingredients list wrapper not found'; }
  340.    
  341.      $tree->delete;
  342.      $cnt++;
  343. }
  344.  
  345.  
  346. $sth_recipe->finish();
  347. $dbh->disconnect();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement