Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- # Author: Dawid Mocek
- # PP Projekt
- # For educational purpose only
- # All rights reserved
- # HTML ingredients parser - gets information about recipe ingredients
- use strict;
- use warnings;
- use feature 'say';
- use POSIX qw(strftime);
- use Data::Dumper;
- use HTML::TreeBuilder::XPath;
- use HTML::Element;
- use DBI;
- use Config::IniFiles;
- use constant DBG => 1;
- sub trim {
- my $s = shift;
- $s =~ s/^\s+|\s+$//g;
- return $s;
- }
- sub _now {
- return strftime "%Y-%m-%d %T", localtime;
- }
- sub group_add {
- my $sth = $_[0];
- my $group_name = $_[1];
- my $recipe_id = $_[2];
- $sth->execute($group_name, $recipe_id);
- say _now . ' group_add() Added group: '. $group_name . ' for recipe_id: ' . $recipe_id .' group_id: ' . $sth->{mysql_insertid} if DBG;
- return $sth->{mysql_insertid};
- }
- sub group_exists {
- my $sth = $_[0];
- my $group_name = $_[1];
- my $recipe_id = $_[2];
- my $row;
- $sth->execute($group_name, $recipe_id);
- $row = $sth->fetchrow_hashref;
- if(defined $row) {
- return $$row{'id'};
- }
- return 0;
- }
- sub ing_add {
- my $sth = $_[0];
- my $ing_name = $_[1];
- $sth->execute($ing_name);
- say _now . ' ing_add() Added ingredient: '. $ing_name . ' ing_id: ' . $sth->{mysql_insertid} if DBG;
- return $sth->{mysql_insertid};
- }
- sub ing_exists {
- my $sth = $_[0];
- my $ing_name = $_[1];
- my $row;
- $sth->execute($ing_name);
- $row = $sth->fetchrow_hashref;
- if(defined $row) {
- return $$row{'id'};
- }
- return 0;
- }
- sub has_add {
- my $sth = $_[0];
- my $group_id = $_[1];
- my $ing_id = $_[2];
- my $ing_amount = $_[3];
- if(!defined($ing_amount)) {
- $sth->execute($group_id, $ing_id, undef);
- }
- $sth->execute($group_id, $ing_id, $ing_amount);
- say _now . ' has_add() Added relation for group_id: '. $group_id . ' ing_id: ' . $ing_id . ' has_id: ' . $sth->{mysql_insertid} if DBG;
- return $sth->{mysql_insertid};
- }
- sub ings_printer {
- my %contents = @_;
- my $c_size = keys %contents;
- my $c_cnt = 0;
- my $ret = 'ings_printer() ';
- while(my($group_name, $ings) = each %contents) {
- $ret .= $group_name . ': ';
- my $b_size = keys $ings;
- my $b_cnt = 0;
- while(my($ing_name, $ing_amount) = each $ings) {
- if(!defined($ing_amount)) { $ing_amount = 'NULL'; }
- $ret .= $ing_name . ' => ' . $ing_amount;
- $b_cnt++;
- if($b_cnt < $b_size) { $ret .= ', '; }
- }
- $c_cnt++;
- if($c_cnt < $c_size) { $ret .= ' | '; }
- }
- return $ret;
- }
- sub load_ini {
- my $inifile = $_[0];
- unless(-e $inifile) {
- print("File: $inifile does not exists\n");
- exit;
- }
- my $cfg = Config::IniFiles->new(-file => $inifile, -fallback => "General");
- return $cfg;
- }
- my $ini_file = './config.ini';
- my $cfg = load_ini($ini_file);
- ### HTML Parser ###
- my $tree;
- my $ing_list_wrapper_xpath = '/html/body//div[@class="ingredients-list-wrapper"]';
- ### Database ###
- my $dbh = DBI->connect('DBI:mysql:database=' . $cfg->val('db', 'name') . ';host=' .
- $cfg->val('db', 'host'),
- $cfg->val('db', 'user'),
- $cfg->val('db', 'pass'),
- {mysql_auto_reconnect => 1, mysql_enable_utf8 => 1});
- my $tb_recipe = $cfg->val('db', 'tb_recipe');
- my $tb_ing = $cfg->val('db', 'tb_ingredient');
- my $tb_ing_group = $cfg->val('db', 'tb_ingredient_group');
- my $tb_has = $cfg->val('db', 'tb_has');
- my $sth_recipe = $dbh->prepare(qq{SELECT id, ds_id FROM $tb_recipe ORDER BY ds_id ASC });
- my $sth_ing_add = $dbh->prepare('INSERT INTO `' . $tb_ing .'`(`name`) VALUES(?)');
- my $sth_ing_exists = $dbh->prepare(qq{SELECT id FROM $tb_ing WHERE name = ? });
- my $sth_group_add = $dbh->prepare('INSERT INTO `'. $tb_ing_group . '`(`name`, recipe_id) VALUES(?,?)');
- my $sth_group_exists = $dbh->prepare('SELECT id FROM `'.$tb_ing_group .'` WHERE `name` = ? AND recipe_id = ?');
- my $sth_has_add = $dbh->prepare('INSERT INTO `'. $tb_has . '`(ingredientGroup_id, ingredient_id, ingredientAmount) VALUES(?,?,?)');
- ### Defaults ###
- my $default_group_name = $cfg->val('ingredients', 'default_group_name');
- my $cnt = 0;
- $sth_recipe->execute();
- my $code = '0x00';
- $dbh->{AutoCommit} = 0;
- $dbh->{RaiseError} = 1;
- while(my $row = $sth_recipe->fetchrow_hashref()) {
- my $ds_id = $$row{'ds_id'};
- my $recipe_id = $$row{'id'};
- my $filename = '/share/przepisy/'.$ds_id .'/'. $ds_id .'.html';
- say _now . ' Parsing: ' . $filename if DBG;
- ### HTML Parser ###
- $tree = HTML::TreeBuilder::XPath->new(ignore_unknown => 0);
- $tree->utf8_mode(1);
- $tree->parse_file($filename);
- ### Stage 1 - searching, storing values ###
- my $ing_list_wrapper_nodes;
- $ing_list_wrapper_nodes = $tree->findnodes($ing_list_wrapper_xpath)->[0];
- my @uls = ();
- my @strongs = ();
- my $uls_size = -1;
- my $strongs_size = -1;
- if(defined $ing_list_wrapper_nodes ) {
- @uls = $ing_list_wrapper_nodes->look_down(_tag => 'ul', class => 'ingredients-list');
- @strongs = $ing_list_wrapper_nodes->look_down(_tag => 'strong', class => 'ingredients-group-title');
- $uls_size = scalar @uls;
- $strongs_size = scalar @strongs;
- #####
- # 1st case - most convenient:
- # n <strong> tags
- # n <ul> tags
- #
- if($uls_size == $strongs_size) {
- $code = '0x01';
- say _now . " $code " . $ds_id . ' uls size: ' . $uls_size . ' strongs size: '. $strongs_size;
- my %contents = ();
- my $cnt = 0;
- foreach my $ul(@uls) {
- my @lis = $ul->look_down(_tag => 'li', class => 'group');
- my $group_name = $strongs[$cnt]->as_trimmed_text;
- $cnt++;
- if(defined $group_name) {
- eval {
- my $group_id = group_exists($sth_group_exists, $group_name, $recipe_id);
- if($group_id == 0 ) {
- $group_id = group_add($sth_group_add, $group_name, $recipe_id);
- }
- foreach my $li(@lis) {
- my $div_to_l = $li->look_down(_tag => 'div', class => 'component-wrapper to-l');
- my $ing_name = undef;
- my $ing_amount = undef;
- if(defined $div_to_l) {
- $ing_name = $div_to_l->look_down(_tag => 'strong');
- $ing_amount = $div_to_l->look_down(_tag => 'span');
- if(defined $ing_name and defined $ing_amount) {
- $ing_amount = $ing_amount->as_trimmed_text;
- $ing_amount =~ s/\xa0/\x20/g;
- my $ing_id = ing_exists($sth_ing_exists, $ing_name->as_trimmed_text);
- if($ing_id == 0) {
- $ing_id = ing_add($sth_ing_add, $ing_name->as_trimmed_text);
- }
- has_add($sth_has_add, $group_id, $ing_id, $ing_amount);
- $contents{$group_name}{$ing_name->as_trimmed_text} = $ing_amount;
- }
- elsif(defined $ing_name and !(defined $ing_amount)) {
- my $ing_id = ing_exists($sth_ing_exists, $ing_name->as_trimmed_text);
- if($ing_id == 0) {
- $ing_id = ing_add($sth_ing_add, $ing_name->as_trimmed_text);
- }
- has_add($sth_has_add, $group_id, $ing_id, undef);
- $contents{$group_name}{$ing_name->as_trimmed_text} = undef ;
- }
- else { }
- }
- }
- say _now . " $code " . $ds_id . ' '. ings_printer(%contents) if DBG;
- $dbh->commit();
- };
- if(%@) {
- warn "Transaction rollback becaouse $@";
- eval { $dbh->rollback() };
- }
- }
- }
- }
- #####
- # 2nd case:
- # 0 <strong> tags
- # n <ul> tags
- #
- elsif($uls_size > 0 and $strongs_size == 0) {
- $code = '0x02';
- say _now . " $code " . $ds_id . ' uls size: ' . $uls_size . ' strongs size: '. $strongs_size;
- my $ul = $uls[0];
- my @lis = $ul->look_down(_tag => 'li', class => 'group');
- my $group_name = $default_group_name;
- my %contents = ();
- if(defined $group_name) {
- eval {
- my $group_id = group_exists($sth_group_exists, $group_name, $recipe_id);
- if($group_id == 0 ) {
- $group_id = group_add($sth_group_add, $group_name, $recipe_id);
- }
- foreach my $li(@lis) {
- my $div_to_l = $li->look_down(_tag => 'div', class => 'component-wrapper to-l');
- my $ing_name = undef;
- my $ing_amount = undef;
- if(defined $div_to_l) {
- $ing_name = $div_to_l->look_down(_tag => 'strong');
- $ing_amount = $div_to_l->look_down(_tag => 'span', class => '');
- if(defined $ing_name and defined $ing_amount) {
- $ing_amount = $ing_amount->as_trimmed_text;
- $ing_amount =~ s/\xa0/\x20/g;
- my $ing_id = ing_exists($sth_ing_exists, $ing_name->as_trimmed_text);
- if($ing_id == 0) {
- $ing_id = ing_add($sth_ing_add, $ing_name->as_trimmed_text);
- }
- has_add($sth_has_add, $group_id, $ing_id, $ing_amount);
- $contents{$default_group_name}{$ing_name->as_trimmed_text} = $ing_amount;
- }
- elsif(defined $ing_name and !(defined $ing_amount)) {
- my $ing_id = ing_exists($sth_ing_exists, $ing_name->as_trimmed_text);
- if($ing_id == 0) {
- $ing_id = ing_add($sth_ing_add, $ing_name->as_trimmed_text);
- }
- has_add($sth_has_add, $group_id, $ing_id, undef);
- $contents{$default_group_name}{$ing_name->as_trimmed_text} = undef ;
- }
- else { }
- }
- }
- say _now . " $code " . $ds_id . ' '. ings_printer(%contents) if DBG;
- $dbh->commit();
- };
- if(%@) {
- warn "Transaction rollback becaouse $@";
- eval { $dbh->rollback() };
- }
- }
- }
- ####
- # 3th case:
- # 0 <ul> tags
- # n <strong> tags
- #
- elsif($uls_size == 0 and $strongs_size > 0) {
- say _now . ' 0x03 ' . $ds_id . ' uls size: ' . $uls_size . ' strongs size: '. $strongs_size;
- }
- #####
- # 4th case:
- # 0 <strong> tags
- # 0 <ul> tags
- #
- elsif($uls_size == 0 and $strongs_size == 0) {
- say _now . ' 0x04 ' . $ds_id . ' uls size: ' . $uls_size . ' strongs size: '. $strongs_size;
- }
- #####
- # 5th case - the worst:
- # m <strong> tags
- # n <ul> tags
- #
- elsif($uls_size > $strongs_size || $uls_size < $strongs_size ) {
- say _now . ' 0x05 ' . $ds_id . ' uls size: ' . $uls_size . ' strongs size: '. $strongs_size;
- }
- #####
- # 6th case - unrecognized:
- #
- else {
- say _now . ' 0x06 ' . $ds_id . ' uls size: ' . $uls_size . ' strongs size: '. $strongs_size;
- }
- } else { say _now . ' 0x00 ' . $ds_id . ' ingredients list wrapper not found'; }
- $tree->delete;
- $cnt++;
- }
- $sth_recipe->finish();
- $dbh->disconnect();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement