Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- # Author: Dawid Mocek
- # PP Projekt
- # For educational purpose only
- # All right reserved
- # HTML Parser - gets information about recipes
- use strict;
- use warnings;
- use feature 'say';
- use POSIX qw(strftime);
- use Data::Dumper;
- use HTML::TreeBuilder::XPath;
- use DBI;
- use Config::IniFiles;
- use constant DBG => 1;
- sub rand_str {
- my $max = shift;
- my @chars = ("A".."Z", "a".."z");
- my $s;
- $s .= $chars[rand @chars] for 1..$max;
- return $s;
- }
- sub trim {
- my $s = shift;
- $s =~ s/^\s+|\s+$//g;
- return $s;
- }
- sub _now {
- return strftime "%Y-%m-%d %T", localtime;
- }
- sub get_ascii_author_name_from_href {
- if ($_[0] =~ /\.pl\/profil\/(\d+)\/(?<ascii_author_name>.*)\.html$/m) {
- return $+{ascii_author_name};
- }
- return rand_str(8);
- }
- sub generate_email {
- return $_[0] . '@' . lc(rand_str(10)) . '.pl';
- }
- sub user_exists {
- my $sth = $_[0];
- my $login = $_[1];
- my $row;
- $sth->execute($login);
- $row = $sth->fetchrow_hashref;
- if(defined $row) {
- return $$row{'id'};
- }
- return 0;
- }
- sub user_add {
- my $sth = $_[0];
- my $screen_name = $_[1];
- my $login = $_[2];
- my $group_id = $_[3];
- my $hash = $_[4];
- $sth->execute($login, $screen_name, generate_email($login), $group_id, $hash);
- say _now . ' Added user: ' . $login . '. id: ' . $sth->{mysql_insertid} if DBG;
- return $sth->{mysql_insertid};
- }
- sub category_exists {
- my $sth = $_[0];
- my $category_name = $_[1];
- my $row;
- $sth->execute($category_name);
- $row = $sth->fetchrow_hashref;
- if(defined $row) {
- return $$row{'id'};
- }
- return 0;
- }
- sub category_add {
- my $sth = $_[0];
- my $category_name = $_[1];
- $sth->execute($category_name);
- say _now . ' Added category: ' . $category_name . '. id: ' . $sth->{mysql_insertid} if DBG;
- return $sth->{mysql_insertid};
- }
- sub cuisine_exists {
- my $sth = $_[0];
- my $recipe_cuisine_name = $_[1];
- my $row;
- $sth->execute($recipe_cuisine_name);
- $row = $sth->fetchrow_hashref;
- if(defined $row) {
- return $$row{'id'};
- }
- return 0;
- }
- sub cuisine_add {
- my $sth = $_[0];
- my $cuisine_name = $_[1];
- $sth->execute($cuisine_name);
- say _now . ' Added cuisine: ' . $cuisine_name . '. id: ' . $sth->{mysql_insertid} if DBG;
- return $sth->{mysql_insertid};
- }
- sub method_exists {
- my $sth = $_[0];
- my $cooking_method_name = $_[1];
- my $row;
- $sth->execute($cooking_method_name);
- $row = $sth->fetchrow_hashref;
- if(defined $row) {
- return $$row{'id'};
- }
- return 0;
- }
- sub yield_min {
- if($_[0] =~ /^(?<min>\d+)/m) {
- return $+{min};
- }
- return 1;
- }
- sub yield_add {
- my $sth = $_[0];
- my $yield_name = $_[1];
- $sth->execute($yield_name, yield_min($yield_name));
- say _now . ' Added yield: ' . $yield_name . '. id: ' . $sth->{mysql_insertid} if DBG;
- return $sth->{mysql_insertid};
- }
- sub main_ingredient_add {
- my $sth = $_[0];
- my $main_ingredient_name = $_[1];
- $sth->execute($main_ingredient_name);
- say _now . ' Added main ingredient: ' . $main_ingredient_name . '. id: ' . $sth->{mysql_insertid} if DBG;
- return $sth->{mysql_insertid};
- }
- sub main_ingredient_exists {
- my $sth = $_[0];
- my $main_ingredient_name = $_[1];
- my $row;
- $sth->execute($main_ingredient_name);
- $row = $sth->fetchrow_hashref;
- if(defined $row) {
- return $$row{'id'};
- }
- return 0;
- }
- sub yield_exists {
- my $sth = $_[0];
- my $yield_name = $_[1];
- my $row;
- $sth->execute($yield_name);
- $row = $sth->fetchrow_hashref;
- if(defined $row) {
- return $$row{'id'};
- }
- return 0;
- }
- sub method_add {
- my $sth = $_[0];
- my $cooking_method_name = $_[1];
- $sth->execute($cooking_method_name);
- say _now . ' Added cooking method: ' . $cooking_method_name . '. id: ' . $sth->{mysql_insertid} if DBG;
- return $sth->{mysql_insertid};
- }
- sub calorycity_exists {
- my $sth = $_[0];
- my $calorcity_name = $_[1];
- my $row;
- $sth->execute($calorcity_name);
- $row = $sth->fetchrow_hashref;
- if(defined $row) {
- return $$row{'id'};
- }
- return 0;
- }
- sub calorycity_add {
- my $sth = $_[0];
- my $calorcity_name = $_[1];
- $sth->execute($calorcity_name);
- say _now . ' Added calorcity: ' . $calorcity_name . '. id: ' . $sth->{mysql_insertid} if DBG;
- return $sth->{mysql_insertid};
- }
- sub recipe_add {
- my($sth, $ds_id, $author_id, $date_created, $date_published,
- $date_modified, $main_ingredient, $recipeInstructions, $yield, $total_time, $title,
- $category, $cuisine, $method, $calorycity, $vegan, $barbecue) = @_;
- $sth->execute($ds_id, $author_id, $date_created, $date_published, $date_modified, $main_ingredient, $recipeInstructions, $yield, $total_time, $title, $category, $cuisine, $method, $calorycity, $vegan, $barbecue);
- say _now . ' Added recipe: ' . $title . '. id: ' . $sth->{mysql_insertid} if DBG;
- return $sth->{mysql_insertid};
- }
- sub is_2int {
- my $str = shift;
- if($str =~ /Tak/m) {
- return 1;
- }
- return 0;
- }
- sub load_ini {
- my $inifile = $_[0];
- unless(-e $inifile) {
- print("File: $inifile does not exists\n");
- exit;
- }
- my $cfg = Config::IniFiles->new(-file => $inifile, -fallback => "General");
- return $cfg;
- }
- my $ini_file = './config.ini';
- my $cfg = load_ini($ini_file);
- ### HTML Parser ###
- my $tree;
- my $date_published_xpath = '/html/body//div[@class="date-publication-wrapper date-wrapper"]/time/@datetime';
- my $total_time_xpath = '//meta[@itemprop="totalTime"]/@content';
- my $recipe_cuisine_xpath = '/html/body//li[@class="ico-cook-type"]/span[@itemprop="recipeCuisine"]/text()';
- my $cooking_method_xpath = '/html/body//li[@class="ico-prepere"]/span[@itemprop="cookingMethod"]/text()';
- my $recipe_yield_xpath = '/html/body//li[@class="ico-people"]/span[@itemprop="recipeYield"]/text()';
- my $recipe_category_xpath = '/html/body/div[@id="Wrapper"]/div[@id="Container"]/div[@id="Content"]/div[@class="main-wrapper"]'.
- '/div[@class="content-wrapper with-sidebar"]/aside[@class="content-sidebar"]/ul[@class="recipe-details-list"]/li[@class="ico-dish"]/span[@class="recipe-details-value"]/text()';
- my $is_vegan_xpath = '/html/body//li[@class="ico-for-vegetarian"]/span[@class="recipe-details-value"]/text()';
- my $is_barbecue_xpath = '/html/body//li[@class="ico-for-grill"]/span[@class="recipe-details-value"]/text()';
- my $main_ingredient_xpath = '/html/body//li[@class="ico-component"]/span[@class="recipe-details-value"]/text()';
- my $author_xpath = '/html/body/div[@id="Wrapper"]/div[@id="Container"]/div[@id="Content"]/div[@class="main-wrapper"]/div[@class="content-wrapper with-sidebar"]'.
- '/header[@class="content-heading with-description decrease-padding"]/div[@class="recipe-gallery-wrapper"]/div[@class="recipe-gallery-content-wrapper"]'.
- '/div[@class="recipe-gallery-left-wrapper"]/div[@class="recipe-gallery-left"]/div[@class="recipe-gallery-top-nav"]/div/span[@class="recipe-gallery-author-wrapper"]/a[@itemprop="author"]';
- my $recipe_title_xpath = '//meta[@name="og:title"]/@content';
- my $ds_id_xpath = '/html/body/div[@id="Wrapper"]/div[@id="Container"]/div[@id="Content"]/div[@class="main-wrapper"]/div[@class="content-wrapper with-sidebar"]/@data-recipe';
- my $instruction_xpath = '/html/body//div[@class="recipe-content-description" and @itemprop="recipeInstructions"]';
- my $instruction_xpath1 = '/html/body//div[@class="recipe-content-description" and @itemprop="recipeInstructions"]/text()';
- my $calorycity_xpath = '/html/body//li[@class="ico-calorycity"]/span[@class="recipe-details-value"]/text()';
- ### Database ###
- my $dbh = DBI->connect('DBI:mysql:database=' . $cfg->val('db', 'name') . ';host=' .
- $cfg->val('db', 'host'),
- $cfg->val('db', 'user'),
- $cfg->val('db', 'pass'),
- {mysql_auto_reconnect => 1, mysql_enable_utf8 => 1});
- my $tb_recipe = $cfg->val('db', 'tb_recipe');
- my $tb_user = $cfg->val('db', 'tb_user');
- my $tb_recipe_category = $cfg->val('db', 'tb_recipe_category');
- my $tb_recipe_cuisine = $cfg->val('db', 'tb_recipe_cuisine');
- my $tb_cooking_method = $cfg->val('db', 'tb_cooking_method');
- my $tb_href = $cfg->val('db', 'tb_href');
- my $tb_calorycity = $cfg->val('db', 'tb_calorycity');
- my $tb_yield = $cfg->val('db', 'tb_yield');
- my $tb_main_ingredient = $cfg->val('db', 'tb_main_ingredient');
- my $hash = $cfg->val('db', 'default_hash');
- my $sth_user_exists = $dbh->prepare(qq{SELECT id FROM $tb_user WHERE login = ? LIMIT 1});
- my $sth_user_add = $dbh->prepare(qq{INSERT INTO $tb_user(login, screenName, email, group_id, password) VALUES(?, ?, ?, ?, ?)});
- my $sth_category_exists = $dbh->prepare(qq{SELECT id FROM $tb_recipe_category WHERE categoryName = ? LIMIT 1});
- my $sth_category_add = $dbh->prepare(qq{INSERT INTO $tb_recipe_category(categoryName) VALUES(?)});
- my $sth_cuisine_exists = $dbh->prepare(qq{SELECT id FROM $tb_recipe_cuisine WHERE cuisineName = ? LIMIT 1});
- my $sth_cuisine_add = $dbh->prepare(qq{INSERT INTO $tb_recipe_cuisine(cuisineName) VALUES(?)});
- my $sth_method_exists = $dbh->prepare(qq{SELECT id FROM $tb_cooking_method WHERE methodName = ? LIMIT 1});
- my $sth_method_add = $dbh->prepare(qq{INSERT INTO $tb_cooking_method(methodName) VALUES(?)});
- my $sth_calorycity_exists = $dbh->prepare(qq{SELECT id FROM $tb_calorycity WHERE calorycityName = ? LIMIT 1});
- my $sth_calorycity_add = $dbh->prepare(qq{INSERT INTO $tb_calorycity(calorycityName) VALUES(?)});
- my $sth_yield_exists = $dbh->prepare(qq{SELECT id FROM $tb_yield WHERE yieldName = ? LIMIT 1});
- my $sth_yield_add = $dbh->prepare(qq{INSERT INTO $tb_yield(yieldName, yieldMin) VALUES(?, ?)});
- my $sth_main_ingredient_exists = $dbh->prepare(qq{SELECT id FROM $tb_main_ingredient WHERE ingredientName = ? LIMIT 1});
- my $sth_main_ingredient_add = $dbh->prepare(qq{INSERT INTO $tb_main_ingredient(ingredientName) VALUES(?)});
- my $sth_one_step = $dbh->prepare(qq{SELECT ds_id FROM $tb_href WHERE steps = 1});
- my $sth_recipe_add = $dbh->prepare(qq{INSERT INTO $tb_recipe(
- ds_id,
- author_id,
- dateCreated,
- datePublished,
- dateModified,
- ingredientMain_id,
- recipeInstructions,
- recipeYield_id,
- totalTime,
- recipeTitle,
- recipeCategory_id,
- recipeCuisine_id,
- cookingMethod_id,
- calorycity_id,
- isVegan,
- isBarbecue) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)});
- my $cnt = 0;
- $sth_one_step->execute();
- while(my $row = $sth_one_step->fetchrow_hashref()) {
- my $ds_id = $$row{'ds_id'};
- my $filename = '/share/przepisy/'.$ds_id .'/'. $ds_id .'.html';
- say _now . ' Parsing: ' . $filename if DBG;
- ### HTML Parser ###
- $tree = HTML::TreeBuilder::XPath->new(ignore_unknown => 0);
- $tree->utf8_mode(1);
- $tree->parse_file($filename);
- ### Stage 1 - searching, storing values ###
- my $recipe_title = trim($tree->findvalue($recipe_title_xpath));
- my $date_published = trim($tree->findvalue($date_published_xpath));
- my $total_time = trim($tree->findvalue($total_time_xpath));
- my $cuisine = trim($tree->findvalue($recipe_cuisine_xpath));
- my $method = trim($tree->findvalue($cooking_method_xpath));
- my $yield = trim($tree->findvalue($recipe_yield_xpath));
- my $category = trim($tree->findvalue($recipe_category_xpath));
- my $main_ingredient = trim($tree->findvalue($main_ingredient_xpath));
- my $calorycity = trim($tree->findvalue($calorycity_xpath));
- my $is_vegan = $tree->findvalue($is_vegan_xpath);
- $is_vegan = is_2int($is_vegan);
- my $is_barbecue = $tree->findvalue($is_barbecue_xpath);
- $is_barbecue = is_2int($is_barbecue);
- my $author_node = $tree->findnodes($author_xpath)->[0];
- my $author_url = $author_node->attr('href');
- my $author_screen_name = $author_node->as_trimmed_text;
- my $author_login = get_ascii_author_name_from_href($author_url);
- my $instruction = '';
- my $instruction_html1 = $tree->findvalue($instruction_xpath1);
- $instruction = $instruction_html1;
- # my $author_tips_content = $instruction_html->look_down(_tag => 'p', class => 'author-tips-content');
- # if(defined $author_tips_content) {
- # my $tips = $author_tips_content->as_trimmed_text;
- # $instruction = $author_tips_content->look_up(_tag => 'div', class => 'recipe-content-description')->[0]->as_trimmed_text;
- # say '-------------';
- # say $instruction;
- # say '--------';
- # }
- # else {
- # $instruction = $instruction_html->as_trimmed_text;
- # }
- # my $instruction_html = $tree->findnodes($instruction_xpath)->[0];
- # my $instruction = $instruction_html;
- # if($instruction_html =~ /<div\s+class=\"recipe-content-description\"\s+itemprop=\"recipeInstructions\">(?<html>.*)<\/div>/m) {
- # $instruction = $+{html};
- # $instruction = trim($instruction);
- # } else {
- # $instruction = $instruction->as_trimmed_text;
- # }
- $dbh->{AutoCommit} = 0;
- $dbh->{RaiseError} = 1;
- eval {
- ### Stage 2 - cheking, adding contexes ###
- my $user_id = user_exists($sth_user_exists, $author_login);
- if($user_id == 0) {
- $user_id = user_add($sth_user_add, $author_screen_name, $author_login, 8, $hash);
- }
- my $category_id = category_exists($sth_category_exists, $category);
- if($category_id == 0) {
- $category_id = category_add($sth_category_add, $category);
- }
- my $cuisine_id = cuisine_exists($sth_cuisine_exists, $cuisine);
- if($cuisine_id == 0) {
- $cuisine = cuisine_add($sth_cuisine_add, $cuisine);
- }
- my $method_id = method_exists($sth_method_exists, $method);
- if($method_id == 0) {
- $method_id = method_add($sth_method_add, $method);
- }
- my $calorycity_id = calorycity_exists($sth_calorycity_exists, $calorycity);
- if($calorycity_id == 0) {
- $calorycity_id = calorycity_add($sth_calorycity_add, $calorycity);
- }
- my $yield_id = yield_exists($sth_yield_exists, $yield);
- if($yield_id == 0) {
- $yield_id = yield_add($sth_yield_add, $yield);
- }
- my $main_ingredient_id = main_ingredient_exists($sth_main_ingredient_exists, $main_ingredient);
- if($main_ingredient_id == 0) {
- $main_ingredient_id = main_ingredient_add($sth_main_ingredient_add, $main_ingredient);
- }
- ### Stage 3 - adding recipe ###
- #say _now . " " .$recipe_title . "|" . $instruction. "|". $author_login ."|". $date_published ."|". $total_time ."|". $cuisine ."|". $method ."|". $yield ."|". $category . "|". $is_vegan . "|" .$main_ingredient . "|" . $is_barbecue . "|" . $calorycity;
- recipe_add($sth_recipe_add, $ds_id, $user_id, $date_published, $date_published, $date_published, $main_ingredient_id, $instruction, $yield_id, $total_time, $recipe_title, $category_id, $cuisine_id, $method_id, $calorycity_id, $is_vegan, $is_barbecue);
- # say $author_email;
- # say $author_name;
- # say $date_published;
- # say $total_time;
- # say $recipe_cuisine;
- # say $cooking_method;
- # say $recipe_yield;
- # say $recipe_category;
- $dbh->commit();
- };
- if ($@) {
- warn "Transaction aborted because $@";
- eval { $dbh->rollback };
- }
- $tree->delete;
- $cnt++;
- }
- $sth_one_step->finish();
- $sth_user_exists->finish();
- $sth_user_add->finish();
- $sth_category_exists->finish();
- $sth_category_add->finish();
- $sth_cuisine_exists->finish();
- $sth_cuisine_add->finish();
- $sth_method_exists->finish();
- $sth_method_add->finish();
- $sth_recipe_add->finish();
- $sth_calorycity_add->finish();
- $sth_calorycity_exists->finish();
- $sth_yield_add->finish();
- $sth_yield_exists->finish();
- $sth_main_ingredient_add->finish();
- $sth_main_ingredient_exists->finish();
- $dbh->disconnect();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement