Advertisement
m4ly

DS_HTML_INF_PARSER.pl

May 1st, 2015
98
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 15.90 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3. # Author: Dawid Mocek
  4. # PP Projekt
  5. # For educational purpose only
  6. # All right reserved
  7.  
  8. # HTML Parser - gets information about recipes
  9.  
  10. use strict;
  11. use warnings;
  12.  
  13. use feature 'say';
  14.  
  15. use POSIX qw(strftime);
  16. use Data::Dumper;
  17. use HTML::TreeBuilder::XPath;
  18. use DBI;
  19. use Config::IniFiles;
  20.  
  21. use constant DBG => 1;
  22.  
  23. sub rand_str {
  24.     my $max = shift;
  25.     my @chars = ("A".."Z", "a".."z");
  26.     my $s;
  27.     $s .= $chars[rand @chars] for 1..$max;
  28.     return $s;
  29. }
  30.  
  31. sub trim {
  32.     my $s = shift;
  33.     $s =~ s/^\s+|\s+$//g;
  34.     return $s;
  35. }
  36.  
  37. sub _now {
  38.     return strftime "%Y-%m-%d %T", localtime;
  39. }
  40.  
  41. sub get_ascii_author_name_from_href {
  42.     if ($_[0] =~ /\.pl\/profil\/(\d+)\/(?<ascii_author_name>.*)\.html$/m) {
  43.          return  $+{ascii_author_name};
  44.     }
  45.     return rand_str(8);
  46. }
  47.  
  48. sub generate_email {
  49.     return  $_[0] . '@' . lc(rand_str(10)) . '.pl';
  50. }
  51.  
  52. sub user_exists {
  53.     my $sth = $_[0];
  54.     my $login = $_[1];
  55.     my $row;
  56.     $sth->execute($login);
  57.     $row = $sth->fetchrow_hashref;
  58.     if(defined $row) {
  59.     return $$row{'id'};
  60.     }
  61.     return 0;
  62. }
  63.  
  64. sub user_add {
  65.     my $sth = $_[0];
  66.     my $screen_name = $_[1];
  67.     my $login = $_[2];
  68.     my $group_id = $_[3];
  69.     my $hash = $_[4];
  70.     $sth->execute($login, $screen_name, generate_email($login), $group_id, $hash);
  71.     say _now . ' Added user: ' . $login . '. id: ' . $sth->{mysql_insertid} if DBG;
  72.     return $sth->{mysql_insertid};
  73. }
  74.  
  75. sub category_exists {
  76.     my $sth = $_[0];
  77.     my $category_name = $_[1];
  78.     my $row;
  79.     $sth->execute($category_name);
  80.     $row = $sth->fetchrow_hashref;
  81.     if(defined $row) {
  82.     return $$row{'id'};
  83.     }
  84.     return 0;
  85. }
  86.  
  87. sub category_add {
  88.     my $sth = $_[0];
  89.     my $category_name = $_[1];
  90.     $sth->execute($category_name);
  91.     say _now . ' Added category: ' . $category_name . '. id: ' . $sth->{mysql_insertid} if DBG;
  92.     return $sth->{mysql_insertid};
  93. }
  94.  
  95. sub cuisine_exists {
  96.     my $sth = $_[0];
  97.     my $recipe_cuisine_name = $_[1];
  98.     my $row;
  99.     $sth->execute($recipe_cuisine_name);
  100.     $row = $sth->fetchrow_hashref;
  101.     if(defined $row) {
  102.     return $$row{'id'};
  103.     }
  104.     return 0;
  105. }
  106.  
  107. sub cuisine_add {
  108.     my $sth = $_[0];
  109.     my $cuisine_name = $_[1];
  110.     $sth->execute($cuisine_name);
  111.     say _now . ' Added cuisine: ' . $cuisine_name . '. id: ' . $sth->{mysql_insertid} if DBG;
  112.     return $sth->{mysql_insertid};
  113. }
  114.  
  115. sub method_exists {
  116.     my $sth = $_[0];
  117.     my $cooking_method_name = $_[1];
  118.     my $row;
  119.     $sth->execute($cooking_method_name);
  120.     $row = $sth->fetchrow_hashref;
  121.     if(defined $row) {
  122.     return $$row{'id'};
  123.     }
  124.     return 0;
  125. }
  126.  
  127. sub yield_min {
  128.     if($_[0] =~ /^(?<min>\d+)/m) {
  129.     return $+{min};
  130.     }
  131.     return 1;
  132. }
  133.  
  134. sub yield_add {
  135.     my $sth = $_[0];
  136.     my $yield_name = $_[1];
  137.     $sth->execute($yield_name, yield_min($yield_name));
  138.     say _now . ' Added yield: ' . $yield_name . '. id: ' . $sth->{mysql_insertid} if DBG;
  139.     return $sth->{mysql_insertid};
  140. }
  141.  
  142. sub main_ingredient_add {
  143.     my $sth = $_[0];
  144.     my $main_ingredient_name = $_[1];
  145.     $sth->execute($main_ingredient_name);
  146.     say _now . ' Added main ingredient: ' . $main_ingredient_name . '. id: ' . $sth->{mysql_insertid} if DBG;
  147.     return $sth->{mysql_insertid};
  148. }
  149.  
  150. sub main_ingredient_exists {
  151.     my $sth = $_[0];
  152.     my $main_ingredient_name = $_[1];
  153.     my $row;
  154.     $sth->execute($main_ingredient_name);
  155.     $row = $sth->fetchrow_hashref;
  156.     if(defined $row) {
  157.     return $$row{'id'};
  158.     }
  159.     return 0;
  160. }
  161.  
  162. sub yield_exists {
  163.     my $sth = $_[0];
  164.     my $yield_name = $_[1];
  165.     my $row;
  166.     $sth->execute($yield_name);
  167.     $row = $sth->fetchrow_hashref;
  168.     if(defined $row) {
  169.     return $$row{'id'};
  170.     }
  171.     return 0;
  172. }
  173.  
  174. sub method_add {
  175.     my $sth = $_[0];
  176.     my $cooking_method_name = $_[1];
  177.     $sth->execute($cooking_method_name);
  178.     say _now . ' Added cooking method: ' . $cooking_method_name . '. id: ' . $sth->{mysql_insertid} if DBG;
  179.     return $sth->{mysql_insertid};
  180. }
  181.  
  182. sub calorycity_exists {
  183.     my $sth = $_[0];
  184.     my $calorcity_name = $_[1];
  185.     my $row;
  186.     $sth->execute($calorcity_name);
  187.     $row = $sth->fetchrow_hashref;
  188.     if(defined $row) {
  189.     return $$row{'id'};
  190.     }
  191.     return 0;
  192. }
  193.  
  194. sub calorycity_add {
  195.     my $sth = $_[0];
  196.     my $calorcity_name = $_[1];
  197.     $sth->execute($calorcity_name);
  198.     say _now . ' Added calorcity: ' . $calorcity_name . '. id: ' . $sth->{mysql_insertid} if DBG;
  199.     return $sth->{mysql_insertid};
  200. }
  201.  
  202. sub recipe_add {
  203.     my($sth, $ds_id, $author_id, $date_created, $date_published,
  204.        $date_modified,  $main_ingredient, $recipeInstructions, $yield, $total_time, $title,
  205.        $category, $cuisine, $method, $calorycity, $vegan, $barbecue) = @_;
  206.  
  207.     $sth->execute($ds_id, $author_id, $date_created, $date_published, $date_modified, $main_ingredient, $recipeInstructions, $yield, $total_time, $title, $category, $cuisine, $method, $calorycity, $vegan, $barbecue);
  208.     say _now . ' Added recipe: ' . $title . '. id: ' . $sth->{mysql_insertid} if DBG;
  209.     return $sth->{mysql_insertid};
  210. }
  211.  
  212. sub is_2int {
  213.     my $str = shift;
  214.     if($str =~ /Tak/m) {
  215.     return 1;
  216.     }
  217.     return 0;
  218. }
  219.  
  220.  
  221. sub load_ini {
  222.  
  223.     my $inifile = $_[0];
  224.  
  225.     unless(-e $inifile) {
  226.          print("File: $inifile does not exists\n");
  227.          exit;
  228.          }
  229.  
  230.     my $cfg =  Config::IniFiles->new(-file => $inifile, -fallback => "General");
  231.     return $cfg;
  232. }
  233.  
  234.  
  235. my $ini_file = './config.ini';
  236. my $cfg = load_ini($ini_file);
  237.  
  238. ### HTML Parser ###
  239. my $tree;
  240.  
  241. my $date_published_xpath    = '/html/body//div[@class="date-publication-wrapper date-wrapper"]/time/@datetime';
  242. my $total_time_xpath        = '//meta[@itemprop="totalTime"]/@content';
  243. my $recipe_cuisine_xpath    = '/html/body//li[@class="ico-cook-type"]/span[@itemprop="recipeCuisine"]/text()';
  244. my $cooking_method_xpath    = '/html/body//li[@class="ico-prepere"]/span[@itemprop="cookingMethod"]/text()';
  245. my $recipe_yield_xpath      = '/html/body//li[@class="ico-people"]/span[@itemprop="recipeYield"]/text()';
  246.  
  247. my $recipe_category_xpath   = '/html/body/div[@id="Wrapper"]/div[@id="Container"]/div[@id="Content"]/div[@class="main-wrapper"]'.
  248.                     '/div[@class="content-wrapper with-sidebar"]/aside[@class="content-sidebar"]/ul[@class="recipe-details-list"]/li[@class="ico-dish"]/span[@class="recipe-details-value"]/text()';
  249.                    
  250. my $is_vegan_xpath      = '/html/body//li[@class="ico-for-vegetarian"]/span[@class="recipe-details-value"]/text()';
  251. my $is_barbecue_xpath       = '/html/body//li[@class="ico-for-grill"]/span[@class="recipe-details-value"]/text()';
  252. my $main_ingredient_xpath   = '/html/body//li[@class="ico-component"]/span[@class="recipe-details-value"]/text()';
  253.  
  254. my $author_xpath        = '/html/body/div[@id="Wrapper"]/div[@id="Container"]/div[@id="Content"]/div[@class="main-wrapper"]/div[@class="content-wrapper with-sidebar"]'.
  255.                     '/header[@class="content-heading with-description decrease-padding"]/div[@class="recipe-gallery-wrapper"]/div[@class="recipe-gallery-content-wrapper"]'.
  256.                     '/div[@class="recipe-gallery-left-wrapper"]/div[@class="recipe-gallery-left"]/div[@class="recipe-gallery-top-nav"]/div/span[@class="recipe-gallery-author-wrapper"]/a[@itemprop="author"]';
  257.                    
  258. my $recipe_title_xpath      = '//meta[@name="og:title"]/@content';
  259. my $ds_id_xpath         = '/html/body/div[@id="Wrapper"]/div[@id="Container"]/div[@id="Content"]/div[@class="main-wrapper"]/div[@class="content-wrapper with-sidebar"]/@data-recipe';
  260. my $instruction_xpath       = '/html/body//div[@class="recipe-content-description" and @itemprop="recipeInstructions"]';
  261. my $instruction_xpath1      = '/html/body//div[@class="recipe-content-description" and @itemprop="recipeInstructions"]/text()';
  262. my $calorycity_xpath        = '/html/body//li[@class="ico-calorycity"]/span[@class="recipe-details-value"]/text()';
  263.  
  264. ### Database ###
  265. my $dbh = DBI->connect('DBI:mysql:database=' .  $cfg->val('db', 'name') . ';host=' .
  266.                         $cfg->val('db', 'host'),
  267.                         $cfg->val('db', 'user'),
  268.                         $cfg->val('db', 'pass'),
  269.                         {mysql_auto_reconnect => 1, mysql_enable_utf8 => 1});
  270.  
  271. my $tb_recipe       = $cfg->val('db', 'tb_recipe');
  272. my $tb_user     = $cfg->val('db', 'tb_user');
  273. my $tb_recipe_category  = $cfg->val('db', 'tb_recipe_category');
  274. my $tb_recipe_cuisine   = $cfg->val('db', 'tb_recipe_cuisine');
  275. my $tb_cooking_method   = $cfg->val('db', 'tb_cooking_method');
  276. my $tb_href     = $cfg->val('db', 'tb_href');
  277. my $tb_calorycity   = $cfg->val('db', 'tb_calorycity');
  278. my $tb_yield        = $cfg->val('db', 'tb_yield');
  279. my $tb_main_ingredient  = $cfg->val('db', 'tb_main_ingredient');
  280. my $hash        = $cfg->val('db', 'default_hash');
  281.  
  282. my $sth_user_exists     = $dbh->prepare(qq{SELECT id FROM $tb_user WHERE login = ? LIMIT 1});
  283. my $sth_user_add        = $dbh->prepare(qq{INSERT INTO $tb_user(login, screenName, email, group_id, password) VALUES(?, ?, ?, ?, ?)});
  284.  
  285. my $sth_category_exists     = $dbh->prepare(qq{SELECT id FROM $tb_recipe_category WHERE categoryName = ? LIMIT 1});
  286. my $sth_category_add        = $dbh->prepare(qq{INSERT INTO $tb_recipe_category(categoryName) VALUES(?)});
  287.  
  288. my $sth_cuisine_exists      = $dbh->prepare(qq{SELECT id FROM $tb_recipe_cuisine WHERE cuisineName = ? LIMIT 1});
  289. my $sth_cuisine_add     = $dbh->prepare(qq{INSERT INTO $tb_recipe_cuisine(cuisineName) VALUES(?)});
  290.  
  291. my $sth_method_exists       = $dbh->prepare(qq{SELECT id FROM $tb_cooking_method WHERE methodName = ? LIMIT 1});
  292. my $sth_method_add      = $dbh->prepare(qq{INSERT INTO $tb_cooking_method(methodName) VALUES(?)});
  293.  
  294. my $sth_calorycity_exists   = $dbh->prepare(qq{SELECT id FROM $tb_calorycity WHERE calorycityName = ? LIMIT 1});
  295. my $sth_calorycity_add      = $dbh->prepare(qq{INSERT INTO $tb_calorycity(calorycityName) VALUES(?)});
  296.  
  297. my $sth_yield_exists        = $dbh->prepare(qq{SELECT id FROM $tb_yield WHERE yieldName = ? LIMIT 1});
  298. my $sth_yield_add       = $dbh->prepare(qq{INSERT INTO $tb_yield(yieldName, yieldMin) VALUES(?, ?)});
  299.  
  300. my $sth_main_ingredient_exists  = $dbh->prepare(qq{SELECT id FROM $tb_main_ingredient WHERE ingredientName = ? LIMIT 1});
  301. my $sth_main_ingredient_add = $dbh->prepare(qq{INSERT INTO $tb_main_ingredient(ingredientName) VALUES(?)});
  302.  
  303. my $sth_one_step        = $dbh->prepare(qq{SELECT ds_id FROM $tb_href WHERE steps = 1});
  304.  
  305. my $sth_recipe_add      = $dbh->prepare(qq{INSERT INTO $tb_recipe(
  306.                                 ds_id,
  307.                                 author_id,
  308.                                 dateCreated,
  309.                                 datePublished,
  310.                                 dateModified,
  311.                                 ingredientMain_id,
  312.                                 recipeInstructions,
  313.                                 recipeYield_id,
  314.                                 totalTime,
  315.                                 recipeTitle,
  316.                                 recipeCategory_id,
  317.                                 recipeCuisine_id,
  318.                                 cookingMethod_id,
  319.                                 calorycity_id,
  320.                                 isVegan,
  321.                                 isBarbecue) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)});
  322.  
  323. my $cnt = 0;
  324. $sth_one_step->execute();
  325.  
  326. while(my $row = $sth_one_step->fetchrow_hashref()) {
  327.     my $ds_id = $$row{'ds_id'};
  328.     my $filename = '/share/przepisy/'.$ds_id .'/'. $ds_id .'.html';
  329.  
  330.     say _now . ' Parsing: ' . $filename if DBG;
  331.  
  332.     ### HTML Parser ###
  333.     $tree = HTML::TreeBuilder::XPath->new(ignore_unknown => 0);
  334.     $tree->utf8_mode(1);
  335.     $tree->parse_file($filename);
  336.  
  337.     ### Stage 1 - searching, storing values ###
  338.     my $recipe_title    = trim($tree->findvalue($recipe_title_xpath));
  339.     my $date_published  = trim($tree->findvalue($date_published_xpath));
  340.     my $total_time  = trim($tree->findvalue($total_time_xpath));
  341.     my $cuisine     = trim($tree->findvalue($recipe_cuisine_xpath));
  342.     my $method      = trim($tree->findvalue($cooking_method_xpath));
  343.     my $yield       = trim($tree->findvalue($recipe_yield_xpath));
  344.     my $category    = trim($tree->findvalue($recipe_category_xpath));
  345.     my $main_ingredient = trim($tree->findvalue($main_ingredient_xpath));
  346.     my $calorycity  = trim($tree->findvalue($calorycity_xpath));
  347.    
  348.     my $is_vegan    = $tree->findvalue($is_vegan_xpath);
  349.     $is_vegan       = is_2int($is_vegan);
  350.  
  351.     my $is_barbecue = $tree->findvalue($is_barbecue_xpath);
  352.     $is_barbecue    = is_2int($is_barbecue);
  353.  
  354.     my $author_node     = $tree->findnodes($author_xpath)->[0];
  355.     my $author_url      = $author_node->attr('href');
  356.  
  357.     my $author_screen_name = $author_node->as_trimmed_text;
  358.     my $author_login    = get_ascii_author_name_from_href($author_url);
  359.  
  360.     my $instruction = '';
  361.     my $instruction_html1 = $tree->findvalue($instruction_xpath1);
  362.     $instruction = $instruction_html1;
  363.  
  364. #    my $author_tips_content = $instruction_html->look_down(_tag => 'p', class => 'author-tips-content');
  365. #    if(defined $author_tips_content) {
  366. #        my $tips = $author_tips_content->as_trimmed_text;
  367. #   $instruction  = $author_tips_content->look_up(_tag => 'div', class => 'recipe-content-description')->[0]->as_trimmed_text;
  368. #   say '-------------';
  369. #   say $instruction;
  370. #   say '--------';
  371. #    }
  372. #    else {
  373. #   $instruction = $instruction_html->as_trimmed_text;
  374. #    }
  375.  
  376. #    my $instruction_html = $tree->findnodes($instruction_xpath)->[0];
  377. #    my $instruction   = $instruction_html;
  378. #    if($instruction_html =~ /<div\s+class=\"recipe-content-description\"\s+itemprop=\"recipeInstructions\">(?<html>.*)<\/div>/m) {
  379. #   $instruction = $+{html};
  380. #   $instruction = trim($instruction);
  381. #    } else {
  382. #   $instruction = $instruction->as_trimmed_text;
  383. #    }
  384.  
  385.  
  386.    
  387.     $dbh->{AutoCommit} = 0;
  388.     $dbh->{RaiseError} = 1;
  389.    
  390.     eval {
  391.     ### Stage 2 - cheking, adding contexes ###
  392.         my $user_id = user_exists($sth_user_exists, $author_login);
  393.     if($user_id == 0) {
  394.         $user_id = user_add($sth_user_add, $author_screen_name, $author_login, 8, $hash);
  395.         }
  396.  
  397.         my $category_id = category_exists($sth_category_exists, $category);
  398.         if($category_id == 0) {
  399.         $category_id = category_add($sth_category_add, $category);
  400.     }
  401.  
  402.         my $cuisine_id = cuisine_exists($sth_cuisine_exists, $cuisine);
  403.     if($cuisine_id == 0) {
  404.         $cuisine = cuisine_add($sth_cuisine_add, $cuisine);
  405.         }
  406.  
  407.     my $method_id = method_exists($sth_method_exists, $method);
  408.         if($method_id == 0) {
  409.         $method_id = method_add($sth_method_add, $method);
  410.     }
  411.  
  412.         my $calorycity_id = calorycity_exists($sth_calorycity_exists, $calorycity);
  413.     if($calorycity_id == 0) {
  414.         $calorycity_id = calorycity_add($sth_calorycity_add, $calorycity);
  415.         }
  416.        
  417.         my $yield_id = yield_exists($sth_yield_exists, $yield);
  418.     if($yield_id == 0) {
  419.         $yield_id = yield_add($sth_yield_add, $yield);
  420.         }
  421.    
  422.     my $main_ingredient_id = main_ingredient_exists($sth_main_ingredient_exists, $main_ingredient);
  423.     if($main_ingredient_id == 0) {
  424.         $main_ingredient_id = main_ingredient_add($sth_main_ingredient_add, $main_ingredient);
  425.         }
  426.  
  427.    
  428.         ### Stage 3 - adding recipe ###
  429.     #say _now . " " .$recipe_title . "|" . $instruction. "|". $author_login ."|". $date_published ."|". $total_time ."|". $cuisine ."|". $method ."|". $yield ."|". $category . "|". $is_vegan . "|" .$main_ingredient . "|" . $is_barbecue . "|" . $calorycity;
  430.         recipe_add($sth_recipe_add, $ds_id, $user_id, $date_published,  $date_published, $date_published, $main_ingredient_id,  $instruction, $yield_id, $total_time, $recipe_title, $category_id, $cuisine_id, $method_id, $calorycity_id, $is_vegan, $is_barbecue);
  431.     #    say $author_email;
  432.     #    say $author_name;
  433.     #    say $date_published;
  434.     #    say $total_time;
  435.     #    say $recipe_cuisine;
  436.     #    say $cooking_method;
  437.     #    say $recipe_yield;
  438.     #    say $recipe_category;
  439.     $dbh->commit();
  440.     };
  441.     if ($@) {
  442.         warn "Transaction aborted because $@";
  443.     eval { $dbh->rollback };
  444.     }
  445.     $tree->delete;
  446.     $cnt++;
  447.    
  448. }
  449.  
  450.  
  451.  
  452. $sth_one_step->finish();
  453. $sth_user_exists->finish();
  454. $sth_user_add->finish();
  455. $sth_category_exists->finish();
  456. $sth_category_add->finish();
  457. $sth_cuisine_exists->finish();
  458. $sth_cuisine_add->finish();
  459. $sth_method_exists->finish();
  460. $sth_method_add->finish();
  461. $sth_recipe_add->finish();
  462. $sth_calorycity_add->finish();
  463. $sth_calorycity_exists->finish();
  464. $sth_yield_add->finish();
  465. $sth_yield_exists->finish();
  466. $sth_main_ingredient_add->finish();
  467. $sth_main_ingredient_exists->finish();
  468. $dbh->disconnect();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement