Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- use strict;
- use warnings;
- use HTML::Tree;
- use LWP::UserAgent ();
- use Data::Dumper;
- use open ':std', ':encoding(UTF-8)';
- use utf8;
- use HTML::TreeBuilder 5 -weak;
- use LWP::UserAgent;
- use File::Slurp;
- use LWP::Simple;
- sub test
- {
- my $url = "https://www.dnes.bg";
- my $html = get $url;
- print $html;
- my $tree = HTML::TreeBuilder->new_from_content($html);
- my @aelements = $tree->look_down(_tag => 'a');
- foreach my $aelement(@aelements)
- {
- if ($aelement->as_text eq "България")
- {
- print $aelement->attr('href');
- print "\n";
- }
- if ($aelement->as_text eq "Спорт")
- {
- print $aelement->attr('href');
- print "\n";
- }
- if ($aelement->as_text eq "Бизнес")
- {
- print $aelement->attr('href');
- print "\n";
- }
- }
- }
- sub GetMainPage
- {
- my $url = $_[0];
- my $ua = LWP::UserAgent->new();
- my $html = get $url;
- return $html;
- }
- sub ExtractCategory
- {
- my $content = $_[0];
- my $category = $_[1];
- my $tree = HTML::TreeBuilder->new_from_content($content);
- my @elements = $tree->look_down(_tag => 'a');
- foreach my $element(@elements)
- {
- if ($element->as_text eq $category)
- {
- print $element->attr('href');
- print "\n";
- return $element->attr('href');
- }
- }
- }
- sub ExtractCategories
- {
- my $html = shift;
- my @cats = @{$_[0]};
- my %urls=();
- my $tmp;
- #print $html;
- foreach my $cat(@cats)
- {
- print $cat;
- $tmp = ExtractCategory($html, $cat);
- $urls{$cat}= $tmp;
- #$returnpairs{$cat} = $tmp;
- }
- #print $urls{"Бизнес"};
- return \%urls;
- }
- sub ProcessCategory
- {
- my $cat = %{$_[0]};
- my $html = GetMainPage($cat);
- my $tree = HTML::TreeBuilder->new_from_content($html);
- my $element = $tree->look_down(_tag => 'div', class => "more");
- print $element->as_text;
- }
- ProcessCategories
- {
- my %cats = %{$_[0]};
- print $cats{'Бизнес'};
- foreach my $key (keys(%cats))
- {
- ProcessCategory($key);
- }
- }
- sub main
- {
- my $page = 1; # which page do we want to download the news from, can be expanded further so that with a loop we cycle trough any number of desired pages
- my $content = GetMainPage("https://www.dnes.bg/");
- my @categories = ("България", "Бизнес", "Спорт");
- my %catpairs = %{ExtractCategories($content, \@categories)};
- ProcessCategories(%catpairs);
- }
- main();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement