Advertisement
Guest User

Untitled

a guest
Jul 20th, 2018
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.62 KB | None | 0 0
  1. use strict;
  2. use warnings;
  3. use HTML::Tree;
  4. use LWP::UserAgent ();
  5. use Data::Dumper;
  6. use open ':std', ':encoding(UTF-8)';
  7. use utf8;
  8. use HTML::TreeBuilder 5 -weak;
  9. use LWP::UserAgent;
  10. use File::Slurp;
  11. use LWP::Simple;
  12.  
  13.  
  14. sub test
  15. {
  16. my $url = "https://www.dnes.bg";
  17. my $html = get $url;
  18. print $html;
  19. my $tree = HTML::TreeBuilder->new_from_content($html);
  20.  
  21.  
  22. my @aelements = $tree->look_down(_tag => 'a');
  23. foreach my $aelement(@aelements)
  24. {
  25. if ($aelement->as_text eq "България")
  26. {
  27. print $aelement->attr('href');
  28. print "\n";
  29. }
  30. if ($aelement->as_text eq "Спорт")
  31. {
  32. print $aelement->attr('href');
  33. print "\n";
  34. }
  35. if ($aelement->as_text eq "Бизнес")
  36. {
  37. print $aelement->attr('href');
  38. print "\n";
  39. }
  40. }
  41. }
  42.  
  43.  
  44. sub GetMainPage
  45. {
  46. my $url = $_[0];
  47. my $ua = LWP::UserAgent->new();
  48. my $html = get $url;
  49. return $html;
  50. }
  51.  
  52. sub ExtractCategory
  53. {
  54. my $content = $_[0];
  55. my $category = $_[1];
  56. my $tree = HTML::TreeBuilder->new_from_content($content);
  57. my @elements = $tree->look_down(_tag => 'a');
  58. foreach my $element(@elements)
  59. {
  60. if ($element->as_text eq $category)
  61. {
  62. print $element->attr('href');
  63. print "\n";
  64. return $element->attr('href');
  65. }
  66. }
  67. }
  68.  
  69. sub ExtractCategories
  70. {
  71. my $html = shift;
  72. my @cats = @{$_[0]};
  73. my %urls=();
  74. my $tmp;
  75. #print $html;
  76. foreach my $cat(@cats)
  77. {
  78. print $cat;
  79. $tmp = ExtractCategory($html, $cat);
  80. $urls{$cat}= $tmp;
  81. #$returnpairs{$cat} = $tmp;
  82. }
  83. #print $urls{"Бизнес"};
  84. return \%urls;
  85. }
  86.  
  87. sub ProcessCategory
  88. {
  89. my $cat = %{$_[0]};
  90. my $html = GetMainPage($cat);
  91. my $tree = HTML::TreeBuilder->new_from_content($html);
  92. my $element = $tree->look_down(_tag => 'div', class => "more");
  93. print $element->as_text;
  94. }
  95.  
  96.  
  97. ProcessCategories
  98. {
  99. my %cats = %{$_[0]};
  100. print $cats{'Бизнес'};
  101. foreach my $key (keys(%cats))
  102. {
  103. ProcessCategory($key);
  104. }
  105. }
  106.  
  107. sub main
  108. {
  109. my $page = 1; # which page do we want to download the news from, can be expanded further so that with a loop we cycle trough any number of desired pages
  110. my $content = GetMainPage("https://www.dnes.bg/");
  111. my @categories = ("България", "Бизнес", "Спорт");
  112. my %catpairs = %{ExtractCategories($content, \@categories)};
  113. ProcessCategories(%catpairs);
  114. }
  115.  
  116. main();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement