Advertisement
Guest User

Untitled

a guest
Oct 12th, 2014
201
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.71 KB | None | 0 0
  1. <?php namespace Lib\Services\Scraping;
  2.  
  3. use App;
  4. use Lib\Services\Db\Writer;
  5. use Lib\Services\Scraping\Scraper;
  6. use Symfony\Component\DomCrawler\Crawler;
  7.  
  8. class NewsScraper extends Curl
  9. {
  10. /**
  11. * Array of scraped news.
  12. *
  13. * @var array
  14. */
  15. private $news;
  16.  
  17. /**
  18. * Writer instance.
  19. *
  20. * @var Lib\Services\Db\Writer
  21. */
  22. private $dbWriter;
  23.  
  24. public function __construct(Writer $dbWriter)
  25. {
  26. $this->dbWriter = $dbWriter;
  27. }
  28.  
  29. /**
  30. * Scrapes news from all sources and saves to db.
  31. * @return void
  32. */
  33. public function all()
  34. {
  35. $provider = App::make('Options')->getNewsProvider();
  36.  
  37. if ($provider == 'firstshowing') {
  38. $this->getFromFirstShowing()->save();
  39. } else {
  40. $this->getFromScreenRant()->save();
  41. }
  42. }
  43.  
  44. /**
  45. * Scrapes and compiles news for saving from
  46. * FirstShowing website.
  47. *
  48. * @return self
  49. */
  50. private function getFromFirstShowing()
  51. {
  52. $compiledNews = array();
  53. $news = $this->curl('http://www.kinopoisk.ru/news/');
  54. $crawler = new Crawler($news);
  55.  
  56. //first we'll grab every news item on the page
  57. foreach ($crawler->filter('.newsList .item') as $node)
  58. {
  59. $cr = new Crawler($node);
  60. //then we will compile array out of every new items
  61. $compiledNews[] = array(
  62. 'title' => head($cr->filter('.title > a')->extract(array('_text'))),
  63. 'image' => head($cr->filter('.pic > a > img')->extract(array('src'))),
  64. 'body' => head($cr->filter('.descr')->extract(array('_text'))),
  65. 'full_url' => 'http://kinopoisk.ru' . head($cr->filter('.more > a')->extract(array('href'))),
  66. 'source' => 'KinoPoisk',
  67. 'fully_scraped' => 0,
  68. );
  69. }
  70.  
  71. $this->news = $compiledNews;
  72.  
  73. return $this;
  74. }
  75.  
  76. /**
  77. * Get full info about single news item from current
  78. * active news provider.
  79. *
  80. * @param string $url
  81. * @return array
  82. */
  83. public function getSingle($url)
  84. {
  85. $provider = App::make('Options')->getNewsProvider();
  86.  
  87. if ($provider == 'firstshowing')
  88. {
  89. return $this->getSingleFromFirstShowing($url);
  90. }
  91.  
  92. return $this->getSingleFromScreenRant($url);
  93. }
  94.  
  95. /**
  96. * Scrapes and compiles news for saving from
  97. * screenrant.com website.
  98. *
  99. * @return self
  100. */
  101. private function getFromScreenRant()
  102. {
  103. $compiledNews = array();
  104.  
  105. $news = $this->curl('http://screenrant.com/movie-news/');
  106.  
  107. $crawler = new Crawler($news);
  108.  
  109. //first we'll grab every news item on the page
  110. foreach ($crawler->filter('#content ul li') as $k => $node)
  111. {
  112.  
  113. $cr = new Crawler($node);
  114.  
  115. //then we will compile array out of every new items
  116. $compiledNews[$k] = array(
  117. 'title' => head($cr->filter('div > h2 > a')->extract(array('_text'))),
  118. 'image' => head($cr->filter('div > a > img')->extract(array('src'))),
  119. 'body' => head($cr->filter('div > p')->extract(array('_text'))),
  120. 'full_url' => head($cr->filter('div > h2 > a')->extract(array('href'))),
  121. 'source' => 'ScreenRant',
  122. 'fully_scraped' => 0,
  123. );
  124. }
  125.  
  126. $this->news = $compiledNews;
  127.  
  128. return $this;
  129. }
  130.  
  131. /**
  132. * Saves scraped news to the database.
  133. *
  134. * @return void
  135. */
  136. private function save()
  137. {
  138. $this->dbWriter->compileBatchInsert('news', $this->news)
  139. ->save();
  140. }
  141.  
  142. /**
  143. * Scrapes single news item from screenrant
  144. *
  145. * @param string $url
  146. * @return string
  147. */
  148. public function getSingleFromFirstShowing($url)
  149. {
  150. $item = $this->curl($url);
  151.  
  152. $crawler = new Crawler($item);
  153.  
  154. $html = $crawler->filter('.brand_words')->each(function (Crawler $node, $i)
  155. {
  156. $ht = trim($node->html());
  157.  
  158. //filter out unneeded html
  159.  
  160.  
  161. return '<p>' . preg_replace('/<a.*?>(.*?)<\/a>/', '$1', $ht) . '</p>';
  162. });
  163.  
  164. return trim(implode('', $html));
  165. }
  166.  
  167. /**
  168. * Scrapes single news item from screenrant
  169. *
  170. * @param string $url
  171. * @return string
  172. */
  173. public function getSingleFromScreenRant($url)
  174. {
  175.  
  176. $text = '';
  177. $item = $this->curl($url);
  178.  
  179. $crawler = new Crawler($item);
  180.  
  181. $html = $crawler->filter('div[itemprop="articleBody"] p')->each(function (Crawler $node, $i)
  182. {
  183. $ht = trim($node->html());
  184.  
  185. //filter out unneeded html
  186. if (strpos($ht, 'contentjumplink')) return false;
  187. if (strpos($ht, 'type="button"')) return false;
  188. if (strpos($ht, 'type="hidden"')) return false;
  189. if (strpos($ht, 'AD BLOCK')) return false;
  190.  
  191.  
  192. if (strpos($ht, 'src='))
  193. {
  194. preg_match('/.*?<img src="(.*?)"/', $ht, $m);
  195.  
  196. if (isset($m[1]))
  197. {
  198. return "<img src='{$m[1]}' class='img-responsive'/>";
  199. }
  200. }
  201.  
  202. return '<p>' . preg_replace('/<a.*?>(.*?)<\/a>/', '$1', $ht) . '</p>';
  203. });
  204.  
  205. return trim(implode('', $html));
  206. }
  207. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement