Advertisement
Guest User

Untitled

a guest
Oct 11th, 2014
246
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.87 KB | None | 0 0
  1. <?php namespace Lib\Services\Scraping;
  2.  
  3. use App;
  4. use Lib\Services\Db\Writer;
  5. use Lib\Services\Scraping\Scraper;
  6. use Symfony\Component\DomCrawler\Crawler;
  7.  
  8. class NewsScraper extends Curl
  9. {
  10. /**
  11. * Array of scraped news.
  12. *
  13. * @var array
  14. */
  15. private $news;
  16.  
  17. /**
  18. * Writer instance.
  19. *
  20. * @var Lib\Services\Db\Writer
  21. */
  22. private $dbWriter;
  23.  
  24. public function __construct(Writer $dbWriter)
  25. {
  26. $this->dbWriter = $dbWriter;
  27. }
  28.  
  29. /**
  30. * Scrapes news from all sources and saves to db.
  31. * @return void
  32. */
  33. public function all()
  34. {
  35. $provider = App::make('Options')->getNewsProvider();
  36.  
  37. if ($provider == 'firstshowing') {
  38. $this->getFromFirstShowing()->save();
  39. } else {
  40. $this->getFromScreenRant()->save();
  41. }
  42. }
  43.  
  44. /**
  45. * Scrapes and compiles news for saving from
  46. * FirstShowing website.
  47. *
  48. * @return self
  49. */
  50. private function getFromFirstShowing()
  51. {
  52. $compiledNews = array();
  53.  
  54. $news = $this->curl('http://www.firstshowing.net/category/movie-news/');
  55.  
  56. $crawler = new Crawler($news);
  57.  
  58. //first we'll grab every news item on the page
  59. foreach ($crawler->filter('#content > .article') as $k => $node)
  60. {
  61.  
  62. $cr = new Crawler($node);
  63.  
  64. //then we will compile array out of every new items
  65. $compiledNews[$k] = array(
  66. 'title' => head($cr->filter('h2 > a')->extract(array('_text'))),
  67. 'image' => head($cr->filter('div.image > a > img')->extract(array('src'))),
  68. 'body' => head($cr->filter('p')->extract(array('_text'))),
  69. 'full_url' => head($cr->filter('.continue > a')->extract(array('href'))),
  70. 'source' => 'FirstShowing',
  71. 'fully_scraped' => 0,
  72. );
  73. }
  74.  
  75. $this->news = $compiledNews;
  76.  
  77. return $this;
  78. }
  79.  
  80. /**
  81. * Get full info about single news item from current
  82. * active news provider.
  83. *
  84. * @param string $url
  85. * @return array
  86. */
  87. public function getSingle($url)
  88. {
  89. $provider = App::make('Options')->getNewsProvider();
  90.  
  91. if ($provider == 'firstshowing')
  92. {
  93. return $this->getSingleFromFirstShowing($url);
  94. }
  95.  
  96. return $this->getSingleFromScreenRant($url);
  97. }
  98.  
  99. /**
  100. * Scrapes and compiles news for saving from
  101. * screenrant.com website.
  102. *
  103. * @return self
  104. */
  105. private function getFromScreenRant()
  106. {
  107. $compiledNews = array();
  108.  
  109. $news = $this->curl('http://screenrant.com/movie-news/');
  110.  
  111. $crawler = new Crawler($news);
  112.  
  113. //first we'll grab every news item on the page
  114. foreach ($crawler->filter('#content ul li') as $k => $node)
  115. {
  116.  
  117. $cr = new Crawler($node);
  118.  
  119. //then we will compile array out of every new items
  120. $compiledNews[$k] = array(
  121. 'title' => head($cr->filter('div > h2 > a')->extract(array('_text'))),
  122. 'image' => head($cr->filter('div > a > img')->extract(array('src'))),
  123. 'body' => head($cr->filter('div > p')->extract(array('_text'))),
  124. 'full_url' => head($cr->filter('div > h2 > a')->extract(array('href'))),
  125. 'source' => 'ScreenRant',
  126. 'fully_scraped' => 0,
  127. );
  128. }
  129.  
  130. $this->news = $compiledNews;
  131.  
  132. return $this;
  133. }
  134.  
  135. /**
  136. * Saves scraped news to the database.
  137. *
  138. * @return void
  139. */
  140. private function save()
  141. {
  142. $this->dbWriter->compileBatchInsert('news', $this->news)
  143. ->save();
  144. }
  145.  
  146. /**
  147. * Scrapes single news item from screenrant
  148. *
  149. * @param string $url
  150. * @return string
  151. */
  152. public function getSingleFromFirstShowing($url)
  153. {
  154. $item = $this->curl($url);
  155.  
  156. $crawler = new Crawler($item);
  157.  
  158. $html = $crawler->filter('.review p')->each(function (Crawler $node, $i)
  159. {
  160. $ht = trim($node->html());
  161.  
  162. //filter out unneeded html
  163. if (strpos($ht, 'class="technotags"')) return false;
  164. if (strpos($ht, 'title="Posts by')) return false;
  165.  
  166. return '<p>' . preg_replace('/<a.*?>(.*?)<\/a>/', '$1', $ht) . '</p>';
  167. });
  168.  
  169. return trim(implode('', $html));
  170. }
  171.  
  172. /**
  173. * Scrapes single news item from screenrant
  174. *
  175. * @param string $url
  176. * @return string
  177. */
  178. public function getSingleFromScreenRant($url)
  179. {
  180.  
  181. $text = '';
  182. $item = $this->curl($url);
  183.  
  184. $crawler = new Crawler($item);
  185.  
  186. $html = $crawler->filter('div[itemprop="articleBody"] p')->each(function (Crawler $node, $i)
  187. {
  188. $ht = trim($node->html());
  189.  
  190. //filter out unneeded html
  191. if (strpos($ht, 'contentjumplink')) return false;
  192. if (strpos($ht, 'type="button"')) return false;
  193. if (strpos($ht, 'type="hidden"')) return false;
  194. if (strpos($ht, 'AD BLOCK')) return false;
  195.  
  196.  
  197. if (strpos($ht, 'src='))
  198. {
  199. preg_match('/.*?<img src="(.*?)"/', $ht, $m);
  200.  
  201. if (isset($m[1]))
  202. {
  203. return "<img src='{$m[1]}' class='img-responsive'/>";
  204. }
  205. }
  206.  
  207. return '<p>' . preg_replace('/<a.*?>(.*?)<\/a>/', '$1', $ht) . '</p>';
  208. });
  209.  
  210. return trim(implode('', $html));
  211. }
  212. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement