Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- namespace common\models;
- use Yii;
- use yii\base\Model;
- use keltstr\simplehtmldom\SimpleHTMLDom;
- use yii\helpers\Json;
- class Parser extends Model
- {
- public $url;
- public $html;
- public $old_watermark;
- const MAX_TRIES = 4;
- const MAX_IMAGES = 7;
- const PROXY_USER = "proxyuser:qwer4123";
- const PROXY_FILE = "/home/pdftour/public_html/proxy.txt";
- public static function getProxy($all = false)
- {
- if ($file = file_get_contents(static::PROXY_FILE)) {
- $proxies = explode("\n", $file);
- if ($all) {
- return $proxies;
- }
- return trim($proxies[rand(0, count($proxies) - 1)]);
- }
- return false;
- }
- public static function testProxy($proxies, $url = "https://www.cian.ru/")
- {
- set_time_limit(0);
- $goodProxies = [];
- for ($i = 0; $i < count($proxies); $i++) {
- $ch = curl_init($url);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($ch, CURLOPT_HTTPHEADER, [
- 'Pragma: no-cache',
- 'Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4,uk;q=0.2',
- 'Upgrade-Insecure-Requests: 1',
- 'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
- 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Cache-Control: no-cache',
- 'Connection: keep-alive',
- ]);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
- curl_setopt($ch, CURLOPT_TIMEOUT, 10);
- curl_setopt($ch, CURLOPT_PROXY, $proxies[$i]);
- curl_setopt($ch, CURLOPT_PROXYUSERPWD, static::PROXY_USER);
- $data = curl_exec($ch);
- if (strlen($data) > 100000) {
- $goodProxies[] = $proxies[$i];
- }
- }
- return $goodProxies;
- }
- public static function request($url)
- {
- $data = "";
- for ($i = 0; $i < static::MAX_TRIES * 2; $i++) {
- $ch = curl_init($url);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($ch, CURLOPT_HTTPHEADER, [
- 'Pragma: no-cache',
- 'Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4,uk;q=0.2',
- 'Upgrade-Insecure-Requests: 1',
- 'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
- 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Cache-Control: no-cache',
- 'Connection: keep-alive',
- ]);
- $proxy = static::getProxy();
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
- curl_setopt($ch, CURLOPT_TIMEOUT, 9);
- curl_setopt($ch, CURLOPT_PROXY, $proxy);
- curl_setopt($ch, CURLOPT_PROXYUSERPWD, static::PROXY_USER);
- $data = curl_exec($ch);
- // file_put_contents("/tmp/cian" . $proxy, $data);
- if (strlen($data) > 1000) {
- return $data;
- }
- }
- // file_put_contents("/tmp/cian", $data);
- return false;
- }
- public static function multiRequest($urls)
- {
- $responses = [];
- for ($i = 0; $i < static::MAX_TRIES && !empty($urls); $i++) {
- $multi = curl_multi_init();
- $channels = [];
- foreach ($urls as $index => $url) {
- $ch = curl_init($url);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($ch, CURLOPT_HTTPHEADER, [
- 'Pragma: no-cache',
- 'Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4,uk;q=0.2',
- 'Upgrade-Insecure-Requests: 1',
- 'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
- 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Cache-Control: no-cache',
- 'Connection: keep-alive',
- ]);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
- curl_setopt($ch, CURLOPT_TIMEOUT, 20);
- curl_setopt($ch, CURLOPT_PROXY, static::getProxy());
- curl_setopt($ch, CURLOPT_PROXYUSERPWD, static::PROXY_USER);
- curl_multi_add_handle($multi, $ch);
- $channels[$index] = $ch;
- }
- $running = NULL;
- do {
- curl_multi_exec($multi, $running);
- } while ($running > 0);
- foreach ($channels as $index => $channel) {
- $response = curl_multi_getcontent($channel);
- if (strlen($response) > 1000) {
- $responses[] = $response;
- unset($urls[$index]);
- }
- curl_multi_remove_handle($multi, $channel);
- }
- curl_multi_close($multi);
- }
- return $responses;
- }
- public function loadHtml()
- {
- if ($response = static::request($this->url)) {
- $this->html = SimpleHTMLDom::str_get_html($response);
- return true;
- }
- return false;
- }
- public function parseHtml()
- {
- $html = $this->html;
- // if (Yii::$app->user->id == 5) {
- // echo $html;
- // exit();
- // }
- // if ($_SERVER['REMOTE_ADDR'] == '176.100.15.166') {
- // echo ($html);
- // exit();
- // return false;
- // }
- //
- if (empty($html)) {
- return false;
- }
- $model = new Realty();
- $price = '';
- if ($htmlBlock = $html->find('[itemprop="price"]', 0)) {
- // $price = static::whitespace(preg_replace('/[^\d]+/', '', $htmlBlock->plaintext));
- $price = static::whitespace(str_replace('₽', 'руб', $htmlBlock->plaintext));
- }
- if (strpos($this->url , '/rent/') === false) {
- $title = "Продажа ";
- } else {
- $title = "Аренда ";
- }
- $address = '';
- if ($htmlBlock = $html->find("address", 0)) {
- $address = static::whitespace(str_replace('На карте', '', $htmlBlock->plaintext));
- }
- if ($htmlBlock = $html->find('[class*="header-information--"] [class*="title--"]', 0)) {
- $title .= static::whitespace($htmlBlock->plaintext) . ', ';
- }
- $title .= $address;
- $title = str_replace(" на длительный срок (от года)", "", $title);
- $images = [];
- $imageUrls = [];
- preg_match_all('/"thumbnailUrl":"([^"]+)"/', $html, $matches);
- if (!empty($matches[1])) {
- foreach ($matches[1] as $url) {
- $imageUrls[] = str_replace(['\u002F', '1.jpg'], ['/', '2.jpg'], $url);
- }
- }
- // foreach ($html->find('img[class^="photo"]') as $image) {
- // $imageUrls[] = str_replace(['\u002F', '1.jpg'], ['/', '2.jpg'], $image->getAttribute("src"));
- // }
- $imageResponses = static::multiRequest($imageUrls);
- foreach ($imageResponses as $response) {
- $filename = Yii::$app->security->generateRandomString() . ".jpg";
- $fullName = Yii::getAlias('@frontend/web/images/tmp/') . $filename;
- file_put_contents($fullName, $response);
- try {
- if (exif_imagetype($fullName) == IMAGETYPE_JPEG) {
- $src = imagecreatefromjpeg($fullName);
- $imageWidth = imagesx($src);
- $imageHeight = imagesy($src);
- if ($imageHeight < 100) {
- // throw new \Exception("small image");
- }
- /*
- if (!empty($this->old_watermark)) {
- $imageHeight = $imageHeight - 81;
- } else {
- $imageHeight = $imageHeight - 138;
- }
- $ratio = 2/3;
- $sourceY = $sourceX = 0;
- $newImageHeight = $imageHeight;
- $newImageWidth = $imageWidth;
- if ($imageHeight / $imageWidth > $ratio) {
- $newImageHeight = intval($imageWidth * $ratio);
- $sourceY = intval(($imageHeight - $newImageHeight) / 2);
- } else {
- $newImageWidth = intval($newImageHeight / $ratio);
- $sourceX = intval(($imageWidth - $newImageWidth) / 2);
- }
- $dest = imagecreatetruecolor($newImageWidth, $newImageHeight);
- imagecopy($dest, $src, 0, 0, $sourceX, $sourceY, $newImageWidth, $newImageHeight);
- */
- // if (empty($this->old_watermark)) {
- // $white = imagecolorallocate($dest, 255, 255, 255);
- // imagefilledrectangle ($dest, $newImageWidth - 340, $newImageHeight - 170, $newImageWidth - 1, $newImageHeight - 1, 0xffffff);
- // }
- // imagejpeg($dest, $fullName);
- //imagejpeg($src, $fullName);
- $images[] = $filename;
- } else {
- unlink($fullName);
- }
- } catch (\Exception $e) {
- try {
- unlink($fullName);
- } catch (\Exception $e) {
- }
- }
- }
- $description = '';
- if ($htmlBlock = $html->find('p[class*="description-text--"]', 0)) {
- /* $description = $htmlBlock->innertext;
- $description = static::whitespace($description);
- $description = preg_replace('/^.+<br\/?>.*<br\/?>/', '', $description);
- $description = preg_replace('/<div.+$/', '', $description);
- $description = preg_replace('/<br \/>.+$/', '', $description);*/
- $description = $htmlBlock->plaintext;
- }
- $fields = [];
- $fields['Адрес'] = $address;
- if ($htmlBlock = $html->find('[class*="underground_link--"]', 0)) {
- $fields['Метро'] = static::whitespace($htmlBlock->plaintext);
- }
- if ($htmlBlock = $html->find('[class*="underground_time--"]', 0)) {
- $fields['До метро'] = static::whitespace(str_replace(',', '', $htmlBlock->plaintext));
- }
- // foreach ($html->find("table.object_descr_props tr") as $tableRow) {
- // if (($tableTh = $tableRow->find("th", 0)) && ($tableTd = $tableRow->find("td", 0))
- // && !empty($name = static::whitespace($tableTh->plaintext))
- // && !empty($value = static::whitespace($tableTd->plaintext))) {
- // $fields[str_replace(":", "", $name)] = $value;
- // }
- // }
- //
- //
- // if ($htmlBlock = $html->find(".cf-comm-offer-detail__section", 0)) {
- // $nameRows = $html->find(".cf-comm-offer-detail__prop-name");
- // $valRows = $html->find(".cf-comm-offer-detail__prop-val");
- // if ($nameRows && $valRows) {
- // for ($i = 0; $i < count($nameRows) && $i < count($valRows); $i++) {
- // if (!empty($name = static::whitespace($nameRows[$i]->plaintext))
- // && !empty($value = static::whitespace($valRows[$i]->plaintext))
- // ) {
- // $fields[str_replace(":", "", $name)] = $value;
- // }
- // }
- // }
- // }
- foreach ($html->find('[class*="info-block--"] [class*="info--"]') as $infoBlock) {
- if (($nameBlock = $infoBlock->find('[class*="info-title--"]', 0)) && ($valueBlock = $infoBlock->find('[class*="info-text"]', 0))
- && !empty($name = static::whitespace($nameBlock->plaintext))
- && !empty($value = static::whitespace($valueBlock->plaintext))) {
- $fields[str_replace(":", "", $name)] = $value;
- }
- }
- $coordinates = '';
- try {
- preg_match('/{"lng":(\d+.\d+),"lat":(\d+.\d+)}/', $html->outertext, $matches);
- $coordinates = $matches[1] . "," . $matches[2];
- } catch (\Exception $e) {
- $coordinates = '';
- }
- $model->attributes = [
- 'url' => $this->url,
- 'image_urls' => Json::encode($images),
- 'title' => $title,
- 'coordinates' => $coordinates,
- 'price' => $price,
- 'user_id' => Yii::$app->user->id,
- 'description' => $description,
- 'fields' => Json::encode($fields),
- ];
- // file_put_contents(__DIR__ . "/../../frontend/runtime/loglog", print_r($model->attributes, true));
- if ($model->save()) {
- $activity = new UserActivity();
- $activity->attributes = [
- 'user_id' => Yii::$app->user->id,
- 'type' => UserActivity::TYPE_PARSING,
- 'url' => $this->url,
- ];
- $activity->save();
- return true;
- }
- file_put_contents(__DIR__ . "/../../frontend/runtime/loglog", print_r($model->errors, true));
- return false;
- }
- public static function whitespace($string)
- {
- return html_entity_decode(trim(preg_replace('/\s+/', ' ', str_replace(" ", " ", $string))));
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement