Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_URL, $address);
- curl_setopt($ch, CURLOPT_HEADER, 1);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
- curl_setopt($ch, CURLOPT_USERAGENT,
- 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36');
- $dirty_html = curl_exec($ch);
- $response = (object)curl_getinfo($ch);
- if(stripos($response->content_type, 'windows-1251')){;
- $dirty_html = mb_convert_encoding($dirty_html, 'utf-8', 'cp1251');
- }
- curl_close($ch);
- $prf = new HTMLPurifier();
- $clean_html = $prf->process($dirty_html,[
- 'AutoFormat.AutoParagraph' => true,
- 'AutoFormat.RemoveEmpty' => true,
- 'AutoFormat.RemoveEmpty.RemoveNbsp' => true,
- 'HTML.AllowedElements' => ['p','a','br','table','tbody','tr','th','td','h1','span','big'],
- 'HTML.ForbiddenAttributes' => [
- 'a@href',
- 'a@style',
- 'a@rel',
- 'a@class',
- 'a@title',
- 'p@class',
- 'p@style',
- 'table@style',
- 'table@border',
- 'table@width',
- 'td@class',
- 'th@class',
- 'td@valign',
- 'td@style',
- 'td@width',
- 'td@colspan',
- 'td@align',
- 'td@rowspan',
- 'span@class'
- ]
- ]);
- $search = ['<p>','</p>','&','<a>','</a>','<table>','<tbody>','</tbody>','</table>','<tr>','</tr>','<th>','</th>','<td>','</td>','<h1>','</h1>','<span>','</span>','<big>','</big>'];
- $replace = ['',"\r\n",'','',"\r\n",'','','','','',"\r\n",'',' ','','','',"\r\n",'',' ','',' '];
- $clean_html = preg_replace("#(?:<p>\s+</p>)#", '', $clean_html);
- $clean_html = str_replace(["\r","\n","\r\n","\t",' '], '', $clean_html);
- $clean_html = str_replace($search, $replace, $clean_html);
- $clean_html = preg_replace("#(?:\r\n?|\n){2,}#", "\r\n", $clean_html);
- $clean_html = str_replace(['<br>', '<br />'], " ", $clean_html);
- $clean_html = preg_replace("#(?:\"\'\+;)+#", '', $clean_html);
- echo $clean_html
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement