Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- //callback function for the regex
- function utf8_entity_decode($entity){
- $convmap = array(0x0, 0x10000, 0, 0xfffff);
- return mb_decode_numericentity($entity, $convmap, 'UTF-8');
- }
- $xmlDoc = file_get_contents("php://stdin");
- $xmlDoc = str_replace(""",'"', $xmlDoc);
- $xmlDoc = str_replace("&","&", $xmlDoc);
- $xmlDoc = str_replace("'","'", $xmlDoc);
- $xmlDoc = str_replace("<","<", $xmlDoc);
- $xmlDoc = str_replace("…","...", $xmlDoc);
- $xmlDoc = str_replace("‘","'", $xmlDoc);
- $xmlDoc = str_replace("’","'", $xmlDoc);
- $xmlDoc = str_replace("“","'", $xmlDoc);
- $xmlDoc = str_replace("•","-", $xmlDoc);
- //$xmlDoc = str_replace("•","Β·", $xmlDoc);
- $xmlDoc = str_replace("—","-", $xmlDoc);
- $xmlDoc = str_replace("—","-", $xmlDoc);
- $xmlDoc = str_replace("’","'", $xmlDoc);
- $xmlDoc = str_replace("“","'", $xmlDoc);
- $xmlDoc = str_replace("”","'", $xmlDoc);
- //decode decimal html entities added by web browser
- $xmlDoc = preg_replace_callback('/&#\d{2,5};/u',
- function($match){
- return utf8_entity_decode($match[0]);
- },
- $xmlDoc
- );
- //$xmlDoc = preg_replace_callback('/&#\d{2,5};/ue', utf8_entity_decode('$0'), $xmlDoc);
- //decode hex html entities added by web browser
- $xmlDoc = preg_replace_callback('/&#x([a-fA-F0-7]{2,8});/u',
- function($match){
- return utf8_entity_decode('&#'.hexdec($match[1]).';');
- },
- $xmlDoc
- );
- //$xmlDoc = preg_replace_callback('/&#x([a-fA-F0-7]{2,8});/ue', utf8_entity_decode('&#'.hexdec('$1').';'), $xmlDoc);
- //Replace "&#???; encoded characters with equivalents
- $xmlDoc = str_replace("&#160;"," ", $xmlDoc);
- $xmlDoc = str_replace("&#172;","", $xmlDoc);
- $xmlDoc = str_replace("&#173;","", $xmlDoc);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement