function xss_clean($str) {
/*
* Remove Null Characters
*
* This prevents sandwiching null characters
* between ascii characters, like Java\0script.
*/
$str = preg_replace('/\0+/', '', $str);
$str = preg_replace('/(\\\\0)+/', '', $str);
/*
* Validate standard character entities
*
* Add a semicolon if missing. We do this to enable
* the conversion of entities to ASCII later.
*/
$str = preg_replace('#(&\#?[0-9a-z]+)[\x00-\x20]*;?#i', "\\1;", $str);
/*
* Validate UTF16 two byte encoding (x00)
*
* Just as above, adds a semicolon if missing.
*/
$str = preg_replace('#(&\#x?)([0-9A-F]+);?#i', "\\1\\2;", $str);
/*
* URL Decode
*
* Just in case stuff like this is submitted:
* <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a>
* Note: Use rawurldecode() so it does not remove plus signs
*/
$str = rawurldecode($str);
/*
* Convert character entities to ASCII
*
* This permits our tests below to work reliably.
* We only convert entities that are within tags since
* these are the ones that will pose security problems.
*/
$str = preg_replace_callback("/[a-z]+=([\'\"]).*?\\1/si", array($this, '_attribute_conversion'), $str);
$str = preg_replace_callback("/<([\w]+)[^>]*>/si", array($this, '_html_entity_decode_callback'), $str);
/*
Old Code that when modified to use preg_replace()'s above became more efficient memory-wise
if (preg_match_all("/[a-z]+=([\'\"]).*?\\1/si", $str, $matches)) {
for ($i = 0; $i < count($matches[0]); $i++) {
if (stristr($matches[0][$i], '>')) {
$str = str_replace( $matches['0'][$i], str_replace('>', '<', $matches[0][$i]), $str);
}
}
}
if (preg_match_all("/<([\w]+)[^>]*>/si", $str, $matches)) {
for ($i = 0; $i < count($matches[0]); $i++) {
$str = str_replace($matches[0][$i], $this->_html_entity_decode($matches[0][$i], $charset), $str);
}
}
*/
/*
* Convert all tabs to spaces
*
* This prevents strings like this: ja vascript
* NOTE: we deal with spaces between characters later.
* NOTE: preg_replace was found to be amazingly slow here on large blocks of data,
* so we use str_replace.
*/
$str = str_replace("\t", " ", $str);
/*
* Not Allowed Under Any Conditions
*/
$bad = array(
'document.cookie' => '[removed]',
'document.write' => '[removed]',
'.parentNode' => '[removed]',
'.innerHTML' => '[removed]',
'window.location' => '[removed]',
'-moz-binding' => '[removed]',
'<!--' => '<!--',
'-->' => '-->',
'<!CDATA[' => '<![CDATA['
);
foreach ($bad as $key=>$val) {
$str = str_replace($key, $val, $str);
}
$bad = array(
"javascript\s*:" => '[removed]',
"expression\s*\(" => '[removed]', // CSS and IE
"Redirect\s+302" => '[removed]'
);
foreach ($bad as $key=>$val) {
$str = preg_replace("#" . $key . "#i", $val, $str);
}
/*
* Makes PHP tags safe
*
* Note: XML tags are inadvertently replaced too:
* <?xml
* But it doesn't seem to pose a problem.
*/
$str = str_replace(array('<?php', '<?PHP', '<?', '?' . '>'), array('<?php', '<?PHP', '<?', '?>'), $str);
/*
* Compact any exploded words
*
* This corrects words like: j a v a s c r i p t
* These words are compacted back to their correct state.
*/
$words = array('javascript', 'expression', 'vbscript', 'script', 'applet', 'alert', 'document', 'write', 'cookie', 'window');
foreach ($words as $word) {
$temp = '';
for($i = 0; $i < strlen($word); $i++) {
$temp .= substr($word, $i, 1) . "\s*";
}
// We only want to do this when it is followed by a non-word character
// That way valid stuff like "dealer to" does not become "dealerto"
$str = preg_replace('#(' . substr($temp, 0, -3) . ')(\W)#ise', "preg_replace('/\s+/s', '', '\\1').'\\2'", $str);
}
/*
* Remove disallowed Javascript in links or img tags
*/
do {
$original = $str;
}
}
}
} while ($original != $str);
/*
* Remove JavaScript Event Handlers
*
* Note: This code is a little blunt. It removes the event handler and anything up to the closing >,
* but it's unlikely to be a problem.
*/
$event_handlers = array('onblur', 'onchange', 'onclick', 'onfocus', 'onload', 'onmouseover', 'onmouseup', 'onmousedown', 'onselect', 'onsubmit', 'onunload', 'onkeypress', 'onkeydown', 'onkeyup', 'onresize', 'xmlns');
$str = preg_replace("#<([^>]+)(" . implode('|', $event_handlers) . ")([^>]*)>#iU", "<\\1\\2\\3>", $str);
/*
* Sanitize naughty HTML elements
*
* If a tag containing any of the words in the list
* below is found, the tag gets converted to entities.
*
* So this: <blink>
* Becomes: <blink>
*/
$str = preg_replace('#<(/*\s*)(alert|applet|basefont|base|behavior|bgsound|blink|body|embed|expression|form|frameset|frame|head|html|ilayer|iframe|input|layer|link|meta|object|plaintext|style|script|textarea|title|xml|xss)([^>]*)>#is', "<\\1\\2\\3>", $str);
/*
* Sanitize naughty scripting elements
*
* Similar to above, only instead of looking for tags it looks for PHP and JavaScript commands
* that are disallowed. Rather than removing the code, it simply converts the parenthesis to entities
* rendering the code un-executable.
*
* For example: eval('some code')
* Becomes: eval('some code')
*/
$str = preg_replace('#(alert|cmd|passthru|eval|exec|expression|system|fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si', "\\1\\2(\\3)", $str);
/*
* Final clean up
*
* This adds a bit of extra precaution in case something got through the above filters
*/
'document.cookie' => '[removed]',
'document.write' => '[removed]',
'.parentNode' => '[removed]',
'.innerHTML' => '[removed]',
'window.location' => '[removed]',
'-moz-binding' => '[removed]',
'<!--' => '<!--', '-->' => '-->',
'<!CDATA[' => '<![CDATA['
);
foreach ($bad as $key=>$val) {
}
"javascript\s*:" => '[removed]',
"expression\s*\(" => '[removed]', // CSS and IE
"Redirect\s+302" => '[removed]'
);
foreach ($bad as $key=>$val) {
}
log_message('debug', "XSS Filtering completed");
return $str;
}