# This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License # http://creativecommons.org/licenses/by-sa/2.5/ # # Thanks to Jang Kim for adding support for single quoted attributes # $filter = new lib_filter(); class lib_filter { var $tag_counts = array(); # # tags and attributes that are allowed # var $allowed = array( 'a' => array('href', 'target'), 'b' => array(), 'img' => array('src', 'width', 'height', 'alt'), ); # # tags which should always be self-closing (e.g. "") # var $no_close = array( 'img', ); # # tags which must always have seperate opening and closing tags (e.g. "") # var $always_close = array( 'a', 'b', ); # # attributes which should be checked for valid protocols # var $protocol_attributes = array( 'src', 'href', ); # # protocols which are allowed # var $allowed_protocols = array( 'http', 'ftp', 'mailto', ); # # tags which should be removed if they contain no content (e.g. "" or "") # var $remove_blanks = array( 'a', 'b', ); # # should we remove comments? # var $strip_comments = 1; # # should we try and make a b tag out of "b>" # var $always_make_tags = 1; # # entity control options # var $allow_numbered_entities = 1; var $allowed_entities = array( 'amp', 'gt', 'lt', 'quot', ); ############################################################### function go($data){ $this->tag_counts = array(); $data = $this->escape_comments($data); $data = $this->balance_html($data); $data = $this->check_tags($data); $data = $this->process_remove_blanks($data); $data = $this->validate_entities($data); return $data; } ############################################################### function escape_comments($data){ $data = preg_replace("//se", "''", $data); return $data; } ############################################################### function balance_html($data){ if ($this->always_make_tags){ # # try and form html # $data = preg_replace("/>>+/", ">", $data); $data = preg_replace("/<<+/", "<", $data); $data = preg_replace("/^>/", "", $data); $data = preg_replace("/<([^>]*?)(?=<|$)/", "<$1>", $data); $data = preg_replace("/(^|>)([^<]*?)(?=>)/", "$1<$2", $data); }else{ # # escape stray brackets # $data = preg_replace("/<([^>]*?)(?=<|$)/", "<$1", $data); $data = preg_replace("/(^|>)([^<]*?)(?=>)/", "$1$2><", $data); # # the last regexp causes '<>' entities to appear # (we need to do a lookahead assertion so that the last bracket can # be used in the next pass of the regexp) # $data = str_replace('<>', '', $data); } #echo "::".HtmlSpecialChars($data)."
\n"; return $data; } ############################################################### function check_tags($data){ $data = preg_replace("/<(.*?)>/se", "\$this->process_tag(\$this->StripSingle('\\1'))", $data); foreach(array_keys($this->tag_counts) as $tag){ for($i=0; $i<$this->tag_counts[$tag]; $i++){ $data .= ""; } } return $data; } ############################################################### function process_tag($data){ # ending tags if (preg_match("/^\/([a-z0-9]+)/si", $data, $matches)){ $name = StrToLower($matches[1]); if (in_array($name, array_keys($this->allowed))){ if (!in_array($name, $this->no_close)){ if ($this->tag_counts[$name]){ $this->tag_counts[$name]--; return ''; } } }else{ return ''; } } # starting tags if (preg_match("/^([a-z0-9]+)(.*?)(\/?)$/si", $data, $matches)){ $name = StrToLower($matches[1]); $body = $matches[2]; $ending = $matches[3]; if (in_array($name, array_keys($this->allowed))){ $params = ""; preg_match_all("/([a-z0-9]+)=([\"'])(.*?)\\2/si", $body, $matches_2, PREG_SET_ORDER); # preg_match_all("/([a-z0-9]+)(=)([^\"\s']+)/si", $body, $matches_1, PREG_SET_ORDER); # preg_match_all("/([a-z0-9]+)=([\"'])([^\"']*?)\s*$/si", $body, $matches_3, PREG_SET_ORDER); # no_close)){ $ending = ' /'; } if (in_array($name, $this->always_close)){ $ending = ''; } if (!$ending){ if (isset($this->tag_counts[$name])){ $this->tag_counts[$name]++; }else{ $this->tag_counts[$name] = 1; } } if ($ending){ $ending = ' /'; } return '<'.$name.$params.$ending.'>'; }else{ return ''; } } # comments if (preg_match("/^!--(.*)--$/si", $data)){ if ($this->strip_comments){ return ''; }else{ return '<'.$data.'>'; } } # garbage, ignore it return ''; } ############################################################### function process_param_protocol($data){ $data = $this->decode_entities($data); if (preg_match("/^([^:]+)\:/si", $data, $matches)){ if (!in_array($matches[1], $this->allowed_protocols)){ $data = '#'.substr($data, strlen($matches[1])+1); } } return $data; } ############################################################### function process_remove_blanks($data){ foreach($this->remove_blanks as $tag){ $data = preg_replace("/<{$tag}(\s[^>]*)?><\\/{$tag}>/", '', $data); $data = preg_replace("/<{$tag}(\s[^>]*)?\\/>/", '', $data); } return $data; } ############################################################### function fix_case($data){ $data_notags = Strip_Tags($data); $data_notags = preg_replace('/[^a-zA-Z]/', '', $data_notags); if (strlen($data_notags)<5){ return $data; } if (preg_match('/[a-z]/', $data_notags)){ return $data; } return preg_replace( "/(>|^)([^<]+?)(<|$)/se", "\$this->StripSingle('\\1').". "\$this->fix_case_inner(\$this->StripSingle('\\2')).". "\$this->StripSingle('\\3')", $data ); } function fix_case_inner($data){ $data = StrToLower($data); $data = preg_replace( '/(^|[^\w\s\';,\\-])(\s*)([a-z])/e', "\$this->StripSingle('\\1\\2').StrToUpper(\$this->StripSingle('\\3'))", $data ); return $data; } ############################################################### function validate_entities($data){ # validate entities throughout the string $data = preg_replace( '!&([^&;]*)(?=(;|&|$))!e', "\$this->check_entity(\$this->StripSingle('\\1'), \$this->StripSingle('\\2'))", $data ); # validate quotes outside of tags $data = preg_replace( "/(>|^)([^<]+?)(<|$)/se", "\$this->StripSingle('\\1').". "str_replace('\"', '"', \$this->StripSingle('\\2')).". "\$this->StripSingle('\\3')", $data ); return $data; } function check_entity($preamble, $term){ if ($term != ';'){ return '&'.$preamble; } if ($this->is_valid_entity($preamble)){ return '&'.$preamble; } return '&'.$preamble; } function is_valid_entity($entity){ if (preg_match('!^#([0-9]+)$!i', $entity, $m)){ if ($m[1] > 127){ return 1; } return $this->allow_numbered_entities; } if (in_array($entity, $this->allowed_entities)){ return 1; } return 0; } ############################################################### # # within attributes, we want to convert all hex/dec/url escape sequences into # their raw characters so that we can check we don't get stray quotes/brackets # inside strings # function decode_entities($data){ $data = preg_replace_callback('!(&)#(\d+);?!', array($this, 'decode_dec_entity'), $data); $data = preg_replace_callback('!(&)#x([0-9a-f]+);?!i', array($this, 'decode_hex_entity'), $data); $data = preg_replace_callback('!(%)([0-9a-f]{2});?!i', array($this, 'decode_hex_entity'), $data); $data = $this->validate_entities($data); return $data; } function decode_hex_entity($m){ return $this->decode_num_entity($m[1], hexdec($m[2])); } function decode_dec_entity($m){ return $this->decode_num_entity($m[1], intval($m[2])); } function decode_num_entity($orig_type, $d){ if ($d < 0){ $d = 32; } # space # don't mess with huigh chars if ($d > 127){ if ($orig_type == '%'){ return '%'.dechex($d); } if ($orig_type == '&'){ return "&#$d;"; } } return HtmlSpecialChars(chr($d)); } ############################################################### function StripSingle($data){ return str_replace(array('\\"', "\\0"), array('"', chr(0)), $data); } ############################################################### } ?>