get ( 'Core.LexerImpl' ); } $needs_tracking = $config->get ( 'Core.MaintainLineNumbers' ) || $config->get ( 'Core.CollectErrors' ); $inst = null; if (is_object ( $lexer )) { $inst = $lexer; } else { if (is_null ( $lexer )) { do { // auto-detection algorithm if ($needs_tracking) { $lexer = 'DirectLex'; break; } if (class_exists ( 'DOMDocument' ) && method_exists ( 'DOMDocument', 'loadHTML' ) && ! extension_loaded ( 'domxml' )) { // check for DOM support, because while it's part of the // core, it can be disabled compile time. Also, the PECL // domxml extension overrides the default DOM, and is evil // and nasty and we shan't bother to support it $lexer = 'DOMLex'; } else { $lexer = 'DirectLex'; } } while ( 0 ); } // do..while so we can break // instantiate recognized string names switch ($lexer) { case 'DOMLex' : $inst = new HTMLPurifier_Lexer_DOMLex (); break; case 'DirectLex' : $inst = new HTMLPurifier_Lexer_DirectLex (); break; case 'PH5P' : $inst = new HTMLPurifier_Lexer_PH5P (); break; default : throw new HTMLPurifier_Exception ( "Cannot instantiate unrecognized Lexer type " . htmlspecialchars ( $lexer ) ); } } if (! $inst) { throw new HTMLPurifier_Exception ( 'No lexer was instantiated' ); } // once PHP DOM implements native line numbers, or we // hack out something using XSLT, remove this stipulation if ($needs_tracking && ! $inst->tracksLineNumbers) { throw new HTMLPurifier_Exception ( 'Cannot use lexer that does not support line numbers with ' . 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' ); } return $inst; } // -- CONVENIENCE MEMBERS --------------------------------------------- public function __construct() { $this->_entity_parser = new HTMLPurifier_EntityParser (); } /** * Most common entity to raw value conversion table for special entities. * @type array */ protected $_special_entity2str = array ( '"' => '"', '&' => '&', '<' => '<', '>' => '>', ''' => "'", ''' => "'", ''' => "'" ); /** * Parses special entities into the proper characters. * * This string will translate escaped versions of the special characters * into the correct ones. * * @warning * You should be able to treat the output of this function as * completely parsed, but that's only because all other entities should * have been handled previously in substituteNonSpecialEntities() * * @param string $string * String character data to be parsed. * @return string Parsed character data. */ public function parseData($string) { // following functions require at least one character if ($string === '') { return ''; } // subtracts amps that cannot possibly be escaped $num_amp = substr_count ( $string, '&' ) - substr_count ( $string, '& ' ) - ($string [strlen ( $string ) - 1] === '&' ? 1 : 0); if (! $num_amp) { return $string; } // abort if no entities $num_esc_amp = substr_count ( $string, '&' ); $string = strtr ( $string, $this->_special_entity2str ); // code duplication for sake of optimization, see above $num_amp_2 = substr_count ( $string, '&' ) - substr_count ( $string, '& ' ) - ($string [strlen ( $string ) - 1] === '&' ? 1 : 0); if ($num_amp_2 <= $num_esc_amp) { return $string; } // hmm... now we have some uncommon entities. Use the callback. $string = $this->_entity_parser->substituteSpecialEntities ( $string ); return $string; } /** * Lexes an HTML string into tokens. * * @param $string String * HTML. * @param HTMLPurifier_Config $config * @param HTMLPurifier_Context $context * @return HTMLPurifier_Token[] array representation of HTML. */ public function tokenizeHTML($string, $config, $context) { trigger_error ( 'Call to abstract class', E_USER_ERROR ); } /** * Translates CDATA sections into regular sections (through escaping). * * @param string $string * HTML string to process. * @return string HTML with CDATA sections escaped. */ protected static function escapeCDATA($string) { return preg_replace_callback ( '//s', array ( 'HTMLPurifier_Lexer', 'CDATACallback' ), $string ); } /** * Special CDATA case that is especially convoluted for