get ( 'Core.LexerImpl' );
}
$needs_tracking = $config->get ( 'Core.MaintainLineNumbers' ) || $config->get ( 'Core.CollectErrors' );
$inst = null;
if (is_object ( $lexer )) {
$inst = $lexer;
} else {
if (is_null ( $lexer )) {
do {
// auto-detection algorithm
if ($needs_tracking) {
$lexer = 'DirectLex';
break;
}
if (class_exists ( 'DOMDocument' ) && method_exists ( 'DOMDocument', 'loadHTML' ) && ! extension_loaded ( 'domxml' )) {
// check for DOM support, because while it's part of the
// core, it can be disabled compile time. Also, the PECL
// domxml extension overrides the default DOM, and is evil
// and nasty and we shan't bother to support it
$lexer = 'DOMLex';
} else {
$lexer = 'DirectLex';
}
} while ( 0 );
} // do..while so we can break
// instantiate recognized string names
switch ($lexer) {
case 'DOMLex' :
$inst = new HTMLPurifier_Lexer_DOMLex ();
break;
case 'DirectLex' :
$inst = new HTMLPurifier_Lexer_DirectLex ();
break;
case 'PH5P' :
$inst = new HTMLPurifier_Lexer_PH5P ();
break;
default :
throw new HTMLPurifier_Exception ( "Cannot instantiate unrecognized Lexer type " . htmlspecialchars ( $lexer ) );
}
}
if (! $inst) {
throw new HTMLPurifier_Exception ( 'No lexer was instantiated' );
}
// once PHP DOM implements native line numbers, or we
// hack out something using XSLT, remove this stipulation
if ($needs_tracking && ! $inst->tracksLineNumbers) {
throw new HTMLPurifier_Exception ( 'Cannot use lexer that does not support line numbers with ' . 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' );
}
return $inst;
}
// -- CONVENIENCE MEMBERS ---------------------------------------------
public function __construct() {
$this->_entity_parser = new HTMLPurifier_EntityParser ();
}
/**
* Most common entity to raw value conversion table for special entities.
* @type array
*/
protected $_special_entity2str = array (
'"' => '"',
'&' => '&',
'<' => '<',
'>' => '>',
''' => "'",
''' => "'",
''' => "'"
);
/**
* Parses special entities into the proper characters.
*
* This string will translate escaped versions of the special characters
* into the correct ones.
*
* @warning
* You should be able to treat the output of this function as
* completely parsed, but that's only because all other entities should
* have been handled previously in substituteNonSpecialEntities()
*
* @param string $string
* String character data to be parsed.
* @return string Parsed character data.
*/
public function parseData($string) {
// following functions require at least one character
if ($string === '') {
return '';
}
// subtracts amps that cannot possibly be escaped
$num_amp = substr_count ( $string, '&' ) - substr_count ( $string, '& ' ) - ($string [strlen ( $string ) - 1] === '&' ? 1 : 0);
if (! $num_amp) {
return $string;
} // abort if no entities
$num_esc_amp = substr_count ( $string, '&' );
$string = strtr ( $string, $this->_special_entity2str );
// code duplication for sake of optimization, see above
$num_amp_2 = substr_count ( $string, '&' ) - substr_count ( $string, '& ' ) - ($string [strlen ( $string ) - 1] === '&' ? 1 : 0);
if ($num_amp_2 <= $num_esc_amp) {
return $string;
}
// hmm... now we have some uncommon entities. Use the callback.
$string = $this->_entity_parser->substituteSpecialEntities ( $string );
return $string;
}
/**
* Lexes an HTML string into tokens.
*
* @param $string String
* HTML.
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return HTMLPurifier_Token[] array representation of HTML.
*/
public function tokenizeHTML($string, $config, $context) {
trigger_error ( 'Call to abstract class', E_USER_ERROR );
}
/**
* Translates CDATA sections into regular sections (through escaping).
*
* @param string $string
* HTML string to process.
* @return string HTML with CDATA sections escaped.
*/
protected static function escapeCDATA($string) {
return preg_replace_callback ( '//s', array (
'HTMLPurifier_Lexer',
'CDATACallback'
), $string );
}
/**
* Special CDATA case that is especially convoluted for