factory = new HTMLPurifier_TokenFactory ();
}
/**
*
* @param string $html
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return HTMLPurifier_Token[]
*/
public function tokenizeHTML($html, $config, $context) {
$html = $this->normalize ( $html, $config, $context );
// attempt to armor stray angled brackets that cannot possibly
// form tags and thus are probably being used as emoticons
if ($config->get ( 'Core.AggressivelyFixLt' )) {
$char = '[^a-z!\/]';
$comment = "/|\z)/is";
$html = preg_replace_callback ( $comment, array (
$this,
'callbackArmorCommentEntities'
), $html );
do {
$old = $html;
$html = preg_replace ( "/<($char)/i", '<\\1', $html );
} while ( $html !== $old );
$html = preg_replace_callback ( $comment, array (
$this,
'callbackUndoCommentSubst'
), $html ); // fix comments
}
// preprocess html, essential for UTF-8
$html = $this->wrapHTML ( $html, $config, $context );
$doc = new DOMDocument ();
$doc->encoding = 'UTF-8'; // theoretically, the above has this covered
set_error_handler ( array (
$this,
'muteErrorHandler'
) );
$doc->loadHTML ( $html );
restore_error_handler ();
$tokens = array ();
$this->tokenizeDOM ( $doc->getElementsByTagName ( 'html' )->item ( 0 )-> //
getElementsByTagName ( 'body' )->item ( 0 )-> //
getElementsByTagName ( 'div' )->item ( 0 ), //
$tokens );
return $tokens;
}
/**
* Iterative function that tokenizes a node, putting it into an accumulator.
* To iterate is human, to recurse divine - L. Peter Deutsch
*
* @param DOMNode $node
* DOMNode to be tokenized.
* @param HTMLPurifier_Token[] $tokens
* Array-list of already tokenized tokens.
* @return HTMLPurifier_Token of node appended to previously passed tokens.
*/
protected function tokenizeDOM($node, &$tokens) {
$level = 0;
$nodes = array (
$level => new HTMLPurifier_Queue ( array (
$node
) )
);
$closingNodes = array ();
do {
while ( ! $nodes [$level]->isEmpty () ) {
$node = $nodes [$level]->shift (); // FIFO
$collect = $level > 0 ? true : false;
$needEndingTag = $this->createStartNode ( $node, $tokens, $collect );
if ($needEndingTag) {
$closingNodes [$level] [] = $node;
}
if ($node->childNodes && $node->childNodes->length) {
$level ++;
$nodes [$level] = new HTMLPurifier_Queue ();
foreach ( $node->childNodes as $childNode ) {
$nodes [$level]->push ( $childNode );
}
}
}
$level --;
if ($level && isset ( $closingNodes [$level] )) {
while ( $node = array_pop ( $closingNodes [$level] ) ) {
$this->createEndNode ( $node, $tokens );
}
}
} while ( $level > 0 );
}
/**
*
* @param DOMNode $node
* DOMNode to be tokenized.
* @param HTMLPurifier_Token[] $tokens
* Array-list of already tokenized tokens.
* @param bool $collect
* Says whether or start and close are collected, set to
* false at first recursion because it's the implicit DIV
* tag you're dealing with.
* @return bool if the token needs an endtoken
* @todo data and tagName properties don't seem to exist in DOMNode?
*/
protected function createStartNode($node, &$tokens, $collect) {
// intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because
// those should have been preprocessed
if ($node->nodeType === XML_TEXT_NODE) {
$tokens [] = $this->factory->createText ( $node->data );
return false;
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
// undo libxml's special treatment of