factory = new HTMLPurifier_TokenFactory (); } /** * * @param string $html * @param HTMLPurifier_Config $config * @param HTMLPurifier_Context $context * @return HTMLPurifier_Token[] */ public function tokenizeHTML($html, $config, $context) { $html = $this->normalize ( $html, $config, $context ); // attempt to armor stray angled brackets that cannot possibly // form tags and thus are probably being used as emoticons if ($config->get ( 'Core.AggressivelyFixLt' )) { $char = '[^a-z!\/]'; $comment = "/|\z)/is"; $html = preg_replace_callback ( $comment, array ( $this, 'callbackArmorCommentEntities' ), $html ); do { $old = $html; $html = preg_replace ( "/<($char)/i", '<\\1', $html ); } while ( $html !== $old ); $html = preg_replace_callback ( $comment, array ( $this, 'callbackUndoCommentSubst' ), $html ); // fix comments } // preprocess html, essential for UTF-8 $html = $this->wrapHTML ( $html, $config, $context ); $doc = new DOMDocument (); $doc->encoding = 'UTF-8'; // theoretically, the above has this covered set_error_handler ( array ( $this, 'muteErrorHandler' ) ); $doc->loadHTML ( $html ); restore_error_handler (); $tokens = array (); $this->tokenizeDOM ( $doc->getElementsByTagName ( 'html' )->item ( 0 )-> // getElementsByTagName ( 'body' )->item ( 0 )-> // getElementsByTagName ( 'div' )->item ( 0 ), //
$tokens ); return $tokens; } /** * Iterative function that tokenizes a node, putting it into an accumulator. * To iterate is human, to recurse divine - L. Peter Deutsch * * @param DOMNode $node * DOMNode to be tokenized. * @param HTMLPurifier_Token[] $tokens * Array-list of already tokenized tokens. * @return HTMLPurifier_Token of node appended to previously passed tokens. */ protected function tokenizeDOM($node, &$tokens) { $level = 0; $nodes = array ( $level => new HTMLPurifier_Queue ( array ( $node ) ) ); $closingNodes = array (); do { while ( ! $nodes [$level]->isEmpty () ) { $node = $nodes [$level]->shift (); // FIFO $collect = $level > 0 ? true : false; $needEndingTag = $this->createStartNode ( $node, $tokens, $collect ); if ($needEndingTag) { $closingNodes [$level] [] = $node; } if ($node->childNodes && $node->childNodes->length) { $level ++; $nodes [$level] = new HTMLPurifier_Queue (); foreach ( $node->childNodes as $childNode ) { $nodes [$level]->push ( $childNode ); } } } $level --; if ($level && isset ( $closingNodes [$level] )) { while ( $node = array_pop ( $closingNodes [$level] ) ) { $this->createEndNode ( $node, $tokens ); } } } while ( $level > 0 ); } /** * * @param DOMNode $node * DOMNode to be tokenized. * @param HTMLPurifier_Token[] $tokens * Array-list of already tokenized tokens. * @param bool $collect * Says whether or start and close are collected, set to * false at first recursion because it's the implicit DIV * tag you're dealing with. * @return bool if the token needs an endtoken * @todo data and tagName properties don't seem to exist in DOMNode? */ protected function createStartNode($node, &$tokens, $collect) { // intercept non element nodes. WE MUST catch all of them, // but we're not getting the character reference nodes because // those should have been preprocessed if ($node->nodeType === XML_TEXT_NODE) { $tokens [] = $this->factory->createText ( $node->data ); return false; } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { // undo libxml's special treatment of