config->set ( 'Core.MaintainLineNumbers', true ); $lexer = HTMLPurifier_Lexer::create ( $this->config ); $this->assertIsA ( $lexer, 'HTMLPurifier_Lexer_DirectLex' ); } public function test_create_objectLexerImpl() { $this->config->set ( 'Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex () ); $lexer = HTMLPurifier_Lexer::create ( $this->config ); $this->assertIsA ( $lexer, 'HTMLPurifier_Lexer_DirectLex' ); } public function test_create_unknownLexer() { $this->config->set ( 'Core.LexerImpl', 'AsdfAsdf' ); $this->expectException ( new HTMLPurifier_Exception ( 'Cannot instantiate unrecognized Lexer type AsdfAsdf' ) ); HTMLPurifier_Lexer::create ( $this->config ); } public function test_create_incompatibleLexer() { $this->config->set ( 'Core.LexerImpl', 'DOMLex' ); $this->config->set ( 'Core.MaintainLineNumbers', true ); $this->expectException ( new HTMLPurifier_Exception ( 'Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' ) ); HTMLPurifier_Lexer::create ( $this->config ); } // HTMLPurifier_Lexer->parseData() ----------------------------------------- public function assertParseData($input, $expect = true) { if ($expect === true) $expect = $input; $lexer = new HTMLPurifier_Lexer (); $this->assertIdentical ( $expect, $lexer->parseData ( $input ) ); } public function test_parseData_plainText() { $this->assertParseData ( 'asdf' ); } public function test_parseData_ampersandEntity() { $this->assertParseData ( '&', '&' ); } public function test_parseData_quotEntity() { $this->assertParseData ( '"', '"' ); } public function test_parseData_aposNumericEntity() { $this->assertParseData ( ''', "'" ); } public function test_parseData_aposCompactNumericEntity() { $this->assertParseData ( ''', "'" ); } public function test_parseData_adjacentAmpersandEntities() { $this->assertParseData ( '&&&', '&&&' ); } public function test_parseData_trailingUnescapedAmpersand() { $this->assertParseData ( '&&', '&&' ); } public function test_parseData_internalUnescapedAmpersand() { $this->assertParseData ( 'Procter & Gamble' ); } public function test_parseData_improperEntityFaultToleranceTest() { $this->assertParseData ( '-' ); } // HTMLPurifier_Lexer->extractBody() --------------------------------------- public function assertExtractBody($text, $extract = true) { $lexer = new HTMLPurifier_Lexer (); $result = $lexer->extractBody ( $text ); if ($extract === true) $extract = $text; $this->assertIdentical ( $extract, $result ); } public function test_extractBody_noBodyTags() { $this->assertExtractBody ( 'Bold' ); } public function test_extractBody_lowercaseBodyTags() { $this->assertExtractBody ( 'Bold', 'Bold' ); } public function test_extractBody_uppercaseBodyTags() { $this->assertExtractBody ( 'Bold', 'Bold' ); } public function test_extractBody_realisticUseCase() { $this->assertExtractBody ( ' xyz
', '
' ); } public function test_extractBody_bodyWithAttributes() { $this->assertExtractBody ( 'Bold', 'Bold' ); } public function test_extractBody_preserveUnclosedBody() { $this->assertExtractBody ( 'asdf' ); // not closed, don't accept } public function test_extractBody_useLastBody() { $this->assertExtractBody ( 'foobar', 'foobar' ); } // HTMLPurifier_Lexer->tokenizeHTML() -------------------------------------- public function assertTokenization($input, $expect, $alt_expect = array()) { $lexers = array (); $lexers ['DirectLex'] = new HTMLPurifier_Lexer_DirectLex (); if (class_exists ( 'DOMDocument' )) { $lexers ['DOMLex'] = new HTMLPurifier_Lexer_DOMLex (); $lexers ['PH5P'] = new HTMLPurifier_Lexer_PH5P (); } foreach ( $lexers as $name => $lexer ) { $result = $lexer->tokenizeHTML ( $input, $this->config, $this->context ); if (isset ( $alt_expect [$name] )) { if ($alt_expect [$name] === false) continue; $t_expect = $alt_expect [$name]; $this->assertIdentical ( $result, $alt_expect [$name], "$name: %s" ); } else { $t_expect = $expect; $this->assertIdentical ( $result, $expect, "$name: %s" ); } if ($t_expect != $result) { printTokens ( $result ); } } } public function test_tokenizeHTML_emptyInput() { $this->assertTokenization ( '', array () ); } public function test_tokenizeHTML_plainText() { $this->assertTokenization ( 'This is regular text.', array ( new HTMLPurifier_Token_Text ( 'This is regular text.' ) ) ); } public function test_tokenizeHTML_textAndTags() { $this->assertTokenization ( 'This is bold text', array ( new HTMLPurifier_Token_Text ( 'This is ' ), new HTMLPurifier_Token_Start ( 'b', array () ), new HTMLPurifier_Token_Text ( 'bold' ), new HTMLPurifier_Token_End ( 'b' ), new HTMLPurifier_Token_Text ( ' text' ) ) ); } public function test_tokenizeHTML_normalizeCase() { $this->assertTokenization ( '
Totally rad dude. asdf
', array ( new HTMLPurifier_Token_Start ( 'DIV', array () ), new HTMLPurifier_Token_Text ( 'Totally rad dude. ' ), new HTMLPurifier_Token_Start ( 'b', array () ), new HTMLPurifier_Token_Text ( 'asdf' ), new HTMLPurifier_Token_End ( 'b' ), new HTMLPurifier_Token_End ( 'div' ) ) ); } public function test_tokenizeHTML_notWellFormed() { $this->assertTokenization ( '', array ( new HTMLPurifier_Token_Start ( 'asdf' ), new HTMLPurifier_Token_End ( 'asdf' ), new HTMLPurifier_Token_Start ( 'd' ), new HTMLPurifier_Token_End ( 'd' ), new HTMLPurifier_Token_Start ( 'poOloka' ), new HTMLPurifier_Token_Start ( 'poolasdf' ), new HTMLPurifier_Token_Start ( 'ds' ), new HTMLPurifier_Token_End ( 'asdf' ), new HTMLPurifier_Token_End ( 'ASDF' ) ), array ( 'DOMLex' => $alt = array ( new HTMLPurifier_Token_Empty ( 'asdf' ), new HTMLPurifier_Token_Empty ( 'd' ), new HTMLPurifier_Token_Start ( 'pooloka' ), new HTMLPurifier_Token_Start ( 'poolasdf' ), new HTMLPurifier_Token_Empty ( 'ds' ), new HTMLPurifier_Token_End ( 'poolasdf' ), new HTMLPurifier_Token_End ( 'pooloka' ) ), 'PH5P' => $alt ) ); } public function test_tokenizeHTML_whitespaceInTag() { $this->assertTokenization ( 'Link to foobar', array ( new HTMLPurifier_Token_Start ( 'a', array ( 'href' => 'foobar.php', 'title' => 'foo!' ) ), new HTMLPurifier_Token_Text ( 'Link to ' ), new HTMLPurifier_Token_Start ( 'b', array ( 'id' => 'asdf' ) ), new HTMLPurifier_Token_Text ( 'foobar' ), new HTMLPurifier_Token_End ( 'b' ), new HTMLPurifier_Token_End ( 'a' ) ) ); } public function test_tokenizeHTML_singleAttribute() { $this->assertTokenization ( '
', array ( new HTMLPurifier_Token_Empty ( 'br', array ( 'style' => '&' ) ) ) ); } public function test_tokenizeHTML_emptyTag() { $this->assertTokenization ( '
', array ( new HTMLPurifier_Token_Empty ( 'br' ) ) ); } public function test_tokenizeHTML_comment() { $this->assertTokenization ( '', array ( new HTMLPurifier_Token_Comment ( ' Comment ' ) ) ); } public function test_tokenizeHTML_malformedComment() { $this->assertTokenization ( '', array ( new HTMLPurifier_Token_Comment ( ' not so well formed -' ) ) ); } public function test_tokenizeHTML_unterminatedTag() { $this->assertTokenization ( ' array ( new HTMLPurifier_Token_Empty ( 'a', array ( 'href' => '' ) ) ), 'PH5P' => false ) // total barfing, grabs scaffolding too ); } public function test_tokenizeHTML_specialEntities() { $this->assertTokenization ( '<b>', array ( new HTMLPurifier_Token_Text ( '' ) ), array ( // some parsers will separate entities out 'PH5P' => array ( new HTMLPurifier_Token_Text ( '<' ), new HTMLPurifier_Token_Text ( 'b' ), new HTMLPurifier_Token_Text ( '>' ) ) ) ); } public function test_tokenizeHTML_earlyQuote() { $this->assertTokenization ( '', array ( new HTMLPurifier_Token_Empty ( 'a' ) ), array ( // we barf on this input 'DirectLex' => array ( new HTMLPurifier_Token_Start ( 'a', array ( '"' => '' ) ) ), 'PH5P' => false ) // behavior varies; handle this personally ); } public function test_tokenizeHTML_earlyQuote_PH5P() { if (! class_exists ( 'DOMDocument' )) return; $lexer = new HTMLPurifier_Lexer_PH5P (); $result = $lexer->tokenizeHTML ( '', $this->config, $this->context ); if ($this->context->get ( 'PH5PError', true )) { $this->assertIdentical ( array ( new HTMLPurifier_Token_Start ( 'a', array ( '"' => '' ) ) ), $result ); } else { $this->assertIdentical ( array ( new HTMLPurifier_Token_Empty ( 'a', array ( '"' => '' ) ) ), $result ); } } public function test_tokenizeHTML_unescapedQuote() { $this->assertTokenization ( '"', array ( new HTMLPurifier_Token_Text ( '"' ) ) ); } public function test_tokenizeHTML_escapedQuote() { $this->assertTokenization ( '"', array ( new HTMLPurifier_Token_Text ( '"' ) ) ); } public function test_tokenizeHTML_cdata() { $this->assertTokenization ( 'can't get me!]]>', array ( new HTMLPurifier_Token_Text ( 'You can't get me!' ) ), array ( 'PH5P' => array ( new HTMLPurifier_Token_Text ( 'You ' ), new HTMLPurifier_Token_Text ( '<' ), new HTMLPurifier_Token_Text ( 'b' ), new HTMLPurifier_Token_Text ( '>' ), new HTMLPurifier_Token_Text ( 'can' ), new HTMLPurifier_Token_Text ( '&' ), new HTMLPurifier_Token_Text ( '#39;t' ), new HTMLPurifier_Token_Text ( '<' ), new HTMLPurifier_Token_Text ( '/b' ), new HTMLPurifier_Token_Text ( '>' ), new HTMLPurifier_Token_Text ( ' get me!' ) ) ) ); } public function test_tokenizeHTML_characterEntity() { $this->assertTokenization ( 'θ', array ( new HTMLPurifier_Token_Text ( "\xCE\xB8" ) ) ); } public function test_tokenizeHTML_characterEntityInCDATA() { $this->assertTokenization ( '', array ( new HTMLPurifier_Token_Text ( "→" ) ), array ( 'PH5P' => array ( new HTMLPurifier_Token_Text ( '&' ), new HTMLPurifier_Token_Text ( 'rarr;' ) ) ) ); } public function test_tokenizeHTML_entityInAttribute() { $this->assertTokenization ( 'Link', array ( new HTMLPurifier_Token_Start ( 'a', array ( 'href' => 'index.php?title=foo&id=bar' ) ), new HTMLPurifier_Token_Text ( 'Link' ), new HTMLPurifier_Token_End ( 'a' ) ) ); } public function test_tokenizeHTML_preserveUTF8() { $this->assertTokenization ( "\xCE\xB8", array ( new HTMLPurifier_Token_Text ( "\xCE\xB8" ) ) ); } public function test_tokenizeHTML_specialEntityInAttribute() { $this->assertTokenization ( '
', array ( new HTMLPurifier_Token_Empty ( 'br', array ( 'test' => 'x < 6' ) ) ) ); } public function test_tokenizeHTML_emoticonProtection() { $this->assertTokenization ( 'Whoa! <3 That\'s not good >.>', array ( new HTMLPurifier_Token_Start ( 'b' ), new HTMLPurifier_Token_Text ( 'Whoa! ' ), new HTMLPurifier_Token_Text ( '<' ), new HTMLPurifier_Token_Text ( '3 That\'s not good >.>' ), new HTMLPurifier_Token_End ( 'b' ) ), array ( // text is absorbed together 'DOMLex' => array ( new HTMLPurifier_Token_Start ( 'b' ), new HTMLPurifier_Token_Text ( 'Whoa! <3 That\'s not good >.>' ), new HTMLPurifier_Token_End ( 'b' ) ), 'PH5P' => array ( // interesting grouping new HTMLPurifier_Token_Start ( 'b' ), new HTMLPurifier_Token_Text ( 'Whoa! ' ), new HTMLPurifier_Token_Text ( '<' ), new HTMLPurifier_Token_Text ( '3 That\'s not good >.>' ), new HTMLPurifier_Token_End ( 'b' ) ) ) ); } public function test_tokenizeHTML_commentWithFunkyChars() { $this->assertTokenization ( '
', array ( new HTMLPurifier_Token_Comment ( ' This >< comment ' ), new HTMLPurifier_Token_Empty ( 'br' ) ) ); } public function test_tokenizeHTML_unterminatedComment() { $this->assertTokenization ( '', array ( new HTMLPurifier_Token_Comment ( ' This comment < < & ' ) ) ); } public function test_tokenizeHTML_attributeWithSpecialCharacters() { $this->assertTokenization ( '', array ( new HTMLPurifier_Token_Empty ( 'a', array ( 'href' => '><>' ) ) ), array ( 'DirectLex' => array ( new HTMLPurifier_Token_Start ( 'a', array ( 'href' => '' ) ), new HTMLPurifier_Token_Text ( '<' ), new HTMLPurifier_Token_Text ( '">' ) ) ) ); } public function test_tokenizeHTML_emptyTagWithSlashInAttribute() { $this->assertTokenization ( '', array ( new HTMLPurifier_Token_Empty ( 'param', array ( 'name' => 'src', 'value' => 'http://example.com/video.wmv' ) ) ) ); } public function test_tokenizeHTML_style() { $extra = array ( // PH5P doesn't seem to like style tags 'PH5P' => false, // DirectLex defers to RemoveForeignElements for textification 'DirectLex' => array ( new HTMLPurifier_Token_Start ( 'style', array ( 'type' => 'text/css' ) ), new HTMLPurifier_Token_Comment ( "\ndiv {}\n" ), new HTMLPurifier_Token_End ( 'style' ) ) ); if (! defined ( 'LIBXML_VERSION' )) { // LIBXML_VERSION is missing in early versions of PHP // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise, // this translates to 5.0.x. In such cases, punt the test entirely. return; } elseif (LIBXML_VERSION < 20628) { // libxml's behavior is wrong prior to this version, so make // appropriate accomodations $extra ['DOMLex'] = $extra ['DirectLex']; } $this->assertTokenization ( '', array ( new HTMLPurifier_Token_Start ( 'style', array ( 'type' => 'text/css' ) ), new HTMLPurifier_Token_Text ( "\ndiv {}\n" ), new HTMLPurifier_Token_End ( 'style' ) ), $extra ); } public function test_tokenizeHTML_tagWithAtSignAndExtraGt() { $alt_expect = array ( // Technically this is invalid, but it won't be a // problem with invalid element removal; also, this // mimics Mozilla's parsing of the tag. new HTMLPurifier_Token_Start ( 'a@' ), new HTMLPurifier_Token_Text ( '>' ) ); $this->assertTokenization ( '>', array ( new HTMLPurifier_Token_Start ( 'a' ), new HTMLPurifier_Token_Text ( '>' ), new HTMLPurifier_Token_End ( 'a' ) ), array ( 'DirectLex' => $alt_expect ) ); } public function test_tokenizeHTML_emoticonHeart() { $this->assertTokenization ( '
<3
', array ( new HTMLPurifier_Token_Empty ( 'br' ), new HTMLPurifier_Token_Text ( '<' ), new HTMLPurifier_Token_Text ( '3' ), new HTMLPurifier_Token_Empty ( 'br' ) ), array ( 'DOMLex' => array ( new HTMLPurifier_Token_Empty ( 'br' ), new HTMLPurifier_Token_Text ( '<3' ), new HTMLPurifier_Token_Empty ( 'br' ) ) ) ); } public function test_tokenizeHTML_emoticonShiftyEyes() { $this->assertTokenization ( '<<', array ( new HTMLPurifier_Token_Start ( 'b' ), new HTMLPurifier_Token_Text ( '<' ), new HTMLPurifier_Token_Text ( '<' ), new HTMLPurifier_Token_End ( 'b' ) ), array ( 'DOMLex' => array ( new HTMLPurifier_Token_Start ( 'b' ), new HTMLPurifier_Token_Text ( '<<' ), new HTMLPurifier_Token_End ( 'b' ) ) ) ); } public function test_tokenizeHTML_eon1996() { $this->assertTokenization ( '< test', array ( new HTMLPurifier_Token_Text ( '<' ), new HTMLPurifier_Token_Text ( ' ' ), new HTMLPurifier_Token_Start ( 'b' ), new HTMLPurifier_Token_Text ( 'test' ), new HTMLPurifier_Token_End ( 'b' ) ), array ( 'DOMLex' => array ( new HTMLPurifier_Token_Text ( '< ' ), new HTMLPurifier_Token_Start ( 'b' ), new HTMLPurifier_Token_Text ( 'test' ), new HTMLPurifier_Token_End ( 'b' ) ) ) ); } public function test_tokenizeHTML_bodyInCDATA() { $alt_tokens = array ( new HTMLPurifier_Token_Text ( '<' ), new HTMLPurifier_Token_Text ( 'body' ), new HTMLPurifier_Token_Text ( '>' ), new HTMLPurifier_Token_Text ( 'Foo' ), new HTMLPurifier_Token_Text ( '<' ), new HTMLPurifier_Token_Text ( '/body' ), new HTMLPurifier_Token_Text ( '>' ) ); $this->assertTokenization ( 'Foo]]>', array ( new HTMLPurifier_Token_Text ( 'Foo' ) ), array ( 'PH5P' => $alt_tokens ) ); } public function test_tokenizeHTML_() { $this->assertTokenization ( '
', array ( new HTMLPurifier_Token_Start ( 'a' ), new HTMLPurifier_Token_Empty ( 'img' ), new HTMLPurifier_Token_End ( 'a' ) ) ); } public function test_tokenizeHTML_ignoreIECondComment() { $this->assertTokenization ( '', array () ); } public function test_tokenizeHTML_removeProcessingInstruction() { $this->config->set ( 'Core.RemoveProcessingInstructions', true ); $this->assertTokenization ( '', array () ); } public function test_tokenizeHTML_removeNewline() { $this->config->set ( 'Core.NormalizeNewlines', true ); $this->assertTokenization ( "plain\rtext\r\n", array ( new HTMLPurifier_Token_Text ( "plain\ntext\n" ) ) ); } public function test_tokenizeHTML_noRemoveNewline() { $this->config->set ( 'Core.NormalizeNewlines', false ); $this->assertTokenization ( "plain\rtext\r\n", array ( new HTMLPurifier_Token_Text ( "plain\rtext\r\n" ) ) ); } public function test_tokenizeHTML_conditionalCommentUngreedy() { $this->assertTokenization ( 'b', array ( new HTMLPurifier_Token_Text ( "b" ) ) ); } public function test_tokenizeHTML_imgTag() { $start = array ( new HTMLPurifier_Token_Start ( 'img', array ( 'src' => 'img_11775.jpg', 'alt' => '[Img #11775]', 'id' => 'EMBEDDED_IMG_11775' ) ) ); $this->assertTokenization ( '[Img #11775]', array ( new HTMLPurifier_Token_Empty ( 'img', array ( 'src' => 'img_11775.jpg', 'alt' => '[Img #11775]', 'id' => 'EMBEDDED_IMG_11775' ) ) ), array ( 'DirectLex' => $start ) ); } /* * * public function test_tokenizeHTML_() * { * $this->assertTokenization( * , * array( * * ) * ); * } */ } // vim: et sw=4 sts=4