def = new HTMLPurifier_AttrDef_Lang (); // basic good uses $this->assertDef ( 'en' ); $this->assertDef ( 'en-us' ); $this->assertDef ( ' en ', 'en' ); // trim $this->assertDef ( 'EN', 'en' ); // case insensitivity // (thanks Eugen Pankratz for noticing the typos!) $this->assertDef ( 'En-Us-Edison', 'en-us-edison' ); // complex ci $this->assertDef ( 'fr en', false ); // multiple languages $this->assertDef ( '%', false ); // bad character // test overlong language according to syntax $this->assertDef ( 'thisistoolongsoitgetscut', false ); // primary subtag rules // I'm somewhat hesitant to allow x and i as primary language codes, // because they usually are never used in real life. However, // theoretically speaking, having them alone is permissable, so // I'll be lenient. No XML parser is going to complain anyway. $this->assertDef ( 'x' ); $this->assertDef ( 'i' ); // real world use-cases $this->assertDef ( 'x-klingon' ); $this->assertDef ( 'i-mingo' ); // because the RFC only defines two and three letter primary codes, // anything with a length of four or greater is invalid, despite // the syntax stipulation of 1 to 8 characters. Because the RFC // specifically states that this reservation is in order to allow // for future versions to expand, the adoption of a new RFC will // require these test cases to be rewritten, even if backwards- // compatibility is largely retained (i.e. this is not forwards // compatible) $this->assertDef ( 'four', false ); // for similar reasons, disallow any other one character language $this->assertDef ( 'f', false ); // second subtag rules // one letter subtags prohibited until revision. This is, however, // less volatile than the restrictions on the primary subtags. // Also note that this test-case tests fix-behavior: chop // off subtags until you get a valid language code. $this->assertDef ( 'en-a', 'en' ); // however, x is a reserved single-letter subtag that is allowed $this->assertDef ( 'en-x', 'en-x' ); // 2-8 chars are permitted, but have special meaning that cannot // be checked without maintaining country code lookup tables (for // two characters) or special registration tables (for all above). $this->assertDef ( 'en-uk', true ); // further subtag rules: only syntactic constraints $this->assertDef ( 'en-us-edison' ); $this->assertDef ( 'en-us-toolonghaha', 'en-us' ); $this->assertDef ( 'en-us-a-silly-long-one' ); // rfc 3066 stipulates that if a three letter and a two letter code // are available, the two letter one MUST be used. Without a language // code lookup table, we cannot implement this functionality. // although the HTML protocol, technically speaking, allows you to // omit language tags, this implicitly means that the parent element's // language is the one applicable, which, in some cases, is incorrect. // Thus, we allow und, only slightly defying the RFC's SHOULD NOT // designation. $this->assertDef ( 'und' ); // because attributes only allow one language, mul is allowed, complying // with the RFC's SHOULD NOT designation. $this->assertDef ( 'mul' ); } } // vim: et sw=4 sts=4