Java/XML/XMLEncoder — различия между версиями
Admin (обсуждение | вклад) м (1 версия) |
|
(нет различий)
|
Текущая версия на 07:11, 1 июня 2010
Содержание
- 1 Determining validity of characters outside basic 7-bit range of Unicode, for XML 1.0
- 2 Encode Xml Attribute
- 3 Escape / unescape special chars according XML specifications
- 4 Provides HTML and XML entity utilities.
- 5 Returns true if the argument, a UCS-4 character code, is valid in XML documents.
- 6 Returns true if the character is a non-initial character in names according to the XML recommendation
- 7 Returns true if the character is an XML "letter"
- 8 Verify whether the specified character conforms to the XML 1.0 definition of whitespace
- 9 XML character properties
- 10 XMLEncoder a bean
- 11 Xml Encoding Sniffer
- 12 XML-related tasks and java.io Readers: IETF standard encoding names, automatic detection of most XML encodings
Determining validity of characters outside basic 7-bit range of Unicode, for XML 1.0
// Revised from ctc wstx
/**
* Simple utility class that encapsulates logic of determining validity
* of characters outside basic 7-bit range of Unicode, for XML 1.0
*/
public final class XmlChars
{
/* We don"t need full 64k bits... (0x80 - 0x312C) / 32. But to
* simplify things, let"s just include first 0x80 entries in there etc
*/
final static int SIZE = (0x3140 >> 5); // 32 bits per int
final static int[] sXml10StartChars = new int[SIZE];
static {
SETBITS(sXml10StartChars, 0xC0, 0xD6);
SETBITS(sXml10StartChars, 0xD8, 0xF6);
SETBITS(sXml10StartChars, 0xF8, 0xFF);
SETBITS(sXml10StartChars, 0x100, 0x131);
SETBITS(sXml10StartChars, 0x134, 0x13e);
SETBITS(sXml10StartChars, 0x141, 0x148);
SETBITS(sXml10StartChars, 0x14a, 0x17e);
SETBITS(sXml10StartChars, 0x180, 0x1c3);
SETBITS(sXml10StartChars, 0x1cd, 0x1f0);
SETBITS(sXml10StartChars, 0x1f4, 0x1f5);
SETBITS(sXml10StartChars, 0x1fa, 0x217);
SETBITS(sXml10StartChars, 0x250, 0x2a8);
SETBITS(sXml10StartChars, 0x2bb, 0x2c1);
SETBITS(sXml10StartChars, 0x386);
SETBITS(sXml10StartChars, 0x388, 0x38a);
SETBITS(sXml10StartChars, 0x38c);
SETBITS(sXml10StartChars, 0x38e, 0x3a1);
SETBITS(sXml10StartChars, 0x3a3, 0x3ce);
SETBITS(sXml10StartChars, 0x3d0, 0x3d6);
SETBITS(sXml10StartChars, 0x3da);
SETBITS(sXml10StartChars, 0x3dc);
SETBITS(sXml10StartChars, 0x3de);
SETBITS(sXml10StartChars, 0x3e0);
SETBITS(sXml10StartChars, 0x3e2, 0x3f3);
SETBITS(sXml10StartChars, 0x401, 0x40c);
SETBITS(sXml10StartChars, 0x40e, 0x44f);
SETBITS(sXml10StartChars, 0x451, 0x45c);
SETBITS(sXml10StartChars, 0x45e, 0x481);
SETBITS(sXml10StartChars, 0x490, 0x4c4);
SETBITS(sXml10StartChars, 0x4c7, 0x4c8);
SETBITS(sXml10StartChars, 0x4cb, 0x4cc);
SETBITS(sXml10StartChars, 0x4d0, 0x4eb);
SETBITS(sXml10StartChars, 0x4ee, 0x4f5);
SETBITS(sXml10StartChars, 0x4f8, 0x4f9);
SETBITS(sXml10StartChars, 0x531, 0x556);
SETBITS(sXml10StartChars, 0x559);
SETBITS(sXml10StartChars, 0x561, 0x586);
SETBITS(sXml10StartChars, 0x5d0, 0x5ea);
SETBITS(sXml10StartChars, 0x5f0, 0x5f2);
SETBITS(sXml10StartChars, 0x621, 0x63a);
SETBITS(sXml10StartChars, 0x641, 0x64a);
SETBITS(sXml10StartChars, 0x671, 0x6b7);
SETBITS(sXml10StartChars, 0x6ba, 0x6be);
SETBITS(sXml10StartChars, 0x6c0, 0x6ce);
SETBITS(sXml10StartChars, 0x6d0, 0x6d3);
SETBITS(sXml10StartChars, 0x6d5);
SETBITS(sXml10StartChars, 0x6e5, 0x6e6);
SETBITS(sXml10StartChars, 0x905, 0x939);
SETBITS(sXml10StartChars, 0x93d);
SETBITS(sXml10StartChars, 0x958, 0x961);
SETBITS(sXml10StartChars, 0x985, 0x98c);
SETBITS(sXml10StartChars, 0x98f, 0x990);
SETBITS(sXml10StartChars, 0x993, 0x9a8);
SETBITS(sXml10StartChars, 0x9aa, 0x9b0);
SETBITS(sXml10StartChars, 0x9b2);
SETBITS(sXml10StartChars, 0x9b6, 0x9b9);
SETBITS(sXml10StartChars, 0x9dc);
SETBITS(sXml10StartChars, 0x9dd);
SETBITS(sXml10StartChars, 0x9df, 0x9e1);
SETBITS(sXml10StartChars, 0x9f0); SETBITS(sXml10StartChars, 0x9f1);
SETBITS(sXml10StartChars, 0xA05, 0xA0A);
SETBITS(sXml10StartChars, 0xA0F); SETBITS(sXml10StartChars, 0xA10);
SETBITS(sXml10StartChars, 0xA13, 0xA28);
SETBITS(sXml10StartChars, 0xA2A, 0xA30);
SETBITS(sXml10StartChars, 0xA32); SETBITS(sXml10StartChars, 0xA33);
SETBITS(sXml10StartChars, 0xA35); SETBITS(sXml10StartChars, 0xA36);
SETBITS(sXml10StartChars, 0xA38); SETBITS(sXml10StartChars, 0xA39);
SETBITS(sXml10StartChars, 0xA59, 0xA5C);
SETBITS(sXml10StartChars, 0xA5E);
SETBITS(sXml10StartChars, 0xA72, 0xA74);
SETBITS(sXml10StartChars, 0xA85, 0xA8B);
SETBITS(sXml10StartChars, 0xA8D);
SETBITS(sXml10StartChars, 0xA8F, 0xA91);
SETBITS(sXml10StartChars, 0xA93, 0xAA8);
SETBITS(sXml10StartChars, 0xAAA, 0xAB0);
SETBITS(sXml10StartChars, 0xAB2, 0xAB3);
SETBITS(sXml10StartChars, 0xAB5, 0xAB9);
SETBITS(sXml10StartChars, 0xABD);
SETBITS(sXml10StartChars, 0xAE0);
SETBITS(sXml10StartChars, 0xB05, 0xB0C);
SETBITS(sXml10StartChars, 0xB0F); SETBITS(sXml10StartChars, 0xB10);
SETBITS(sXml10StartChars, 0xB13, 0xB28);
SETBITS(sXml10StartChars, 0xB2A, 0xB30);
SETBITS(sXml10StartChars, 0xB32); SETBITS(sXml10StartChars, 0xB33);
SETBITS(sXml10StartChars, 0xB36, 0xB39);
SETBITS(sXml10StartChars, 0xB3D);
SETBITS(sXml10StartChars, 0xB5C); SETBITS(sXml10StartChars, 0xB5D);
SETBITS(sXml10StartChars, 0xB5F, 0xB61);
SETBITS(sXml10StartChars, 0xB85, 0xB8A);
SETBITS(sXml10StartChars, 0xB8E, 0xB90);
SETBITS(sXml10StartChars, 0xB92, 0xB95);
SETBITS(sXml10StartChars, 0xB99, 0xB9A);
SETBITS(sXml10StartChars, 0xB9C);
SETBITS(sXml10StartChars, 0xB9E); SETBITS(sXml10StartChars, 0xB9F);
SETBITS(sXml10StartChars, 0xBA3); SETBITS(sXml10StartChars, 0xBA4);
SETBITS(sXml10StartChars, 0xBA8, 0xBAA);
SETBITS(sXml10StartChars, 0xBAE, 0xBB5);
SETBITS(sXml10StartChars, 0xBB7, 0xBB9);
SETBITS(sXml10StartChars, 0xC05, 0xC0C);
SETBITS(sXml10StartChars, 0xC0E, 0xC10);
SETBITS(sXml10StartChars, 0xC12, 0xC28);
SETBITS(sXml10StartChars, 0xC2A, 0xC33);
SETBITS(sXml10StartChars, 0xC35, 0xC39);
SETBITS(sXml10StartChars, 0xC60); SETBITS(sXml10StartChars, 0xC61);
SETBITS(sXml10StartChars, 0xC85, 0xC8C);
SETBITS(sXml10StartChars, 0xC8E, 0xC90);
SETBITS(sXml10StartChars, 0xC92, 0xCA8);
SETBITS(sXml10StartChars, 0xCAA, 0xCB3);
SETBITS(sXml10StartChars, 0xCB5, 0xCB9);
SETBITS(sXml10StartChars, 0xCDE);
SETBITS(sXml10StartChars, 0xCE0); SETBITS(sXml10StartChars, 0xCE1);
SETBITS(sXml10StartChars, 0xD05, 0xD0C);
SETBITS(sXml10StartChars, 0xD0E, 0xD10);
SETBITS(sXml10StartChars, 0xD12, 0xD28);
SETBITS(sXml10StartChars, 0xD2A, 0xD39);
SETBITS(sXml10StartChars, 0xD60); SETBITS(sXml10StartChars, 0xD61);
SETBITS(sXml10StartChars, 0xE01, 0xE2E);
SETBITS(sXml10StartChars, 0xE30);
SETBITS(sXml10StartChars, 0xE32); SETBITS(sXml10StartChars, 0xE33);
SETBITS(sXml10StartChars, 0xE40, 0xE45);
SETBITS(sXml10StartChars, 0xE81); SETBITS(sXml10StartChars, 0xE82);
SETBITS(sXml10StartChars, 0xE84);
SETBITS(sXml10StartChars, 0xE87); SETBITS(sXml10StartChars, 0xE88);
SETBITS(sXml10StartChars, 0xE8A); SETBITS(sXml10StartChars, 0xE8D);
SETBITS(sXml10StartChars, 0xE94, 0xE97);
SETBITS(sXml10StartChars, 0xE99, 0xE9F);
SETBITS(sXml10StartChars, 0xEA1, 0xEA3);
SETBITS(sXml10StartChars, 0xEA5); SETBITS(sXml10StartChars, 0xEA7);
SETBITS(sXml10StartChars, 0xEAA); SETBITS(sXml10StartChars, 0xEAB);
SETBITS(sXml10StartChars, 0xEAD); SETBITS(sXml10StartChars, 0xEAE);
SETBITS(sXml10StartChars, 0xEB0);
SETBITS(sXml10StartChars, 0xEB2); SETBITS(sXml10StartChars, 0xEB3);
SETBITS(sXml10StartChars, 0xEBD);
SETBITS(sXml10StartChars, 0xEC0, 0xEC4);
SETBITS(sXml10StartChars, 0xF40, 0xF47);
SETBITS(sXml10StartChars, 0xF49, 0xF69);
SETBITS(sXml10StartChars, 0x10a0, 0x10c5);
SETBITS(sXml10StartChars, 0x10d0, 0x10f6);
SETBITS(sXml10StartChars, 0x1100);
SETBITS(sXml10StartChars, 0x1102, 0x1103);
SETBITS(sXml10StartChars, 0x1105, 0x1107);
SETBITS(sXml10StartChars, 0x1109);
SETBITS(sXml10StartChars, 0x110b, 0x110c);
SETBITS(sXml10StartChars, 0x110e, 0x1112);
SETBITS(sXml10StartChars, 0x113c);
SETBITS(sXml10StartChars, 0x113e);
SETBITS(sXml10StartChars, 0x1140);
SETBITS(sXml10StartChars, 0x114c);
SETBITS(sXml10StartChars, 0x114e);
SETBITS(sXml10StartChars, 0x1150);
SETBITS(sXml10StartChars, 0x1154, 0x1155);
SETBITS(sXml10StartChars, 0x1159);
SETBITS(sXml10StartChars, 0x115f, 0x1161);
SETBITS(sXml10StartChars, 0x1163);
SETBITS(sXml10StartChars, 0x1165);
SETBITS(sXml10StartChars, 0x1167);
SETBITS(sXml10StartChars, 0x1169);
SETBITS(sXml10StartChars, 0x116d, 0x116e);
SETBITS(sXml10StartChars, 0x1172, 0x1173);
SETBITS(sXml10StartChars, 0x1175);
SETBITS(sXml10StartChars, 0x119e);
SETBITS(sXml10StartChars, 0x11a8);
SETBITS(sXml10StartChars, 0x11ab);
SETBITS(sXml10StartChars, 0x11ae, 0x11af);
SETBITS(sXml10StartChars, 0x11b7, 0x11b8);
SETBITS(sXml10StartChars, 0x11ba);
SETBITS(sXml10StartChars, 0x11bc, 0x11c2);
SETBITS(sXml10StartChars, 0x11eb);
SETBITS(sXml10StartChars, 0x11f0);
SETBITS(sXml10StartChars, 0x11f9);
SETBITS(sXml10StartChars, 0x1e00, 0x1e9b);
SETBITS(sXml10StartChars, 0x1ea0, 0x1ef9);
SETBITS(sXml10StartChars, 0x1f00, 0x1f15);
SETBITS(sXml10StartChars, 0x1f18, 0x1f1d);
SETBITS(sXml10StartChars, 0x1f20, 0x1f45);
SETBITS(sXml10StartChars, 0x1f48, 0x1f4d);
SETBITS(sXml10StartChars, 0x1f50, 0x1f57);
SETBITS(sXml10StartChars, 0x1f59);
SETBITS(sXml10StartChars, 0x1f5b);
SETBITS(sXml10StartChars, 0x1f5d);
SETBITS(sXml10StartChars, 0x1f5f, 0x1f7d);
SETBITS(sXml10StartChars, 0x1f80, 0x1fb4);
SETBITS(sXml10StartChars, 0x1fb6, 0x1fbc);
SETBITS(sXml10StartChars, 0x1fbe);
SETBITS(sXml10StartChars, 0x1fc2, 0x1fc4);
SETBITS(sXml10StartChars, 0x1fc6, 0x1fcc);
SETBITS(sXml10StartChars, 0x1fd0, 0x1fd3);
SETBITS(sXml10StartChars, 0x1fd6, 0x1fdb);
SETBITS(sXml10StartChars, 0x1fe0, 0x1fec);
SETBITS(sXml10StartChars, 0x1ff2, 0x1ff4);
SETBITS(sXml10StartChars, 0x1ff6, 0x1ffc);
SETBITS(sXml10StartChars, 0x2126);
SETBITS(sXml10StartChars, 0x212a, 0x212b);
SETBITS(sXml10StartChars, 0x212e);
SETBITS(sXml10StartChars, 0x2180, 0x2182);
SETBITS(sXml10StartChars, 0x3041, 0x3094);
SETBITS(sXml10StartChars, 0x30a1, 0x30fa);
SETBITS(sXml10StartChars, 0x3105, 0x312c);
// note: AC00 - D7A3 handled separately
// [86] Ideographic (but note: > 0x312c handled separately)
SETBITS(sXml10StartChars, 0x3007);
SETBITS(sXml10StartChars, 0x3021, 0x3029);
}
final static int[] sXml10Chars = new int[SIZE];
static {
// Let"s start with all valid start chars:
System.arraycopy(sXml10StartChars, 0, sXml10Chars, 0, SIZE);
// [87] CombiningChar ::=
SETBITS(sXml10Chars, 0x300, 0x345);
SETBITS(sXml10Chars, 0x360, 0x361);
SETBITS(sXml10Chars, 0x483, 0x486);
SETBITS(sXml10Chars, 0x591, 0x5a1);
SETBITS(sXml10Chars, 0x5a3, 0x5b9);
SETBITS(sXml10Chars, 0x5bb, 0x5bd);
SETBITS(sXml10Chars, 0x5bf);
SETBITS(sXml10Chars, 0x5c1, 0x5c2);
SETBITS(sXml10Chars, 0x5c4);
SETBITS(sXml10Chars, 0x64b, 0x652);
SETBITS(sXml10Chars, 0x670);
SETBITS(sXml10Chars, 0x6d6, 0x6dc);
SETBITS(sXml10Chars, 0x6dd, 0x6df);
SETBITS(sXml10Chars, 0x6e0, 0x6e4);
SETBITS(sXml10Chars, 0x6e7, 0x6e8);
SETBITS(sXml10Chars, 0x6ea, 0x6ed);
SETBITS(sXml10Chars, 0x901, 0x903);
SETBITS(sXml10Chars, 0x93c);
SETBITS(sXml10Chars, 0x93e, 0x94c);
SETBITS(sXml10Chars, 0x94d);
SETBITS(sXml10Chars, 0x951, 0x954);
SETBITS(sXml10Chars, 0x962); SETBITS(sXml10Chars, 0x963);
SETBITS(sXml10Chars, 0x981, 0x983);
SETBITS(sXml10Chars, 0x9bc);
SETBITS(sXml10Chars, 0x9be); SETBITS(sXml10Chars, 0x9bf);
SETBITS(sXml10Chars, 0x9c0, 0x9c4);
SETBITS(sXml10Chars, 0x9c7); SETBITS(sXml10Chars, 0x9c8);
SETBITS(sXml10Chars, 0x9cb, 0x9cd);
SETBITS(sXml10Chars, 0x9d7);
SETBITS(sXml10Chars, 0x9e2); SETBITS(sXml10Chars, 0x9e3);
SETBITS(sXml10Chars, 0xA02);
SETBITS(sXml10Chars, 0xA3C);
SETBITS(sXml10Chars, 0xA3E); SETBITS(sXml10Chars, 0xA3F);
SETBITS(sXml10Chars, 0xA40, 0xA42);
SETBITS(sXml10Chars, 0xA47); SETBITS(sXml10Chars, 0xA48);
SETBITS(sXml10Chars, 0xA4B, 0xA4D);
SETBITS(sXml10Chars, 0xA70); SETBITS(sXml10Chars, 0xA71);
SETBITS(sXml10Chars, 0xA81, 0xA83);
SETBITS(sXml10Chars, 0xABC);
SETBITS(sXml10Chars, 0xABE, 0xAC5);
SETBITS(sXml10Chars, 0xAC7, 0xAC9);
SETBITS(sXml10Chars, 0xACB, 0xACD);
SETBITS(sXml10Chars, 0xB01, 0xB03);
SETBITS(sXml10Chars, 0xB3C);
SETBITS(sXml10Chars, 0xB3E, 0xB43);
SETBITS(sXml10Chars, 0xB47); SETBITS(sXml10Chars, 0xB48);
SETBITS(sXml10Chars, 0xB4B, 0xB4D);
SETBITS(sXml10Chars, 0xB56); SETBITS(sXml10Chars, 0xB57);
SETBITS(sXml10Chars, 0xB82); SETBITS(sXml10Chars, 0xB83);
SETBITS(sXml10Chars, 0xBBE, 0xBC2);
SETBITS(sXml10Chars, 0xBC6, 0xBC8);
SETBITS(sXml10Chars, 0xBCA, 0xBCD);
SETBITS(sXml10Chars, 0xBD7);
SETBITS(sXml10Chars, 0xC01, 0xC03);
SETBITS(sXml10Chars, 0xC3E, 0xC44);
SETBITS(sXml10Chars, 0xC46, 0xC48);
SETBITS(sXml10Chars, 0xC4A, 0xC4D);
SETBITS(sXml10Chars, 0xC55, 0xC56);
SETBITS(sXml10Chars, 0xC82, 0xC83);
SETBITS(sXml10Chars, 0xCBE, 0xCC4);
SETBITS(sXml10Chars, 0xCC6, 0xCC8);
SETBITS(sXml10Chars, 0xCCA, 0xCCD);
SETBITS(sXml10Chars, 0xCD5, 0xCD6);
SETBITS(sXml10Chars, 0xD02, 0xD03);
SETBITS(sXml10Chars, 0xD3E, 0xD43);
SETBITS(sXml10Chars, 0xD46, 0xD48);
SETBITS(sXml10Chars, 0xD4A, 0xD4D);
SETBITS(sXml10Chars, 0xD57);
SETBITS(sXml10Chars, 0xE31);
SETBITS(sXml10Chars, 0xE34, 0xE3A);
SETBITS(sXml10Chars, 0xE47, 0xE4E);
SETBITS(sXml10Chars, 0xEB1);
SETBITS(sXml10Chars, 0xEB4, 0xEB9);
SETBITS(sXml10Chars, 0xEBB, 0xEBC);
SETBITS(sXml10Chars, 0xEC8, 0xECD);
SETBITS(sXml10Chars, 0xF18, 0xF19);
SETBITS(sXml10Chars, 0xF35); SETBITS(sXml10Chars, 0xF37);
SETBITS(sXml10Chars, 0xF39);
SETBITS(sXml10Chars, 0xF3E); SETBITS(sXml10Chars, 0xF3F);
SETBITS(sXml10Chars, 0xF71, 0xF84);
SETBITS(sXml10Chars, 0xF86, 0xF8B);
SETBITS(sXml10Chars, 0xF90, 0xF95);
SETBITS(sXml10Chars, 0xF97);
SETBITS(sXml10Chars, 0xF99, 0xFAD);
SETBITS(sXml10Chars, 0xFB1, 0xFB7);
SETBITS(sXml10Chars, 0xFB9);
SETBITS(sXml10Chars, 0x20D0, 0x20DC);
SETBITS(sXml10Chars, 0x20E1);
SETBITS(sXml10Chars, 0x302A, 0x302F);
SETBITS(sXml10Chars, 0x3099); SETBITS(sXml10Chars, 0x309A);
// [88] Digit:
SETBITS(sXml10Chars, 0x660, 0x669);
SETBITS(sXml10Chars, 0x6f0, 0x6f9);
SETBITS(sXml10Chars, 0x966, 0x96f);
SETBITS(sXml10Chars, 0x9e6, 0x9ef);
SETBITS(sXml10Chars, 0xa66, 0xa6f);
SETBITS(sXml10Chars, 0xae6, 0xaef);
SETBITS(sXml10Chars, 0xb66, 0xb6f);
SETBITS(sXml10Chars, 0xbe7, 0xbef);
SETBITS(sXml10Chars, 0xc66, 0xc6f);
SETBITS(sXml10Chars, 0xce6, 0xcef);
SETBITS(sXml10Chars, 0xd66, 0xd6f);
SETBITS(sXml10Chars, 0xe50, 0xe59);
SETBITS(sXml10Chars, 0xed0, 0xed9);
SETBITS(sXml10Chars, 0xf20, 0xf29);
// [89] Extender:
SETBITS(sXml10Chars, 0xb7);
SETBITS(sXml10Chars, 0x2d0);
SETBITS(sXml10Chars, 0x2d1);
SETBITS(sXml10Chars, 0x387);
SETBITS(sXml10Chars, 0x640);
SETBITS(sXml10Chars, 0xE46);
SETBITS(sXml10Chars, 0xEC6);
SETBITS(sXml10Chars, 0x3005);
SETBITS(sXml10Chars, 0x3031, 0x3035);
SETBITS(sXml10Chars, 0x309d, 0x309e);
SETBITS(sXml10Chars, 0x30fc, 0x30fe);
}
private XmlChars() { }
public final static boolean is10NameStartChar(char c)
{
// First, let"s deal with outliers
if (c > 0x312C) { // Most valid chars are below this..
if (c < 0xAC00) {
return (c >= 0x4E00 && c <= 0x9FA5); // valid ideograms
}
if (c <= 0xD7A3) { // 0xAC00 - 0xD7A3, valid base chars
return true;
}
/* As to surrogate pairs... let"s do the bare minimum;
* 0xD800 - 0xDBFF (high surrogate) are ok; low surrogates
* can only follow high one
*/
return (c <= 0xDBFF && c >= 0xD800);
}
// but then we"ll just need to use the table...
int ix = (int) c;
return (sXml10StartChars[ix >> 5] & (1 << (ix & 31))) != 0;
}
public final static boolean is10NameChar(char c)
{
// First, let"s deal with outliers
if (c > 0x312C) { // Most valid chars are below this..
if (c < 0xAC00) {
return (c >= 0x4E00 && c <= 0x9FA5); // valid ideograms
}
if (c <= 0xD7A3) { // 0xAC00 - 0xD7A3, valid base chars
return true;
}
/* As to surrogate pairs... let"s do the bare minimum;
* 0xD800 - 0xDFFF (high, low surrogate) are ok (need to
* check pairing in future)
*/
return (c >= 0xD800 && c <= 0xDFFF);
}
// but then we"ll just need to use the table...
int ix = (int) c;
return (sXml10Chars[ix >> 5] & (1 << (ix & 31))) != 0;
}
public final static boolean is11NameStartChar(char c)
{
// Others are checked block-by-block:
if (c <= 0x2FEF) {
if (c < 0x300) {
if (c < 0x00C0) { // 8-bit ctrl chars
return false;
}
// most of the rest are fine...
return (c != 0xD7 && c != 0xF7);
}
if (c >= 0x2C00) {
// 0x2C00 - 0x2FEF are ok
return true;
}
if (c < 0x370 || c > 0x218F) {
// 0x300 - 0x36F, 0x2190 - 0x2BFF invalid
return false;
}
if (c < 0x2000) {
// 0x370 - 0x37D, 0x37F - 0x1FFF are ok
return (c != 0x37E);
}
if (c >= 0x2070) {
// 0x2070 - 0x218F are ok
return (c <= 0x218F);
}
// And finally, 0x200C - 0x200D
return (c == 0x200C || c == 0x200D);
}
// 0x3000 and above:
if (c >= 0x3001) {
/* Hmmh, let"s allow high surrogates here, without checking
* that they are properly followed... crude basic support,
* I know, but allows valid combinations, just doesn"t catch
* invalid ones
*/
if (c <= 0xDBFF) { // 0x3001 - 0xD7FF (chars),
// 0xD800 - 0xDBFF (high surrogate) are ok (unlike DC00-DFFF)
return true;
}
if (c >= 0xF900 && c <= 0xFFFD) {
/* Check above removes low surrogate (since one can not
* START an identifier), and byte-order markers..
*/
return (c <= 0xFDCF || c >= 0xFDF0);
}
}
return false;
}
public final static boolean is11NameChar(char c)
{
// Others are checked block-by-block:
if (c <= 0x2FEF) {
if (c < 0x2000) { // only 8-bit ctrl chars and 0x37E to filter out
return (c >= 0x00C0 && c != 0x37E) || (c == 0xB7);
}
if (c >= 0x2C00) {
// 0x100 - 0x1FFF, 0x2C00 - 0x2FEF are ok
return true;
}
if (c < 0x200C || c > 0x218F) {
// 0x2000 - 0x200B, 0x2190 - 0x2BFF invalid
return false;
}
if (c >= 0x2070) {
// 0x2070 - 0x218F are ok
return true;
}
// And finally, 0x200C - 0x200D, 0x203F - 0x2040 are ok
return (c == 0x200C || c == 0x200D
|| c == 0x203F || c == 0x2040);
}
// 0x3000 and above:
if (c >= 0x3001) {
/* Hmmh, let"s allow surrogate heres, without checking that
* they have proper ordering. For non-first name chars, both are
* ok, for valid names. Crude basic support,
* I know, but allows valid combinations, just doesn"t catch
* invalid ones
*/
if (c <= 0xDFFF) { // 0x3001 - 0xD7FF (chars),
// 0xD800 - 0xDFFF (high, low surrogate) are ok:
return true;
}
if (c >= 0xF900 && c <= 0xFFFD) {
/* Check above removes other invalid chars (below valid
* range), and byte-order markers (0xFFFE, 0xFFFF).
*/
return (c <= 0xFDCF || c >= 0xFDF0);
}
}
return false;
}
private static void SETBITS(int[] array, int start, int end)
{
int bit1 = (start & 31);
int bit2 = (end & 31);
start >>= 5;
end >>= 5;
/* Ok; this is not perfectly optimal, but should be good enough...
* we"ll only do one-by-one at the ends.
*/
if (start == end) {
for (; bit1 <= bit2; ++bit1) {
array[start] |= (1 << bit1);
}
} else {
for (int bit = bit1; bit <= 31; ++bit) {
array[start] |= (1 << bit);
}
while (++start < end) {
array[start] = -1;
}
for (int bit = 0; bit <= bit2; ++bit) {
array[end] |= (1 << bit);
}
}
}
private static void SETBITS(int[] array, int point) {
int ix = (point >> 5);
int bit = (point & 31);
array[ix] |= (1 << bit);
}
}
Encode Xml Attribute
/**
*
* The ObjectStyle Group Software License, version 1.1
* ObjectStyle Group - http://objectstyle.org/
*
* Copyright (c) 2002-2005, Andrei (Andrus) Adamchik and individual authors
* of the software. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if any,
* must include the following acknowlegement:
* "This product includes software developed by independent contributors
* and hosted on ObjectStyle Group web site (http://objectstyle.org/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The names "ObjectStyle Group" and "Cayenne" must not be used to endorse
* or promote products derived from this software without prior written
* permission. For written permission, email
* "andrus at objectstyle dot org".
*
* 5. Products derived from this software may not be called "ObjectStyle"
* or "Cayenne", nor may "ObjectStyle" or "Cayenne" appear in their
* names without prior written permission.
*
* THIS SOFTWARE IS PROVIDED ``AS IS"" AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE OBJECTSTYLE GROUP OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* This software consists of voluntary contributions made by many
* individuals and hosted on ObjectStyle Group web site. For more
* information on the ObjectStyle Group, please see
* <http://objectstyle.org/>.
*/
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.lang.reflect.Member;
import java.lang.reflect.Modifier;
import java.net.URL;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.ruparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
/**
* Contains various unorganized static utility methods used across Cayenne.
*
* @author Andrei Adamchik
*/
public class Util {
/**
* Encodes a string so that it can be used as an attribute value in an XML document.
* Will do conversion of the greater/less signs, quotes and ampersands.
*/
public static String encodeXmlAttribute(String str) {
if (str == null)
return null;
int len = str.length();
if (len == 0)
return str;
StringBuffer encoded = new StringBuffer();
for (int i = 0; i < len; i++) {
char c = str.charAt(i);
if (c == "<")
encoded.append("<");
else if (c == "\"")
encoded.append(""");
else if (c == ">")
encoded.append(">");
else if (c == "\"")
encoded.append("'");
else if (c == "&")
encoded.append("&");
else
encoded.append(c);
}
return encoded.toString();
}
}
Escape / unescape special chars according XML specifications
/*
* Funambol is a mobile platform developed by Funambol, Inc.
* Copyright (C) 2003 - 2007 Funambol, Inc.
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU Affero General Public License version 3 as published by
* the Free Software Foundation with the addition of the following permission
* added to Section 15 as permitted in Section 7(a): FOR ANY PART OF THE COVERED
* WORK IN WHICH THE COPYRIGHT IS OWNED BY FUNAMBOL, FUNAMBOL DISCLAIMS THE
* WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program; if not, see http://www.gnu.org/licenses or write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301 USA.
*
* You can contact Funambol, Inc. headquarters at 643 Bair Island Road, Suite
* 305, Redwood City, CA 94063, USA, or at email address info@funambol.ru.
*
* The interactive user interfaces in modified source and object code versions
* of this program must display Appropriate Legal Notices, as required under
* Section 5 of the GNU Affero General Public License version 3.
*
* In accordance with Section 7(b) of the GNU Affero General Public License
* version 3, these Appropriate Legal Notices must retain the display of the
* "Powered by Funambol" logo. If the display of the logo is not reasonably
* feasible for technical reasons, the Appropriate Legal Notices must display
* the words "Powered by Funambol".
*/
import java.util.Hashtable;
/**
* This class supplies some methods
* to escape / unescape special chars according XML specifications
*
*/
class Entities {
private static final String[][] BASIC_ARRAY = {
{"quot" , "34"}, // " - double-quote
{"amp" , "38"}, // & - ampersand
{"lt" , "60"}, // < - less-than
{"gt" , "62"}, // > - greater-than
{"apos" , "39"}, // XML apostrophe
};
/**
* <p>The set of entities supported by standard XML.</p>
*/
public static final Entities XML;
static {
XML = new Entities();
XML.addEntities(BASIC_ARRAY);
}
static interface EntityMap {
void add(String name, int value);
String name(int value);
int value(String name);
}
static class PrimitiveEntityMap implements EntityMap {
private Hashtable mapNameToValue = new Hashtable();
private Hashtable mapValueToName = new Hashtable();
public void add(String name, int value) {
mapNameToValue.put(name, new Integer(value));
mapValueToName.put(new Integer(value), name);
}
public String name(int value) {
return (String) mapValueToName.get(new Integer(value));
}
public int value(String name) {
Object value = mapNameToValue.get(name);
if (value == null) {
return -1;
}
return ((Integer) value).intValue();
}
}
static class LookupEntityMap extends PrimitiveEntityMap {
private String[] lookupTable;
private int LOOKUP_TABLE_SIZE = 256;
public String name(int value) {
if (value < LOOKUP_TABLE_SIZE) {
return lookupTable()[value];
}
return super.name(value);
}
private String[] lookupTable() {
if (lookupTable == null) {
createLookupTable();
}
return lookupTable;
}
private void createLookupTable() {
lookupTable = new String[LOOKUP_TABLE_SIZE];
for (int i = 0, l = LOOKUP_TABLE_SIZE; i < l; ++i) {
lookupTable[i] = super.name(i);
}
}
}
EntityMap map = new Entities.LookupEntityMap();
public void addEntities(String[][] entityArray) {
for (int i = 0; i < entityArray.length; ++i) {
addEntity(entityArray[i][0], Integer.parseInt(entityArray[i][1]));
}
}
public void addEntity(String name, int value) {
map.add(name, value);
}
public String entityName(int value) {
return map.name(value);
}
public int entityValue(String name) {
return map.value(name);
}
/**
* <p>Escapes special characters in a <code>String</code>.</p>
*
*
* @param str The <code>String</code> to escape.
* @return A escaped <code>String</code>.
*/
public String escape(String str) {
char ch = " " ;
String entityName = null ;
StringBuffer buf = null ;
int intValue = 0 ;
buf = new StringBuffer(str.length() * 2);
for (int i = 0, l = str.length(); i < l; ++i) {
ch = str.charAt(i);
entityName = this.entityName(ch);
if (entityName == null) {
if (ch > 0x7F) {
intValue = ch;
buf.append("&#");
buf.append(intValue);
buf.append(";");
} else {
buf.append(ch);
}
} else {
buf.append("&");
buf.append(entityName);
buf.append(";");
}
}
return buf.toString();
}
/**
* <p>Unescapes special characters in a <code>String</code>.</p>
*
* @param str The <code>String</code> to escape.
* @return A un-escaped <code>String</code>.
*/
public String unescape(String str) {
StringBuffer buf = null ;
String entityName = null ;
char ch = " " ;
char charAt1 = " " ;
int entityValue = 0 ;
buf = new StringBuffer(str.length());
for (int i = 0, l = str.length(); i < l; ++i) {
ch = str.charAt(i);
if (ch == "&") {
int semi = str.indexOf(";", i + 1);
if (semi == -1) {
buf.append(ch);
continue;
}
entityName = str.substring(i + 1, semi);
if (entityName.charAt(0) == "#") {
charAt1 = entityName.charAt(1);
if (charAt1 == "x" || charAt1=="X") {
entityValue = Integer.valueOf(entityName.substring(2), 16).intValue();
} else {
entityValue = Integer.parseInt(entityName.substring(1));
}
} else {
entityValue = this.entityValue(entityName);
} if (entityValue == -1) {
buf.append("&");
buf.append(entityName);
buf.append(";");
} else {
buf.append((char) (entityValue));
}
i = semi;
} else {
buf.append(ch);
}
}
return buf.toString();
}
}
Provides HTML and XML entity utilities.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Revised from apache commons lang
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
/**
* <p>
* Provides HTML and XML entity utilities.
* </p>
*
* @see
* @since 2.0
* @version $Id: Entities.java 636641 2008-03-13 06:11:30Z bayard $
*/
class Entities {
private static final String[][] BASIC_ARRAY = {{"quot", "34"}, // " - double-quote
{"amp", "38"}, // & - ampersand
{"lt", "60"}, // < - less-than
{"gt", "62"}, // > - greater-than
};
private static final String[][] APOS_ARRAY = {{"apos", "39"}, // XML apostrophe
};
// package scoped for testing
static final String[][] ISO8859_1_ARRAY = {{"nbsp", "160"}, // non-breaking space
{"iexcl", "161"}, // inverted exclamation mark
{"cent", "162"}, // cent sign
{"pound", "163"}, // pound sign
{"curren", "164"}, // currency sign
{"yen", "165"}, // yen sign = yuan sign
{"brvbar", "166"}, // broken bar = broken vertical bar
{"sect", "167"}, // section sign
{"uml", "168"}, // diaeresis = spacing diaeresis
{"copy", "169"}, // � - copyright sign
{"ordf", "170"}, // feminine ordinal indicator
{"laquo", "171"}, // left-pointing double angle quotation mark = left pointing guillemet
{"not", "172"}, // not sign
{"shy", "173"}, // soft hyphen = discretionary hyphen
{"reg", "174"}, // � - registered trademark sign
{"macr", "175"}, // macron = spacing macron = overline = APL overbar
{"deg", "176"}, // degree sign
{"plusmn", "177"}, // plus-minus sign = plus-or-minus sign
{"sup2", "178"}, // superscript two = superscript digit two = squared
{"sup3", "179"}, // superscript three = superscript digit three = cubed
{"acute", "180"}, // acute accent = spacing acute
{"micro", "181"}, // micro sign
{"para", "182"}, // pilcrow sign = paragraph sign
{"middot", "183"}, // middle dot = Georgian comma = Greek middle dot
{"cedil", "184"}, // cedilla = spacing cedilla
{"sup1", "185"}, // superscript one = superscript digit one
{"ordm", "186"}, // masculine ordinal indicator
{"raquo", "187"}, // right-pointing double angle quotation mark = right pointing guillemet
{"frac14", "188"}, // vulgar fraction one quarter = fraction one quarter
{"frac12", "189"}, // vulgar fraction one half = fraction one half
{"frac34", "190"}, // vulgar fraction three quarters = fraction three quarters
{"iquest", "191"}, // inverted question mark = turned question mark
{"Agrave", "192"}, // � - uppercase A, grave accent
{"Aacute", "193"}, // � - uppercase A, acute accent
{"Acirc", "194"}, // � - uppercase A, circumflex accent
{"Atilde", "195"}, // � - uppercase A, tilde
{"Auml", "196"}, // Ä - uppercase A, umlaut
{"Aring", "197"}, // Å - uppercase A, ring
{"AElig", "198"}, // Æ - uppercase AE
{"Ccedil", "199"}, // Ç - uppercase C, cedilla
{"Egrave", "200"}, // � - uppercase E, grave accent
{"Eacute", "201"}, // � - uppercase E, acute accent
{"Ecirc", "202"}, // � - uppercase E, circumflex accent
{"Euml", "203"}, // Ë - uppercase E, umlaut
{"Igrave", "204"}, // � - uppercase I, grave accent
{"Iacute", "205"}, // � - uppercase I, acute accent
{"Icirc", "206"}, // � - uppercase I, circumflex accent
{"Iuml", "207"}, // � - uppercase I, umlaut
{"ETH", "208"}, // � - uppercase Eth, Icelandic
{"Ntilde", "209"}, // � - uppercase N, tilde
{"Ograve", "210"}, // � - uppercase O, grave accent
{"Oacute", "211"}, // � - uppercase O, acute accent
{"Ocirc", "212"}, // � - uppercase O, circumflex accent
{"Otilde", "213"}, // � - uppercase O, tilde
{"Ouml", "214"}, // Ö - uppercase O, umlaut
{"times", "215"}, // multiplication sign
{"Oslash", "216"}, // Ø - uppercase O, slash
{"Ugrave", "217"}, // � - uppercase U, grave accent
{"Uacute", "218"}, // � - uppercase U, acute accent
{"Ucirc", "219"}, // � - uppercase U, circumflex accent
{"Uuml", "220"}, // Ü - uppercase U, umlaut
{"Yacute", "221"}, // � - uppercase Y, acute accent
{"THORN", "222"}, // � - uppercase THORN, Icelandic
{"szlig", "223"}, // ß - lowercase sharps, German
{"agrave", "224"}, // à - lowercase a, grave accent
{"aacute", "225"}, // á - lowercase a, acute accent
{"acirc", "226"}, // â - lowercase a, circumflex accent
{"atilde", "227"}, // � - lowercase a, tilde
{"auml", "228"}, // ä - lowercase a, umlaut
{"aring", "229"}, // å - lowercase a, ring
{"aelig", "230"}, // æ - lowercase ae
{"ccedil", "231"}, // ç - lowercase c, cedilla
{"egrave", "232"}, // è - lowercase e, grave accent
{"eacute", "233"}, // é - lowercase e, acute accent
{"ecirc", "234"}, // ê - lowercase e, circumflex accent
{"euml", "235"}, // ë - lowercase e, umlaut
{"igrave", "236"}, // ì - lowercase i, grave accent
{"iacute", "237"}, // í - lowercase i, acute accent
{"icirc", "238"}, // î - lowercase i, circumflex accent
{"iuml", "239"}, // � - lowercase i, umlaut
{"eth", "240"}, // � - lowercase eth, Icelandic
{"ntilde", "241"}, // ñ - lowercase n, tilde
{"ograve", "242"}, // ò - lowercase o, grave accent
{"oacute", "243"}, // ó - lowercase o, acute accent
{"ocirc", "244"}, // ô - lowercase o, circumflex accent
{"otilde", "245"}, // � - lowercase o, tilde
{"ouml", "246"}, // ö - lowercase o, umlaut
{"divide", "247"}, // division sign
{"oslash", "248"}, // ø - lowercase o, slash
{"ugrave", "249"}, // ù - lowercase u, grave accent
{"uacute", "250"}, // ú - lowercase u, acute accent
{"ucirc", "251"}, // û - lowercase u, circumflex accent
{"uuml", "252"}, // ü - lowercase u, umlaut
{"yacute", "253"}, // � - lowercase y, acute accent
{"thorn", "254"}, // � - lowercase thorn, Icelandic
{"yuml", "255"}, // � - lowercase y, umlaut
};
// http://www.w3.org/TR/REC-html40/sgml/entities.html
// package scoped for testing
static final String[][] HTML40_ARRAY = {
// <!-- Latin Extended-B -->
{"fnof", "402"}, // latin small f with hook = function= florin, U+0192 ISOtech -->
// <!-- Greek -->
{"Alpha", "913"}, // greek capital letter alpha, U+0391 -->
{"Beta", "914"}, // greek capital letter beta, U+0392 -->
{"Gamma", "915"}, // greek capital letter gamma,U+0393 ISOgrk3 -->
{"Delta", "916"}, // greek capital letter delta,U+0394 ISOgrk3 -->
{"Epsilon", "917"}, // greek capital letter epsilon, U+0395 -->
{"Zeta", "918"}, // greek capital letter zeta, U+0396 -->
{"Eta", "919"}, // greek capital letter eta, U+0397 -->
{"Theta", "920"}, // greek capital letter theta,U+0398 ISOgrk3 -->
{"Iota", "921"}, // greek capital letter iota, U+0399 -->
{"Kappa", "922"}, // greek capital letter kappa, U+039A -->
{"Lambda", "923"}, // greek capital letter lambda,U+039B ISOgrk3 -->
{"Mu", "924"}, // greek capital letter mu, U+039C -->
{"Nu", "925"}, // greek capital letter nu, U+039D -->
{"Xi", "926"}, // greek capital letter xi, U+039E ISOgrk3 -->
{"Omicron", "927"}, // greek capital letter omicron, U+039F -->
{"Pi", "928"}, // greek capital letter pi, U+03A0 ISOgrk3 -->
{"Rho", "929"}, // greek capital letter rho, U+03A1 -->
// <!-- there is no Sigmaf, and no U+03A2 character either -->
{"Sigma", "931"}, // greek capital letter sigma,U+03A3 ISOgrk3 -->
{"Tau", "932"}, // greek capital letter tau, U+03A4 -->
{"Upsilon", "933"}, // greek capital letter upsilon,U+03A5 ISOgrk3 -->
{"Phi", "934"}, // greek capital letter phi,U+03A6 ISOgrk3 -->
{"Chi", "935"}, // greek capital letter chi, U+03A7 -->
{"Psi", "936"}, // greek capital letter psi,U+03A8 ISOgrk3 -->
{"Omega", "937"}, // greek capital letter omega,U+03A9 ISOgrk3 -->
{"alpha", "945"}, // greek small letter alpha,U+03B1 ISOgrk3 -->
{"beta", "946"}, // greek small letter beta, U+03B2 ISOgrk3 -->
{"gamma", "947"}, // greek small letter gamma,U+03B3 ISOgrk3 -->
{"delta", "948"}, // greek small letter delta,U+03B4 ISOgrk3 -->
{"epsilon", "949"}, // greek small letter epsilon,U+03B5 ISOgrk3 -->
{"zeta", "950"}, // greek small letter zeta, U+03B6 ISOgrk3 -->
{"eta", "951"}, // greek small letter eta, U+03B7 ISOgrk3 -->
{"theta", "952"}, // greek small letter theta,U+03B8 ISOgrk3 -->
{"iota", "953"}, // greek small letter iota, U+03B9 ISOgrk3 -->
{"kappa", "954"}, // greek small letter kappa,U+03BA ISOgrk3 -->
{"lambda", "955"}, // greek small letter lambda,U+03BB ISOgrk3 -->
{"mu", "956"}, // greek small letter mu, U+03BC ISOgrk3 -->
{"nu", "957"}, // greek small letter nu, U+03BD ISOgrk3 -->
{"xi", "958"}, // greek small letter xi, U+03BE ISOgrk3 -->
{"omicron", "959"}, // greek small letter omicron, U+03BF NEW -->
{"pi", "960"}, // greek small letter pi, U+03C0 ISOgrk3 -->
{"rho", "961"}, // greek small letter rho, U+03C1 ISOgrk3 -->
{"sigmaf", "962"}, // greek small letter final sigma,U+03C2 ISOgrk3 -->
{"sigma", "963"}, // greek small letter sigma,U+03C3 ISOgrk3 -->
{"tau", "964"}, // greek small letter tau, U+03C4 ISOgrk3 -->
{"upsilon", "965"}, // greek small letter upsilon,U+03C5 ISOgrk3 -->
{"phi", "966"}, // greek small letter phi, U+03C6 ISOgrk3 -->
{"chi", "967"}, // greek small letter chi, U+03C7 ISOgrk3 -->
{"psi", "968"}, // greek small letter psi, U+03C8 ISOgrk3 -->
{"omega", "969"}, // greek small letter omega,U+03C9 ISOgrk3 -->
{"thetasym", "977"}, // greek small letter theta symbol,U+03D1 NEW -->
{"upsih", "978"}, // greek upsilon with hook symbol,U+03D2 NEW -->
{"piv", "982"}, // greek pi symbol, U+03D6 ISOgrk3 -->
// <!-- General Punctuation -->
{"bull", "8226"}, // bullet = black small circle,U+2022 ISOpub -->
// <!-- bullet is NOT the same as bullet operator, U+2219 -->
{"hellip", "8230"}, // horizontal ellipsis = three dot leader,U+2026 ISOpub -->
{"prime", "8242"}, // prime = minutes = feet, U+2032 ISOtech -->
{"Prime", "8243"}, // double prime = seconds = inches,U+2033 ISOtech -->
{"oline", "8254"}, // overline = spacing overscore,U+203E NEW -->
{"frasl", "8260"}, // fraction slash, U+2044 NEW -->
// <!-- Letterlike Symbols -->
{"weierp", "8472"}, // script capital P = power set= Weierstrass p, U+2118 ISOamso -->
{"image", "8465"}, // blackletter capital I = imaginary part,U+2111 ISOamso -->
{"real", "8476"}, // blackletter capital R = real part symbol,U+211C ISOamso -->
{"trade", "8482"}, // trade mark sign, U+2122 ISOnum -->
{"alefsym", "8501"}, // alef symbol = first transfinite cardinal,U+2135 NEW -->
// <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the
// same glyph could be used to depict both characters -->
// <!-- Arrows -->
{"larr", "8592"}, // leftwards arrow, U+2190 ISOnum -->
{"uarr", "8593"}, // upwards arrow, U+2191 ISOnum-->
{"rarr", "8594"}, // rightwards arrow, U+2192 ISOnum -->
{"darr", "8595"}, // downwards arrow, U+2193 ISOnum -->
{"harr", "8596"}, // left right arrow, U+2194 ISOamsa -->
{"crarr", "8629"}, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW -->
{"lArr", "8656"}, // leftwards double arrow, U+21D0 ISOtech -->
// <!-- ISO 10646 does not say that lArr is the same as the "is implied by"
// arrow but also does not have any other character for that function.
// So ? lArr canbe used for "is implied by" as ISOtech suggests -->
{"uArr", "8657"}, // upwards double arrow, U+21D1 ISOamsa -->
{"rArr", "8658"}, // rightwards double arrow,U+21D2 ISOtech -->
// <!-- ISO 10646 does not say this is the "implies" character but does not
// have another character with this function so ?rArr can be used for
// "implies" as ISOtech suggests -->
{"dArr", "8659"}, // downwards double arrow, U+21D3 ISOamsa -->
{"hArr", "8660"}, // left right double arrow,U+21D4 ISOamsa -->
// <!-- Mathematical Operators -->
{"forall", "8704"}, // for all, U+2200 ISOtech -->
{"part", "8706"}, // partial differential, U+2202 ISOtech -->
{"exist", "8707"}, // there exists, U+2203 ISOtech -->
{"empty", "8709"}, // empty set = null set = diameter,U+2205 ISOamso -->
{"nabla", "8711"}, // nabla = backward difference,U+2207 ISOtech -->
{"isin", "8712"}, // element of, U+2208 ISOtech -->
{"notin", "8713"}, // not an element of, U+2209 ISOtech -->
{"ni", "8715"}, // contains as member, U+220B ISOtech -->
// <!-- should there be a more memorable name than "ni"? -->
{"prod", "8719"}, // n-ary product = product sign,U+220F ISOamsb -->
// <!-- prod is NOT the same character as U+03A0 "greek capital letter pi"
// though the same glyph might be used for both -->
{"sum", "8721"}, // n-ary summation, U+2211 ISOamsb -->
// <!-- sum is NOT the same character as U+03A3 "greek capital letter sigma"
// though the same glyph might be used for both -->
{"minus", "8722"}, // minus sign, U+2212 ISOtech -->
{"lowast", "8727"}, // asterisk operator, U+2217 ISOtech -->
{"radic", "8730"}, // square root = radical sign,U+221A ISOtech -->
{"prop", "8733"}, // proportional to, U+221D ISOtech -->
{"infin", "8734"}, // infinity, U+221E ISOtech -->
{"ang", "8736"}, // angle, U+2220 ISOamso -->
{"and", "8743"}, // logical and = wedge, U+2227 ISOtech -->
{"or", "8744"}, // logical or = vee, U+2228 ISOtech -->
{"cap", "8745"}, // intersection = cap, U+2229 ISOtech -->
{"cup", "8746"}, // union = cup, U+222A ISOtech -->
{"int", "8747"}, // integral, U+222B ISOtech -->
{"there4", "8756"}, // therefore, U+2234 ISOtech -->
{"sim", "8764"}, // tilde operator = varies with = similar to,U+223C ISOtech -->
// <!-- tilde operator is NOT the same character as the tilde, U+007E,although
// the same glyph might be used to represent both -->
{"cong", "8773"}, // approximately equal to, U+2245 ISOtech -->
{"asymp", "8776"}, // almost equal to = asymptotic to,U+2248 ISOamsr -->
{"ne", "8800"}, // not equal to, U+2260 ISOtech -->
{"equiv", "8801"}, // identical to, U+2261 ISOtech -->
{"le", "8804"}, // less-than or equal to, U+2264 ISOtech -->
{"ge", "8805"}, // greater-than or equal to,U+2265 ISOtech -->
{"sub", "8834"}, // subset of, U+2282 ISOtech -->
{"sup", "8835"}, // superset of, U+2283 ISOtech -->
// <!-- note that nsup, "not a superset of, U+2283" is not covered by the
// Symbol font encoding and is not included. Should it be, for symmetry?
// It is in ISOamsn --> <!ENTITY nsub", "8836"},
// not a subset of, U+2284 ISOamsn -->
{"sube", "8838"}, // subset of or equal to, U+2286 ISOtech -->
{"supe", "8839"}, // superset of or equal to,U+2287 ISOtech -->
{"oplus", "8853"}, // circled plus = direct sum,U+2295 ISOamsb -->
{"otimes", "8855"}, // circled times = vector product,U+2297 ISOamsb -->
{"perp", "8869"}, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
{"sdot", "8901"}, // dot operator, U+22C5 ISOamsb -->
// <!-- dot operator is NOT the same character as U+00B7 middle dot -->
// <!-- Miscellaneous Technical -->
{"lceil", "8968"}, // left ceiling = apl upstile,U+2308 ISOamsc -->
{"rceil", "8969"}, // right ceiling, U+2309 ISOamsc -->
{"lfloor", "8970"}, // left floor = apl downstile,U+230A ISOamsc -->
{"rfloor", "8971"}, // right floor, U+230B ISOamsc -->
{"lang", "9001"}, // left-pointing angle bracket = bra,U+2329 ISOtech -->
// <!-- lang is NOT the same character as U+003C "less than" or U+2039 "single left-pointing angle quotation
// mark" -->
{"rang", "9002"}, // right-pointing angle bracket = ket,U+232A ISOtech -->
// <!-- rang is NOT the same character as U+003E "greater than" or U+203A
// "single right-pointing angle quotation mark" -->
// <!-- Geometric Shapes -->
{"loz", "9674"}, // lozenge, U+25CA ISOpub -->
// <!-- Miscellaneous Symbols -->
{"spades", "9824"}, // black spade suit, U+2660 ISOpub -->
// <!-- black here seems to mean filled as opposed to hollow -->
{"clubs", "9827"}, // black club suit = shamrock,U+2663 ISOpub -->
{"hearts", "9829"}, // black heart suit = valentine,U+2665 ISOpub -->
{"diams", "9830"}, // black diamond suit, U+2666 ISOpub -->
// <!-- Latin Extended-A -->
{"OElig", "338"}, // -- latin capital ligature OE,U+0152 ISOlat2 -->
{"oelig", "339"}, // -- latin small ligature oe, U+0153 ISOlat2 -->
// <!-- ligature is a misnomer, this is a separate character in some languages -->
{"Scaron", "352"}, // -- latin capital letter S with caron,U+0160 ISOlat2 -->
{"scaron", "353"}, // -- latin small letter s with caron,U+0161 ISOlat2 -->
{"Yuml", "376"}, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
// <!-- Spacing Modifier Letters -->
{"circ", "710"}, // -- modifier letter circumflex accent,U+02C6 ISOpub -->
{"tilde", "732"}, // small tilde, U+02DC ISOdia -->
// <!-- General Punctuation -->
{"ensp", "8194"}, // en space, U+2002 ISOpub -->
{"emsp", "8195"}, // em space, U+2003 ISOpub -->
{"thinsp", "8201"}, // thin space, U+2009 ISOpub -->
{"zwnj", "8204"}, // zero width non-joiner,U+200C NEW RFC 2070 -->
{"zwj", "8205"}, // zero width joiner, U+200D NEW RFC 2070 -->
{"lrm", "8206"}, // left-to-right mark, U+200E NEW RFC 2070 -->
{"rlm", "8207"}, // right-to-left mark, U+200F NEW RFC 2070 -->
{"ndash", "8211"}, // en dash, U+2013 ISOpub -->
{"mdash", "8212"}, // em dash, U+2014 ISOpub -->
{"lsquo", "8216"}, // left single quotation mark,U+2018 ISOnum -->
{"rsquo", "8217"}, // right single quotation mark,U+2019 ISOnum -->
{"sbquo", "8218"}, // single low-9 quotation mark, U+201A NEW -->
{"ldquo", "8220"}, // left double quotation mark,U+201C ISOnum -->
{"rdquo", "8221"}, // right double quotation mark,U+201D ISOnum -->
{"bdquo", "8222"}, // double low-9 quotation mark, U+201E NEW -->
{"dagger", "8224"}, // dagger, U+2020 ISOpub -->
{"Dagger", "8225"}, // double dagger, U+2021 ISOpub -->
{"permil", "8240"}, // per mille sign, U+2030 ISOtech -->
{"lsaquo", "8249"}, // single left-pointing angle quotation mark,U+2039 ISO proposed -->
// <!-- lsaquo is proposed but not yet ISO standardized -->
{"rsaquo", "8250"}, // single right-pointing angle quotation mark,U+203A ISO proposed -->
// <!-- rsaquo is proposed but not yet ISO standardized -->
{"euro", "8364"}, // -- euro sign, U+20AC NEW -->
};
/**
* <p>
* The set of entities supported by standard XML.
* </p>
*/
public static final Entities XML;
/**
* <p>
* The set of entities supported by HTML 3.2.
* </p>
*/
public static final Entities HTML32;
/**
* <p>
* The set of entities supported by HTML 4.0.
* </p>
*/
public static final Entities HTML40;
static {
XML = new Entities();
XML.addEntities(BASIC_ARRAY);
XML.addEntities(APOS_ARRAY);
}
static {
HTML32 = new Entities();
HTML32.addEntities(BASIC_ARRAY);
HTML32.addEntities(ISO8859_1_ARRAY);
}
static {
HTML40 = new Entities();
fillWithHtml40Entities(HTML40);
}
/**
* <p>
* Fills the specified entities instance with HTML 40 entities.
* </p>
*
* @param entities
* the instance to be filled.
*/
static void fillWithHtml40Entities(Entities entities) {
entities.addEntities(BASIC_ARRAY);
entities.addEntities(ISO8859_1_ARRAY);
entities.addEntities(HTML40_ARRAY);
}
static interface EntityMap {
/**
* <p>
* Add an entry to this entity map.
* </p>
*
* @param name
* the entity name
* @param value
* the entity value
*/
void add(String name, int value);
/**
* <p>
* Returns the name of the entity identified by the specified value.
* </p>
*
* @param value
* the value to locate
* @return entity name associated with the specified value
*/
String name(int value);
/**
* <p>
* Returns the value of the entity identified by the specified name.
* </p>
*
* @param name
* the name to locate
* @return entity value associated with the specified name
*/
int value(String name);
}
static class PrimitiveEntityMap implements EntityMap {
private Map mapNameToValue = new HashMap();
private IntHashMap mapValueToName = new IntHashMap();
/**
* {@inheritDoc}
*/
public void add(String name, int value) {
mapNameToValue.put(name, new Integer(value));
mapValueToName.put(value, name);
}
/**
* {@inheritDoc}
*/
public String name(int value) {
return (String) mapValueToName.get(value);
}
/**
* {@inheritDoc}
*/
public int value(String name) {
Object value = mapNameToValue.get(name);
if (value == null) {
return -1;
}
return ((Integer) value).intValue();
}
}
static abstract class MapIntMap implements Entities.EntityMap {
protected Map mapNameToValue;
protected Map mapValueToName;
/**
* {@inheritDoc}
*/
public void add(String name, int value) {
mapNameToValue.put(name, new Integer(value));
mapValueToName.put(new Integer(value), name);
}
/**
* {@inheritDoc}
*/
public String name(int value) {
return (String) mapValueToName.get(new Integer(value));
}
/**
* {@inheritDoc}
*/
public int value(String name) {
Object value = mapNameToValue.get(name);
if (value == null) {
return -1;
}
return ((Integer) value).intValue();
}
}
static class HashEntityMap extends MapIntMap {
/**
* Constructs a new instance of <code>HashEntityMap</code>.
*/
public HashEntityMap() {
mapNameToValue = new HashMap();
mapValueToName = new HashMap();
}
}
static class TreeEntityMap extends MapIntMap {
/**
* Constructs a new instance of <code>TreeEntityMap</code>.
*/
public TreeEntityMap() {
mapNameToValue = new TreeMap();
mapValueToName = new TreeMap();
}
}
static class LookupEntityMap extends PrimitiveEntityMap {
private String[] lookupTable;
private int LOOKUP_TABLE_SIZE = 256;
/**
* {@inheritDoc}
*/
public String name(int value) {
if (value < LOOKUP_TABLE_SIZE) {
return lookupTable()[value];
}
return super.name(value);
}
/**
* <p>
* Returns the lookup table for this entity map. The lookup table is created if it has not been previously.
* </p>
*
* @return the lookup table
*/
private String[] lookupTable() {
if (lookupTable == null) {
createLookupTable();
}
return lookupTable;
}
/**
* <p>
* Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names.
* </p>
*/
private void createLookupTable() {
lookupTable = new String[LOOKUP_TABLE_SIZE];
for (int i = 0; i < LOOKUP_TABLE_SIZE; ++i) {
lookupTable[i] = super.name(i);
}
}
}
static class ArrayEntityMap implements EntityMap {
protected int growBy = 100;
protected int size = 0;
protected String[] names;
protected int[] values;
/**
* Constructs a new instance of <code>ArrayEntityMap</code>.
*/
public ArrayEntityMap() {
names = new String[growBy];
values = new int[growBy];
}
/**
* Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the array should
* grow.
*
* @param growBy
* array will be initialized to and will grow by this amount
*/
public ArrayEntityMap(int growBy) {
this.growBy = growBy;
names = new String[growBy];
values = new int[growBy];
}
/**
* {@inheritDoc}
*/
public void add(String name, int value) {
ensureCapacity(size + 1);
names[size] = name;
values[size] = value;
size++;
}
/**
* Verifies the capacity of the entity array, adjusting the size if necessary.
*
* @param capacity
* size the array should be
*/
protected void ensureCapacity(int capacity) {
if (capacity > names.length) {
int newSize = Math.max(capacity, size + growBy);
String[] newNames = new String[newSize];
System.arraycopy(names, 0, newNames, 0, size);
names = newNames;
int[] newValues = new int[newSize];
System.arraycopy(values, 0, newValues, 0, size);
values = newValues;
}
}
/**
* {@inheritDoc}
*/
public String name(int value) {
for (int i = 0; i < size; ++i) {
if (values[i] == value) {
return names[i];
}
}
return null;
}
/**
* {@inheritDoc}
*/
public int value(String name) {
for (int i = 0; i < size; ++i) {
if (names[i].equals(name)) {
return values[i];
}
}
return -1;
}
}
static class BinaryEntityMap extends ArrayEntityMap {
/**
* Constructs a new instance of <code>BinaryEntityMap</code>.
*/
public BinaryEntityMap() {
super();
}
/**
* Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the underlying array
* should grow.
*
* @param growBy
* array will be initialized to and will grow by this amount
*/
public BinaryEntityMap(int growBy) {
super(growBy);
}
/**
* Performs a binary search of the entity array for the specified key. This method is based on code in
* {@link java.util.Arrays}.
*
* @param key
* the key to be found
* @return the index of the entity array matching the specified key
*/
private int binarySearch(int key) {
int low = 0;
int high = size - 1;
while (low <= high) {
int mid = (low + high) >>> 1;
int midVal = values[mid];
if (midVal < key) {
low = mid + 1;
} else if (midVal > key) {
high = mid - 1;
} else {
return mid; // key found
}
}
return -(low + 1); // key not found.
}
/**
* {@inheritDoc}
*/
public void add(String name, int value) {
ensureCapacity(size + 1);
int insertAt = binarySearch(value);
if (insertAt > 0) {
return; // note: this means you can"t insert the same value twice
}
insertAt = -(insertAt + 1); // binarySearch returns it negative and off-by-one
System.arraycopy(values, insertAt, values, insertAt + 1, size - insertAt);
values[insertAt] = value;
System.arraycopy(names, insertAt, names, insertAt + 1, size - insertAt);
names[insertAt] = name;
size++;
}
/**
* {@inheritDoc}
*/
public String name(int value) {
int index = binarySearch(value);
if (index < 0) {
return null;
}
return names[index];
}
}
// package scoped for testing
EntityMap map = new Entities.LookupEntityMap();
/**
* <p>
* Adds entities to this entity.
* </p>
*
* @param entityArray
* array of entities to be added
*/
public void addEntities(String[][] entityArray) {
for (int i = 0; i < entityArray.length; ++i) {
addEntity(entityArray[i][0], Integer.parseInt(entityArray[i][1]));
}
}
/**
* <p>
* Add an entity to this entity.
* </p>
*
* @param name
* name of the entity
* @param value
* vale of the entity
*/
public void addEntity(String name, int value) {
map.add(name, value);
}
/**
* <p>
* Returns the name of the entity identified by the specified value.
* </p>
*
* @param value
* the value to locate
* @return entity name associated with the specified value
*/
public String entityName(int value) {
return map.name(value);
}
/**
* <p>
* Returns the value of the entity identified by the specified name.
* </p>
*
* @param name
* the name to locate
* @return entity value associated with the specified name
*/
public int entityValue(String name) {
return map.value(name);
}
/**
* <p>
* Escapes the characters in a <code>String</code>.
* </p>
*
* <p>
* For example, if you have called addEntity("foo", 0xA1), escape("\u00A1") will return
* "&foo;"
* </p>
*
* @param str
* The <code>String</code> to escape.
* @return A new escaped <code>String</code>.
*/
public String escape(String str) {
StringWriter stringWriter = createStringWriter(str);
try {
this.escape(stringWriter, str);
} catch (IOException e) {
// This should never happen because ALL the StringWriter methods called by #escape(Writer, String) do not
// throw IOExceptions.
throw new RuntimeException(e);
}
return stringWriter.toString();
}
/**
* <p>
* Escapes the characters in the <code>String</code> passed and writes the result to the <code>Writer</code>
* passed.
* </p>
*
* @param writer
* The <code>Writer</code> to write the results of the escaping to. Assumed to be a non-null value.
* @param str
* The <code>String</code> to escape. Assumed to be a non-null value.
* @throws IOException
* when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
* methods.
*
* @see #escape(String)
* @see Writer
*/
public void escape(Writer writer, String str) throws IOException {
int len = str.length();
for (int i = 0; i < len; i++) {
char c = str.charAt(i);
String entityName = this.entityName(c);
if (entityName == null) {
if (c > 0x7F) {
writer.write("&#");
writer.write(Integer.toString(c, 10));
writer.write(";");
} else {
writer.write(c);
}
} else {
writer.write("&");
writer.write(entityName);
writer.write(";");
}
}
}
/**
* <p>
* Unescapes the entities in a <code>String</code>.
* </p>
*
* <p>
* For example, if you have called addEntity("foo", 0xA1), unescape("&foo;") will return
* "\u00A1"
* </p>
*
* @param str
* The <code>String</code> to escape.
* @return A new escaped <code>String</code>.
*/
public String unescape(String str) {
int firstAmp = str.indexOf("&");
if (firstAmp < 0) {
return str;
} else {
StringWriter stringWriter = createStringWriter(str);
try {
this.doUnescape(stringWriter, str, firstAmp);
} catch (IOException e) {
// This should never happen because ALL the StringWriter methods called by #escape(Writer, String)
// do not throw IOExceptions.
throw new RuntimeException(e);
}
return stringWriter.toString();
}
}
/**
* Make the StringWriter 10% larger than the source String to avoid growing the writer
*
* @param str The source string
* @return A newly created StringWriter
*/
private StringWriter createStringWriter(String str) {
return new StringWriter((int) (str.length() + (str.length() * 0.1)));
}
/**
* <p>
* Unescapes the escaped entities in the <code>String</code> passed and writes the result to the
* <code>Writer</code> passed.
* </p>
*
* @param writer
* The <code>Writer</code> to write the results to; assumed to be non-null.
* @param str
* The source <code>String</code> to unescape; assumed to be non-null.
* @throws IOException
* when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
* methods.
*
* @see #escape(String)
* @see Writer
*/
public void unescape(Writer writer, String str) throws IOException {
int firstAmp = str.indexOf("&");
if (firstAmp < 0) {
writer.write(str);
return;
} else {
doUnescape(writer, str, firstAmp);
}
}
/**
* Underlying unescape method that allows the optimisation of not starting from the 0 index again.
*
* @param writer
* The <code>Writer</code> to write the results to; assumed to be non-null.
* @param str
* The source <code>String</code> to unescape; assumed to be non-null.
* @param firstAmp
* The <code>int</code> index of the first ampersand in the source String.
* @throws IOException
* when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
* methods.
*/
private void doUnescape(Writer writer, String str, int firstAmp) throws IOException {
writer.write(str, 0, firstAmp);
int len = str.length();
for (int i = firstAmp; i < len; i++) {
char c = str.charAt(i);
if (c == "&") {
int nextIdx = i + 1;
int semiColonIdx = str.indexOf(";", nextIdx);
if (semiColonIdx == -1) {
writer.write(c);
continue;
}
int amphersandIdx = str.indexOf("&", i + 1);
if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) {
// Then the text looks like &...&...;
writer.write(c);
continue;
}
String entityContent = str.substring(nextIdx, semiColonIdx);
int entityValue = -1;
int entityContentLen = entityContent.length();
if (entityContentLen > 0) {
if (entityContent.charAt(0) == "#") { // escaped value content is an integer (decimal or
// hexidecimal)
if (entityContentLen > 1) {
char isHexChar = entityContent.charAt(1);
try {
switch (isHexChar) {
case "X" :
case "x" : {
entityValue = Integer.parseInt(entityContent.substring(2), 16);
break;
}
default : {
entityValue = Integer.parseInt(entityContent.substring(1), 10);
}
}
if (entityValue > 0xFFFF) {
entityValue = -1;
}
} catch (NumberFormatException e) {
entityValue = -1;
}
}
} else { // escaped value content is an entity name
entityValue = this.entityValue(entityContent);
}
}
if (entityValue == -1) {
writer.write("&");
writer.write(entityContent);
writer.write(";");
} else {
writer.write(entityValue);
}
i = semiColonIdx; // move index up to the semi-colon
} else {
writer.write(c);
}
}
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Note: originally released under the GNU LGPL v2.1,
* but rereleased by the original author under the ASF license (above).
*/
/**
* <p>A hash map that uses primitive ints for the key rather than objects.</p>
*
* <p>Note that this class is for internal optimization purposes only, and may
* not be supported in future releases of Apache Commons Lang. Utilities of
* this sort may be included in future releases of Apache Commons Collections.</p>
*
* @author Justin Couch
* @author Alex Chaffee (alex@apache.org)
* @author Stephen Colebourne
* @since 2.0
* @version $Revision: 561230 $
* @see java.util.HashMap
*/
class IntHashMap {
/**
* The hash table data.
*/
private transient Entry table[];
/**
* The total number of entries in the hash table.
*/
private transient int count;
/**
* The table is rehashed when its size exceeds this threshold. (The
* value of this field is (int)(capacity * loadFactor).)
*
* @serial
*/
private int threshold;
/**
* The load factor for the hashtable.
*
* @serial
*/
private float loadFactor;
/**
* <p>Innerclass that acts as a datastructure to create a new entry in the
* table.</p>
*/
private static class Entry {
int hash;
int key;
Object value;
Entry next;
/**
* <p>Create a new entry with the given values.</p>
*
* @param hash The code used to hash the object with
* @param key The key used to enter this in the table
* @param value The value for this key
* @param next A reference to the next entry in the table
*/
protected Entry(int hash, int key, Object value, Entry next) {
this.hash = hash;
this.key = key;
this.value = value;
this.next = next;
}
}
/**
* <p>Constructs a new, empty hashtable with a default capacity and load
* factor, which is <code>20</code> and <code>0.75</code> respectively.</p>
*/
public IntHashMap() {
this(20, 0.75f);
}
/**
* <p>Constructs a new, empty hashtable with the specified initial capacity
* and default load factor, which is <code>0.75</code>.</p>
*
* @param initialCapacity the initial capacity of the hashtable.
* @throws IllegalArgumentException if the initial capacity is less
* than zero.
*/
public IntHashMap(int initialCapacity) {
this(initialCapacity, 0.75f);
}
/**
* <p>Constructs a new, empty hashtable with the specified initial
* capacity and the specified load factor.</p>
*
* @param initialCapacity the initial capacity of the hashtable.
* @param loadFactor the load factor of the hashtable.
* @throws IllegalArgumentException if the initial capacity is less
* than zero, or if the load factor is nonpositive.
*/
public IntHashMap(int initialCapacity, float loadFactor) {
super();
if (initialCapacity < 0) {
throw new IllegalArgumentException("Illegal Capacity: " + initialCapacity);
}
if (loadFactor <= 0) {
throw new IllegalArgumentException("Illegal Load: " + loadFactor);
}
if (initialCapacity == 0) {
initialCapacity = 1;
}
this.loadFactor = loadFactor;
table = new Entry[initialCapacity];
threshold = (int) (initialCapacity * loadFactor);
}
/**
* <p>Returns the number of keys in this hashtable.</p>
*
* @return the number of keys in this hashtable.
*/
public int size() {
return count;
}
/**
* <p>Tests if this hashtable maps no keys to values.</p>
*
* @return <code>true</code> if this hashtable maps no keys to values;
* <code>false</code> otherwise.
*/
public boolean isEmpty() {
return count == 0;
}
/**
* <p>Tests if some key maps into the specified value in this hashtable.
* This operation is more expensive than the <code>containsKey</code>
* method.</p>
*
* <p>Note that this method is identical in functionality to containsValue,
* (which is part of the Map interface in the collections framework).</p>
*
* @param value a value to search for.
* @return <code>true</code> if and only if some key maps to the
* <code>value</code> argument in this hashtable as
* determined by the <tt>equals</tt> method;
* <code>false</code> otherwise.
* @throws NullPointerException if the value is <code>null</code>.
* @see #containsKey(int)
* @see #containsValue(Object)
* @see java.util.Map
*/
public boolean contains(Object value) {
if (value == null) {
throw new NullPointerException();
}
Entry tab[] = table;
for (int i = tab.length; i-- > 0;) {
for (Entry e = tab[i]; e != null; e = e.next) {
if (e.value.equals(value)) {
return true;
}
}
}
return false;
}
/**
* <p>Returns <code>true</code> if this HashMap maps one or more keys
* to this value.</p>
*
* <p>Note that this method is identical in functionality to contains
* (which predates the Map interface).</p>
*
* @param value value whose presence in this HashMap is to be tested.
* @return boolean <code>true</code> if the value is contained
* @see java.util.Map
* @since JDK1.2
*/
public boolean containsValue(Object value) {
return contains(value);
}
/**
* <p>Tests if the specified object is a key in this hashtable.</p>
*
* @param key possible key.
* @return <code>true</code> if and only if the specified object is a
* key in this hashtable, as determined by the <tt>equals</tt>
* method; <code>false</code> otherwise.
* @see #contains(Object)
*/
public boolean containsKey(int key) {
Entry tab[] = table;
int hash = key;
int index = (hash & 0x7FFFFFFF) % tab.length;
for (Entry e = tab[index]; e != null; e = e.next) {
if (e.hash == hash) {
return true;
}
}
return false;
}
/**
* <p>Returns the value to which the specified key is mapped in this map.</p>
*
* @param key a key in the hashtable.
* @return the value to which the key is mapped in this hashtable;
* <code>null</code> if the key is not mapped to any value in
* this hashtable.
* @see #put(int, Object)
*/
public Object get(int key) {
Entry tab[] = table;
int hash = key;
int index = (hash & 0x7FFFFFFF) % tab.length;
for (Entry e = tab[index]; e != null; e = e.next) {
if (e.hash == hash) {
return e.value;
}
}
return null;
}
/**
* <p>Increases the capacity of and internally reorganizes this
* hashtable, in order to accommodate and access its entries more
* efficiently.</p>
*
* <p>This method is called automatically when the number of keys
* in the hashtable exceeds this hashtable"s capacity and load
* factor.</p>
*/
protected void rehash() {
int oldCapacity = table.length;
Entry oldMap[] = table;
int newCapacity = oldCapacity * 2 + 1;
Entry newMap[] = new Entry[newCapacity];
threshold = (int) (newCapacity * loadFactor);
table = newMap;
for (int i = oldCapacity; i-- > 0;) {
for (Entry old = oldMap[i]; old != null;) {
Entry e = old;
old = old.next;
int index = (e.hash & 0x7FFFFFFF) % newCapacity;
e.next = newMap[index];
newMap[index] = e;
}
}
}
/**
* <p>Maps the specified <code>key</code> to the specified
* <code>value</code> in this hashtable. The key cannot be
* <code>null</code>. </p>
*
* <p>The value can be retrieved by calling the <code>get</code> method
* with a key that is equal to the original key.</p>
*
* @param key the hashtable key.
* @param value the value.
* @return the previous value of the specified key in this hashtable,
* or <code>null</code> if it did not have one.
* @throws NullPointerException if the key is <code>null</code>.
* @see #get(int)
*/
public Object put(int key, Object value) {
// Makes sure the key is not already in the hashtable.
Entry tab[] = table;
int hash = key;
int index = (hash & 0x7FFFFFFF) % tab.length;
for (Entry e = tab[index]; e != null; e = e.next) {
if (e.hash == hash) {
Object old = e.value;
e.value = value;
return old;
}
}
if (count >= threshold) {
// Rehash the table if the threshold is exceeded
rehash();
tab = table;
index = (hash & 0x7FFFFFFF) % tab.length;
}
// Creates the new entry.
Entry e = new Entry(hash, key, value, tab[index]);
tab[index] = e;
count++;
return null;
}
/**
* <p>Removes the key (and its corresponding value) from this
* hashtable.</p>
*
* <p>This method does nothing if the key is not present in the
* hashtable.</p>
*
* @param key the key that needs to be removed.
* @return the value to which the key had been mapped in this hashtable,
* or <code>null</code> if the key did not have a mapping.
*/
public Object remove(int key) {
Entry tab[] = table;
int hash = key;
int index = (hash & 0x7FFFFFFF) % tab.length;
for (Entry e = tab[index], prev = null; e != null; prev = e, e = e.next) {
if (e.hash == hash) {
if (prev != null) {
prev.next = e.next;
} else {
tab[index] = e.next;
}
count--;
Object oldValue = e.value;
e.value = null;
return oldValue;
}
}
return null;
}
/**
* <p>Clears this hashtable so that it contains no keys.</p>
*/
public synchronized void clear() {
Entry tab[] = table;
for (int index = tab.length; --index >= 0;) {
tab[index] = null;
}
count = 0;
}
}
Returns true if the argument, a UCS-4 character code, is valid in XML documents.
/*
* $Id: XmlChars.java,v 1.1 2004/08/19 05:30:22 aslom Exp $
*
* The Apache Software License, Version 1.1
*
*
* Copyright (c) 2000 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Crimson" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS"" AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
* http://www.sun.ru. For more information on the Apache Software
* Foundation, please see <http://www.apache.org/>.
*/
/**
* Methods in this class are used to determine whether characters may
* appear in certain roles in XML documents. Such methods are used
* both to parse and to create such documents.
*
* @version 1.8
* @author David Brownell
*/
public class XmlChars
{
// can"t construct instances
private XmlChars () { }
/**
* Returns true if the argument, a UCS-4 character code, is valid in
* XML documents. Unicode characters fit into the low sixteen
* bits of a UCS-4 character, and pairs of Unicode <em>surrogate
* characters</em> can be combined to encode UCS-4 characters in
* documents containing only Unicode. (The <code>char</code> datatype
* in the Java Programming Language represents Unicode characters,
* including unpaired surrogates.)
*
* <P> In XML, UCS-4 characters can also be encoded by the use of
* <em>character references</em> such as <b>&#x12345678;</b>, which
* happens to refer to a character that is disallowed in XML documents.
* UCS-4 characters allowed in XML documents can be expressed with
* one or two Unicode characters.
*
* @param ucs4char The 32-bit UCS-4 character being tested.
*/
static public boolean isChar (int ucs4char)
{
// [2] Char ::= #x0009 | #x000A | #x000D
// | [#x0020-#xD7FF]
// ... surrogates excluded!
// | [#xE000-#xFFFD]
// | [#x10000-#x10ffff]
return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
|| ucs4char == 0x000A || ucs4char == 0x0009
|| ucs4char == 0x000D
|| (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
|| (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
}
}
Returns true if the character is a non-initial character in names according to the XML recommendation
/*
* $Id: XmlChars.java,v 1.1 2004/08/19 05:30:22 aslom Exp $
*
* The Apache Software License, Version 1.1
*
*
* Copyright (c) 2000 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Crimson" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS"" AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
* http://www.sun.ru. For more information on the Apache Software
* Foundation, please see <http://www.apache.org/>.
*/
/**
* Methods in this class are used to determine whether characters may appear in
* certain roles in XML documents. Such methods are used both to parse and to
* create such documents.
*
* @version 1.8
* @author David Brownell
*/
public class XmlChars {
/**
* Returns true if the character is allowed to be a non-initial character in
* names according to the XML recommendation.
*
* @see #isNCNameChar
* @see #isLetter
*/
public static boolean isNameChar(char c) {
// [4] NameChar ::= Letter | Digit | "." | "_" | ":"
// | CombiningChar | Extender
if (isLetter2(c))
return true;
else if (c == ">")
return false;
else if (c == "." || c == "-" || c == "_" || c == ":" || isExtender(c))
return true;
else
return false;
}
/**
* Returns true if the character is allowed to be a non-initial character in
* unscoped names according to the rules of the XML Namespaces proposed
* recommendation. Except for precluding the colon (used to separate names
* from their scopes) these characters are just as allowed by the XML
* recommendation.
*
* @see #isNameChar
* @see #isLetter
*/
public static boolean isNCNameChar(char c) {
// [NC 5] NCNameChar ::= Letter | Digit | "." | "_"
// | CombiningChar | Extender
return c != ":" && isNameChar(c);
}
/**
* Returns true if the character is allowed where XML supports whitespace
* characters, false otherwise.
*/
public static boolean isSpace(char c) {
return c == " " || c == "\t" || c == "\n" || c == "\r";
}
/*
* NOTE: java.lang.Character.getType() values are:
*
* UNASSIGNED = 0,
*
* UPPERCASE_LETTER = 1, // Lu LOWERCASE_LETTER = 2, // Ll TITLECASE_LETTER =
* 3, // Lt MODIFIER_LETTER = 4, // Lm OTHER_LETTER = 5, // Lo
* NON_SPACING_MARK = 6, // Mn ENCLOSING_MARK = 7, // Me
* COMBINING_SPACING_MARK = 8, // Mc DECIMAL_DIGIT_NUMBER = 9, // Nd
* LETTER_NUMBER = 10, // Nl OTHER_NUMBER = 11, // No SPACE_SEPARATOR = 12, //
* Zs LINE_SEPARATOR = 13, // Zl PARAGRAPH_SEPARATOR = 14, // Zp CONTROL = 15, //
* Cc FORMAT = 16, // Cf // 17 reserved for proposed Ci category PRIVATE_USE =
* 18, // Co SURROGATE = 19, // Cs DASH_PUNCTUATION = 20, // Pd
* START_PUNCTUATION = 21, // Ps END_PUNCTUATION = 22, // Pe
* CONNECTOR_PUNCTUATION = 23, // Pc OTHER_PUNCTUATION = 24, // Po MATH_SYMBOL =
* 25, // Sm CURRENCY_SYMBOL = 26, // Sc MODIFIER_SYMBOL = 27, // Sk
* OTHER_SYMBOL = 28; // So
*/
/**
* Returns true if the character is an XML "letter". XML Names must start with
* Letters or a few other characters, but other characters in names must only
* satisfy the <em>isNameChar</em> predicate.
*
* @see #isNameChar
* @see #isNCNameChar
*/
public static boolean isLetter(char c) {
// [84] Letter ::= BaseChar | Ideographic
// [85] BaseChar ::= ... too much to repeat
// [86] Ideographic ::= ... too much to repeat
//
// Optimize the typical case.
//
if (c >= "a" && c <= "z")
return true;
if (c == "/")
return false;
if (c >= "A" && c <= "Z")
return true;
//
// Since the tables are too ridiculous to use in code,
// we"re using the footnotes here to drive this test.
//
switch (Character.getType(c)) {
// app. B footnote says these are "name start"
// chars" ...
case Character.LOWERCASE_LETTER: // Ll
case Character.UPPERCASE_LETTER: // Lu
case Character.OTHER_LETTER: // Lo
case Character.TITLECASE_LETTER: // Lt
case Character.LETTER_NUMBER: // Nl
// OK, here we just have some exceptions to check...
return !isCompatibilityChar(c)
// per "5.14 of Unicode", rule out some combiners
&& !(c >= 0x20dd && c <= 0x20e0);
default:
// check for some exceptions: these are "alphabetic"
return ((c >= 0x02bb && c <= 0x02c1) || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
}
}
//
// XML 1.0 discourages "compatibility" characters in names; these
// were defined to permit passing through some information stored in
// older non-Unicode character sets. These always have alternative
// representations in Unicode, e.g. using combining chars.
//
private static boolean isCompatibilityChar(char c) {
// the numerous comparisions here seem unavoidable,
// but the switch can reduce the number which must
// actually be executed.
switch ((c >> 8) & 0x0ff) {
case 0x00:
// ISO Latin/1 has a few compatibility characters
return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
case 0x01:
// as do Latin Extended A and (parts of) B
return (c >= 0x0132 && c <= 0x0133) || (c >= 0x013f && c <= 0x0140) || c == 0x0149
|| c == 0x017f || (c >= 0x01c4 && c <= 0x01cc) || (c >= 0x01f1 && c <= 0x01f3);
case 0x02:
// some spacing modifiers
return (c >= 0x02b0 && c <= 0x02b8) || (c >= 0x02e0 && c <= 0x02e4);
case 0x03:
return c == 0x037a; // Greek
case 0x05:
return c == 0x0587; // Armenian
case 0x0e:
return c >= 0x0edc && c <= 0x0edd; // Laotian
case 0x11:
// big chunks of Hangul Jamo are all "compatibility"
return c == 0x1101 || c == 0x1104 || c == 0x1108 || c == 0x110a || c == 0x110d
|| (c >= 0x1113 && c <= 0x113b) || c == 0x113d || c == 0x113f
|| (c >= 0x1141 && c <= 0x114b) || c == 0x114d || c == 0x114f
|| (c >= 0x1151 && c <= 0x1153) || (c >= 0x1156 && c <= 0x1158) || c == 0x1162
|| c == 0x1164 || c == 0x1166 || c == 0x1168 || (c >= 0x116a && c <= 0x116c)
|| (c >= 0x116f && c <= 0x1171) || c == 0x1174 || (c >= 0x1176 && c <= 0x119d)
|| (c >= 0x119f && c <= 0x11a2) || (c >= 0x11a9 && c <= 0x11aa)
|| (c >= 0x11ac && c <= 0x11ad) || (c >= 0x11b0 && c <= 0x11b6) || c == 0x11b9
|| c == 0x11bb || (c >= 0x11c3 && c <= 0x11ea) || (c >= 0x11ec && c <= 0x11ef)
|| (c >= 0x11f1 && c <= 0x11f8);
case 0x20:
return c == 0x207f; // superscript
case 0x21:
return
// various letterlike symbols
c == 0x2102 || c == 0x2107 || (c >= 0x210a && c <= 0x2113) || c == 0x2115
|| (c >= 0x2118 && c <= 0x211d) || c == 0x2124 || c == 0x2128
|| (c >= 0x212c && c <= 0x212d) || (c >= 0x212f && c <= 0x2138)
// most Roman numerals (less 1K, 5K, 10K)
|| (c >= 0x2160 && c <= 0x217f);
case 0x30:
// some Hiragana
return c >= 0x309b && c <= 0x309c;
case 0x31:
// all Hangul Compatibility Jamo
return c >= 0x3131 && c <= 0x318e;
case 0xf9:
case 0xfa:
case 0xfb:
case 0xfc:
case 0xfd:
case 0xfe:
case 0xff:
// the whole "compatibility" area is for that purpose!
return true;
default:
// most of Unicode isn"t flagged as being for compatibility
return false;
}
}
// guts of isNameChar/isNCNameChar
private static boolean isLetter2(char c) {
// [84] Letter ::= BaseChar | Ideographic
// [85] BaseChar ::= ... too much to repeat
// [86] Ideographic ::= ... too much to repeat
// [87] CombiningChar ::= ... too much to repeat
//
// Optimize the typical case.
//
if (c >= "a" && c <= "z")
return true;
if (c == ">")
return false;
if (c >= "A" && c <= "Z")
return true;
//
// Since the tables are too ridiculous to use in code,
// we"re using the footnotes here to drive this test.
//
switch (Character.getType(c)) {
// app. B footnote says these are "name start"
// chars" ...
case Character.LOWERCASE_LETTER: // Ll
case Character.UPPERCASE_LETTER: // Lu
case Character.OTHER_LETTER: // Lo
case Character.TITLECASE_LETTER: // Lt
case Character.LETTER_NUMBER: // Nl
// ... and these are name characters "other
// than name start characters"
case Character.ruBINING_SPACING_MARK: // Mc
case Character.ENCLOSING_MARK: // Me
case Character.NON_SPACING_MARK: // Mn
case Character.MODIFIER_LETTER: // Lm
case Character.DECIMAL_DIGIT_NUMBER: // Nd
// OK, here we just have some exceptions to check...
return !isCompatibilityChar(c)
// per "5.14 of Unicode", rule out some combiners
&& !(c >= 0x20dd && c <= 0x20e0);
default:
// added a character ...
return c == 0x0387;
}
}
private static boolean isDigit(char c) {
// [88] Digit ::= ...
//
// java.lang.Character.isDigit is correct from the XML point
// of view except that it allows "fullwidth" digits.
//
return Character.isDigit(c) && !((c >= 0xff10) && (c <= 0xff19));
}
private static boolean isExtender(char c) {
// [89] Extender ::= ...
return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 || c == 0x0640 || c == 0x0e46
|| c == 0x0ec6 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
|| (c >= 0x309d && c <= 0x309e) || (c >= 0x30fc && c <= 0x30fe);
}
}
Returns true if the character is an XML "letter"
/*
* $Id: XmlChars.java,v 1.1 2004/08/19 05:30:22 aslom Exp $
*
* The Apache Software License, Version 1.1
*
*
* Copyright (c) 2000 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Crimson" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS"" AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
* http://www.sun.ru. For more information on the Apache Software
* Foundation, please see <http://www.apache.org/>.
*/
/**
* Methods in this class are used to determine whether characters may appear in
* certain roles in XML documents. Such methods are used both to parse and to
* create such documents.
*
* @version 1.8
* @author David Brownell
*/
public class XmlChars {
/**
* Returns true if the character is allowed to be a non-initial character in
* names according to the XML recommendation.
*
* @see #isNCNameChar
* @see #isLetter
*/
public static boolean isNameChar(char c) {
// [4] NameChar ::= Letter | Digit | "." | "_" | ":"
// | CombiningChar | Extender
if (isLetter2(c))
return true;
else if (c == ">")
return false;
else if (c == "." || c == "-" || c == "_" || c == ":" || isExtender(c))
return true;
else
return false;
}
/**
* Returns true if the character is allowed to be a non-initial character in
* unscoped names according to the rules of the XML Namespaces proposed
* recommendation. Except for precluding the colon (used to separate names
* from their scopes) these characters are just as allowed by the XML
* recommendation.
*
* @see #isNameChar
* @see #isLetter
*/
public static boolean isNCNameChar(char c) {
// [NC 5] NCNameChar ::= Letter | Digit | "." | "_"
// | CombiningChar | Extender
return c != ":" && isNameChar(c);
}
/**
* Returns true if the character is allowed where XML supports whitespace
* characters, false otherwise.
*/
public static boolean isSpace(char c) {
return c == " " || c == "\t" || c == "\n" || c == "\r";
}
/*
* NOTE: java.lang.Character.getType() values are:
*
* UNASSIGNED = 0,
*
* UPPERCASE_LETTER = 1, // Lu LOWERCASE_LETTER = 2, // Ll TITLECASE_LETTER =
* 3, // Lt MODIFIER_LETTER = 4, // Lm OTHER_LETTER = 5, // Lo
* NON_SPACING_MARK = 6, // Mn ENCLOSING_MARK = 7, // Me
* COMBINING_SPACING_MARK = 8, // Mc DECIMAL_DIGIT_NUMBER = 9, // Nd
* LETTER_NUMBER = 10, // Nl OTHER_NUMBER = 11, // No SPACE_SEPARATOR = 12, //
* Zs LINE_SEPARATOR = 13, // Zl PARAGRAPH_SEPARATOR = 14, // Zp CONTROL = 15, //
* Cc FORMAT = 16, // Cf // 17 reserved for proposed Ci category PRIVATE_USE =
* 18, // Co SURROGATE = 19, // Cs DASH_PUNCTUATION = 20, // Pd
* START_PUNCTUATION = 21, // Ps END_PUNCTUATION = 22, // Pe
* CONNECTOR_PUNCTUATION = 23, // Pc OTHER_PUNCTUATION = 24, // Po MATH_SYMBOL =
* 25, // Sm CURRENCY_SYMBOL = 26, // Sc MODIFIER_SYMBOL = 27, // Sk
* OTHER_SYMBOL = 28; // So
*/
/**
* Returns true if the character is an XML "letter". XML Names must start with
* Letters or a few other characters, but other characters in names must only
* satisfy the <em>isNameChar</em> predicate.
*
* @see #isNameChar
* @see #isNCNameChar
*/
public static boolean isLetter(char c) {
// [84] Letter ::= BaseChar | Ideographic
// [85] BaseChar ::= ... too much to repeat
// [86] Ideographic ::= ... too much to repeat
//
// Optimize the typical case.
//
if (c >= "a" && c <= "z")
return true;
if (c == "/")
return false;
if (c >= "A" && c <= "Z")
return true;
//
// Since the tables are too ridiculous to use in code,
// we"re using the footnotes here to drive this test.
//
switch (Character.getType(c)) {
// app. B footnote says these are "name start"
// chars" ...
case Character.LOWERCASE_LETTER: // Ll
case Character.UPPERCASE_LETTER: // Lu
case Character.OTHER_LETTER: // Lo
case Character.TITLECASE_LETTER: // Lt
case Character.LETTER_NUMBER: // Nl
// OK, here we just have some exceptions to check...
return !isCompatibilityChar(c)
// per "5.14 of Unicode", rule out some combiners
&& !(c >= 0x20dd && c <= 0x20e0);
default:
// check for some exceptions: these are "alphabetic"
return ((c >= 0x02bb && c <= 0x02c1) || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
}
}
//
// XML 1.0 discourages "compatibility" characters in names; these
// were defined to permit passing through some information stored in
// older non-Unicode character sets. These always have alternative
// representations in Unicode, e.g. using combining chars.
//
private static boolean isCompatibilityChar(char c) {
// the numerous comparisions here seem unavoidable,
// but the switch can reduce the number which must
// actually be executed.
switch ((c >> 8) & 0x0ff) {
case 0x00:
// ISO Latin/1 has a few compatibility characters
return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
case 0x01:
// as do Latin Extended A and (parts of) B
return (c >= 0x0132 && c <= 0x0133) || (c >= 0x013f && c <= 0x0140) || c == 0x0149
|| c == 0x017f || (c >= 0x01c4 && c <= 0x01cc) || (c >= 0x01f1 && c <= 0x01f3);
case 0x02:
// some spacing modifiers
return (c >= 0x02b0 && c <= 0x02b8) || (c >= 0x02e0 && c <= 0x02e4);
case 0x03:
return c == 0x037a; // Greek
case 0x05:
return c == 0x0587; // Armenian
case 0x0e:
return c >= 0x0edc && c <= 0x0edd; // Laotian
case 0x11:
// big chunks of Hangul Jamo are all "compatibility"
return c == 0x1101 || c == 0x1104 || c == 0x1108 || c == 0x110a || c == 0x110d
|| (c >= 0x1113 && c <= 0x113b) || c == 0x113d || c == 0x113f
|| (c >= 0x1141 && c <= 0x114b) || c == 0x114d || c == 0x114f
|| (c >= 0x1151 && c <= 0x1153) || (c >= 0x1156 && c <= 0x1158) || c == 0x1162
|| c == 0x1164 || c == 0x1166 || c == 0x1168 || (c >= 0x116a && c <= 0x116c)
|| (c >= 0x116f && c <= 0x1171) || c == 0x1174 || (c >= 0x1176 && c <= 0x119d)
|| (c >= 0x119f && c <= 0x11a2) || (c >= 0x11a9 && c <= 0x11aa)
|| (c >= 0x11ac && c <= 0x11ad) || (c >= 0x11b0 && c <= 0x11b6) || c == 0x11b9
|| c == 0x11bb || (c >= 0x11c3 && c <= 0x11ea) || (c >= 0x11ec && c <= 0x11ef)
|| (c >= 0x11f1 && c <= 0x11f8);
case 0x20:
return c == 0x207f; // superscript
case 0x21:
return
// various letterlike symbols
c == 0x2102 || c == 0x2107 || (c >= 0x210a && c <= 0x2113) || c == 0x2115
|| (c >= 0x2118 && c <= 0x211d) || c == 0x2124 || c == 0x2128
|| (c >= 0x212c && c <= 0x212d) || (c >= 0x212f && c <= 0x2138)
// most Roman numerals (less 1K, 5K, 10K)
|| (c >= 0x2160 && c <= 0x217f);
case 0x30:
// some Hiragana
return c >= 0x309b && c <= 0x309c;
case 0x31:
// all Hangul Compatibility Jamo
return c >= 0x3131 && c <= 0x318e;
case 0xf9:
case 0xfa:
case 0xfb:
case 0xfc:
case 0xfd:
case 0xfe:
case 0xff:
// the whole "compatibility" area is for that purpose!
return true;
default:
// most of Unicode isn"t flagged as being for compatibility
return false;
}
}
// guts of isNameChar/isNCNameChar
private static boolean isLetter2(char c) {
// [84] Letter ::= BaseChar | Ideographic
// [85] BaseChar ::= ... too much to repeat
// [86] Ideographic ::= ... too much to repeat
// [87] CombiningChar ::= ... too much to repeat
//
// Optimize the typical case.
//
if (c >= "a" && c <= "z")
return true;
if (c == ">")
return false;
if (c >= "A" && c <= "Z")
return true;
//
// Since the tables are too ridiculous to use in code,
// we"re using the footnotes here to drive this test.
//
switch (Character.getType(c)) {
// app. B footnote says these are "name start"
// chars" ...
case Character.LOWERCASE_LETTER: // Ll
case Character.UPPERCASE_LETTER: // Lu
case Character.OTHER_LETTER: // Lo
case Character.TITLECASE_LETTER: // Lt
case Character.LETTER_NUMBER: // Nl
// ... and these are name characters "other
// than name start characters"
case Character.ruBINING_SPACING_MARK: // Mc
case Character.ENCLOSING_MARK: // Me
case Character.NON_SPACING_MARK: // Mn
case Character.MODIFIER_LETTER: // Lm
case Character.DECIMAL_DIGIT_NUMBER: // Nd
// OK, here we just have some exceptions to check...
return !isCompatibilityChar(c)
// per "5.14 of Unicode", rule out some combiners
&& !(c >= 0x20dd && c <= 0x20e0);
default:
// added a character ...
return c == 0x0387;
}
}
private static boolean isDigit(char c) {
// [88] Digit ::= ...
//
// java.lang.Character.isDigit is correct from the XML point
// of view except that it allows "fullwidth" digits.
//
return Character.isDigit(c) && !((c >= 0xff10) && (c <= 0xff19));
}
private static boolean isExtender(char c) {
// [89] Extender ::= ...
return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 || c == 0x0640 || c == 0x0e46
|| c == 0x0ec6 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
|| (c >= 0x309d && c <= 0x309e) || (c >= 0x30fc && c <= 0x30fe);
}
}
Verify whether the specified character conforms to the XML 1.0 definition of whitespace
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Id: XMLCharacterRecognizer.java 468655 2006-10-28 07:12:06Z minchau $
*/
/**
* Class used to verify whether the specified <var>ch</var>
* conforms to the XML 1.0 definition of whitespace.
* @xsl.usage internal
*/
public class XMLCharacterRecognizer
{
/**
* Returns whether the specified <var>ch</var> conforms to the XML 1.0 definition
* of whitespace. Refer to for details.
* @param ch Character to check as XML whitespace.
* @return =true if <var>ch</var> is XML whitespace; otherwise =false.
*/
public static boolean isWhiteSpace(char ch)
{
return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
}
/**
* Tell if the string is whitespace.
*
* @param ch Character array to check as XML whitespace.
* @param start Start index of characters in the array
* @param length Number of characters in the array
* @return True if the characters in the array are
* XML whitespace; otherwise, false.
*/
public static boolean isWhiteSpace(char ch[], int start, int length)
{
int end = start + length;
for (int s = start; s < end; s++)
{
if (!isWhiteSpace(ch[s]))
return false;
}
return true;
}
/**
* Tell if the string is whitespace.
*
* @param buf StringBuffer to check as XML whitespace.
* @return True if characters in buffer are XML whitespace, false otherwise
*/
public static boolean isWhiteSpace(StringBuffer buf)
{
int n = buf.length();
for (int i = 0; i < n; i++)
{
if (!isWhiteSpace(buf.charAt(i)))
return false;
}
return true;
}
/**
* Tell if the string is whitespace.
*
* @param s String to check as XML whitespace.
* @return True if characters in buffer are XML whitespace, false otherwise
*/
public static boolean isWhiteSpace(String s)
{
if(null != s)
{
int n = s.length();
for (int i = 0; i < n; i++)
{
if (!isWhiteSpace(s.charAt(i)))
return false;
}
}
return true;
}
}
XML character properties
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Id$
*/
/**
* This class defines the basic XML character properties. The data
* in this class can be used to verify that a character is a valid
* XML character or if the character is a space, name start, or name
* character.
* <p>
* A series of convenience methods are supplied to ease the burden
* of the developer. Because inlining the checks can improve per
* character performance, the tables of character properties are
* public. Using the character as an index into the <code>CHARS</code>
* array and applying the appropriate mask flag (e.g.
* <code>MASK_VALID</code>), yields the same results as calling the
* convenience methods. There is one exception: check the comments
* for the <code>isValid</code> method for details.
*
* @author Glenn Marcy, IBM
* @author Andy Clark, IBM
* @author Eric Ye, IBM
* @author Arnaud Le Hors, IBM
* @author Rahul Srivastava, Sun Microsystems Inc.
*
* @version $Id: XMLChar.java,v 1.7 2002/01/29 01:15:18 lehors Exp $
*/
public class XMLChar {
//
// Constants
//
/** Character flags. */
private static final byte[] CHARS = new byte[1 << 16];
/** Valid character mask. */
public static final int MASK_VALID = 0x01;
/** Space character mask. */
public static final int MASK_SPACE = 0x02;
/** Name start character mask. */
public static final int MASK_NAME_START = 0x04;
/** Name character mask. */
public static final int MASK_NAME = 0x08;
/** Pubid character mask. */
public static final int MASK_PUBID = 0x10;
/**
* Content character mask. Special characters are those that can
* be considered the start of markup, such as "<" and "&".
* The various newline characters are considered special as well.
* All other valid XML characters can be considered content.
* <p>
* This is an optimization for the inner loop of character scanning.
*/
public static final int MASK_CONTENT = 0x20;
/** NCName start character mask. */
public static final int MASK_NCNAME_START = 0x40;
/** NCName character mask. */
public static final int MASK_NCNAME = 0x80;
//
// Static initialization
//
static {
//
// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] |
// [#xE000-#xFFFD] | [#x10000-#x10FFFF]
//
int charRange[] = {
0x0009, 0x000A, 0x000D, 0x000D, 0x0020, 0xD7FF, 0xE000, 0xFFFD,
};
//
// [3] S ::= (#x20 | #x9 | #xD | #xA)+
//
int spaceChar[] = {
0x0020, 0x0009, 0x000D, 0x000A,
};
//
// [4] NameChar ::= Letter | Digit | "." | "-" | "_" | ":" |
// CombiningChar | Extender
//
int nameChar[] = {
0x002D, 0x002E, // "-" and "."
};
//
// [5] Name ::= (Letter | "_" | ":") (NameChar)*
//
int nameStartChar[] = {
0x003A, 0x005F, // ":" and "_"
};
//
// [13] PubidChar ::= #x20 | 0xD | 0xA | [a-zA-Z0-9] | [-"()+,./:=?;!*#@$_%]
//
int pubidChar[] = {
0x000A, 0x000D, 0x0020, 0x0021, 0x0023, 0x0024, 0x0025, 0x003D,
0x005F
};
int pubidRange[] = {
0x0027, 0x003B, 0x003F, 0x005A, 0x0061, 0x007A
};
//
// [84] Letter ::= BaseChar | Ideographic
//
int letterRange[] = {
// BaseChar
0x0041, 0x005A, 0x0061, 0x007A, 0x00C0, 0x00D6, 0x00D8, 0x00F6,
0x00F8, 0x0131, 0x0134, 0x013E, 0x0141, 0x0148, 0x014A, 0x017E,
0x0180, 0x01C3, 0x01CD, 0x01F0, 0x01F4, 0x01F5, 0x01FA, 0x0217,
0x0250, 0x02A8, 0x02BB, 0x02C1, 0x0388, 0x038A, 0x038E, 0x03A1,
0x03A3, 0x03CE, 0x03D0, 0x03D6, 0x03E2, 0x03F3, 0x0401, 0x040C,
0x040E, 0x044F, 0x0451, 0x045C, 0x045E, 0x0481, 0x0490, 0x04C4,
0x04C7, 0x04C8, 0x04CB, 0x04CC, 0x04D0, 0x04EB, 0x04EE, 0x04F5,
0x04F8, 0x04F9, 0x0531, 0x0556, 0x0561, 0x0586, 0x05D0, 0x05EA,
0x05F0, 0x05F2, 0x0621, 0x063A, 0x0641, 0x064A, 0x0671, 0x06B7,
0x06BA, 0x06BE, 0x06C0, 0x06CE, 0x06D0, 0x06D3, 0x06E5, 0x06E6,
0x0905, 0x0939, 0x0958, 0x0961, 0x0985, 0x098C, 0x098F, 0x0990,
0x0993, 0x09A8, 0x09AA, 0x09B0, 0x09B6, 0x09B9, 0x09DC, 0x09DD,
0x09DF, 0x09E1, 0x09F0, 0x09F1, 0x0A05, 0x0A0A, 0x0A0F, 0x0A10,
0x0A13, 0x0A28, 0x0A2A, 0x0A30, 0x0A32, 0x0A33, 0x0A35, 0x0A36,
0x0A38, 0x0A39, 0x0A59, 0x0A5C, 0x0A72, 0x0A74, 0x0A85, 0x0A8B,
0x0A8F, 0x0A91, 0x0A93, 0x0AA8, 0x0AAA, 0x0AB0, 0x0AB2, 0x0AB3,
0x0AB5, 0x0AB9, 0x0B05, 0x0B0C, 0x0B0F, 0x0B10, 0x0B13, 0x0B28,
0x0B2A, 0x0B30, 0x0B32, 0x0B33, 0x0B36, 0x0B39, 0x0B5C, 0x0B5D,
0x0B5F, 0x0B61, 0x0B85, 0x0B8A, 0x0B8E, 0x0B90, 0x0B92, 0x0B95,
0x0B99, 0x0B9A, 0x0B9E, 0x0B9F, 0x0BA3, 0x0BA4, 0x0BA8, 0x0BAA,
0x0BAE, 0x0BB5, 0x0BB7, 0x0BB9, 0x0C05, 0x0C0C, 0x0C0E, 0x0C10,
0x0C12, 0x0C28, 0x0C2A, 0x0C33, 0x0C35, 0x0C39, 0x0C60, 0x0C61,
0x0C85, 0x0C8C, 0x0C8E, 0x0C90, 0x0C92, 0x0CA8, 0x0CAA, 0x0CB3,
0x0CB5, 0x0CB9, 0x0CE0, 0x0CE1, 0x0D05, 0x0D0C, 0x0D0E, 0x0D10,
0x0D12, 0x0D28, 0x0D2A, 0x0D39, 0x0D60, 0x0D61, 0x0E01, 0x0E2E,
0x0E32, 0x0E33, 0x0E40, 0x0E45, 0x0E81, 0x0E82, 0x0E87, 0x0E88,
0x0E94, 0x0E97, 0x0E99, 0x0E9F, 0x0EA1, 0x0EA3, 0x0EAA, 0x0EAB,
0x0EAD, 0x0EAE, 0x0EB2, 0x0EB3, 0x0EC0, 0x0EC4, 0x0F40, 0x0F47,
0x0F49, 0x0F69, 0x10A0, 0x10C5, 0x10D0, 0x10F6, 0x1102, 0x1103,
0x1105, 0x1107, 0x110B, 0x110C, 0x110E, 0x1112, 0x1154, 0x1155,
0x115F, 0x1161, 0x116D, 0x116E, 0x1172, 0x1173, 0x11AE, 0x11AF,
0x11B7, 0x11B8, 0x11BC, 0x11C2, 0x1E00, 0x1E9B, 0x1EA0, 0x1EF9,
0x1F00, 0x1F15, 0x1F18, 0x1F1D, 0x1F20, 0x1F45, 0x1F48, 0x1F4D,
0x1F50, 0x1F57, 0x1F5F, 0x1F7D, 0x1F80, 0x1FB4, 0x1FB6, 0x1FBC,
0x1FC2, 0x1FC4, 0x1FC6, 0x1FCC, 0x1FD0, 0x1FD3, 0x1FD6, 0x1FDB,
0x1FE0, 0x1FEC, 0x1FF2, 0x1FF4, 0x1FF6, 0x1FFC, 0x212A, 0x212B,
0x2180, 0x2182, 0x3041, 0x3094, 0x30A1, 0x30FA, 0x3105, 0x312C,
0xAC00, 0xD7A3,
// Ideographic
0x3021, 0x3029, 0x4E00, 0x9FA5,
};
int letterChar[] = {
// BaseChar
0x0386, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0559, 0x06D5,
0x093D, 0x09B2, 0x0A5E, 0x0A8D, 0x0ABD, 0x0AE0, 0x0B3D, 0x0B9C,
0x0CDE, 0x0E30, 0x0E84, 0x0E8A, 0x0E8D, 0x0EA5, 0x0EA7, 0x0EB0,
0x0EBD, 0x1100, 0x1109, 0x113C, 0x113E, 0x1140, 0x114C, 0x114E,
0x1150, 0x1159, 0x1163, 0x1165, 0x1167, 0x1169, 0x1175, 0x119E,
0x11A8, 0x11AB, 0x11BA, 0x11EB, 0x11F0, 0x11F9, 0x1F59, 0x1F5B,
0x1F5D, 0x1FBE, 0x2126, 0x212E,
// Ideographic
0x3007,
};
//
// [87] CombiningChar ::= ...
//
int combiningCharRange[] = {
0x0300, 0x0345, 0x0360, 0x0361, 0x0483, 0x0486, 0x0591, 0x05A1,
0x05A3, 0x05B9, 0x05BB, 0x05BD, 0x05C1, 0x05C2, 0x064B, 0x0652,
0x06D6, 0x06DC, 0x06DD, 0x06DF, 0x06E0, 0x06E4, 0x06E7, 0x06E8,
0x06EA, 0x06ED, 0x0901, 0x0903, 0x093E, 0x094C, 0x0951, 0x0954,
0x0962, 0x0963, 0x0981, 0x0983, 0x09C0, 0x09C4, 0x09C7, 0x09C8,
0x09CB, 0x09CD, 0x09E2, 0x09E3, 0x0A40, 0x0A42, 0x0A47, 0x0A48,
0x0A4B, 0x0A4D, 0x0A70, 0x0A71, 0x0A81, 0x0A83, 0x0ABE, 0x0AC5,
0x0AC7, 0x0AC9, 0x0ACB, 0x0ACD, 0x0B01, 0x0B03, 0x0B3E, 0x0B43,
0x0B47, 0x0B48, 0x0B4B, 0x0B4D, 0x0B56, 0x0B57, 0x0B82, 0x0B83,
0x0BBE, 0x0BC2, 0x0BC6, 0x0BC8, 0x0BCA, 0x0BCD, 0x0C01, 0x0C03,
0x0C3E, 0x0C44, 0x0C46, 0x0C48, 0x0C4A, 0x0C4D, 0x0C55, 0x0C56,
0x0C82, 0x0C83, 0x0CBE, 0x0CC4, 0x0CC6, 0x0CC8, 0x0CCA, 0x0CCD,
0x0CD5, 0x0CD6, 0x0D02, 0x0D03, 0x0D3E, 0x0D43, 0x0D46, 0x0D48,
0x0D4A, 0x0D4D, 0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB4, 0x0EB9,
0x0EBB, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19, 0x0F71, 0x0F84,
0x0F86, 0x0F8B, 0x0F90, 0x0F95, 0x0F99, 0x0FAD, 0x0FB1, 0x0FB7,
0x20D0, 0x20DC, 0x302A, 0x302F,
};
int combiningCharChar[] = {
0x05BF, 0x05C4, 0x0670, 0x093C, 0x094D, 0x09BC, 0x09BE, 0x09BF,
0x09D7, 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, 0x0ABC, 0x0B3C, 0x0BD7,
0x0D57, 0x0E31, 0x0EB1, 0x0F35, 0x0F37, 0x0F39, 0x0F3E, 0x0F3F,
0x0F97, 0x0FB9, 0x20E1, 0x3099, 0x309A,
};
//
// [88] Digit ::= ...
//
int digitRange[] = {
0x0030, 0x0039, 0x0660, 0x0669, 0x06F0, 0x06F9, 0x0966, 0x096F,
0x09E6, 0x09EF, 0x0A66, 0x0A6F, 0x0AE6, 0x0AEF, 0x0B66, 0x0B6F,
0x0BE7, 0x0BEF, 0x0C66, 0x0C6F, 0x0CE6, 0x0CEF, 0x0D66, 0x0D6F,
0x0E50, 0x0E59, 0x0ED0, 0x0ED9, 0x0F20, 0x0F29,
};
//
// [89] Extender ::= ...
//
int extenderRange[] = {
0x3031, 0x3035, 0x309D, 0x309E, 0x30FC, 0x30FE,
};
int extenderChar[] = {
0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005,
};
//
// SpecialChar ::= "<", "&", "\n", "\r", "]"
//
int specialChar[] = {
"<", "&", "\n", "\r", "]",
};
//
// Initialize
//
// set valid characters
for (int i = 0; i < charRange.length; i += 2) {
for (int j = charRange[i]; j <= charRange[i + 1]; j++) {
CHARS[j] |= MASK_VALID | MASK_CONTENT;
}
}
// remove special characters
for (int i = 0; i < specialChar.length; i++) {
CHARS[specialChar[i]] = (byte)(CHARS[specialChar[i]] & ~MASK_CONTENT);
}
// set space characters
for (int i = 0; i < spaceChar.length; i++) {
CHARS[spaceChar[i]] |= MASK_SPACE;
}
// set name start characters
for (int i = 0; i < nameStartChar.length; i++) {
CHARS[nameStartChar[i]] |= MASK_NAME_START | MASK_NAME |
MASK_NCNAME_START | MASK_NCNAME;
}
for (int i = 0; i < letterRange.length; i += 2) {
for (int j = letterRange[i]; j <= letterRange[i + 1]; j++) {
CHARS[j] |= MASK_NAME_START | MASK_NAME |
MASK_NCNAME_START | MASK_NCNAME;
}
}
for (int i = 0; i < letterChar.length; i++) {
CHARS[letterChar[i]] |= MASK_NAME_START | MASK_NAME |
MASK_NCNAME_START | MASK_NCNAME;
}
// set name characters
for (int i = 0; i < nameChar.length; i++) {
CHARS[nameChar[i]] |= MASK_NAME | MASK_NCNAME;
}
for (int i = 0; i < digitRange.length; i += 2) {
for (int j = digitRange[i]; j <= digitRange[i + 1]; j++) {
CHARS[j] |= MASK_NAME | MASK_NCNAME;
}
}
for (int i = 0; i < combiningCharRange.length; i += 2) {
for (int j = combiningCharRange[i]; j <= combiningCharRange[i + 1]; j++) {
CHARS[j] |= MASK_NAME | MASK_NCNAME;
}
}
for (int i = 0; i < combiningCharChar.length; i++) {
CHARS[combiningCharChar[i]] |= MASK_NAME | MASK_NCNAME;
}
for (int i = 0; i < extenderRange.length; i += 2) {
for (int j = extenderRange[i]; j <= extenderRange[i + 1]; j++) {
CHARS[j] |= MASK_NAME | MASK_NCNAME;
}
}
for (int i = 0; i < extenderChar.length; i++) {
CHARS[extenderChar[i]] |= MASK_NAME | MASK_NCNAME;
}
// remove ":" from allowable MASK_NCNAME_START and MASK_NCNAME chars
CHARS[":"] &= ~(MASK_NCNAME_START | MASK_NCNAME);
// set Pubid characters
for (int i = 0; i < pubidChar.length; i++) {
CHARS[pubidChar[i]] |= MASK_PUBID;
}
for (int i = 0; i < pubidRange.length; i += 2) {
for (int j = pubidRange[i]; j <= pubidRange[i + 1]; j++) {
CHARS[j] |= MASK_PUBID;
}
}
} // <clinit>()
//
// Public static methods
//
/**
* Returns true if the specified character is a supplemental character.
*
* @param c The character to check.
*/
public static boolean isSupplemental(int c) {
return (c >= 0x10000 && c <= 0x10FFFF);
}
/**
* Returns true the supplemental character corresponding to the given
* surrogates.
*
* @param h The high surrogate.
* @param l The low surrogate.
*/
public static int supplemental(char h, char l) {
return (h - 0xD800) * 0x400 + (l - 0xDC00) + 0x10000;
}
/**
* Returns the high surrogate of a supplemental character
*
* @param c The supplemental character to "split".
*/
public static char highSurrogate(int c) {
return (char) (((c - 0x00010000) >> 10) + 0xD800);
}
/**
* Returns the low surrogate of a supplemental character
*
* @param c The supplemental character to "split".
*/
public static char lowSurrogate(int c) {
return (char) (((c - 0x00010000) & 0x3FF) + 0xDC00);
}
/**
* Returns whether the given character is a high surrogate
*
* @param c The character to check.
*/
public static boolean isHighSurrogate(int c) {
return (0xD800 <= c && c <= 0xDBFF);
}
/**
* Returns whether the given character is a low surrogate
*
* @param c The character to check.
*/
public static boolean isLowSurrogate(int c) {
return (0xDC00 <= c && c <= 0xDFFF);
}
/**
* Returns true if the specified character is valid. This method
* also checks the surrogate character range from 0x10000 to 0x10FFFF.
* <p>
* If the program chooses to apply the mask directly to the
* <code>CHARS</code> array, then they are responsible for checking
* the surrogate character range.
*
* @param c The character to check.
*/
public static boolean isValid(int c) {
return (c < 0x10000 && (CHARS[c] & MASK_VALID) != 0) ||
(0x10000 <= c && c <= 0x10FFFF);
} // isValid(int):boolean
/**
* Returns true if the specified character is invalid.
*
* @param c The character to check.
*/
public static boolean isInvalid(int c) {
return !isValid(c);
} // isInvalid(int):boolean
/**
* Returns true if the specified character can be considered content.
*
* @param c The character to check.
*/
public static boolean isContent(int c) {
return (c < 0x10000 && (CHARS[c] & MASK_CONTENT) != 0) ||
(0x10000 <= c && c <= 0x10FFFF);
} // isContent(int):boolean
/**
* Returns true if the specified character can be considered markup.
* Markup characters include "<", "&", and "%".
*
* @param c The character to check.
*/
public static boolean isMarkup(int c) {
return c == "<" || c == "&" || c == "%";
} // isMarkup(int):boolean
/**
* Returns true if the specified character is a space character
* as defined by production [3] in the XML 1.0 specification.
*
* @param c The character to check.
*/
public static boolean isSpace(int c) {
return c < 0x10000 && (CHARS[c] & MASK_SPACE) != 0;
} // isSpace(int):boolean
/**
* Returns true if the specified character is a valid name start
* character as defined by production [5] in the XML 1.0
* specification.
*
* @param c The character to check.
*/
public static boolean isNameStart(int c) {
return c < 0x10000 && (CHARS[c] & MASK_NAME_START) != 0;
} // isNameStart(int):boolean
/**
* Returns true if the specified character is a valid name
* character as defined by production [4] in the XML 1.0
* specification.
*
* @param c The character to check.
*/
public static boolean isName(int c) {
return c < 0x10000 && (CHARS[c] & MASK_NAME) != 0;
} // isName(int):boolean
/**
* Returns true if the specified character is a valid NCName start
* character as defined by production [4] in Namespaces in XML
* recommendation.
*
* @param c The character to check.
*/
public static boolean isNCNameStart(int c) {
return c < 0x10000 && (CHARS[c] & MASK_NCNAME_START) != 0;
} // isNCNameStart(int):boolean
/**
* Returns true if the specified character is a valid NCName
* character as defined by production [5] in Namespaces in XML
* recommendation.
*
* @param c The character to check.
*/
public static boolean isNCName(int c) {
return c < 0x10000 && (CHARS[c] & MASK_NCNAME) != 0;
} // isNCName(int):boolean
/**
* Returns true if the specified character is a valid Pubid
* character as defined by production [13] in the XML 1.0
* specification.
*
* @param c The character to check.
*/
public static boolean isPubid(int c) {
return c < 0x10000 && (CHARS[c] & MASK_PUBID) != 0;
} // isPubid(int):boolean
/*
* [5] Name ::= (Letter | "_" | ":") (NameChar)*
*/
/**
* Check to see if a string is a valid Name according to [5]
* in the XML 1.0 Recommendation
*
* @param name string to check
* @return true if name is a valid Name
*/
public static boolean isValidName(String name) {
if (name.length() == 0)
return false;
char ch = name.charAt(0);
if( isNameStart(ch) == false)
return false;
for (int i = 1; i < name.length(); i++ ) {
ch = name.charAt(i);
if( isName( ch ) == false ){
return false;
}
}
return true;
} // isValidName(String):boolean
/*
* from the namespace rec
* [4] NCName ::= (Letter | "_") (NCNameChar)*
*/
/**
* Check to see if a string is a valid NCName according to [4]
* from the XML Namespaces 1.0 Recommendation
*
* @param ncName string to check
* @return true if name is a valid NCName
*/
public static boolean isValidNCName(String ncName) {
if (ncName.length() == 0)
return false;
char ch = ncName.charAt(0);
if( isNCNameStart(ch) == false)
return false;
for (int i = 1; i < ncName.length(); i++ ) {
ch = ncName.charAt(i);
if( isNCName( ch ) == false ){
return false;
}
}
return true;
} // isValidNCName(String):boolean
/*
* [7] Nmtoken ::= (NameChar)+
*/
/**
* Check to see if a string is a valid Nmtoken according to [7]
* in the XML 1.0 Recommendation
*
* @param nmtoken string to check
* @return true if nmtoken is a valid Nmtoken
*/
public static boolean isValidNmtoken(String nmtoken) {
if (nmtoken.length() == 0)
return false;
for (int i = 0; i < nmtoken.length(); i++ ) {
char ch = nmtoken.charAt(i);
if( ! isName( ch ) ){
return false;
}
}
return true;
} // isValidName(String):boolean
// encodings
/**
* Returns true if the encoding name is a valid IANA encoding.
* This method does not verify that there is a decoder available
* for this encoding, only that the characters are valid for an
* IANA encoding name.
*
* @param ianaEncoding The IANA encoding name.
*/
public static boolean isValidIANAEncoding(String ianaEncoding) {
if (ianaEncoding != null) {
int length = ianaEncoding.length();
if (length > 0) {
char c = ianaEncoding.charAt(0);
if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z")) {
for (int i = 1; i < length; i++) {
c = ianaEncoding.charAt(i);
if ((c < "A" || c > "Z") && (c < "a" || c > "z") &&
(c < "0" || c > "9") && c != "." && c != "_" &&
c != "-") {
return false;
}
}
return true;
}
}
}
return false;
} // isValidIANAEncoding(String):boolean
/**
* Returns true if the encoding name is a valid Java encoding.
* This method does not verify that there is a decoder available
* for this encoding, only that the characters are valid for an
* Java encoding name.
*
* @param javaEncoding The Java encoding name.
*/
public static boolean isValidJavaEncoding(String javaEncoding) {
if (javaEncoding != null) {
int length = javaEncoding.length();
if (length > 0) {
for (int i = 1; i < length; i++) {
char c = javaEncoding.charAt(i);
if ((c < "A" || c > "Z") && (c < "a" || c > "z") &&
(c < "0" || c > "9") && c != "." && c != "_" &&
c != "-") {
return false;
}
}
return true;
}
}
return false;
} // isValidIANAEncoding(String):boolean
/**
* Simple check to determine if qname is legal. If it returns false
* then <param>str</param> is illegal; if it returns true then
* <param>str</param> is legal.
*/
public static boolean isValidQName(String str) {
final int colon = str.indexOf(":");
if (colon == 0 || colon == str.length() - 1) {
return false;
}
if (colon > 0) {
final String prefix = str.substring(0,colon);
final String localPart = str.substring(colon+1);
return isValidNCName(prefix) && isValidNCName(localPart);
}
else {
return isValidNCName(str);
}
}
} // class XMLChar
XMLEncoder a bean
import java.beans.XMLEncoder;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import javax.swing.JFrame;
public class MainClass {
public static void main(String args[]) {
JFrame x = new JFrame("Look at me");
x.setSize(200, 300);
x.setVisible(true);
x.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
FileOutputStream f;
try {
f = new FileOutputStream("Test.xml");
XMLEncoder e = new XMLEncoder(new BufferedOutputStream(f));
e.writeObject(x);
e.close();
} catch (Exception e) {
}
}
}
Xml Encoding Sniffer
/* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Revised from xml beans
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.nio.charset.Charset;
import com.sun.org.apache.xerces.internal.util.EncodingMap;
public class XmlEncodingSniffer
{
private String _xmlencoding;
private String _javaencoding;
private InputStream _stream;
private Reader _reader;
/**
* Sniffs the given XML stream for encoding information.
*
* After a sniffer is constructed, it can return either a stream
* (which is a buffered stream wrapper of the original) or a reader
* (which applies the proper encoding).
*
* @param stream The stream to sniff
* @param encodingOverride The XML (IANA) name for the overriding encoding
* @throws IOException
* @throws UnsupportedEncodingException
*/
public XmlEncodingSniffer(InputStream stream, String encodingOverride)
throws IOException, UnsupportedEncodingException
{
_stream = stream;
if (encodingOverride != null)
_xmlencoding = EncodingMap.getJava2IANAMapping(encodingOverride);
if (_xmlencoding == null)
_xmlencoding = encodingOverride;
if (_xmlencoding == null)
{
SniffedXmlInputStream sniffed = new SniffedXmlInputStream(_stream);
_xmlencoding = sniffed.getXmlEncoding();
assert(_xmlencoding != null);
_stream = sniffed;
}
_javaencoding = EncodingMap.getIANA2JavaMapping(_xmlencoding);
// we allow you to use Java"s encoding names in XML even though you"re
// not supposed to.
if (_javaencoding == null)
_javaencoding = _xmlencoding;
}
/**
* Sniffs the given XML stream for encoding information.
*
* After a sniffer is constructed, it can return either a reader
* (which is a buffered stream wrapper of the original) or a stream
* (which applies the proper encoding).
*
* @param reader The reader to sniff
* @param encodingDefault The Java name for the default encoding to apply, UTF-8 if null.
* @throws IOException
* @throws UnsupportedEncodingException
*/
public XmlEncodingSniffer(Reader reader, String encodingDefault)
throws IOException, UnsupportedEncodingException
{
if (encodingDefault == null)
encodingDefault = "UTF-8";
SniffedXmlReader sniffedReader = new SniffedXmlReader(reader);
_reader = sniffedReader;
_xmlencoding = sniffedReader.getXmlEncoding();
if (_xmlencoding == null)
{
_xmlencoding = EncodingMap.getJava2IANAMapping(encodingDefault);
if (_xmlencoding != null)
_javaencoding = encodingDefault;
else
_xmlencoding = encodingDefault;
}
if (_xmlencoding == null)
_xmlencoding = "UTF-8";
// we allow you to use Java"s encoding names in XML even though you"re
// not supposed to.
_javaencoding = EncodingMap.getIANA2JavaMapping(_xmlencoding);
if (_javaencoding == null)
_javaencoding = _xmlencoding;
}
public String getXmlEncoding()
{
return _xmlencoding;
}
public String getJavaEncoding()
{
return _javaencoding;
}
public InputStream getStream()
throws UnsupportedEncodingException
{
if (_stream != null)
{
InputStream is = _stream;
_stream = null;
return is;
}
if (_reader != null)
{
InputStream is = new ReaderInputStream( _reader, _javaencoding );
_reader = null;
return is;
}
return null;
}
public Reader getReader ( )
throws UnsupportedEncodingException
{
if (_reader != null)
{
Reader reader = _reader;
_reader = null;
return reader;
}
if (_stream != null)
{
Reader reader = new InputStreamReader( _stream, _javaencoding );
_stream = null;
return reader;
}
return null;
}
}
/* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
class ReaderInputStream extends PushedInputStream
{
private Reader reader;
private Writer writer;
private char[] buf;
public static int defaultBufferSize = 2048;
public ReaderInputStream(Reader reader, String encoding) throws UnsupportedEncodingException
{
this(reader, encoding, defaultBufferSize);
}
public ReaderInputStream(Reader reader, String encoding, int bufferSize) throws UnsupportedEncodingException
{
if (bufferSize <= 0)
throw new IllegalArgumentException("Buffer size <= 0");
this.reader = reader;
this.writer = new OutputStreamWriter(getOutputStream(), encoding);
buf = new char[bufferSize];
}
public void fill(int requestedBytes) throws IOException
{
do
{
int chars = reader.read(buf);
if (chars < 0)
return;
writer.write(buf, 0, chars);
writer.flush();
}
while (available() <= 0); // loop for safety, in case encoding didn"t produce any bytes yet
}
}
/* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
abstract class PushedInputStream extends InputStream
{
private static int defaultBufferSize = 2048;
protected byte buf[];
protected int writepos;
protected int readpos;
protected int markpos = -1;
protected int marklimit;
protected OutputStream outputStream = new InternalOutputStream();
/**
* Called when more bytes need to be written into this stream
* (as an OutputStream).
*
* This method must write at least one byte if the stream is
* not ended, and it must not write any bytes if the stream has
* already ended.
*/
protected abstract void fill(int requestedBytes) throws IOException;
/**
* Returns the linked output stream.
*
* This is the output stream that must be written to whenever
* the fill method is called.
*/
public final OutputStream getOutputStream()
{
return outputStream;
}
public PushedInputStream()
{
this(defaultBufferSize);
}
public PushedInputStream(int size)
{
if (size < 0)
{
throw new IllegalArgumentException("Negative initial buffer size");
}
buf = new byte[size];
}
/**
* Makes room for cb more bytes of data
*/
private void shift(int cb)
{
int savepos = readpos;
if (markpos > 0)
{
if (readpos - markpos > marklimit)
markpos = -1;
else
savepos = markpos;
}
int size = writepos - savepos;
if (savepos > 0 && buf.length - size >= cb && size <= cb)
{
System.arraycopy(buf, savepos, buf, 0, size);
}
else
{
int newcount = size + cb;
byte newbuf[] = new byte[Math.max(buf.length << 1, newcount)];
System.arraycopy(buf, savepos, newbuf, 0, size);
buf = newbuf;
}
if (savepos > 0)
{
readpos -= savepos;
if (markpos > 0)
markpos -= savepos;
writepos -= savepos;
}
}
public synchronized int read() throws IOException
{
if (readpos >= writepos)
{
fill(1);
if (readpos >= writepos)
return -1;
}
return buf[readpos++] & 0xff;
}
/**
* Read characters into a portion of an array, reading from the underlying
* stream at most once if necessary.
*/
public synchronized int read(byte[] b, int off, int len) throws IOException
{
int avail = writepos - readpos;
if (avail < len)
{
fill(len - avail);
avail = writepos - readpos;
if (avail <= 0) return -1;
}
int cnt = (avail < len) ? avail : len;
System.arraycopy(buf, readpos, b, off, cnt);
readpos += cnt;
return cnt;
}
public synchronized long skip(long n) throws IOException
{
if (n <= 0)
return 0;
long avail = writepos - readpos;
if (avail < n)
{
// Fill in buffer to save bytes for reset
long req = n - avail;
if (req > Integer.MAX_VALUE)
req = Integer.MAX_VALUE;
fill((int)req);
avail = writepos - readpos;
if (avail <= 0)
return 0;
}
long skipped = (avail < n) ? avail : n;
readpos += skipped;
return skipped;
}
public synchronized int available()
{
return writepos - readpos;
}
public synchronized void mark(int readlimit)
{
marklimit = readlimit;
markpos = readpos;
}
public synchronized void reset() throws IOException
{
if (markpos < 0)
throw new IOException("Resetting to invalid mark");
readpos = markpos;
}
public boolean markSupported()
{
return true;
}
private class InternalOutputStream extends OutputStream
{
public synchronized void write(int b) throws IOException
{
if (writepos + 1 > buf.length)
{
shift(1);
}
buf[writepos] = (byte)b;
writepos += 1;
}
public synchronized void write(byte b[], int off, int len)
{
if ((off < 0) || (off > b.length) || (len < 0) ||
((off + len) > b.length) || ((off + len) < 0))
throw new IndexOutOfBoundsException();
else if (len == 0)
return;
if (writepos + len > buf.length)
shift(len);
System.arraycopy(b, off, buf, writepos, len);
writepos += len;
}
}
}
class SniffedXmlInputStream extends BufferedInputStream
{
// We don"t sniff more than 192 bytes.
public static int MAX_SNIFFED_BYTES = 192;
public SniffedXmlInputStream(InputStream stream) throws IOException
{
super(stream);
// read byte order marks and detect EBCDIC etc
_encoding = sniffFourBytes();
if (_encoding != null && _encoding.equals("IBM037"))
{
// First four bytes suggest EBCDIC with <?xm at start
String encoding = sniffForXmlDecl(_encoding);
if (encoding != null)
_encoding = encoding;
}
if (_encoding == null)
{
// Haven"t yet determined encoding: sniff for <?xml encoding="..."?>
// assuming we can read it as UTF-8.
_encoding = sniffForXmlDecl("UTF-8");
}
if (_encoding == null)
{
// The XML spec says these two things:
// (1) "In the absence of external character encoding information
// (such as MIME headers), parsed entities which are stored in an
// encoding other than UTF-8 or UTF-16 must begin with a text
// declaration (see 4.3.1 The Text Declaration) containing an
// encoding declaration:"
// (2) "In the absence of information provided by an external
// transport protocol (e.g. HTTP or MIME), it is an error
// for an entity including an encoding declaration to be
// presented to the XML processor in an encoding other than
// that named in the declaration, or for an entity which begins
// with neither a Byte Order Mark nor an encoding declaration
// to use an encoding other than UTF-8."
// Since we"re using a sniffed stream, we do not have external
// character encoding information.
// Since we"re here, we also don"t have a recognized byte order
// mark or an explicit encoding declaration that can be read in
// either ASCII or EBDIC style.
// Therefore, we must use UTF-8.
_encoding = "UTF-8";
}
}
private int readAsMuchAsPossible(byte[] buf, int startAt, int len) throws IOException
{
int total = 0;
while (total < len)
{
int count = read(buf, startAt + total, len - total);
if (count < 0)
break;
total += count;
}
return total;
}
private String sniffFourBytes() throws IOException
{
mark(4);
int skip = 0;
try
{
byte[] buf = new byte[4];
if (readAsMuchAsPossible(buf, 0, 4) < 4)
return null;
long result = 0xFF000000 & (buf[0] << 24) | 0x00FF0000 & (buf[1] << 16) | 0x0000FF00 & (buf[2] << 8) | 0x000000FF & buf[3];
if (result == 0x0000FEFF)
return "UCS-4";
else if (result == 0xFFFE0000)
return "UCS-4";
else if (result == 0x0000003C)
return "UCS-4BE";
else if (result == 0x3C000000)
return "UCS-4LE";
else if (result == 0x003C003F)
return "UTF-16BE";
else if (result == 0x3C003F00)
return "UTF-16LE";
else if (result == 0x3C3F786D)
return null; // looks like US-ASCII with <?xml: sniff
else if (result == 0x4C6FA794)
return "IBM037"; // Sniff for ebdic codepage
else if ((result & 0xFFFF0000) == 0xFEFF0000)
return "UTF-16";
else if ((result & 0xFFFF0000) == 0xFFFE0000)
return "UTF-16";
else if ((result & 0xFFFFFF00) == 0xEFBBBF00)
return "UTF-8";
else return null;
}
finally
{
reset();
}
}
// BUGBUG in JDK: Charset.forName is not threadsafe, so we"ll prime it
// with the common charsets.
private static Charset dummy1 = Charset.forName("UTF-8");
private static Charset dummy2 = Charset.forName("UTF-16");
private static Charset dummy3 = Charset.forName("UTF-16BE");
private static Charset dummy4 = Charset.forName("UTF-16LE");
private static Charset dummy5 = Charset.forName("ISO-8859-1");
private static Charset dummy6 = Charset.forName("US-ASCII");
private static Charset dummy7 = Charset.forName("Cp1252");
private String sniffForXmlDecl(String encoding) throws IOException
{
mark(MAX_SNIFFED_BYTES);
try
{
byte[] bytebuf = new byte[MAX_SNIFFED_BYTES];
int bytelimit = readAsMuchAsPossible(bytebuf, 0, MAX_SNIFFED_BYTES);
// BUGBUG in JDK: Charset.forName is not threadsafe.
Charset charset = Charset.forName(encoding);
Reader reader = new InputStreamReader(new ByteArrayInputStream(bytebuf, 0, bytelimit), charset);
char[] buf = new char[bytelimit];
int limit = 0;
while (limit < bytelimit)
{
int count = reader.read(buf, limit, bytelimit - limit);
if (count < 0)
break;
limit += count;
}
return extractXmlDeclEncoding(buf, 0, limit);
}
finally
{
reset();
}
}
private String _encoding;
public String getXmlEncoding()
{
return _encoding;
}
/* package */ static String extractXmlDeclEncoding(char[] buf, int offset, int size)
{
int limit = offset + size;
int xmlpi = firstIndexOf("<?xml", buf, offset, limit);
if (xmlpi >= 0)
{
int i = xmlpi + 5;
ScannedAttribute attr = new ScannedAttribute();
while (i < limit)
{
i = scanAttribute(buf, i, limit, attr);
if (i < 0)
return null;
if (attr.name.equals("encoding"))
return attr.value;
}
}
return null;
}
private static int firstIndexOf(String s, char[] buf, int startAt, int limit)
{
assert(s.length() > 0);
char[] lookFor = s.toCharArray();
char firstchar = lookFor[0];
searching: for (limit -= lookFor.length; startAt < limit; startAt++)
{
if (buf[startAt] == firstchar)
{
for (int i = 1; i < lookFor.length; i++)
{
if (buf[startAt + i] != lookFor[i])
{
continue searching;
}
}
return startAt;
}
}
return -1;
}
private static int nextNonmatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
{
searching: for (; startAt < limit; startAt++)
{
int thischar = buf[startAt];
for (int i = 0; i < lookFor.length; i++)
if (thischar == lookFor[i])
continue searching;
return startAt;
}
return -1;
}
private static int nextMatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
{
searching: for (; startAt < limit; startAt++)
{
int thischar = buf[startAt];
for (int i = 0; i < lookFor.length; i++)
if (thischar == lookFor[i])
return startAt;
}
return -1;
}
private static int nextMatchingByte(char lookFor, char[] buf, int startAt, int limit)
{
searching: for (; startAt < limit; startAt++)
{
if (buf[startAt] == lookFor)
return startAt;
}
return -1;
}
private static char[] WHITESPACE = new char[] { " ", "\r", "\t", "\n" };
private static char[] NOTNAME = new char[] { "=", " ", "\r", "\t", "\n", "?", ">", "<", "\"", "\"" };
private static class ScannedAttribute
{
public String name;
public String value;
}
private static int scanAttribute(char[] buf, int startAt, int limit, ScannedAttribute attr)
{
int nameStart = nextNonmatchingByte(WHITESPACE, buf, startAt, limit);
if (nameStart < 0)
return -1;
int nameEnd = nextMatchingByte(NOTNAME, buf, nameStart, limit);
if (nameEnd < 0)
return -1;
int equals = nextNonmatchingByte(WHITESPACE, buf, nameEnd, limit);
if (equals < 0)
return -1;
if (buf[equals] != "=")
return -1;
int valQuote = nextNonmatchingByte(WHITESPACE, buf, equals + 1, limit);
if (buf[valQuote] != "\"" && buf[valQuote] != "\"")
return -1;
int valEndquote = nextMatchingByte(buf[valQuote], buf, valQuote + 1, limit);
if (valEndquote < 0)
return -1;
attr.name = new String(buf, nameStart, nameEnd - nameStart);
attr.value = new String(buf, valQuote + 1, valEndquote - valQuote - 1);
return valEndquote + 1;
}
}
class SniffedXmlReader extends BufferedReader {
// We don"t sniff more than 192 bytes.
public static int MAX_SNIFFED_CHARS = 192;
public SniffedXmlReader(Reader reader) throws IOException {
super(reader);
_encoding = sniffForXmlDecl();
}
private int readAsMuchAsPossible(char[] buf, int startAt, int len) throws IOException {
int total = 0;
while (total < len) {
int count = read(buf, startAt + total, len - total);
if (count < 0)
break;
total += count;
}
return total;
}
// BUGBUG in JDK: Charset.forName is not threadsafe, so we"ll prime it
// with the common charsets.
private static Charset dummy1 = Charset.forName("UTF-8");
private static Charset dummy2 = Charset.forName("UTF-16");
private static Charset dummy3 = Charset.forName("UTF-16BE");
private static Charset dummy4 = Charset.forName("UTF-16LE");
private static Charset dummy5 = Charset.forName("ISO-8859-1");
private static Charset dummy6 = Charset.forName("US-ASCII");
private static Charset dummy7 = Charset.forName("Cp1252");
private String sniffForXmlDecl() throws IOException {
mark(MAX_SNIFFED_CHARS);
try {
char[] buf = new char[MAX_SNIFFED_CHARS];
int limit = readAsMuchAsPossible(buf, 0, MAX_SNIFFED_CHARS);
return SniffedXmlInputStream.extractXmlDeclEncoding(buf, 0, limit);
} finally {
reset();
}
}
private String _encoding;
public String getXmlEncoding() {
return _encoding;
}
}
/*
* $Id: XmlReader.java,v 1.1 2004/08/19 05:30:22 aslom Exp $
*
* The Apache Software License, Version 1.1
*
*
* Copyright (c) 2000 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Crimson" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS"" AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
* http://www.sun.ru. For more information on the Apache Software
* Foundation, please see <http://www.apache.org/>.
*/
import java.io.*;
import java.util.Hashtable;
/**
* This handles several XML-related tasks that normal java.io Readers
* don"t support, inluding use of IETF standard encoding names and
* automatic detection of most XML encodings. The former is needed
* for interoperability; the latter is needed to conform with the XML
* spec. This class also optimizes reading some common encodings by
* providing low-overhead unsynchronized Reader support.
*
* <P> Note that the autodetection facility should be used only on
* data streams which have an unknown character encoding. For example,
* it should never be used on MIME text/xml entities.
*
* <P> Note that XML processors are only required to support UTF-8 and
* UTF-16 character encodings. Autodetection permits the underlying Java
* implementation to provide support for many other encodings, such as
* US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
*
* @author David Brownell
* @version $Revision: 1.1 $
*/
final public class XmlReader extends Reader
{
private static final int MAXPUSHBACK = 512;
private Reader in;
private String assignedEncoding;
private boolean closed;
//
// This class always delegates I/O to a reader, which gets
// its data from the very beginning of the XML text. It needs
// to use a pushback stream since (a) autodetection can read
// partial UTF-8 characters which need to be fully processed,
// (b) the "Unicode" readers swallow characters that they think
// are byte order marks, so tests fail if they don"t see the
// real byte order mark.
//
// It"s got do this efficiently: character I/O is solidly on the
// critical path. (So keep buffer length over 2 Kbytes to avoid
// excess buffering. Many URL handlers stuff a BufferedInputStream
// between here and the real data source, and larger buffers keep
// that from slowing you down.)
//
/**
* Constructs the reader from an input stream, autodetecting
* the encoding to use according to the heuristic specified
* in the XML 1.0 recommendation.
*
* @param in the input stream from which the reader is constructed
* @exception IOException on error, such as unrecognized encoding
*/
public static Reader createReader (InputStream in) throws IOException
{
return new XmlReader (in);
}
/**
* Creates a reader supporting the given encoding, mapping
* from standard encoding names to ones that understood by
* Java where necessary.
*
* @param in the input stream from which the reader is constructed
* @param encoding the IETF standard name of the encoding to use;
* if null, autodetection is used.
* @exception IOException on error, including unrecognized encoding
*/
public static Reader createReader (InputStream in, String encoding)
throws IOException
{
if (encoding == null) {
return new XmlReader(in);
}
if ("UTF-8".equalsIgnoreCase (encoding)
|| "UTF8".equalsIgnoreCase (encoding)) {
return new Utf8Reader (in);
}
if ("US-ASCII".equalsIgnoreCase (encoding)
|| "ASCII".equalsIgnoreCase (encoding)) {
return new AsciiReader (in);
}
if ("ISO-8859-1".equalsIgnoreCase (encoding)
// plus numerous aliases ...
) {
return new Iso8859_1Reader (in);
}
// What we really want is an administerable resource mapping
// encoding names/aliases to classnames. For example a property
// file resource, "readers/mapping.props", holding and a set
// of readers in that (sub)package... defaulting to this call
// only if no better choice is available.
//
return new InputStreamReader (in, std2java (encoding));
}
// JDK doesn"t know all of the standard encoding names, and
// in particular none of the EBCDIC ones IANA defines (and
// which IBM encourages).
static private final Hashtable charsets = new Hashtable (31);
static {
charsets.put ("UTF-16", "Unicode");
charsets.put ("ISO-10646-UCS-2", "Unicode");
// NOTE: no support for ISO-10646-UCS-4 yet.
charsets.put ("EBCDIC-CP-US", "cp037");
charsets.put ("EBCDIC-CP-CA", "cp037");
charsets.put ("EBCDIC-CP-NL", "cp037");
charsets.put ("EBCDIC-CP-WT", "cp037");
charsets.put ("EBCDIC-CP-DK", "cp277");
charsets.put ("EBCDIC-CP-NO", "cp277");
charsets.put ("EBCDIC-CP-FI", "cp278");
charsets.put ("EBCDIC-CP-SE", "cp278");
charsets.put ("EBCDIC-CP-IT", "cp280");
charsets.put ("EBCDIC-CP-ES", "cp284");
charsets.put ("EBCDIC-CP-GB", "cp285");
charsets.put ("EBCDIC-CP-FR", "cp297");
charsets.put ("EBCDIC-CP-AR1", "cp420");
charsets.put ("EBCDIC-CP-HE", "cp424");
charsets.put ("EBCDIC-CP-BE", "cp500");
charsets.put ("EBCDIC-CP-CH", "cp500");
charsets.put ("EBCDIC-CP-ROECE", "cp870");
charsets.put ("EBCDIC-CP-YU", "cp870");
charsets.put ("EBCDIC-CP-IS", "cp871");
charsets.put ("EBCDIC-CP-AR2", "cp918");
// IANA also defines two that JDK 1.2 doesn"t handle:
// EBCDIC-CP-GR --> CP423
// EBCDIC-CP-TR --> CP905
}
// returns an encoding name supported by JDK >= 1.1.6
// for some cases required by the XML spec
private static String std2java (String encoding)
{
String temp = encoding.toUpperCase ();
temp = (String) charsets.get (temp);
return (temp != null) ? temp : encoding;
}
/** Returns the standard name of the encoding in use */
public String getEncoding ()
{
return assignedEncoding;
}
private XmlReader (InputStream stream) throws IOException
{
super (stream);
PushbackInputStream pb;
byte buf [];
int len;
/*if (stream instanceof PushbackInputStream)
pb = (PushbackInputStream) stream;
else*/
/**
* Commented out the above code to make sure it works when the
* document is accessed using http. URL connection in the code uses
* a PushbackInputStream with size 7 and when we try to push back
* MAX which default value is set to 512 we get and exception. So
* that"s why we need to wrap the stream irrespective of what type
* of stream we start off with.
*/
pb = new PushbackInputStream (stream, MAXPUSHBACK);
//
// See if we can figure out the character encoding used
// in this file by peeking at the first few bytes.
//
buf = new byte [4];
len = pb.read (buf);
if (len > 0)
pb.unread (buf, 0, len);
if (len == 4) switch (buf [0] & 0x0ff) {
case 0:
// 00 3c 00 3f == illegal UTF-16 big-endian
if (buf [1] == 0x3c && buf [2] == 0x00 && buf [3] == 0x3f) {
setEncoding (pb, "UnicodeBig");
return;
}
// else it"s probably UCS-4
break;
case "<": // 0x3c: the most common cases!
switch (buf [1] & 0x0ff) {
// First character is "<"; could be XML without
// an XML directive such as "<hello>", "<!-- ...",
// and so on.
default:
break;
// 3c 00 3f 00 == illegal UTF-16 little endian
case 0x00:
if (buf [2] == 0x3f && buf [3] == 0x00) {
setEncoding (pb, "UnicodeLittle");
return;
}
// else probably UCS-4
break;
// 3c 3f 78 6d == ASCII and supersets "<?xm"
case "?":
if (buf [2] != "x" || buf [3] != "m")
break;
//
// One of several encodings could be used:
// Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
//
useEncodingDecl (pb, "UTF8");
return;
}
break;
// 4c 6f a7 94 ... some EBCDIC code page
case 0x4c:
if (buf [1] == 0x6f
&& (0x0ff & buf [2]) == 0x0a7
&& (0x0ff & buf [3]) == 0x094) {
useEncodingDecl (pb, "CP037");
return;
}
// whoops, treat as UTF-8
break;
// UTF-16 big-endian
case 0xfe:
if ((buf [1] & 0x0ff) != 0xff)
break;
setEncoding (pb, "UTF-16");
return;
// UTF-16 little-endian
case 0xff:
if ((buf [1] & 0x0ff) != 0xfe)
break;
setEncoding (pb, "UTF-16");
return;
// default ... no XML declaration
default:
break;
}
//
// If all else fails, assume XML without a declaration, and
// using UTF-8 encoding.
//
setEncoding (pb, "UTF-8");
}
/*
* Read the encoding decl on the stream, knowing that it should
* be readable using the specified encoding (basically, ASCII or
* EBCDIC). The body of the document may use a wider range of
* characters than the XML/Text decl itself, so we switch to use
* the specified encoding as soon as we can. (ASCII is a subset
* of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
* has a variety of "code pages" that have these characters as
* a common subset.)
*/
private void useEncodingDecl (PushbackInputStream pb, String encoding)
throws IOException
{
byte buffer[] = new byte [MAXPUSHBACK];
int len;
Reader r;
int c;
//
// Buffer up a bunch of input, and set up to read it in
// the specified encoding ... we can skip the first four
// bytes since we know that "<?xm" was read to determine
// what encoding to use!
//
len = pb.read (buffer, 0, buffer.length);
pb.unread (buffer, 0, len);
r = new InputStreamReader (
new ByteArrayInputStream (buffer, 4, len),
encoding);
//
// Next must be "l" (and whitespace) else we conclude
// error and choose UTF-8.
//
if ((c = r.read ()) != "l") {
setEncoding (pb, "UTF-8");
return;
}
//
// Then, we"ll skip any
// S version="..." [or single quotes]
// bit and get any subsequent
// S encoding="..." [or single quotes]
//
// We put an arbitrary size limit on how far we read; lots
// of space will break this algorithm.
//
StringBuffer buf = new StringBuffer ();
StringBuffer keyBuf = null;
String key = null;
boolean sawEq = false;
char quoteChar = 0;
boolean sawQuestion = false;
XmlDecl:
for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
if ((c = r.read ()) == -1)
break;
// ignore whitespace before/between "key = "value""
if (c == " " || c == "\t" || c == "\n" || c == "\r")
continue;
// ... but require at least a little!
if (i == 0)
break;
// terminate the loop ASAP
if (c == "?")
sawQuestion = true;
else if (sawQuestion) {
if (c == ">")
break;
sawQuestion = false;
}
// did we get the "key =" bit yet?
if (key == null || !sawEq) {
if (keyBuf == null) {
if (Character.isWhitespace ((char) c))
continue;
keyBuf = buf;
buf.setLength (0);
buf.append ((char)c);
sawEq = false;
} else if (Character.isWhitespace ((char) c)) {
key = keyBuf.toString ();
} else if (c == "=") {
if (key == null)
key = keyBuf.toString ();
sawEq = true;
keyBuf = null;
quoteChar = 0;
} else
keyBuf.append ((char)c);
continue;
}
// space before quoted value
if (Character.isWhitespace ((char) c))
continue;
if (c == """ || c == "\"") {
if (quoteChar == 0) {
quoteChar = (char) c;
buf.setLength (0);
continue;
} else if (c == quoteChar) {
if ("encoding".equals (key)) {
assignedEncoding = buf.toString ();
// [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|"-")*
for (i = 0; i < assignedEncoding.length(); i++) {
c = assignedEncoding.charAt (i);
if ((c >= "A" && c <= "Z")
|| (c >= "a" && c <= "z"))
continue;
if (i == 0)
break XmlDecl;
if (i > 0 && (c == "-"
|| (c >= "0" && c <= "9")
|| c == "." || c == "_"))
continue;
// map illegal names to UTF-8 default
break XmlDecl;
}
setEncoding (pb, assignedEncoding);
return;
} else {
key = null;
continue;
}
}
}
buf.append ((char) c);
}
setEncoding (pb, "UTF-8");
}
private void setEncoding (InputStream stream, String encoding)
throws IOException
{
assignedEncoding = encoding;
in = createReader (stream, encoding);
}
/**
* Reads the number of characters read into the buffer, or -1 on EOF.
*/
public int read(char buf [], int off, int len) throws IOException
{
int val;
if (closed)
return -1; // throw new IOException ("closed");
val = in.read (buf, off, len);
if (val == -1)
close ();
return val;
}
/**
* Reads a single character.
*/
public int read () throws IOException
{
int val;
if (closed) {
throw new IOException("Stream closed");
}
val = in.read();
if (val == -1) {
close();
}
return val;
}
/**
* Returns true iff the reader supports mark/reset.
*/
public boolean markSupported ()
{
return in == null ? false : in.markSupported ();
}
/**
* Sets a mark allowing a limited number of characters to
* be "peeked", by reading and then resetting.
* @param value how many characters may be "peeked".
*/
public void mark (int value) throws IOException
{
if (in != null) in.mark (value);
}
/**
* Resets the current position to the last marked position.
*/
public void reset () throws IOException
{
if (in != null) in.reset ();
}
/**
* Skips a specified number of characters.
*/
public long skip (long value) throws IOException
{
return in == null ? 0 : in.skip (value);
}
/**
* Returns true iff input characters are known to be ready.
*/
public boolean ready () throws IOException
{
return in == null ? false : in.ready ();
}
/**
* Closes the reader.
*/
public void close() throws IOException
{
if (closed)
return;
in.close ();
in = null;
closed = true;
}
//
// Delegating to a converter module will always be slower than
// direct conversion. Use a similar approach for any other
// readers that need to be particularly fast; only block I/O
// speed matters to this package. For UTF-16, separate readers
// for big and little endian streams make a difference, too;
// fewer conditionals in the critical path!
//
public static abstract class BaseReader extends Reader
{
protected InputStream instream;
protected byte buffer [];
protected int start, finish;
BaseReader (InputStream stream)
{
super (stream);
instream = stream;
buffer = new byte [8192];
}
public abstract String getEncoding();
public boolean ready () throws IOException
{
return instream == null
|| (finish - start) > 0
|| instream.available () != 0;
}
// caller shouldn"t read again
public void close () throws IOException
{
if (instream != null) {
instream.close ();
start = finish = 0;
buffer = null;
instream = null;
}
}
}
//
// We want this reader, to make the default encoding be as fast
// as we can make it. JDK"s "UTF8" (not "UTF-8" till JDK 1.2)
// InputStreamReader works, but 20+% slower speed isn"t OK for
// the default/primary encoding.
//
static final class Utf8Reader extends BaseReader
{
// 2nd half of UTF-8 surrogate pair
private char nextChar;
Utf8Reader (InputStream stream)
{
super (stream);
}
public String getEncoding() { return "UTF-8"; }
public int read (char buf [], int offset, int len) throws IOException
{
int i = 0, c = 0;
if (len <= 0)
return 0;
// avoid many runtime bounds checks ... a good optimizer
// (static or JIT) will now remove checks from the loop.
if ((offset + len) > buf.length || offset < 0)
throw new ArrayIndexOutOfBoundsException ();
// Consume remaining half of any surrogate pair immediately
if (nextChar != 0) {
buf [offset + i++] = nextChar;
nextChar = 0;
}
while (i < len) {
// stop or read data if needed
if (finish <= start) {
if (instream == null) {
c = -1;
break;
}
start = 0;
finish = instream.read (buffer, 0, buffer.length);
if (finish <= 0) {
this.close ();
c = -1;
break;
}
}
// RFC 2279 describes UTF-8; there are six encodings.
// Each encoding takes a fixed number of characters
// (1-6 bytes) and is flagged by a bit pattern in the
// first byte. The five and six byte-per-character
// encodings address characters which are disallowed
// in XML documents, as do some four byte ones.
// Single byte == ASCII. Common; optimize.
//
c = buffer [start] & 0x0ff;
if ((c & 0x80) == 0x00) {
// 0x0000 <= c <= 0x007f
start++;
buf [offset + i++] = (char) c;
continue;
}
//
// Multibyte chars -- check offsets optimistically,
// ditto the "10xx xxxx" format for subsequent bytes
//
int off = start;
try {
// 2 bytes
if ((buffer [off] & 0x0E0) == 0x0C0) {
c = (buffer [off++] & 0x1f) << 6;
c += buffer [off++] & 0x3f;
// 0x0080 <= c <= 0x07ff
// 3 bytes
} else if ((buffer [off] & 0x0F0) == 0x0E0) {
c = (buffer [off++] & 0x0f) << 12;
c += (buffer [off++] & 0x3f) << 6;
c += buffer [off++] & 0x3f;
// 0x0800 <= c <= 0xffff
// 4 bytes
} else if ((buffer [off] & 0x0f8) == 0x0F0) {
c = (buffer [off++] & 0x07) << 18;
c += (buffer [off++] & 0x3f) << 12;
c += (buffer [off++] & 0x3f) << 6;
c += buffer [off++] & 0x3f;
// 0x0001 0000 <= c <= 0x001f ffff
// Unicode supports c <= 0x0010 ffff ...
if (c > 0x0010ffff)
throw new CharConversionException (
"UTF-8 encoding of character 0x00"
+ Integer.toHexString (c)
+ " can"t be converted to Unicode."
);
else if (c > 0xffff) {
// Convert UCS-4 char to surrogate pair (UTF-16)
c -= 0x10000;
nextChar = (char) (0xDC00 + (c & 0x03ff));
c = 0xD800 + (c >> 10);
}
// 5 and 6 byte versions are XML WF errors, but
// typically come from mislabeled encodings
} else
throw new CharConversionException (
"Unconvertible UTF-8 character"
+ " beginning with 0x"
+ Integer.toHexString (
buffer [start] & 0xff)
);
} catch (ArrayIndexOutOfBoundsException e) {
// off > length && length >= buffer.length
c = 0;
}
//
// if the buffer held only a partial character,
// compact it and try to read the rest of the
// character. worst case involves three
// single-byte reads -- quite rare.
//
if (off > finish) {
System.arraycopy (buffer, start,
buffer, 0, finish - start);
finish -= start;
start = 0;
off = instream.read (buffer, finish,
buffer.length - finish);
if (off < 0) {
this.close ();
throw new CharConversionException (
"Partial UTF-8 char");
}
finish += off;
continue;
}
//
// check the format of the non-initial bytes
//
for (start++; start < off; start++) {
if ((buffer [start] & 0xC0) != 0x80) {
this.close ();
throw new CharConversionException (
"Malformed UTF-8 char -- "
+ "is an XML encoding declaration missing?"
);
}
}
//
// If this needed a surrogate pair, consume ASAP
//
buf [offset + i++] = (char) c;
if (nextChar != 0 && i < len) {
buf [offset + i++] = nextChar;
nextChar = 0;
}
}
if (i > 0)
return i;
return (c == -1) ? -1 : 0;
}
}
//
// We want ASCII and ISO-8859 Readers since they"re the most common
// encodings in the US and Europe, and we don"t want performance
// regressions for them. They"re also easy to implement efficiently,
// since they"re bitmask subsets of UNICODE.
//
// XXX haven"t benchmarked these readers vs what we get out of JDK.
//
static final class AsciiReader extends BaseReader
{
AsciiReader (InputStream in) { super (in); }
public String getEncoding() { return "US-ASCII"; }
public int read (char buf [], int offset, int len) throws IOException
{
if (instream == null) {
return -1;
}
// avoid many runtime bounds checks ... a good optimizer
// (static or JIT) will now remove checks from the loop.
if ((offset + len) > buf.length || offset < 0)
throw new ArrayIndexOutOfBoundsException ();
/* 07-Mar-2006, TSa: Actually, it"s bad idea to try to fill the
* whole buffer -- if this is a blocking source (network socket
* for example), we may be blocking too early.
*/
// So, do we need to try to read more?
int avail = (finish - start);
if (avail < 1) {
start = 0;
finish = instream.read (buffer, 0, buffer.length);
if (finish <= 0) {
this.close();
return -1;
}
if (len > finish) {
len = finish;
}
} else {
if (len > avail) {
len = avail;
}
}
for (int i = 0; i < len; i++) {
int c = buffer[start++];
if (c < 0) {
throw new CharConversionException ("Illegal ASCII character, 0x"
+ Integer.toHexString(c & 0xff));
}
buf [offset + i] = (char) c;
}
return len;
}
}
static final class Iso8859_1Reader extends BaseReader
{
Iso8859_1Reader (InputStream in) { super (in); }
public String getEncoding() { return "ISO-8859-1"; }
public int read (char buf [], int offset, int len) throws IOException
{
if (instream == null)
return -1;
// avoid many runtime bounds checks ... a good optimizer
// (static or JIT) will now remove checks from the loop.
if ((offset + len) > buf.length || offset < 0)
throw new ArrayIndexOutOfBoundsException ();
/* 07-Mar-2006, TSa: Actually, it"s bad idea to try to fill the
* whole buffer -- if this is a blocking source (network socket
* for example), we may be blocking too early.
*/
// So, do we need to try to read more?
int avail = (finish - start);
if (avail < 1) {
start = 0;
finish = instream.read (buffer, 0, buffer.length);
if (finish <= 0) {
this.close();
return -1;
}
if (len > finish) {
len = finish;
}
} else {
if (len > avail) {
len = avail;
}
}
for (int i = 0; i < len; i++) {
buf [offset + i] = (char) (buffer[start++] & 0xFF);
}
return len;
}
}
}