[jdom-interest] Toward beta 9
Rolf Lear
rlear at algorithmics.com
Thu Apr 10 07:44:51 PDT 2003
Since my post-with-attachment is waiting for moderator approval, I thought I
would post the "diff".
Just paste the following at the beginning of the Verifier class.
Rolf
=================================================
private static long[] shifts = buildshifts();
private static long[] lettermask = buildLetterMask();
private static long[] combmask = buildCombinationMask();
private static boolean getbit(long[] mask, int index) {
return (mask[index >>> 6] & shifts[index & 0x3f]) != 0L;
}
private static void setbit(long[] mask, int index) {
mask[index >>> 6] |= shifts[index & 0x3f];
}
private static void setbits(long[] mask, int start, int end) {
for (int i = start; i <= end; i++) {
setbit(mask,i);
}
}
private static long[] buildshifts() {
long[] shifts = new long[64];
for (int i = 0; i < 64; i++) {
shifts[i] = (1L << i);
}
return shifts;
}
private static long[] newmask() {
// 0x400 == (0xffff + 1) / 64
return new long[0x400];
}
private static long[] buildLetterMask() {
System.out.println ("Building lettermask");
long tm = System.currentTimeMillis();
long[] mask = newmask();
setbits(mask, 0x0041, 0x005a);
setbits(mask, 0x0061, 0x007A);
setbits(mask, 0x00C0, 0x00D6);
setbits(mask, 0x00D8, 0x00F6);
setbits(mask, 0x00F8, 0x00FF);
setbits(mask, 0x0100, 0x0131);
setbits(mask, 0x0134, 0x013E);
setbits(mask, 0x0141, 0x0148);
setbits(mask, 0x014A, 0x017E);
setbits(mask, 0x0180, 0x01C3);
setbits(mask, 0x01CD, 0x01F0);
setbits(mask, 0x01F4, 0x01F5);
setbits(mask, 0x01FA, 0x0217);
setbits(mask, 0x0250, 0x02A8);
setbits(mask, 0x02BB, 0x02C1);
setbit(mask, 0x0386);
setbits(mask, 0x0388, 0x038A);
setbit(mask, 0x038C);
setbits(mask, 0x038E, 0x03A1);
setbits(mask, 0x03A3, 0x03CE);
setbits(mask, 0x03D0, 0x03D6);
setbit(mask, 0x03DA);
setbit(mask, 0x03DC);
setbit(mask, 0x03DE);
setbit(mask, 0x03E0);
setbits(mask, 0x03E2, 0x03F3);
setbits(mask, 0x0401, 0x040C);
setbits(mask, 0x040E, 0x044F);
setbits(mask, 0x0451, 0x045C);
setbits(mask, 0x045E, 0x0481);
setbits(mask, 0x0490, 0x04C4);
setbits(mask, 0x04C7, 0x04C8);
setbits(mask, 0x04CB, 0x04CC);
setbits(mask, 0x04D0, 0x04EB);
setbits(mask, 0x04EE, 0x04F5);
setbits(mask, 0x04F8, 0x04F9);
setbits(mask, 0x0531, 0x0556);
setbit(mask, 0x0559);
setbits(mask, 0x0561, 0x0586);
setbits(mask, 0x05D0, 0x05EA);
setbits(mask, 0x05F0, 0x05F2);
setbits(mask, 0x0621, 0x063A);
setbits(mask, 0x0641, 0x064A);
setbits(mask, 0x0671, 0x06B7);
setbits(mask, 0x06BA, 0x06BE);
setbits(mask, 0x06C0, 0x06CE);
setbits(mask, 0x06D0, 0x06D3);
setbit(mask, 0x06D5);
setbits(mask, 0x06E5, 0x06E6);
setbits(mask, 0x0905, 0x0939);
setbit(mask, 0x093D);
setbits(mask, 0x0958, 0x0961);
setbits(mask, 0x0985, 0x098C);
setbits(mask, 0x098F, 0x0990);
setbits(mask, 0x0993, 0x09A8);
setbits(mask, 0x09AA, 0x09B0);
setbit(mask, 0x09B2);
setbits(mask, 0x09B6, 0x09B9);
setbits(mask, 0x09DC, 0x09DD);
setbits(mask, 0x09DF, 0x09E1);
setbits(mask, 0x09F0, 0x09F1);
setbits(mask, 0x0A05, 0x0A0A);
setbits(mask, 0x0A0F, 0x0A10);
setbits(mask, 0x0A13, 0x0A28);
setbits(mask, 0x0A2A, 0x0A30);
setbits(mask, 0x0A32, 0x0A33);
setbits(mask, 0x0A35, 0x0A36);
setbits(mask, 0x0A38, 0x0A39);
setbits(mask, 0x0A59, 0x0A5C);
setbit(mask, 0x0A5E);
setbits(mask, 0x0A72, 0x0A74);
setbits(mask, 0x0A85, 0x0A8B);
setbit(mask, 0x0A8D);
setbits(mask, 0x0A8F, 0x0A91);
setbits(mask, 0x0A93, 0x0AA8);
setbits(mask, 0x0AAA, 0x0AB0);
setbits(mask, 0x0AB2, 0x0AB3);
setbits(mask, 0x0AB5, 0x0AB9);
setbit(mask, 0x0ABD);
setbit(mask, 0x0AE0);
setbits(mask, 0x0B05, 0x0B0C);
setbits(mask, 0x0B0F, 0x0B10);
setbits(mask, 0x0B13, 0x0B28);
setbits(mask, 0x0B2A, 0x0B30);
setbits(mask, 0x0B32, 0x0B33);
setbits(mask, 0x0B36, 0x0B39);
setbit(mask, 0x0B3D);
setbits(mask, 0x0B5C, 0x0B5D);
setbits(mask, 0x0B5F, 0x0B61);
setbits(mask, 0x0B85, 0x0B8A);
setbits(mask, 0x0B8E, 0x0B90);
setbits(mask, 0x0B92, 0x0B95);
setbits(mask, 0x0B99, 0x0B9A);
setbit(mask, 0x0B9C);
setbits(mask, 0x0B9E, 0x0B9F);
setbits(mask, 0x0BA3, 0x0BA4);
setbits(mask, 0x0BA8, 0x0BAA);
setbits(mask, 0x0BAE, 0x0BB5);
setbits(mask, 0x0BB7, 0x0BB9);
setbits(mask, 0x0C05, 0x0C0C);
setbits(mask, 0x0C0E, 0x0C10);
setbits(mask, 0x0C12, 0x0C28);
setbits(mask, 0x0C2A, 0x0C33);
setbits(mask, 0x0C35, 0x0C39);
setbits(mask, 0x0C60, 0x0C61);
setbits(mask, 0x0C85, 0x0C8C);
setbits(mask, 0x0C8E, 0x0C90);
setbits(mask, 0x0C92, 0x0CA8);
setbits(mask, 0x0CAA, 0x0CB3);
setbits(mask, 0x0CB5, 0x0CB9);
setbit(mask, 0x0CDE);
setbits(mask, 0x0CE0, 0x0CE1);
setbits(mask, 0x0D05, 0x0D0C);
setbits(mask, 0x0D0E, 0x0D10);
setbits(mask, 0x0D12, 0x0D28);
setbits(mask, 0x0D2A, 0x0D39);
setbits(mask, 0x0D60, 0x0D61);
setbits(mask, 0x0E01, 0x0E2E);
setbit(mask, 0x0E30);
setbits(mask, 0x0E32, 0x0E33);
setbits(mask, 0x0E40, 0x0E45);
setbits(mask, 0x0E81, 0x0E82);
setbit(mask, 0x0E84);
setbits(mask, 0x0E87, 0x0E88);
setbit(mask, 0x0E8A);
setbit(mask, 0x0E8D);
setbits(mask, 0x0E94, 0x0E97);
setbits(mask, 0x0E99, 0x0E9F);
setbits(mask, 0x0EA1, 0x0EA3);
setbit(mask, 0x0EA5);
setbit(mask, 0x0EA7);
setbits(mask, 0x0EAA, 0x0EAB);
setbits(mask, 0x0EAD, 0x0EAE);
setbit(mask, 0x0EB0);
setbits(mask, 0x0EB2, 0x0EB3);
setbit(mask, 0x0EBD);
setbits(mask, 0x0EC0, 0x0EC4);
setbits(mask, 0x0F40, 0x0F47);
setbits(mask, 0x0F49, 0x0F69);
setbits(mask, 0x10A0, 0x10C5);
setbits(mask, 0x10D0, 0x10F6);
setbit(mask, 0x1100);
setbits(mask, 0x1102, 0x1103);
setbits(mask, 0x1105, 0x1107);
setbit(mask, 0x1109);
setbits(mask, 0x110B, 0x110C);
setbits(mask, 0x110E, 0x1112);
setbit(mask, 0x113C);
setbit(mask, 0x113E);
setbit(mask, 0x1140);
setbit(mask, 0x114C);
setbit(mask, 0x114E);
setbit(mask, 0x1150);
setbits(mask, 0x1154, 0x1155);
setbit(mask, 0x1159);
setbits(mask, 0x115F, 0x1161);
setbit(mask, 0x1163);
setbit(mask, 0x1165);
setbit(mask, 0x1167);
setbit(mask, 0x1169);
setbits(mask, 0x116D, 0x116E);
setbits(mask, 0x1172, 0x1173);
setbit(mask, 0x1175);
setbit(mask, 0x119E);
setbit(mask, 0x11A8);
setbit(mask, 0x11AB);
setbits(mask, 0x11AE, 0x11AF);
setbits(mask, 0x11B7, 0x11B8);
setbit(mask, 0x11BA);
setbits(mask, 0x11BC, 0x11C2);
setbit(mask, 0x11EB);
setbit(mask, 0x11F0);
setbit(mask, 0x11F9);
setbits(mask, 0x1E00, 0x1E9B);
setbits(mask, 0x1EA0, 0x1EF9);
setbits(mask, 0x1F00, 0x1F15);
setbits(mask, 0x1F18, 0x1F1D);
setbits(mask, 0x1F20, 0x1F45);
setbits(mask, 0x1F48, 0x1F4D);
setbits(mask, 0x1F50, 0x1F57);
setbit(mask, 0x1F59);
setbit(mask, 0x1F5B);
setbit(mask, 0x1F5D);
setbits(mask, 0x1F5F, 0x1F7D);
setbits(mask, 0x1F80, 0x1FB4);
setbits(mask, 0x1FB6, 0x1FBC);
setbit(mask, 0x1FBE);
setbits(mask, 0x1FC2, 0x1FC4);
setbits(mask, 0x1FC6, 0x1FCC);
setbits(mask, 0x1FD0, 0x1FD3);
setbits(mask, 0x1FD6, 0x1FDB);
setbits(mask, 0x1FE0, 0x1FEC);
setbits(mask, 0x1FF2, 0x1FF4);
setbits(mask, 0x1FF6, 0x1FFC);
setbit(mask, 0x2126);
setbits(mask, 0x212A, 0x212B);
setbit(mask, 0x212E);
setbits(mask, 0x2180, 0x2182);
setbit(mask, 0x3007); // ideographic
setbits(mask, 0x3021, 0x3029); // ideo
setbits(mask, 0x3041, 0x3094);
setbits(mask, 0x30A1, 0x30FA);
setbits(mask, 0x3105, 0x312C);
setbits(mask, 0x4E00, 0x9FA5); // ideo
setbits(mask, 0xAC00, 0xD7A3);
System.out.println ("Done in " + (System.currentTimeMillis() - tm) +
"ms.");
return mask;
}
private static long[] buildCombinationMask() {
System.out.println ("Building combinationmask");
long tm = System.currentTimeMillis();
long[] mask = newmask();
setbits(mask, 0x0300, 0x0345);
setbits(mask, 0x0360, 0x0361);
setbits(mask, 0x0483, 0x0486);
setbits(mask, 0x0591, 0x05A1);
setbits(mask, 0x05A3, 0x05B9);
setbits(mask, 0x05BB, 0x05BD);
setbit(mask, 0x05BF);
setbits(mask, 0x05C1, 0x05C2);
setbit(mask, 0x05C4);
setbits(mask, 0x064B, 0x0652);
setbit(mask, 0x0670);
setbits(mask, 0x06D6, 0x06DC);
setbits(mask, 0x06DD, 0x06DF);
setbits(mask, 0x06E0, 0x06E4);
setbits(mask, 0x06E7, 0x06E8);
setbits(mask, 0x06EA, 0x06ED);
setbits(mask, 0x0901, 0x0903);
setbit(mask, 0x093C);
setbits(mask, 0x093E, 0x094C);
setbit(mask, 0x094D);
setbits(mask, 0x0951, 0x0954);
setbits(mask, 0x0962, 0x0963);
setbits(mask, 0x0981, 0x0983);
setbit(mask, 0x09BC);
setbit(mask, 0x09BE);
setbit(mask, 0x09BF);
setbits(mask, 0x09C0, 0x09C4);
setbits(mask, 0x09C7, 0x09C8);
setbits(mask, 0x09CB, 0x09CD);
setbit(mask, 0x09D7);
setbits(mask, 0x09E2, 0x09E3);
setbit(mask, 0x0A02);
setbit(mask, 0x0A3C);
setbit(mask, 0x0A3E);
setbit(mask, 0x0A3F);
setbits(mask, 0x0A40, 0x0A42);
setbits(mask, 0x0A47, 0x0A48);
setbits(mask, 0x0A4B, 0x0A4D);
setbits(mask, 0x0A70, 0x0A71);
setbits(mask, 0x0A81, 0x0A83);
setbit(mask, 0x0ABC);
setbits(mask, 0x0ABE, 0x0AC5);
setbits(mask, 0x0AC7, 0x0AC9);
setbits(mask, 0x0ACB, 0x0ACD);
setbits(mask, 0x0B01, 0x0B03);
setbit(mask, 0x0B3C);
setbits(mask, 0x0B3E, 0x0B43);
setbits(mask, 0x0B47, 0x0B48);
setbits(mask, 0x0B4B, 0x0B4D);
setbits(mask, 0x0B56, 0x0B57);
setbits(mask, 0x0B82, 0x0B83);
setbits(mask, 0x0BBE, 0x0BC2);
setbits(mask, 0x0BC6, 0x0BC8);
setbits(mask, 0x0BCA, 0x0BCD);
setbit(mask, 0x0BD7);
setbits(mask, 0x0C01, 0x0C03);
setbits(mask, 0x0C3E, 0x0C44);
setbits(mask, 0x0C46, 0x0C48);
setbits(mask, 0x0C4A, 0x0C4D);
setbits(mask, 0x0C55, 0x0C56);
setbits(mask, 0x0C82, 0x0C83);
setbits(mask, 0x0CBE, 0x0CC4);
setbits(mask, 0x0CC6, 0x0CC8);
setbits(mask, 0x0CCA, 0x0CCD);
setbits(mask, 0x0CD5, 0x0CD6);
setbits(mask, 0x0D02, 0x0D03);
setbits(mask, 0x0D3E, 0x0D43);
setbits(mask, 0x0D46, 0x0D48);
setbits(mask, 0x0D4A, 0x0D4D);
setbit(mask, 0x0D57);
setbit(mask, 0x0E31);
setbits(mask, 0x0E34, 0x0E3A);
setbits(mask, 0x0E47, 0x0E4E);
setbit(mask, 0x0EB1);
setbits(mask, 0x0EB4, 0x0EB9);
setbits(mask, 0x0EBB, 0x0EBC);
setbits(mask, 0x0EC8, 0x0ECD);
setbits(mask, 0x0F18, 0x0F19);
setbit(mask, 0x0F35);
setbit(mask, 0x0F37);
setbit(mask, 0x0F39);
setbit(mask, 0x0F3E);
setbit(mask, 0x0F3F);
setbits(mask, 0x0F71, 0x0F84);
setbits(mask, 0x0F86, 0x0F8B);
setbits(mask, 0x0F90, 0x0F95);
setbit(mask, 0x0F97);
setbits(mask, 0x0F99, 0x0FAD);
setbits(mask, 0x0FB1, 0x0FB7);
setbit(mask, 0x0FB9);
setbits(mask, 0x20D0, 0x20DC);
setbit(mask, 0x20E1);
setbits(mask, 0x302A, 0x302F);
setbit(mask, 0x3099);
setbit(mask, 0x309A);
System.out.println ("Done in " + (System.currentTimeMillis() - tm) +
"ms.");
return mask;
}
public static boolean newisXMLLetter(char c) {
return getbit(lettermask, c);
}
public static boolean newisXMLCombiningChar(char c) {
return getbit(combmask, c);
}
public static void main(String[] args) throws IOException {
int cnt = new Integer(args[0]).intValue();
for (int i = 1; i < args.length; i++) {
BufferedReader reader = new BufferedReader(new
FileReader(args[i]));
StringBuffer sb = new StringBuffer();
String line;
while ((line = reader.readLine()) != null) {
sb.append(line + "\n");
}
char[] chars = sb.toString().toCharArray();
double rate;
long tm;
for (int c = 0; c < cnt; c++) {
int oldcnt = 0;
int newcnt = 0;
tm = System.currentTimeMillis();
for (int x = 0; x < chars.length; x++) {
if (isXMLLetter(chars[x])) oldcnt++;
if (isXMLCombiningChar(chars[x])) oldcnt++;
}
tm = System.currentTimeMillis() - tm;
rate = (double)(tm * 10000) / (double)chars.length;
rate = (double)((int)(rate * 100.0)) / 100.0;
System.out.println("OLD Iteration " + args[i] + " count " +
c + " took " + rate + "ms/10000 chars, counted " + oldcnt + " trues in " +
chars.length + " characters .");
tm = System.currentTimeMillis();
for (int x = 0; x < chars.length; x++) {
if (newisXMLLetter(chars[x])) newcnt++;
if (newisXMLCombiningChar(chars[x])) newcnt++;
}
tm = System.currentTimeMillis() - tm;
rate = (double)(tm * 10000) / (double)chars.length;
rate = (double)((int)(rate * 100.0)) / 100.0;
System.out.println("NEW Iteration " + args[i] + " count " +
c + " took " + rate + "ms/10000 chars, counted " + oldcnt + " trues in " +
chars.length + " characters .");
}
}
}
===========================================
-----Original Message-----
From: Rolf Lear [mailto:rlear at algorithmics.com]
Sent: Thursday, April 10, 2003 10:26 AM
To: 'Elliotte Rusty Harold'; jdom-interest at jdom.org
Subject: RE: [jdom-interest] Toward beta 9
How serious are people about performance in Verifier?
Using a relatively random input source (the characters in various Jars), I
can get a 500% - 1000% performance improvement in Verifier.
This is relatively simple, and "just as logical" as the existing verifier.
Have a look at the attached code, it is a "new" Verifier, with a main method
which has a relatively clunky, but effective performance test comparison
between the existing checks, and the proposed checks.
On my linux box I am getting performance improvements from 5.6 ms/10000
chars to 0.7ms/10000 chars. I know that the numbers are rough, but people
with profilers may be able to substantiate them better.
The basic principal is to build a bitmask representing all the valid
letters/combinations. The bitmask has 0xffff+1 bits, i.e. is 8K (relatively
small), and there is 1 for each "test". I have done only the isXMLLetter,
and isXMLCombiner. The pre-processing overhead is relatively small, (on my
box I measure 23ms).
Have a look-see, and tell me if I am barking up the wrong tree. I haven't
neatened up the code too much, but the principal seems good.
I have been running:
ant package
java -cp build/jdom.jar org.jdom.Verifier 5 lib/*.jar
and getting results:
Building lettermask
Done in 22ms.
Building combinationmask
Done in 0ms.
OLD Iteration lib/ant.jar count 0 took 6.93ms/10000 chars, counted 176182
trues in 732481 characters .
NEW Iteration lib/ant.jar count 0 took 0.76ms/10000 chars, counted 176182
trues in 732481 characters .
OLD Iteration lib/ant.jar count 1 took 5.61ms/10000 chars, counted 176182
trues in 732481 characters .
NEW Iteration lib/ant.jar count 1 took 0.76ms/10000 chars, counted 176182
trues in 732481 characters .
OLD Iteration lib/ant.jar count 2 took 5.66ms/10000 chars, counted 176182
trues in 732481 characters .
NEW Iteration lib/ant.jar count 2 took 0.76ms/10000 chars, counted 176182
trues in 732481 characters .
OLD Iteration lib/ant.jar count 3 took 5.69ms/10000 chars, counted 176182
trues in 732481 characters .
NEW Iteration lib/ant.jar count 3 took 0.76ms/10000 chars, counted 176182
trues in 732481 characters .
OLD Iteration lib/ant.jar count 4 took 5.61ms/10000 chars, counted 176182
trues in 732481 characters .
NEW Iteration lib/ant.jar count 4 took 0.76ms/10000 chars, counted 176182
trues in 732481 characters .
OLD Iteration lib/jaxen-core.jar count 0 took 5.34ms/10000 chars, counted
41039 trues in 160965 characters .
NEW Iteration lib/jaxen-core.jar count 0 took 0.86ms/10000 chars, counted
41039 trues in 160965 characters .
OLD Iteration lib/jaxen-core.jar count 1 took 5.34ms/10000 chars, counted
41039 trues in 160965 characters .
NEW Iteration lib/jaxen-core.jar count 1 took 0.8ms/10000 chars, counted
41039 trues in 160965 characters .
OLD Iteration lib/jaxen-core.jar count 2 took 5.34ms/10000 chars, counted
41039 trues in 160965 characters .
NEW Iteration lib/jaxen-core.jar count 2 took 0.8ms/10000 chars, counted
41039 trues in 160965 characters .
OLD Iteration lib/jaxen-core.jar count 3 took 5.34ms/10000 chars, counted
41039 trues in 160965 characters .
NEW Iteration lib/jaxen-core.jar count 3 took 0.8ms/10000 chars, counted
41039 trues in 160965 characters .
OLD Iteration lib/jaxen-core.jar count 4 took 5.34ms/10000 chars, counted
41039 trues in 160965 characters .
NEW Iteration lib/jaxen-core.jar count 4 took 0.8ms/10000 chars, counted
41039 trues in 160965 characters .
........
Rolf
-----Original Message-----
From: Elliotte Rusty Harold [mailto:elharo at metalab.unc.edu]
Sent: Thursday, April 10, 2003 7:54 AM
To: jdom-interest at jdom.org
Subject: Re: [jdom-interest] Toward beta 9
At 10:55 PM -0700 4/9/03, Philip Nelson wrote:
>Has anybody tried this approach?
>
>create a package protected or inner subclass of DefaultJDOMFactory in
>SAXBuilder. Then in the factory, for example...
>
> private class NoCheckText extends Text
> {
> public void noCheck(String text) {
> value = text;
> }
> }
> public Text text(String text) {
> NoCheckText t = new NoCheckText();
> t.noCheck(text);
> return (Text) t;
> }
That looks like it might actually work without causing too many
problems or further complicating the API, though it does depend on
those protected, do-nothing, no-args constructors that I wish we
didn't have.
--
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://jdom.org/pipermail/jdom-interest/attachments/20030410/a48aab70/attachment.htm
More information about the jdom-interest
mailing list