[jdom-interest] Toward beta 9

Rolf Lear rlear at algorithmics.com
Thu Apr 10 07:44:51 PDT 2003


Since my post-with-attachment is waiting for moderator approval, I thought I
would post the "diff".

Just paste the following at the beginning of the Verifier class.

Rolf

=================================================
    private static long[] shifts = buildshifts();
    private static long[] lettermask = buildLetterMask(); 
    private static long[] combmask = buildCombinationMask();

    private static boolean getbit(long[] mask, int index) {
        return (mask[index >>> 6] & shifts[index & 0x3f]) != 0L;
    }
    
    private static void setbit(long[] mask, int index) {
        mask[index >>> 6] |= shifts[index & 0x3f];
    }

    private static void setbits(long[] mask, int start, int end) {
        for (int i = start; i <= end; i++) {
            setbit(mask,i);
        }
    }

    private static long[] buildshifts() {
        long[] shifts = new long[64];
        for (int i = 0; i < 64; i++) {
            shifts[i] = (1L << i);
        }
        return shifts;
    }
    
    private static long[] newmask() {
        // 0x400 == (0xffff + 1) / 64
        return new long[0x400];
    }

    private static long[] buildLetterMask() {
        System.out.println ("Building lettermask");
        long tm = System.currentTimeMillis();
        long[] mask = newmask();

        setbits(mask, 0x0041, 0x005a);
        setbits(mask, 0x0061, 0x007A);
        setbits(mask, 0x00C0, 0x00D6);
        setbits(mask, 0x00D8, 0x00F6);
        setbits(mask, 0x00F8, 0x00FF);
        setbits(mask, 0x0100, 0x0131);
        setbits(mask, 0x0134, 0x013E);
        setbits(mask, 0x0141, 0x0148);
        setbits(mask, 0x014A, 0x017E);
        setbits(mask, 0x0180, 0x01C3);
        setbits(mask, 0x01CD, 0x01F0);
        setbits(mask, 0x01F4, 0x01F5);
        setbits(mask, 0x01FA, 0x0217);
        setbits(mask, 0x0250, 0x02A8);
        setbits(mask, 0x02BB, 0x02C1);
        setbit(mask, 0x0386);
        setbits(mask, 0x0388, 0x038A);
        setbit(mask, 0x038C);
        setbits(mask, 0x038E, 0x03A1);
        setbits(mask, 0x03A3, 0x03CE);
        setbits(mask, 0x03D0, 0x03D6);
        setbit(mask, 0x03DA);
        setbit(mask, 0x03DC);
        setbit(mask, 0x03DE);
        setbit(mask, 0x03E0);
        setbits(mask, 0x03E2, 0x03F3);
        setbits(mask, 0x0401, 0x040C);
        setbits(mask, 0x040E, 0x044F);
        setbits(mask, 0x0451, 0x045C);
        setbits(mask, 0x045E, 0x0481);
        setbits(mask, 0x0490, 0x04C4);
        setbits(mask, 0x04C7, 0x04C8);
        setbits(mask, 0x04CB, 0x04CC);
        setbits(mask, 0x04D0, 0x04EB);
        setbits(mask, 0x04EE, 0x04F5);
        setbits(mask, 0x04F8, 0x04F9);
        setbits(mask, 0x0531, 0x0556);
        setbit(mask, 0x0559);
        setbits(mask, 0x0561, 0x0586);
        setbits(mask, 0x05D0, 0x05EA);
        setbits(mask, 0x05F0, 0x05F2);
        setbits(mask, 0x0621, 0x063A);
        setbits(mask, 0x0641, 0x064A);
        setbits(mask, 0x0671, 0x06B7);
        setbits(mask, 0x06BA, 0x06BE);
        setbits(mask, 0x06C0, 0x06CE);
        setbits(mask, 0x06D0, 0x06D3);
        setbit(mask, 0x06D5);
        setbits(mask, 0x06E5, 0x06E6);
        setbits(mask, 0x0905, 0x0939);
        setbit(mask, 0x093D);
        setbits(mask, 0x0958, 0x0961);
        setbits(mask, 0x0985, 0x098C);
        setbits(mask, 0x098F, 0x0990);
        setbits(mask, 0x0993, 0x09A8);
        setbits(mask, 0x09AA, 0x09B0);
        setbit(mask, 0x09B2);
        setbits(mask, 0x09B6, 0x09B9);
        setbits(mask, 0x09DC, 0x09DD);
        setbits(mask, 0x09DF, 0x09E1);
        setbits(mask, 0x09F0, 0x09F1);
        setbits(mask, 0x0A05, 0x0A0A);
        setbits(mask, 0x0A0F, 0x0A10);
        setbits(mask, 0x0A13, 0x0A28);
        setbits(mask, 0x0A2A, 0x0A30);
        setbits(mask, 0x0A32, 0x0A33);
        setbits(mask, 0x0A35, 0x0A36);
        setbits(mask, 0x0A38, 0x0A39);
        setbits(mask, 0x0A59, 0x0A5C);
        setbit(mask, 0x0A5E);
        setbits(mask, 0x0A72, 0x0A74);
        setbits(mask, 0x0A85, 0x0A8B);
        setbit(mask, 0x0A8D);
        setbits(mask, 0x0A8F, 0x0A91);
        setbits(mask, 0x0A93, 0x0AA8);
        setbits(mask, 0x0AAA, 0x0AB0);
        setbits(mask, 0x0AB2, 0x0AB3);
        setbits(mask, 0x0AB5, 0x0AB9);
        setbit(mask, 0x0ABD);
        setbit(mask, 0x0AE0);
        setbits(mask, 0x0B05, 0x0B0C);
        setbits(mask, 0x0B0F, 0x0B10);
        setbits(mask, 0x0B13, 0x0B28);
        setbits(mask, 0x0B2A, 0x0B30);
        setbits(mask, 0x0B32, 0x0B33);
        setbits(mask, 0x0B36, 0x0B39);
        setbit(mask, 0x0B3D);
        setbits(mask, 0x0B5C, 0x0B5D);
        setbits(mask, 0x0B5F, 0x0B61);
        setbits(mask, 0x0B85, 0x0B8A);
        setbits(mask, 0x0B8E, 0x0B90);
        setbits(mask, 0x0B92, 0x0B95);
        setbits(mask, 0x0B99, 0x0B9A);
        setbit(mask, 0x0B9C);
        setbits(mask, 0x0B9E, 0x0B9F);
        setbits(mask, 0x0BA3, 0x0BA4);
        setbits(mask, 0x0BA8, 0x0BAA);
        setbits(mask, 0x0BAE, 0x0BB5);
        setbits(mask, 0x0BB7, 0x0BB9);
        setbits(mask, 0x0C05, 0x0C0C);
        setbits(mask, 0x0C0E, 0x0C10);
        setbits(mask, 0x0C12, 0x0C28);
        setbits(mask, 0x0C2A, 0x0C33);
        setbits(mask, 0x0C35, 0x0C39);
        setbits(mask, 0x0C60, 0x0C61);
        setbits(mask, 0x0C85, 0x0C8C);
        setbits(mask, 0x0C8E, 0x0C90);
        setbits(mask, 0x0C92, 0x0CA8);
        setbits(mask, 0x0CAA, 0x0CB3);
        setbits(mask, 0x0CB5, 0x0CB9);
        setbit(mask, 0x0CDE);
        setbits(mask, 0x0CE0, 0x0CE1);
        setbits(mask, 0x0D05, 0x0D0C);
        setbits(mask, 0x0D0E, 0x0D10);
        setbits(mask, 0x0D12, 0x0D28);
        setbits(mask, 0x0D2A, 0x0D39);
        setbits(mask, 0x0D60, 0x0D61);
        setbits(mask, 0x0E01, 0x0E2E);
        setbit(mask, 0x0E30);
        setbits(mask, 0x0E32, 0x0E33);
        setbits(mask, 0x0E40, 0x0E45);
        setbits(mask, 0x0E81, 0x0E82);
        setbit(mask, 0x0E84);
        setbits(mask, 0x0E87, 0x0E88);
        setbit(mask, 0x0E8A);
        setbit(mask, 0x0E8D);
        setbits(mask, 0x0E94, 0x0E97);
        setbits(mask, 0x0E99, 0x0E9F);
        setbits(mask, 0x0EA1, 0x0EA3);
        setbit(mask, 0x0EA5);
        setbit(mask, 0x0EA7);
        setbits(mask, 0x0EAA, 0x0EAB);
        setbits(mask, 0x0EAD, 0x0EAE);
        setbit(mask, 0x0EB0);
        setbits(mask, 0x0EB2, 0x0EB3);
        setbit(mask, 0x0EBD);
        setbits(mask, 0x0EC0, 0x0EC4);
        setbits(mask, 0x0F40, 0x0F47);
        setbits(mask, 0x0F49, 0x0F69);
        setbits(mask, 0x10A0, 0x10C5);
        setbits(mask, 0x10D0, 0x10F6);
        setbit(mask, 0x1100);
        setbits(mask, 0x1102, 0x1103);
        setbits(mask, 0x1105, 0x1107);
        setbit(mask, 0x1109);
        setbits(mask, 0x110B, 0x110C);
        setbits(mask, 0x110E, 0x1112);
        setbit(mask, 0x113C);
        setbit(mask, 0x113E);
        setbit(mask, 0x1140);
        setbit(mask, 0x114C);
        setbit(mask, 0x114E);
        setbit(mask, 0x1150);
        setbits(mask, 0x1154, 0x1155);
        setbit(mask, 0x1159);
        setbits(mask, 0x115F, 0x1161);
        setbit(mask, 0x1163);
        setbit(mask, 0x1165);
        setbit(mask, 0x1167);
        setbit(mask, 0x1169);
        setbits(mask, 0x116D, 0x116E);
        setbits(mask, 0x1172, 0x1173);
        setbit(mask, 0x1175);
        setbit(mask, 0x119E);
        setbit(mask, 0x11A8);
        setbit(mask, 0x11AB);
        setbits(mask, 0x11AE, 0x11AF);
        setbits(mask, 0x11B7, 0x11B8);
        setbit(mask, 0x11BA);
        setbits(mask, 0x11BC, 0x11C2);
        setbit(mask, 0x11EB);
        setbit(mask, 0x11F0);
        setbit(mask, 0x11F9);
        setbits(mask, 0x1E00, 0x1E9B);
        setbits(mask, 0x1EA0, 0x1EF9);
        setbits(mask, 0x1F00, 0x1F15);
        setbits(mask, 0x1F18, 0x1F1D);
        setbits(mask, 0x1F20, 0x1F45);
        setbits(mask, 0x1F48, 0x1F4D);
        setbits(mask, 0x1F50, 0x1F57);
        setbit(mask, 0x1F59);
        setbit(mask, 0x1F5B);
        setbit(mask, 0x1F5D);
        setbits(mask, 0x1F5F, 0x1F7D);
        setbits(mask, 0x1F80, 0x1FB4);
        setbits(mask, 0x1FB6, 0x1FBC);
        setbit(mask, 0x1FBE);
        setbits(mask, 0x1FC2, 0x1FC4);
        setbits(mask, 0x1FC6, 0x1FCC);
        setbits(mask, 0x1FD0, 0x1FD3);
        setbits(mask, 0x1FD6, 0x1FDB);
        setbits(mask, 0x1FE0, 0x1FEC);
        setbits(mask, 0x1FF2, 0x1FF4);
        setbits(mask, 0x1FF6, 0x1FFC);
        setbit(mask, 0x2126);
        setbits(mask, 0x212A, 0x212B);
        setbit(mask, 0x212E);
        setbits(mask, 0x2180, 0x2182);
        setbit(mask, 0x3007);                          // ideographic
        setbits(mask, 0x3021, 0x3029);  // ideo
        setbits(mask, 0x3041, 0x3094);
        setbits(mask, 0x30A1, 0x30FA);
        setbits(mask, 0x3105, 0x312C);
        setbits(mask, 0x4E00, 0x9FA5);  // ideo
        setbits(mask, 0xAC00, 0xD7A3);
        
        System.out.println ("Done in " + (System.currentTimeMillis() - tm) +
"ms.");
        return mask;
    }


    private static long[] buildCombinationMask() {
        System.out.println ("Building combinationmask");
        long tm = System.currentTimeMillis();
        long[] mask = newmask();
        setbits(mask, 0x0300, 0x0345);
        setbits(mask, 0x0360, 0x0361);
        setbits(mask, 0x0483, 0x0486);
        setbits(mask, 0x0591, 0x05A1);
                                       
        setbits(mask, 0x05A3, 0x05B9);
        setbits(mask, 0x05BB, 0x05BD);
        setbit(mask, 0x05BF);
        setbits(mask, 0x05C1, 0x05C2);
                                       
        setbit(mask, 0x05C4);
        setbits(mask, 0x064B, 0x0652);
        setbit(mask, 0x0670);
        setbits(mask, 0x06D6, 0x06DC);
                                       
        setbits(mask, 0x06DD, 0x06DF);
        setbits(mask, 0x06E0, 0x06E4);
        setbits(mask, 0x06E7, 0x06E8);
                                       
        setbits(mask, 0x06EA, 0x06ED);
        setbits(mask, 0x0901, 0x0903);
        setbit(mask, 0x093C);
        setbits(mask, 0x093E, 0x094C);
                                       
        setbit(mask, 0x094D);
        setbits(mask, 0x0951, 0x0954);
        setbits(mask, 0x0962, 0x0963);
        setbits(mask, 0x0981, 0x0983);
                                       
        setbit(mask, 0x09BC);
        setbit(mask, 0x09BE);
        setbit(mask, 0x09BF);
        setbits(mask, 0x09C0, 0x09C4);
        setbits(mask, 0x09C7, 0x09C8);
                                       
        setbits(mask, 0x09CB, 0x09CD);
        setbit(mask, 0x09D7);
        setbits(mask, 0x09E2, 0x09E3);
        setbit(mask, 0x0A02);
        setbit(mask, 0x0A3C);
                                       
        setbit(mask, 0x0A3E);
        setbit(mask, 0x0A3F);
        setbits(mask, 0x0A40, 0x0A42);
        setbits(mask, 0x0A47, 0x0A48);
                                       
        setbits(mask, 0x0A4B, 0x0A4D);
        setbits(mask, 0x0A70, 0x0A71);
        setbits(mask, 0x0A81, 0x0A83);
        setbit(mask, 0x0ABC);
                                       
        setbits(mask, 0x0ABE, 0x0AC5);
        setbits(mask, 0x0AC7, 0x0AC9);
        setbits(mask, 0x0ACB, 0x0ACD);
                                       
        setbits(mask, 0x0B01, 0x0B03);
        setbit(mask, 0x0B3C);
        setbits(mask, 0x0B3E, 0x0B43);
        setbits(mask, 0x0B47, 0x0B48);
                                       
        setbits(mask, 0x0B4B, 0x0B4D);
        setbits(mask, 0x0B56, 0x0B57);
        setbits(mask, 0x0B82, 0x0B83);
                                       
        setbits(mask, 0x0BBE, 0x0BC2);
        setbits(mask, 0x0BC6, 0x0BC8);
        setbits(mask, 0x0BCA, 0x0BCD);
        setbit(mask, 0x0BD7);
                                       
        setbits(mask, 0x0C01, 0x0C03);
        setbits(mask, 0x0C3E, 0x0C44);
        setbits(mask, 0x0C46, 0x0C48);
                                       
        setbits(mask, 0x0C4A, 0x0C4D);
        setbits(mask, 0x0C55, 0x0C56);
        setbits(mask, 0x0C82, 0x0C83);
                                       
        setbits(mask, 0x0CBE, 0x0CC4);
        setbits(mask, 0x0CC6, 0x0CC8);
        setbits(mask, 0x0CCA, 0x0CCD);
                                       
        setbits(mask, 0x0CD5, 0x0CD6);
        setbits(mask, 0x0D02, 0x0D03);
        setbits(mask, 0x0D3E, 0x0D43);
                                       
        setbits(mask, 0x0D46, 0x0D48);
        setbits(mask, 0x0D4A, 0x0D4D);
        setbit(mask, 0x0D57);
        setbit(mask, 0x0E31);
                                       
        setbits(mask, 0x0E34, 0x0E3A);
        setbits(mask, 0x0E47, 0x0E4E);
        setbit(mask, 0x0EB1);
        setbits(mask, 0x0EB4, 0x0EB9);
                                       
        setbits(mask, 0x0EBB, 0x0EBC);
        setbits(mask, 0x0EC8, 0x0ECD);
        setbits(mask, 0x0F18, 0x0F19);
        setbit(mask, 0x0F35);
                                       
        setbit(mask, 0x0F37);
        setbit(mask, 0x0F39);
        setbit(mask, 0x0F3E);
        setbit(mask, 0x0F3F);
        setbits(mask, 0x0F71, 0x0F84);
                                       
        setbits(mask, 0x0F86, 0x0F8B);
        setbits(mask, 0x0F90, 0x0F95);
        setbit(mask, 0x0F97);
        setbits(mask, 0x0F99, 0x0FAD);
                                       
        setbits(mask, 0x0FB1, 0x0FB7);
        setbit(mask, 0x0FB9);
        setbits(mask, 0x20D0, 0x20DC);
        setbit(mask, 0x20E1);
                                       
        setbits(mask, 0x302A, 0x302F);
        setbit(mask, 0x3099);
        setbit(mask, 0x309A); 
        System.out.println ("Done in " + (System.currentTimeMillis() - tm) +
"ms.");
        return mask;
    }

    public static boolean newisXMLLetter(char c) {
        return getbit(lettermask, c);
    }
    
    public static boolean newisXMLCombiningChar(char c) {
        return getbit(combmask, c);
    }
    
    public static void main(String[] args) throws IOException {
        int cnt = new Integer(args[0]).intValue();
        for (int i = 1; i < args.length; i++) {
            BufferedReader reader = new BufferedReader(new
FileReader(args[i]));
            StringBuffer sb = new StringBuffer();
            String line;
            while ((line = reader.readLine()) != null) {
                sb.append(line + "\n");
            }
            char[] chars = sb.toString().toCharArray();
            double rate;
            long tm;
            for (int c = 0; c < cnt; c++) {

                int oldcnt = 0;
                int newcnt = 0;

                tm = System.currentTimeMillis();
                for (int x = 0; x < chars.length; x++) {
                    if (isXMLLetter(chars[x])) oldcnt++;
                    if (isXMLCombiningChar(chars[x])) oldcnt++;
                }
                tm = System.currentTimeMillis() - tm;
                rate = (double)(tm * 10000) / (double)chars.length;
                rate = (double)((int)(rate * 100.0)) / 100.0;
                System.out.println("OLD Iteration " + args[i] + " count " +
c + " took " + rate + "ms/10000 chars, counted " + oldcnt + " trues in " +
chars.length + " characters .");
                
                tm = System.currentTimeMillis();
                for (int x = 0; x < chars.length; x++) {
                    if (newisXMLLetter(chars[x])) newcnt++;
                    if (newisXMLCombiningChar(chars[x])) newcnt++;
                }
                tm = System.currentTimeMillis() - tm;
                rate = (double)(tm * 10000) / (double)chars.length;
                rate = (double)((int)(rate * 100.0)) / 100.0;
                System.out.println("NEW Iteration " + args[i] + " count " +
c + " took " + rate + "ms/10000 chars, counted " + oldcnt + " trues in " +
chars.length + " characters .");
            }
        }
    }

===========================================


-----Original Message-----
From: Rolf Lear [mailto:rlear at algorithmics.com]
Sent: Thursday, April 10, 2003 10:26 AM
To: 'Elliotte Rusty Harold'; jdom-interest at jdom.org
Subject: RE: [jdom-interest] Toward beta 9


How serious are people about performance in Verifier? 
Using a relatively random input source (the characters in various Jars), I
can get a 500% - 1000% performance improvement in Verifier.
This is relatively simple, and "just as logical" as the existing verifier. 
Have a look at the attached code, it is a "new" Verifier, with a main method
which has a relatively clunky, but effective performance test comparison
between the existing checks, and the proposed checks.
On my linux box I am getting performance improvements from 5.6 ms/10000
chars to 0.7ms/10000 chars. I know that the numbers are rough, but people
with profilers may be able to substantiate them better.
The basic principal is to build a bitmask representing all the valid
letters/combinations. The bitmask has 0xffff+1 bits, i.e. is 8K (relatively
small), and there is 1 for each "test". I have done only the isXMLLetter,
and isXMLCombiner. The pre-processing overhead is relatively small, (on my
box I measure 23ms).
Have a look-see, and tell me if I am barking up the wrong tree. I haven't
neatened up the code too much, but the principal seems good.
I have been running: 
ant package 
java -cp build/jdom.jar org.jdom.Verifier 5 lib/*.jar 
and getting results: 
Building lettermask 
Done in 22ms. 
Building combinationmask 
Done in 0ms. 
OLD Iteration lib/ant.jar count 0 took 6.93ms/10000 chars, counted 176182
trues in 732481 characters . 
NEW Iteration lib/ant.jar count 0 took 0.76ms/10000 chars, counted 176182
trues in 732481 characters . 
OLD Iteration lib/ant.jar count 1 took 5.61ms/10000 chars, counted 176182
trues in 732481 characters . 
NEW Iteration lib/ant.jar count 1 took 0.76ms/10000 chars, counted 176182
trues in 732481 characters . 
OLD Iteration lib/ant.jar count 2 took 5.66ms/10000 chars, counted 176182
trues in 732481 characters . 
NEW Iteration lib/ant.jar count 2 took 0.76ms/10000 chars, counted 176182
trues in 732481 characters . 
OLD Iteration lib/ant.jar count 3 took 5.69ms/10000 chars, counted 176182
trues in 732481 characters . 
NEW Iteration lib/ant.jar count 3 took 0.76ms/10000 chars, counted 176182
trues in 732481 characters . 
OLD Iteration lib/ant.jar count 4 took 5.61ms/10000 chars, counted 176182
trues in 732481 characters . 
NEW Iteration lib/ant.jar count 4 took 0.76ms/10000 chars, counted 176182
trues in 732481 characters . 
OLD Iteration lib/jaxen-core.jar count 0 took 5.34ms/10000 chars, counted
41039 trues in 160965 characters . 
NEW Iteration lib/jaxen-core.jar count 0 took 0.86ms/10000 chars, counted
41039 trues in 160965 characters . 
OLD Iteration lib/jaxen-core.jar count 1 took 5.34ms/10000 chars, counted
41039 trues in 160965 characters . 
NEW Iteration lib/jaxen-core.jar count 1 took 0.8ms/10000 chars, counted
41039 trues in 160965 characters . 
OLD Iteration lib/jaxen-core.jar count 2 took 5.34ms/10000 chars, counted
41039 trues in 160965 characters . 
NEW Iteration lib/jaxen-core.jar count 2 took 0.8ms/10000 chars, counted
41039 trues in 160965 characters . 
OLD Iteration lib/jaxen-core.jar count 3 took 5.34ms/10000 chars, counted
41039 trues in 160965 characters . 
NEW Iteration lib/jaxen-core.jar count 3 took 0.8ms/10000 chars, counted
41039 trues in 160965 characters . 
OLD Iteration lib/jaxen-core.jar count 4 took 5.34ms/10000 chars, counted
41039 trues in 160965 characters . 
NEW Iteration lib/jaxen-core.jar count 4 took 0.8ms/10000 chars, counted
41039 trues in 160965 characters . 
........ 


Rolf 


-----Original Message----- 
From: Elliotte Rusty Harold [mailto:elharo at metalab.unc.edu] 
Sent: Thursday, April 10, 2003 7:54 AM 
To: jdom-interest at jdom.org 
Subject: Re: [jdom-interest] Toward beta 9 


At 10:55 PM -0700 4/9/03, Philip Nelson wrote: 
>Has anybody tried this approach? 
> 
>create a package protected or inner subclass of DefaultJDOMFactory in 
>SAXBuilder.  Then in the factory, for example... 
> 
>     private class NoCheckText extends Text 
>     { 
>        public void noCheck(String text) { 
>           value = text; 
>        } 
>     } 
>     public Text text(String text) { 
>         NoCheckText t = new NoCheckText(); 
>         t.noCheck(text); 
>         return (Text) t; 
>     } 


That looks like it might actually work without causing too many 
problems or further complicating the API, though it does depend on 
those protected, do-nothing, no-args constructors that I wish we 
didn't have. 
-- 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://jdom.org/pipermail/jdom-interest/attachments/20030410/a48aab70/attachment.htm


More information about the jdom-interest mailing list