[jdom-interest] Verifier Namespace patches

Thu Jun 27 07:49:07 PDT 2002

I've attached two patches for the Verifier and TestVerifier classes 
related to testing namespace URIs. I think these do all the checks that 
are possible using the general URI specification. Specifically:

1. Each URI reference contains only characters allowed by RFC 2396.
2. No more than one # is included
3. Each percent sign is followed by two hexadecimal digits

Since namespace URIs may be relative, and may be URI references, there's 
a not a lot more we can do.

For convenience, I've added two new public methods to the Verifier 
class, which are used when testing the content of each URI:

    public static boolean isURICharacter(char c)
    public static boolean isHexDigit(char c)

However, if you'd prefer not to clutter the API, these could easily be 
made private.

--
Elliotte
-------------- next part --------------
Index: src/java/org/jdom/Verifier.java
===================================================================
RCS file: /home/cvspublic/jdom/src/java/org/jdom/Verifier.java,v
retrieving revision 1.40
diff -d -u -r1.40 Verifier.java

--- src/java/org/jdom/Verifier.java	2002/06/26 01:44:26	1.40
+++ src/java/org/jdom/Verifier.java	2002/06/27 22:49:19
@@ -250,31 +250,52 @@
 
     /**
      * This will check the supplied name to see if it is legal for use as
-     * a JDOM <code>{@link Namespace}</code> URI.
+     * a JDOM <code>{@link Namespace}</code> URI. (Technically it checks
+     * to see if this is a legal RFC 2396 URI reference, which is not quite the
+     * same thing as a URI.)
      *
      * @param uri <code>String</code> URI to check.
      * @return <code>String</code> - reason name is illegal, or
      *         <code>null</code> if name is OK.
      */
     public static String checkNamespaceURI(String uri) {
-        // Manually do rules, since URIs can be null or empty
+        // URIs can be null or empty to indicate no namespace
         if ((uri == null) || (uri.equals(""))) {
             return null;
         }
 
-        // Cannot start with a number
-        char first = uri.charAt(0);
-        if (Character.isDigit(first)) {
-            return "Namespace URIs cannot begin with a number";
-        }
-        // Cannot start with a $
-        if (first == '$') {
-            return "Namespace URIs cannot begin with a dollar sign ($)";
-        }
-        // Cannot start with a -
-        if (first == '-') {
-            return "Namespace URIs cannot begin with a hyphen (-)";
-        }
+        // We need to make sure there are no more than one #
+        // in the proposed URI reference
+        int numberOfHashes = 0;
+        for (int i = 0; i < uri.length(); i++) {
+            char test = uri.charAt(i);
+            if (!isURICharacter(test)) {
+                if (test == '#') {
+                  numberOfHashes++;
+                  if (numberOfHashes > 1) {
+                    return "URI references can contain at most one # character.";
+                  }
+                }
+                else {
+                  String msgNumber = "0x" + Integer.toHexString(test);
+                  if (test <= 0x09) msgNumber = "0x0" + Integer.toHexString(test);
+                  return "Namespace URIs cannot contain " + msgNumber;
+                }
+            } // end if
+            if (test == '%') {  // must be followed by two hexadecimal digits
+                   try {
+                       char firstDigit = uri.charAt(i+1);
+                       char secondDigit = uri.charAt(i+2);
+                       if (!isHexDigit(firstDigit) || !isHexDigit(secondDigit)) {
+                           return "Percent signs in URIs must be followed by exactly two hexadecimal digits.";    
+                       }
+		  
+                   }
+                   catch (StringIndexOutOfBoundsException e) {
+                       return "Percent signs in URIs must be followed by exactly two hexadecimal digits.";    
+                   }
+            }
+        } // end for
 
         // If we got here, everything is OK
         return null;
@@ -1075,5 +1096,52 @@
         if (c < 0x0F20) return false;  if (c <= 0x0F29) return true; 
       
         return false;
-    }    
+    }  
+    
+        /**
+     * This is a utility function for determining whether a specified
+     * Unicode character is allowed in 
+     * URIs as determined by RFC 2396. Note that the # character is allowed
+     * in URI references but <b>not</b> URIs. Thus this method returns false
+     * for that character.
+     *
+     * @param c <code>char</code> to check for URI compliance.
+     * @return <code>boolean</code> - true if it's allowed, false otherwise.
+     */
+    public static boolean isURICharacter(char c) {
+
+        if (c <= 0x0020) return false;  if (c <= 0x0021) return true;
+        if (c <= 0x0023) return false;  if (c <= 0x003B) return true;
+        if (c <= 0x003C) return false;  if (c <= 0x003D) return true;
+
+        if (c <= 0x003E) return false;  if (c <= 0x005A) return true;
+        if (c <= 0x005E) return false;  if (c <= 0x005F) return true;
+        if (c <= 0x0060) return false;  if (c <= 0x007A) return true;
+
+        if (c <= 0x007D) return false;  if (c <= 0x007E) return true;
+
+        return false;
+    }
+
+    /**
+     * This is a utility function for determining whether a specified
+     * Unicode character is a hexadecimal digit as defined in RFC 2396;
+     * that is, one of the ASCII characters 0-9, a-f, or A-F
+     *
+     * @param c <code>char</code> to check for hex digit.
+     * @return <code>boolean</code> - true if it's allowed, false otherwise.
+     */
+    public static boolean isHexDigit(char c) {
+
+	// I suspect most characters passed to this method will be
+	// correct hexadecimal digits, so I test for the true cases
+	// first. If this proves to be a performance bottleneck a switch statement
+	// might optimize this. 
+        if (c >= '0' && c <= '9') return true; 
+        if (c >= 'A' && c <= 'F') return true; 
+        if (c >= 'a' && c <= 'f') return true; 
+
+        return false;
+    }
+    
 }
-------------- next part --------------
Index: src/java/org/jdom/test/cases/TestVerifier.java
===================================================================
RCS file: /home/cvspublic/jdom-test/src/java/org/jdom/test/cases/TestVerifier.java,v
retrieving revision 1.7
diff -d -u -r1.7 TestVerifier.java
--- src/java/org/jdom/test/cases/TestVerifier.java	2002/06/26 01:46:54	1.7
+++ src/java/org/jdom/test/cases/TestVerifier.java	2002/06/27 22:50:27
@@ -701,44 +701,65 @@
 
 	}
 	/**
-	 * Tests that checkNamespaceURI validates xml uri's.
-	 * A valid URI is alphanumeric characters and the reserved characters:
+	 * Tests that checkNamespaceURI validates XML URI references.
+	 * A valid URI is composed of alphanumeric ASCII characters and the reserved characters:
 	 * ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |  "$" | ","
-	 *            
-	 * The URI cannot begin with a digit, "-" or "$".  It must have at least
-	 * one ":" separating the scheme from the scheme specific part
-	 *
-	 * XXX:TODO make this match the eventual specs for the Verifier class which is incomplete
+     * and the mark characters
+	 * "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
 	 */
 	public void test_TCM__String_checkNamespaceURI_String() {
-		//invalid start characters
-		assertTrue("validated invalid URI with startin -" , !(Verifier.checkNamespaceURI('-' + "test")== null));
-		assertTrue("validated invalid URI with starting digit" , !(Verifier.checkNamespaceURI("9") == null));
-		assertTrue("validated invalid URI with starting $" , !(Verifier.checkNamespaceURI("$") == null));
-		//valid tests
-		assertTrue("invalidated valid null" , Verifier.checkNamespaceURI(null) == null);
-		assertTrue("invalidated valid URI with :" , Verifier.checkNamespaceURI("test" + ':' + "local") == null);
-		assertTrue("invalidated valid URI with _" , Verifier.checkNamespaceURI("test" + '_') == null);
-		assertTrue("invalidated valid URI with ." , Verifier.checkNamespaceURI("test" + '.' + "URI") == null);
-		assertTrue("invalidated valid URI with digit" , Verifier.checkNamespaceURI("test9") == null);
-		assertTrue("invalidated valid URI with 0x00B7" , Verifier.checkNamespaceURI("test" + (char)0x00B7) == null);
-		assertTrue("invalidated valid URI with 0x4E01" , Verifier.checkNamespaceURI("test" + (char)0x4E01) == null);
-		assertTrue("invalidated valid URI with 0x0301" , Verifier.checkNamespaceURI("test" + (char)0x0301) == null);
-		//check out of range values
-
-		/** skip these tests until the time the checks are implemented
-		assertTrue("validated invalid URI with xmlns" , !(Verifier.checkNamespaceURI("xmlns")== null));
-		assertTrue("validated invalid URI with startin :" , !(Verifier.checkNamespaceURI(':' + "test")== null));
-		assertTrue("validated invalid URI with starting ." , !(Verifier.checkNamespaceURI(".") == null));
-		
+		//invalid characters
+		assertTrue("validated invalid URI with non-ASCII character" , Verifier.checkNamespaceURI("test" + (char)0x4E01) != null);
 		assertTrue("validated invalid URI with null" ,! (Verifier.checkNamespaceURI("test" + (char)0x0) == null));
 		assertTrue("validated invalid URI with null" ,! (Verifier.checkNamespaceURI("test" + (char)0x0 + "ing") == null));
 		assertTrue("validated invalid URI with null" ,! (Verifier.checkNamespaceURI((char)0x0 + "test") == null));
 		assertTrue("validated invalid URI with 0x01" ,! (Verifier.checkNamespaceURI((char)0x01 + "test") == null));
 		assertTrue("validated invalid URI with 0xD800" ,! (Verifier.checkNamespaceURI("test" + (char)0xD800) == null));
 		assertTrue("validated invalid URI with 0xD800" ,! (Verifier.checkNamespaceURI("test" + (char)0xD800 + "ing") == null));
-		assertTrue("validated invalid URI with 0xD800" ,! (Verifier.checkNamespaceURI((char)0xD800 + "test") == null));
-		*/
+		assertTrue("validated invalid URI with 0xD800" ,! (Verifier.checkNamespaceURI((char)0xD800 + "test") == null));        
+		assertTrue("validated invalid URI with 0x00B7" ,! (Verifier.checkNamespaceURI("test" + (char)0x00B7) == null));
+		assertTrue("validated invalid URI with 0x0301" ,! (Verifier.checkNamespaceURI("test" + (char)0x0301) == null));
+        
+        //    unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
+		assertTrue("validated invalid URI with {" , Verifier.checkNamespaceURI("test{") != null);
+		assertTrue("validated invalid URI with }" , Verifier.checkNamespaceURI("test}") != null);
+		assertTrue("validated invalid URI with |" , Verifier.checkNamespaceURI("test|") != null);
+		assertTrue("validated invalid URI with \\" , Verifier.checkNamespaceURI("test\\") != null);
+		assertTrue("validated invalid URI with ^" , Verifier.checkNamespaceURI("test^") != null);
+		assertTrue("validated invalid URI with [" , Verifier.checkNamespaceURI("test[") != null);
+		assertTrue("validated invalid URI with ]" , Verifier.checkNamespaceURI("test]") != null);
+		assertTrue("validated invalid URI with `" , Verifier.checkNamespaceURI("test`") != null);
+        // delims      = "<" | ">" | "#" | "%" | <">
+		assertTrue("validated invalid URI with <" , Verifier.checkNamespaceURI("test<") != null);
+		assertTrue("validated invalid URI with >" , Verifier.checkNamespaceURI("test>") != null);
+		assertTrue("validated invalid URI with \"" , Verifier.checkNamespaceURI("test\"") != null);
+        // # is a special case. It is illegal in a URI, but it is legal in a URI reference,
+        // and namespace URIs are actually URI references. However, a URI reference may contain
+        // at most one of these characters
+        assertTrue("validated invalid URI with multiple #" , Verifier.checkNamespaceURI("test##") != null);
+        assertTrue("validated invalid URI with multiple #" , Verifier.checkNamespaceURI("#test#") != null);
+        assertTrue("validated invalid URI with multiple #" , Verifier.checkNamespaceURI("test#test#test") != null);
+
+        
+		//valid tests
+		assertTrue("invalidated valid URI with starting -" , (Verifier.checkNamespaceURI('-' + "test")== null));
+		assertTrue("invalidated valid URI with starting digit" , (Verifier.checkNamespaceURI("9") == null));
+		assertTrue("invalidated valid URI with starting $" , (Verifier.checkNamespaceURI("$") == null));
+		assertTrue("invalidated valid null" , Verifier.checkNamespaceURI(null) == null);
+		assertTrue("invalidated valid URI with :" , Verifier.checkNamespaceURI("test" + ':' + "local") == null);
+		assertTrue("invalidated valid URI with _" , Verifier.checkNamespaceURI("test" + '_') == null);
+		assertTrue("invalidated valid URI with ." , Verifier.checkNamespaceURI("test" + '.' + "URI") == null);
+		assertTrue("invalidated valid URI with digit" , Verifier.checkNamespaceURI("test9") == null);
+        // # is a special case. It is illegal in a URI, but it is legal in a URI reference,
+        // and namespace URIs are actually URI references
+		assertTrue("invalidated valid URI with #" , Verifier.checkNamespaceURI("test#") == null);
+		assertTrue("invalidated valid URI with #" , Verifier.checkNamespaceURI("#test") == null);
+		assertTrue("invalidated valid URI with #" , Verifier.checkNamespaceURI("test#test") == null);
+       
+        // Check percent escaping
+		assertTrue("invalidated valid URI with %AD" , Verifier.checkNamespaceURI("test%AD") == null);
+		assertTrue("validated invalid URI with % AD" , Verifier.checkNamespaceURI("test% AD") != null);
+		assertTrue("validated invalid URI with %0" , Verifier.checkNamespaceURI("test%0") != null);
 
 
 	}