[jdom-interest] getTextNormalize bug?
Jason Hunter
jhunter at acm.org
Wed Dec 5 19:29:03 PST 2001
The problem is that System.out.println() prints using the Latin-1
(ISO-8859-1) charset, but parsers by default assume the UTF-8 charset.
Chars from 0-127 are the same in both charsets so there's no problem for
ASCII, but for chars 128-255 what in Latin-1 is a single character would
in UTF-8 require 2 bytes. So the parser sees the byte and complains
that it doesn't properly decode in UTF-8.
The solution is to output using UTF-8 or add a decl to the file to
indicate it's ISO-8859-1.
Aren't charsets fun!
-jh-
Tim Daly wrote:
>
> I wrote a simple program to read an XML file and write it to stdout.
> The text of each Element is printed with the call:
>
> Element element;
> ...
> System.out.println(element.getTextNormalize());
>
> During my testing I ran it on the build.xml file in jdom, thus:
>
> java XMLCopy build.xml >foo.xml
>
> Then I renamed foo.xml to build.xml and did:
>
> ./build.sh
>
> The Ant program died with the message:
>
> Character conversion error: "Unconvertible UTF-8 character beginning
> with 0xa9" (line number may be too low).
>
> The original line in the original build.xml file contains:
>
> Copyright ©
>
> (aside: we knew the problem was the copyright :-))
> which got converted by getTextNormalize() into
>
> Copyright (someStrangeCharacter)
>
> Is this a bug in getTextNormalize?
> My source code follows.
>
> Tim Daly
> daly at idsi.net
>
> =====================================================================
>
> package samples;
>
> import org.jdom.*;
> import org.jdom.input.SAXBuilder;
> import org.jdom.input.DOMBuilder;
> import org.jdom.output.*;
> import java.util.*;
>
> public class Count
> {
> static Stack stack = new Stack();
> static int indent = 0;
>
> public static void doIndent(int count)
> { if (count < 0)
> indent=indent+count;
> for(int i=0; i<indent; i++)
> System.out.print(" ");
> if (count > 0)
> indent=indent+count;
> }
>
> public static void main(String[] args)
> { if (args.length == 0)
> { System.out.println("Usage: java Count URL1 URL2...");
> return;
> }
> SAXBuilder saxBuilder = new SAXBuilder();
> DOMBuilder domBuilder = new DOMBuilder();
> DOMOutputter domOutputter = new DOMOutputter();
> Document jdomDocument;
> org.w3c.dom.Element domElement;
> org.jdom.Element jdomElement;
> org.w3c.dom.Document domDocument;
> try
> { jdomDocument = saxBuilder.build(args[0]);
> domElement = domOutputter.output(jdomDocument.getRootElement());
> jdomElement = domBuilder.build(domElement);
> count(jdomElement);
> }
> catch (JDOMException e)
> { System.out.println(args[0] + " is not a well formed XML document.");
> System.out.println(e.getMessage());
> }
> }
>
> public static void printAttributes(List attributes)
> { Iterator iterator = attributes.iterator();
> while (iterator.hasNext())
> { Object o = iterator.next();
> System.out.print(" "+((Attribute)o).getName()+" = \""+
> ((Attribute)o).getValue()+"\"");
> }
> }
>
> public static void count(Element element)
> { doIndent(1);
> System.out.print("<"+element.getName());
> stack.push(element.getName());
> printAttributes(element.getAttributes());
> System.out.println(">");
> String text = element.getTextNormalize();
> if (! text.equals(""))
> { doIndent(0);
> System.out.println(" "+text);
> }
> List children = element.getContent();
> Iterator iterator = children.iterator();
> while (iterator.hasNext())
> { Object o = iterator.next();
> if (o instanceof Element)
> count((Element) o);
> }
> doIndent(-1);
> System.out.println("</"+(String)stack.pop()+">");
> }
> }
> _______________________________________________
> To control your jdom-interest membership:
> http://lists.denveronline.net/mailman/options/jdom-interest/youraddr@yourhost.com
More information about the jdom-interest
mailing list