The following is against JWSDP-1.5, 1.4.2_07-b05 on XP Pro SP 2...
When I unmarshall a document using JAXB 1.0, and obtain Strings from the resulting objects, I would expect the Strings to be encoded with the character encoding specified by the text declaration. Is this a correct assumption?
Unfortunately, from the sample code (given below) I am not experiencing this behaviour. I've included the code, sample output, and the RELAX NG schema used.
Any help very welcome.
ian
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.util.List;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Marshaller;
import org.xml.sax.InputSource;
import com.chellomedia.transcoding.Category;
import com.chellomedia.transcoding.ObjectFactory;
import com.chellomedia.transcoding.Schedule;
public class TranscoderExemplar {
private static final String CATEGORY_NAME = "\u00bfMa\u00f1ana?";
public static void main(String[] args) throws Exception {
try {
ByteArrayOutputStream os = generateSampleDocument("US-ASCII");
inspectDocument("US-ASCII", os);
os = generateSampleDocument("ISO-8859-1");
inspectDocument("ISO-8859-1", os);
os = generateSampleDocument("UTF-8");
inspectDocument("UTF-8", os);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (JAXBException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
private static void inspectDocument(String encoding, ByteArrayOutputStream os) throws Exception {
byte[] b = os.toByteArray();
System.out.println(encoding + " document byte length: " + b.length);
InputSource is = new InputSource(new ByteArrayInputStream(b));
is.setEncoding(encoding);
JAXBContext context = JAXBContext.newInstance("com.chellomedia.transcoding");
Schedule schedule = (Schedule)context.createUnmarshaller().unmarshal(is);
context.createValidator().validate(schedule);
List categories = schedule.getCategories();
for (int i = 0, n = categories.size(); i < n; i++) {
Category c = (Category)categories.get(i);
String name = c.getName();
System.out.println(encoding + " name string length: " + name.length());
System.out.println(encoding + " name byte length: " + name.getBytes().length);
}
}
private static ByteArrayOutputStream generateSampleDocument(String encoding) throws JAXBException,
FileNotFoundException
{
ObjectFactory of = new ObjectFactory();
Schedule schedule = of.createSchedule();
List categories = schedule.getCategories();
Category category = of.createCategory();
category.setId(1);
category.setName(CATEGORY_NAME);
categories.add(category);
Marshaller m = of.createMarshaller();
m.setProperty("jaxb.encoding", encoding);
m.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, Boolean.TRUE);
ByteArrayOutputStream os = new ByteArrayOutputStream();
m.marshal(schedule, os);
FileOutputStream fos = new FileOutputStream(encoding + "-representation.xml");
m.marshal(schedule, fos);
return os;
}
}
...and the schema
<grammar xmlns="http://relaxng.org/ns/structure/1.0"
datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes"
xmlns:jaxb="http://java.sun.com/xml/ns/jaxb"
xmlns:xjc="http://java.sun.com/xml/ns/jaxb/xjc"
jaxb:extensionBindingPrefixes="xjc"
jaxb:version="1.0">
<jaxb:schemaBindings>
<jaxb:package name="com.chellomedia.transcoding"/>
</jaxb:schemaBindings>
<start>
<ref name="Schedule"/>
</start>
<define name="Schedule">
<element name="schedule">
<ref name="Categories"/>
<zeroOrMore>
<ref name="Event"/>
</zeroOrMore>
</element>
</define>
<define name="Categories">
<element name="categories">
<zeroOrMore>
<ref name="Category"/>
</zeroOrMore>
</element>
</define>
<define name="Category">
<element name="category">
<attribute name="id"><data type="int"/></attribute>
<attribute name="name"/>
<zeroOrMore>
<ref name="Subcategory"/>
</zeroOrMore>
</element>
</define>
<define name="Subcategory">
<element name="subcategory">
<attribute name="id"/>
<attribute name="name"/>
</element>
</define>
<define name="Event">
<element name="event">
<attribute name="id"/>
<attribute name="title"/>
<attribute name="description"/>
<attribute name="category"/>
<attribute name="subcategory"/>
</element>
</define>
</grammar>
...with sample output
US-ASCII document byte length: 171
US-ASCII name string length: 8
US-ASCII name byte length: 8
ISO-8859-1 document byte length: 163
ISO-8859-1 name string length: 8
ISO-8859-1 name byte length: 8
UTF-8 document byte length: 160
UTF-8 name string length: 8
UTF-8 name byte length: 8