I can't seem to parse a simple XHTML 1.1 document with XPATH. In the code below, the string
xmlDoesntWork is taken directly from http://www.w3.org/TR/2001/REC-xhtml11-20010531/conformance.html. However, XPATH can't find the
<title> element unless I remove the
DOCTYPE line & the
xmlns attribute from the
<html> element (the
xmlWorks string). XPATH returns
null for the <title> element in the first string, but correctly retrieves the title in the second string. I tried adding a namespace context argument, but that didn't make any difference.
Can anyone see what I'm doing wrong?
import java.io.StringReader;
import javax.xml.namespace.NamespaceContext;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
public class Test
{
//----------------------------------------------------------------------------------------------
public static void main(String[] unused)throws Exception
{
final String path = "/html/head/title";
final String xmlDoesntWork =
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>" +
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" +
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" >" +
"<head>" +
"<title>Virtual Library</title>" +
"</head>" +
"<body>" +
"<p>Moved to <a href=\"http://vlib.org/\">vlib.org</a>.</p>" +
"</body>" +
"</html>";
String title = getText(xmlDoesntWork, path, null);
System.out.println("Title: " + title);
final String xmlWorks =
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>" +
"<html>" +
"<head>" +
"<title>Virtual Library</title>" +
"</head>" +
"<body>" +
"<p>Moved to <a href=\"http://vlib.org/\">vlib.org</a>.</p>" +
"</body>" +
"</html>";
title = getText(xmlWorks, path, null);
System.out.println("Title: " + title);
}
//----------------------------------------------------------------------------------------------
private static String getText(String xml, String path, NamespaceContext context)
throws Exception
{
StringReader reader = new StringReader(xml); // Get input source
InputSource source = new InputSource(reader);
XPath xpath = XPathFactory.newInstance().newXPath();
if (context != null) // If there's a namespace context
xpath.setNamespaceContext(context); // Inform XPATH
XPathExpression expression = xpath.compile(path);
Node node = (Node)expression.evaluate(source, XPathConstants.NODE);
return node == null ? null : node.getTextContent();
}
//----------------------------------------------------------------------------------------------
}