I want to parse log files using the SAX parser. In the log file, at the beginning of the file I have normal text that are not in xml tags, then xml tags, followed by normal text not in xml tags.
How do I make my program ignore the normal text not in xml tags in the file and just parse the xml tags.
This is my code:
import java.io.*;
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
public class LogParser extends DefaultHandler
{
public static void main(String argv[])
{
if (argv.length != 1) {
System.err.println("Usage: cmd filename");
System.exit(1);
}
// Use an instance of ourselves as the SAX event handler
DefaultHandler handler = new LogParser();
// Use the default (non-validating) parser
SAXParserFactory factory = SAXParserFactory.newInstance();
try {
// Set up output stream
out = new OutputStreamWriter(System.out, "UTF8");
// Parse the input
SAXParser saxParser = factory.newSAXParser();
saxParser.parse( new File(argv[0]), handler);
} catch (Throwable t) {
t.printStackTrace();
}
System.exit(0);
}
static private Writer out;
private String indentString = " "; // Amount to indent
private int indentLevel = 0;
//===========================================================
// SAX DocumentHandler methods
//===========================================================
public void startDocument()
throws SAXException
{
nl();
nl();
emit("START DOCUMENT");
nl();
emit("<?xml version='1.0' encoding='UTF-8'?>");
}
public void endDocument()
throws SAXException
{
nl(); emit("END DOCUMENT");
try {
nl();
out.flush();
} catch (IOException e) {
throw new SAXException("I/O error", e);
}
}
public void startElement(String namespaceURI,
String lName, // local name
String qName, // qualified name
Attributes attrs)
throws SAXException
{
indentLevel++;
nl(); emit("ELEMENT: ");
String eName = lName; // element name
if ("".equals(eName)) eName = qName; // namespaceAware = false
emit("<"+eName);
if (attrs != null) {
for (int i = 0; i < attrs.getLength(); i++) {
String aName = attrs.getLocalName(i); // Attr name
if ("".equals(aName)) aName = attrs.getQName(i);
nl();
emit(" ATTR: ");
emit(aName);
emit("\t\"");
emit(attrs.getValue(i));
emit("\"");
}
}
if (attrs.getLength() > 0) nl();
emit(">");
}
public void endElement(String namespaceURI,
String sName, // simple name
String qName // qualified name
)
throws SAXException
{
nl();
emit("END_ELM: ");
emit("</"+sName+">");
indentLevel--;
}
public void characters(char buf[], int offset, int len)
throws SAXException
{
nl(); emit("CHARS: ");
String s = new String(buf, offset, len);
if (!s.trim().equals("")) emit(s);
}
//===========================================================
// Utility Methods ...
//===========================================================
// Wrap I/O exceptions in SAX exceptions, to
// suit handler signature requirements
private void emit(String s)
throws SAXException
{
try {
out.write(s);
out.flush();
} catch (IOException e) {
throw new SAXException("I/O error", e);
}
}
// Start a new line
// and indent the next line appropriately
private void nl()
throws SAXException
{
String lineEnd = System.getProperty("line.separator");
try {
out.write(lineEnd);
for (int i=0; i < indentLevel; i++) out.write(indentString);
} catch (IOException e) {
throw new SAXException("I/O error", e);
}
}
}
When I run the code with my log file I get the following error message:
org.xml.sax.SAXParseException: Content is not allowed in prolog.
at com.sun.org.apache.xerces.internal.util.ErrorHandlerWrapper.createSAXParseException(Unknown Source)
at com.sun.org.apache.xerces.internal.util.ErrorHandlerWrapper.fatalError(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLErrorReporter.reportError(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLErrorReporter.reportError(Unknown Source)
at com.sun.org.apache.xerces.internal.impl.XMLScanner.reportFatalError(Unknown Source)