Getting outOfMemory while using Xpath for 6MB file
807588Feb 14 2009 — edited Feb 15 2009Hi ,
Requirement:
I have thousands of xml files of variable size (mostly around 5MB), Total size is around 20GB .The structure of xml content is as follows.
filename: xaaaa
<file>
<page>
<title>AmericanSamoa</title>
<id>6</id>
<revision>
<id>133452270</id>
<timestamp>2007-05-25T17:12:06Z</timestamp>
<contributor>
<username>Gurch</username>
<id>241822</id>
</contributor>
<minor />
<comment>Revert edit(s) by [[Special:Contributions/Ngaiklin|Ngaiklin]] to last version by [[Special:Contributions/Docu|Docu]]</comment>
<text xml:space="preserve">#REDIRECT [[American Samoa]]{{R from CamelCase}}</text>
</revision>
</page>
My task is to retrieve the ID , filename in which it exists and the position of node in the page, and i have to write it to a file.
ex: 6:xaaaa:1
My approach:
I am using Xpath for this. The code is as follows.
*/*XPathReader.java*/*
package preprocess;
import java.io.IOException;
import javax.xml.XMLConstants;
import javax.xml.namespace.QName;
import javax.xml.parsers.*;
import javax.xml.xpath.*;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
public class XPathReader {
private String xmlFile;
private Document xmlDocument;
private XPath xPath;
public XPathReader(String xmlFile) {
this.xmlFile = xmlFile;
initObjects();
}
private void initObjects(){
try {
xmlDocument = DocumentBuilderFactory.
newInstance().newDocumentBuilder().
parse(xmlFile);
xPath = XPathFactory.newInstance().
newXPath();
} catch (IOException ex) {
ex.printStackTrace();
} catch (SAXException ex) {
ex.printStackTrace();
} catch (ParserConfigurationException ex) {
ex.printStackTrace();
}
}
public Object read(String expression,
QName returnType){
try {
XPathExpression xPathExpression =
xPath.compile(expression);
return xPathExpression.evaluate
(xmlDocument, returnType);
} catch (XPathExpressionException ex) {
ex.printStackTrace();
return null;
}
}
}
XpathReaderTest.java
/* it takes directory name as argument, this directory contains xml file*/
package preprocess;
import java.io.*;
import javax.xml.xpath.XPathConstants;
import org.w3c.dom.*;
public class XPathReaderTest {
public XPathReaderTest() {
}
public static void main(String[] args) throws IOException{
if (args.length <= 0) {
System.out.println(
"Usage: java PreProcess dir_name"
);
return;
}
String dir=null;
if (args.length >= 1) dir = args[0];
int indexno=0;
File directory = new File(dir);
File[] files = directory.listFiles();
FileWriter fstream = new FileWriter("index"+indexno+".txt");
BufferedWriter out = new BufferedWriter(fstream);
XPathReaderTest xt=new XPathReaderTest();
/*for (int index = 0; index < files.length; index++)
{
System.out.println(files[index].toString());
}*/
for (int index = 0,i=1; index < files.length; index++)
{
/*if(index/100>indexno){
indexno++;
out.close();
fstream = new FileWriter("index"+indexno+".txt");
out = new BufferedWriter(fstream);
}*/
xt.extract(files[index].toString(),index,i,out);
System.gc();
}
out.close();
}
public void extract(String completepath,int index,int i,BufferedWriter out)
throws IOException
{
System.out.println(index+" "+completepath);
XPathReader reader = new XPathReader(completepath);
String separator = File.separator;
int pos = completepath.lastIndexOf(separator);
String temp_fname=completepath.substring(0,pos);
pos=temp_fname.lastIndexOf(separator);
String f_name= completepath.substring(pos+1);
i=1;
while(true)
{
String expression = "/file/page["+i+"]/id";
String id_value= (String) reader.read(expression, XPathConstants.STRING);
if(id_value=="")
break;
out.write( id_value + ":"+ f_name+ ":"+i+ "\n" );
i++;
}
}
}
Problem:
This code works fine for xml files < 6MB, but its giving outOfMemory for 6MB and above file.
I have tried with -Xms256m -Xmx512m option.
Please suggest the work around , or any modification to code that will resolve my problem.
I am new to java world , so problem root cause will be very helpful for me.
Thanks