Skip to Main Content

Java APIs

Announcement

For appeals, questions and feedback about Oracle Forums, please email oracle-forums-moderators_us@oracle.com. Technical questions should be asked in the appropriate category. Thank you!

Interested in getting your voice heard by members of the Developer Marketing team at Oracle? Check out this post for AppDev or this post for AI focus group information.

issue: Not getting response when the document is having special chars

818960Mar 15 2011 — edited Mar 16 2011
Please help following issue: Not getting response when the document is having special chars(Use any doc with special char(ex: &, $, <, >,.....) TestErrorFour.doc


Error message:

System.FormatException: Invalid length for a Base-64 char array. at
System.Convert.FromBase64String(String s) at
Summarize.Summarizer.AccumulateBroadcast(String filedata, String givenWords) in
c:\DocumentSummarizer\App_Code\Summarizer.cs:line 66

Code:
--------
File 1:
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.Properties;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.hwpf.*;
import org.apache.poi.hwpf.extractor.*;

import com.lowagie.text.Document;
import com.lowagie.text.pdf.PRTokeniser;
import com.lowagie.text.pdf.PdfReader;

public class DocumentSummarizerClient {
	static Properties loadProperties() {
		Properties prop = new Properties();
		try {
			prop.load(DocumentSummarizerClient.class.getClassLoader().getResourceAsStream("vep.properties"));

		} catch (Exception ioe) {
			ioe.printStackTrace();

		}
		return prop;
	}

	public String getSummary(String fileName,String noOfWordsOrPercentage ){
		String summaryInputData ="";
		String summarizedData="";
		String summarizerURL = loadProperties().getProperty("Summarizer.serviceURL");
		try {
			String fileExtension=fileName.substring(fileName.lastIndexOf(".")+1, fileName.length());
			
			if (fileExtension.equalsIgnoreCase("doc")|| fileExtension.equalsIgnoreCase("txt")|| fileExtension.equalsIgnoreCase("pdf")) {
						if (fileExtension.equalsIgnoreCase("txt")) {
							BufferedReader bufferedReader = new BufferedReader(
									new FileReader(fileName));
							String line = null;
							while ((line = bufferedReader.readLine()) != null) {
								summaryInputData += line;
							}
						}
						if(fileExtension.equalsIgnoreCase("doc")){
							POIFSFileSystem fs = null;
							 fs = new POIFSFileSystem(new FileInputStream(fileName)); 
					              
					            HWPFDocument doc = new HWPFDocument(fs);
					 
							  WordExtractor we = new WordExtractor(doc);
					 
							  String[] paragraphs = we.getParagraphText();
					 							 
							  for( int i=0; i<paragraphs .length; i++ ) {
								paragraphs[i] = paragraphs.replaceAll("\\cM?\r?\n","");

summaryInputData+= paragraphs[i];
}

}
if(fileExtension.equalsIgnoreCase("pdf")){
Document document = new Document();
document.open();
PdfReader reader = new PdfReader(fileName);

int pageCount =reader.getNumberOfPages();
for(int i=1;i<=pageCount;i++){
byte[] bytes = reader.getPageContent(i);
PRTokeniser tokenizer = new PRTokeniser(bytes);
StringBuffer buffer = new StringBuffer();
while (tokenizer.nextToken()) {
if (tokenizer.getTokenType() == PRTokeniser.TK_STRING) {
buffer.append(tokenizer.getStringValue());
}
}
summaryInputData += buffer.toString();
}
}
}
else{
System.out.println("This is Invalid document. Presntly we support only text,word and PDF documents ");
}

// String encoded =new String (summaryInputData.getBytes("ISO-8859-1"),"UTF-8");
String encoded=Base64Utils.base64Encode(summaryInputData.getBytes());
// encoded =new String (summaryInputData.getBytes("ISO-8859-1"),"UTF-8");
String parameters= "base64String="+encoded+"&noOfWordsOrPercentage="+noOfWordsOrPercentage;
summarizedData= postRequest(parameters,summarizerURL);

String slength= "<string xmlns=\"http://tempuri.org/\">";
if(summarizedData.contains("</string>")){
summarizedData= summarizedData.substring(summarizedData.indexOf(slength)+slength.length(),summarizedData.indexOf("</string>"));
summarizedData = replaceVal(summarizedData);
//System.out.println("<?xml version=\"1.0\" encoding=\"utf-8\"?><![CDATA["+summarizedData+"]]>");
// System.out.println("Summarized data "+summarizedData);
if(summarizedData.contains("Please enter the percentage")){
summarizedData="Data given cannot be summarized further";
}

}
else{
System.out.println("Data given cannot be summarized further");
summarizedData="";

}

} catch (FileNotFoundException e) {
return("The File is not found \n\n"+e.toString());
} catch (IOException e) {
return("The File is already in use \n\n"+e.toString());
} catch (Exception e) {
return(e.toString());
}

return summarizedData;

}

public static String postRequest(String parameters,String webServiceURL) throws Exception{
Properties systemSettings = System.getProperties();
systemSettings.put("http.proxyHost", loadProperties().getProperty("proxyHost"));
systemSettings.put("http.proxyPort", loadProperties().getProperty("proxyPort"));
System.setProperties(systemSettings);
String responseXML = "";

try {

URL url = new URL(webServiceURL);
URLConnection connection = url.openConnection();
HttpURLConnection httpConn = (HttpURLConnection) connection;
byte[] requestXML = parameters.getBytes();




httpConn.setRequestProperty("Content-Length", String
.valueOf(requestXML.length));
httpConn.setRequestProperty("Content-Type",
"application/x-www-form-urlencoded");

httpConn.setRequestMethod("POST");

httpConn.setDoOutput(true);
httpConn.setDoInput(true);


OutputStream out = httpConn.getOutputStream();
out.write(requestXML, 0, requestXML.length);
out.close();

InputStreamReader isr = new InputStreamReader(httpConn
.getInputStream());
BufferedReader br = new BufferedReader(isr);
String temp;
String tempResponse = "";


while ((temp = br.readLine()) != null)
tempResponse = tempResponse + temp;

responseXML = tempResponse;
br.close();
isr.close();
} catch (java.net.MalformedURLException e) {
System.out
.println("Error in postRequest(): Secure Service Required");
} catch (Exception e) {
System.out.println("Error in postRequest(): " + e.getMessage());
}
return responseXML;
}
public String replaceVal(String value) {
if (value == null) {
value = "";
}
value = value.replace("&lt;", "<");
value = value.replace("&gt;", ">");
value = value.replace("&amp;", "&");

return value;
}

public static void main(String[] args) {
DocumentSummarizerClient testdoc=new DocumentSummarizerClient();
System.out.println("hello");
testdoc.getSummary("C:\\working_folder\\vep\\UnitTestCases\\VEP1.0\\DocumentSummarizerTestData\\TestErrorFour.doc","100%");

}

}
Note: Use any doc with special char(ex: &, $, <, >,.....) TestErrorFour.doc


File 2:
---------
public class Base64Utils {

private static byte[] mBase64EncMap, mBase64DecMap;

/**
* Class initializer. Initializes the Base64 alphabet (specified in RFC-2045).
*/
static {
byte[] base64Map = {
(byte)'A', (byte)'B', (byte)'C', (byte)'D', (byte)'E', (byte)'F',
(byte)'G', (byte)'H', (byte)'I', (byte)'J', (byte)'K', (byte)'L',
(byte)'M', (byte)'N', (byte)'O', (byte)'P', (byte)'Q', (byte)'R',
(byte)'S', (byte)'T', (byte)'U', (byte)'V', (byte)'W', (byte)'X',
(byte)'Y', (byte)'Z',
(byte)'a', (byte)'b', (byte)'c', (byte)'d', (byte)'e', (byte)'f',
(byte)'g', (byte)'h', (byte)'i', (byte)'j', (byte)'k', (byte)'l',
(byte)'m', (byte)'n', (byte)'o', (byte)'p', (byte)'q', (byte)'r',
(byte)'s', (byte)'t', (byte)'u', (byte)'v', (byte)'w', (byte)'x',
(byte)'y', (byte)'z',
(byte)'0', (byte)'1', (byte)'2', (byte)'3', (byte)'4', (byte)'5',
(byte)'6', (byte)'7', (byte)'8', (byte)'9', (byte)'+', (byte)'/'};
mBase64EncMap = base64Map;
mBase64DecMap = new byte[128];
for (int i=0; i<mBase64EncMap.length; i++)
mBase64DecMap[mBase64EncMap[i]] = (byte) i;
}

/**
* This class isn't meant to be instantiated.
*/
private Base64Utils() {
}

/**
* Encodes the given byte[] using the Base64-encoding,
* as specified in RFC-2045 (Section 6.8).
*
* @param aData the data to be encoded
* @return the Base64-encoded <var>aData</var>
* @exception IllegalArgumentException if NULL or empty array is passed
*/
public static String base64Encode(byte[] aData) {
if ((aData == null) || (aData.length == 0))
throw new IllegalArgumentException("Can not encode NULL or empty byte array.");

byte encodedBuf[] = new byte[((aData.length+2)/3)*4];

// 3-byte to 4-byte conversion
int srcIndex, destIndex;
for (srcIndex=0, destIndex=0; srcIndex < aData.length-2; srcIndex += 3) {
encodedBuf[destIndex++] = mBase64EncMap[(aData[srcIndex] >>> 2) & 077];
encodedBuf[destIndex++] = mBase64EncMap[(aData[srcIndex+1] >>> 4) & 017 |
(aData[srcIndex] << 4) & 077];
encodedBuf[destIndex++] = mBase64EncMap[(aData[srcIndex+2] >>> 6) & 003 |
(aData[srcIndex+1] << 2) & 077];
encodedBuf[destIndex++] = mBase64EncMap[aData[srcIndex+2] & 077];
}

// Convert the last 1 or 2 bytes
if (srcIndex < aData.length) {
encodedBuf[destIndex++] = mBase64EncMap[(aData[srcIndex] >>> 2) & 077];
if (srcIndex < aData.length-1) {
encodedBuf[destIndex++] = mBase64EncMap[(aData[srcIndex+1] >>> 4) & 017 |
(aData[srcIndex] << 4) & 077];
encodedBuf[destIndex++] = mBase64EncMap[(aData[srcIndex+1] << 2) & 077];
}
else {
encodedBuf[destIndex++] = mBase64EncMap[(aData[srcIndex] << 4) & 077];
}
}

// Add padding to the end of encoded data
while (destIndex < encodedBuf.length) {
encodedBuf[destIndex] = (byte) '=';
destIndex++;
}

String result = new String(encodedBuf);
return result;
}


/**
* Decodes the given Base64-encoded data,
* as specified in RFC-2045 (Section 6.8).
*
* @param aData the Base64-encoded aData.
* @return the decoded <var>aData</var>.
* @exception IllegalArgumentException if NULL or empty data is passed
*/
public static byte[] base64Decode(String aData) {
if ((aData == null) || (aData.length() == 0))
throw new IllegalArgumentException("Can not decode NULL or empty string.");

byte[] data = aData.getBytes();

// Skip padding from the end of encoded data
int tail = data.length;
while (data[tail-1] == '=')
tail--;

byte decodedBuf[] = new byte[tail - data.length/4];

// ASCII-printable to 0-63 conversion
for (int i = 0; i < data.length; i++)
data[i] = mBase64DecMap[data[i]];

// 4-byte to 3-byte conversion
int srcIndex, destIndex;
for (srcIndex = 0, destIndex=0; destIndex < decodedBuf.length-2;
srcIndex += 4, destIndex += 3) {
decodedBuf[destIndex] = (byte) ( ((data[srcIndex] << 2) & 255) |
((data[srcIndex+1] >>> 4) & 003) );
decodedBuf[destIndex+1] = (byte) ( ((data[srcIndex+1] << 4) & 255) |
((data[srcIndex+2] >>> 2) & 017) );
decodedBuf[destIndex+2] = (byte) ( ((data[srcIndex+2] << 6) & 255) |
(data[srcIndex+3] & 077) );
}

// Handle last 1 or 2 bytes
if (destIndex < decodedBuf.length)
decodedBuf[destIndex] = (byte) ( ((data[srcIndex] << 2) & 255) |
((data[srcIndex+1] >>> 4) & 003) );
if (++destIndex < decodedBuf.length)
decodedBuf[destIndex] = (byte) ( ((data[srcIndex+1] << 4) & 255) |
((data[srcIndex+2] >>> 2) & 017) );

return decodedBuf;
}

}
issue 2: Exception when passing 2MB .txt file
------------------------------------------------------------------
Steps to reproduce:

Call getSummary() with 2MB .txt file

Actual:

The following exception has occured:
------------------------------------
1. Error in postRequest(): Unexpected end of file from server
java.lang.NullPointerException


Please provide your precious feedback/suggestions.


Thanks in advance…..

Edited by: EJP on 15/03/2011 16:52: added code tags. Please use them. Code is unreadable otherwise.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
Comments
Locked Post
New comments cannot be posted to this locked post.
Post Details
Locked on Apr 13 2011
Added on Mar 15 2011
3 comments
512 views