How do I add an ICC to an existing PDF document - java

I have an existing PDF document that is using CMYK colors. It was created using a specific ICC profile, which I have obtained. The colors are obviously different if I open the document with the profile active than without. From what I can tell using a variety of tools, there is no ICC profile embedded in the document. What I would like to do is embed the ICC profile in the PDF so that it can be opened and viewed with the correct colors by third parties. My understanding is that this is possible to do with the PDF format, but nothing I have tried seems to work.
I wrote a small program using PDFBox based on looking at some examples, but it seems to have no effect. I feel like I am missing a step somewhere.
package com.mapsherpa.tools.addicc;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.InputStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.graphics.color.PDOutputIntent;
import java.io.FileInputStream;
import java.io.IOException;
public class AddICC {
public AddICC() {
// TODO Auto-generated constructor stub
}
public static void main(String[] args) {
AddICC app = new AddICC();
try {
if( args.length != 3) {
app.usage();
} else {
app.doIt(args[0], args[1], args[2]);
}
}
catch (Exception e) {
e.printStackTrace();
}
}
private void doIt(String input, String output, String icc) throws IOException {
// TODO Auto-generated method stub
System.out.printf("Adding %s to %s and saving as %s\n", icc, input, output);
PDDocument doc = null;
try
{
File file = new File(input);
doc = PDDocument.load(file);
PDDocumentCatalog cat = doc.getDocumentCatalog();
PDMetadata metadata = new PDMetadata(doc);
cat.setMetadata(metadata);
InputStream colorProfile = new FileInputStream(icc);
PDOutputIntent oi = new PDOutputIntent(doc, colorProfile);
oi.setInfo("SWOP (Coated), 20%, GCR, None");
oi.setOutputCondition("SWOP (Coated), 20%, GCR, None");
oi.setOutputConditionIdentifier("SWOP (Coated), 20%, GCR, None");
oi.setRegistryName("http://www.color.org");
cat.addOutputIntent(oi);
doc.save(output);
System.out.println("Finished adding color profile");
}
catch (Exception e)
{
System.out.println("Exception processing color profile");
e.printStackTrace();
}
finally
{
if (doc != null) {
doc.close();
}
}
}
private void usage() {
// TODO Auto-generated method stub
System.err.println("Usage: " + this.getClass().getName() + " <input-file> <output-file> <icc-file>");
}
}
I'm not a Java expert but I did manage to get this to run and it seems to do something but I still am not seeing the correct colors and there is no indication using imagemagick or pdfinfo that it has a color profile.
I feel like somehow I should be indicating that the document color space is ICCBased but I can't see any obvious way to do that using the PDFBox API.
Any help would be appreciated (even being told that it won't work!)
EDIT:
I believe that this is working as written in that it adds the required output intent to the document. However, I have also discovered that this is not what I need - I now believe that I need it to add an /ICCBased stream to the PDF - sigh. The updated code below is based on this stackoverflow question's updated createColorSpace function.
private static PDColorSpace createColorSpace( PDDocument doc, ColorSpace cs ) throws IOException
{
PDColorSpace retval = null;
if( cs.isCS_sRGB() )
{
retval = PDDeviceRGB.INSTANCE;
}
else if( cs instanceof ICC_ColorSpace )
{
ICC_ColorSpace ics = (ICC_ColorSpace)cs;
// CREATING MANUALLY THE COS ARR ****************************
COSArray cosArray = new COSArray();
cosArray.add(COSName.ICCBASED);
PDStream pdStream = new PDStream(doc);
cosArray.add(pdStream.getStream());
// USING DIFFERENT CONSTRUTOR *******************************
PDICCBased pdCS = new PDICCBased( cosArray );
retval = pdCS;
COSArray ranges = new COSArray();
for( int i=0; i<cs.getNumComponents(); i++ )
{
ranges.add( new COSFloat( ics.getMinValue( i ) ) );
ranges.add( new COSFloat( ics.getMaxValue( i ) ) );
}
PDStream iccData = pdCS.getPDStream();
OutputStream output = null;
try
{
output = ((COSStream)iccData.getCOSObject()).createFilteredStream();
output.write( ics.getProfile().getData() );
}
finally
{
if( output != null )
{
output.close();
}
}
pdCS.setNumberOfComponents( cs.getNumComponents() );
}
else
{
throw new IOException( "Not yet implemented:" + cs );
}
return retval;
}
private void doIt(String input, String output, String icc) throws IOException {
// TODO Auto-generated method stub
System.out.printf("Adding %s to %s and saving as %s\n", icc, input, output);
PDDocument doc = null;
try
{
File file = new File(input);
doc = PDDocument.load(file);
ICC_ColorSpace iccColorSpace = new ICC_ColorSpace(ICC_Profile.getInstance(icc));
PDColorSpace colorSpace = createColorSpace(doc, iccColorSpace);
doc.save(output);
System.out.println("Finished adding color profile");
}
catch (Exception e)
{
System.out.println("Exception processing color profile");
e.printStackTrace();
}
finally
{
if (doc != null) {
doc.close();
}
}
}
This code now has an exception:
java.io.IOException: Unknown color space number of components:-1
at org.apache.pdfbox.pdmodel.graphics.color.PDICCBased.getAlternateColorSpace(PDICCBased.java:269)
at org.apache.pdfbox.pdmodel.graphics.color.PDICCBased.loadICCProfile(PDICCBased.java:151)
at org.apache.pdfbox.pdmodel.graphics.color.PDICCBased.<init>(PDICCBased.java:89)
at com.mapsherpa.tools.addicc.AddICC.createColorSpace(AddICC.java:65)
at com.mapsherpa.tools.addicc.AddICC.doIt(AddICC.java:109)
at com.mapsherpa.tools.addicc.AddICC.main(AddICC.java:39)
at this line of code:
cosArray.add(pdStream.getStream());
The only difference I can see between this code and the other answer is that I am loading an existing PDF document rather than creating a new empty one.
For testing, I'm using the US Web (Coated) SWOP v2 icc profile from Adobe, but it is the same exception with any profile I test. From my understanding of reading the PDFBox source, it isn't a problem with the profile but with reading the stream from the document (which doesn't have an /ICCBased stream, the whole point of this question :))
EDIT 2: the code above does actually run without exceptions if used with PDFBox 1.8.10 - apparently I had linked in 2.0.0 RC2 without realizing it (total Java newbie).

Related

Open office sdk create xls, doc,ppt files in Java

How to create a xls file using open office sdk? Please give Java example source code. Also needed to create word and power point files. I’m not able to get any examples
Below is the code I tried. It tries to open in Open office App, that I don't want. I want to generate the .ods file in the WebSphere App Server under AIX environment, using Java. I'm using it to generate a report and download it to front end (web app).
import ooo.connector.BootstrapSocketConnector;
import com.sun.star.beans.PropertyValue;
import com.sun.star.comp.helper.BootstrapException;
import com.sun.star.container.XIndexAccess;
import com.sun.star.frame.XComponentLoader;
import com.sun.star.lang.XComponent;
import com.sun.star.lang.XMultiComponentFactory;
import com.sun.star.sheet.XSpreadsheet;
import com.sun.star.sheet.XSpreadsheetDocument;
import com.sun.star.sheet.XSpreadsheets;
import com.sun.star.table.XCell;
import com.sun.star.uno.UnoRuntime;
import com.sun.star.uno.XComponentContext;
public class Test {
/**
* #param args
*/
public static void main(String[] args) {
XComponentContext xContext = null;
// get the remote office component context
try {
String folder = "C:\\Program Files (x86)\\OpenOffice 4\\program";
xContext = BootstrapSocketConnector.bootstrap(folder);
} catch (BootstrapException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
XSpreadsheetDocument myDoc = null;
System.out.println("Opening an empty Calc document");
myDoc = openCalc(xContext);
XSpreadsheet xSheet = null;
try {
System.out.println("Getting spreadsheet");
XSpreadsheets xSheets = myDoc.getSheets();
XIndexAccess oIndexSheets = (XIndexAccess) UnoRuntime
.queryInterface(XIndexAccess.class, xSheets);
xSheet = (XSpreadsheet) UnoRuntime.queryInterface(
XSpreadsheet.class, oIndexSheets.getByIndex(0));
} catch (Exception e) {
System.out.println("Couldn't get Sheet " + e);
e.printStackTrace(System.err);
}
System.out.println("Creating the Header") ;
insertIntoCell(1,0,"JAN",xSheet,"");
insertIntoCell(2,0,"FEB",xSheet,"");
insertIntoCell(3,0,"MAR",xSheet,"");
insertIntoCell(4,0,"APR",xSheet,"");
insertIntoCell(5,0,"MAI",xSheet,"");
insertIntoCell(6,0,"JUN",xSheet,"");
insertIntoCell(7,0,"JUL",xSheet,"");
insertIntoCell(8,0,"AUG",xSheet,"");
insertIntoCell(9,0,"SEP",xSheet,"");
insertIntoCell(10,0,"OCT",xSheet,"");
insertIntoCell(11,0,"NOV",xSheet,"");
insertIntoCell(12,0,"DEC",xSheet,"");
insertIntoCell(13,0,"SUM",xSheet,"");
System.out.println("Fill the lines");
insertIntoCell(0,1,"Smith",xSheet,"");
insertIntoCell(1,1,"42",xSheet,"V");
insertIntoCell(2,1,"58.9",xSheet,"V");
insertIntoCell(3,1,"-66.5",xSheet,"V");
insertIntoCell(4,1,"43.4",xSheet,"V");
insertIntoCell(5,1,"44.5",xSheet,"V");
insertIntoCell(6,1,"45.3",xSheet,"V");
insertIntoCell(7,1,"-67.3",xSheet,"V");
insertIntoCell(8,1,"30.5",xSheet,"V");
insertIntoCell(9,1,"23.2",xSheet,"V");
insertIntoCell(10,1,"-97.3",xSheet,"V");
insertIntoCell(11,1,"22.4",xSheet,"V");
insertIntoCell(12,1,"23.5",xSheet,"V");
insertIntoCell(13,1,"=SUM(B2:M2)",xSheet,"");
insertIntoCell(0,2,"Jones",xSheet,"");
insertIntoCell(1,2,"21",xSheet,"V");
insertIntoCell(2,2,"40.9",xSheet,"V");
insertIntoCell(3,2,"-57.5",xSheet,"V");
insertIntoCell(4,2,"-23.4",xSheet,"V");
insertIntoCell(5,2,"34.5",xSheet,"V");
insertIntoCell(6,2,"59.3",xSheet,"V");
insertIntoCell(7,2,"27.3",xSheet,"V");
insertIntoCell(8,2,"-38.5",xSheet,"V");
insertIntoCell(9,2,"43.2",xSheet,"V");
insertIntoCell(10,2,"57.3",xSheet,"V");
insertIntoCell(11,2,"25.4",xSheet,"V");
insertIntoCell(12,2,"28.5",xSheet,"V");
insertIntoCell(13,2,"=SUM(B3:M3)",xSheet,"");
insertIntoCell(0,3,"Brown",xSheet,"");
insertIntoCell(1,3,"31.45",xSheet,"V");
insertIntoCell(2,3,"-20.9",xSheet,"V");
insertIntoCell(3,3,"-117.5",xSheet,"V");
insertIntoCell(4,3,"23.4",xSheet,"V");
insertIntoCell(5,3,"-114.5",xSheet,"V");
insertIntoCell(6,3,"115.3",xSheet,"V");
insertIntoCell(7,3,"-171.3",xSheet,"V");
insertIntoCell(8,3,"89.5",xSheet,"V");
insertIntoCell(9,3,"41.2",xSheet,"V");
insertIntoCell(10,3,"71.3",xSheet,"V");
insertIntoCell(11,3,"25.4",xSheet,"V");
insertIntoCell(12,3,"38.5",xSheet,"V");
insertIntoCell(13,3,"=SUM(A4:L4)",xSheet,"");
}
public static void insertIntoCell(int CellX, int CellY, String theValue,
XSpreadsheet TT1, String flag) {
XCell xCell = null;
try {
xCell = TT1.getCellByPosition(CellX, CellY);
} catch (com.sun.star.lang.IndexOutOfBoundsException ex) {
System.err.println("Could not get Cell");
ex.printStackTrace(System.err);
}
if (flag.equals("V")) {
xCell.setValue((new Float(theValue)).floatValue());
} else {
xCell.setFormula(theValue);
}
}
public static XSpreadsheetDocument openCalc(XComponentContext xContext) {
// define variables
XMultiComponentFactory xMCF = null;
XComponentLoader xCLoader;
XSpreadsheetDocument xSpreadSheetDoc = null;
XComponent xComp = null;
try {
// get the servie manager rom the office
xMCF = xContext.getServiceManager();
// create a new instance of the the desktop
Object oDesktop = xMCF.createInstanceWithContext(
"com.sun.star.frame.Desktop", xContext);
// query the desktop object for the XComponentLoader
xCLoader = (XComponentLoader) UnoRuntime.queryInterface(
XComponentLoader.class, oDesktop);
PropertyValue[] szEmptyArgs = new PropertyValue[0];
String strDoc = "private:factory/scalc";
xComp = xCLoader.loadComponentFromURL(strDoc, "_blank", 0,
szEmptyArgs);
xSpreadSheetDoc = (XSpreadsheetDocument) UnoRuntime.queryInterface(
XSpreadsheetDocument.class, xComp);
} catch (Exception e) {
System.err.println(" Exception " + e);
e.printStackTrace(System.err);
}
return xSpreadSheetDoc;
}
}

How to persist changes with docx4j Java library to word document

I am using the docx4j Java library for the first time and have some difficulties finding a good reference. What i need to start is a simple Java class to enforce the protection on a Word document in read-only mode. I have com that far that i can read the protection mode and set it. But when saving the Word document the changes are not written to the Word document.
public class Doc4JPOC {
public static void main(String[] args) {
String docName = "/Users/petervannes/Desktop/Unprotected document.docx";
// String docName = "/Users/petervannes/Desktop/Protected document.docx" ;
Doc4JPOC d4j = new Doc4JPOC();
d4j.isProtected(docName);
d4j.protect(docName);
d4j.isProtected(docName);
}
private void protect(String filename) {
try {
WordprocessingMLPackage wordMLPackage = Docx4J.load(new java.io.File(filename));
MainDocumentPart mdp = wordMLPackage.getMainDocumentPart();
Relationship rs = mdp.getRelationshipsPart().getRelationshipByType(Namespaces.SETTINGS);
DocumentSettingsPart dsp = (DocumentSettingsPart) mdp.getRelationshipsPart().getPart(rs);
// Update settings.xml
List<Object> nodes = dsp.getJAXBNodesViaXPath("//w:documentProtection", true);
for (Object obj : nodes) {
CTDocProtect cdtP = ((CTDocProtect) obj);
cdtP.setEnforcement(Boolean.TRUE);
cdtP.setEdit(STDocProtect.READ_ONLY);
}
// Write updated settings.xml to document
wordMLPackage.addTargetPart(dsp);
// wordMLPackage.save(new java.io.File(filename));
Docx4J.save(wordMLPackage, new java.io.File(filename), 0);
System.out.println("Protected document " + filename) ;
} catch (Docx4JException ex) {
Logger.getLogger(Doc4JPOC.class.getName()).log(Level.SEVERE, null, ex);
} catch (JAXBException jex) {
Logger.getLogger(Doc4JPOC.class.getName()).log(Level.SEVERE, null, jex);
}
}
private void isProtected(String filename) {
Boolean isProtectionEnforced = false;
STDocProtect editMode = STDocProtect.NONE;
try {
WordprocessingMLPackage wordMLPackage = Docx4J.load(new java.io.File(filename));
MainDocumentPart mdp = wordMLPackage.getMainDocumentPart();
Relationship rs = mdp.getRelationshipsPart().getRelationshipByType(Namespaces.SETTINGS);
DocumentSettingsPart dsp = (DocumentSettingsPart) mdp.getRelationshipsPart().getPart(rs);
System.out.println("Partname : " + dsp.getPartName());
List<Object> nodes = dsp.getJAXBNodesViaXPath("//w:documentProtection", true);
for (Object obj : nodes) {
CTDocProtect cdtP = ((CTDocProtect) obj);
isProtectionEnforced = cdtP.isEnforcement();
editMode = cdtP.getEdit();
System.out.println("Enforced: " + cdtP.isEnforcement());
System.out.println("Edit: " + cdtP.getEdit());
}
if (isProtectionEnforced) {
System.out.println("Protection is enabled , protection mode is " + editMode.toString());
} else {
System.out.println("Protection is disabled");
}
} catch (Docx4JException ex) {
Logger.getLogger(Doc4JPOC.class.getName()).log(Level.SEVERE, null, ex);
} catch (JAXBException jex) {
Logger.getLogger(Doc4JPOC.class.getName()).log(Level.SEVERE, null, jex);
}
}
}
When executing this class i do get the following output;
Partname : /word/settings.xml
Protection is disabled
Protected document /Users/petervannes/Desktop/Unprotected document.docx
Partname : /word/settings.xml
Protection is disabled
So i suspect that i am not updating the WordprocessingMLPackage or DocumentSettingsPart correctly in the protect method, but have currently no clue where it goes wrong.
Resolved it. Instead of adding the DocumentSettingsPart to the loaded WordprocessingMLPackage. The CTDocProtect instance needs to be used to set the document protection on the content.
CTDocProtect cdtP = new CTDocProtect();
cdtP.setEnforcement(Boolean.TRUE);
cdtP.setEdit(STDocProtect.READ_ONLY);
dsp.getContents().setDocumentProtection(cdtP);
Docx4J.save(wordMLPackage, new java.io.File(filename), 0);
For docx4j v3.3.0, please see http://www.docx4java.org/forums/docx-java-f6/password-set-for-documentprotection-not-accepted-in-msword-t2427.html#p8290

Read Shapes (Rectangle,Square,Circle,Arrow etc), Clip Arts from MS Word File using java

I am able to get image from ms word table but unable to get shapes and clip-arts.
public static void main(String[] args) throws Exception {
// The path to the documents directory.
try {
String dataDir = "E://test//demo.docx";
generatePicturesAsImages(dataDir);
} catch (Exception e) {
e.printStackTrace();
}
}
public static void generatePicturesAsImages(String sourcePath) {
try {
Document doc = new Document(sourcePath);
ImageSaveOptions options = new ImageSaveOptions(SaveFormat.JPEG);
options.setJpegQuality(100);
options.setResolution(100);
// options.setUseHighQualityRendering(true);
List<ShapeRenderer> pictures = getAllPictures(doc);
if (pictures != null) {
for (int i = 0; i < pictures.size(); i++) {
ShapeRenderer picture = pictures.get(i);
String imageFilePath = sourcePath + "_output_" + i + ".jpeg";
picture.save(imageFilePath, options);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
private static List<ShapeRenderer> getAllPictures(final Document document) throws Exception {
List<ShapeRenderer> images = null;
#SuppressWarnings("unchecked")
NodeCollection<DrawingML> nodeCollection = document.getChildNodes(NodeType.DRAWING_ML, Boolean.TRUE);
if (nodeCollection.getCount() > 0) {
images = new ArrayList<ShapeRenderer>();
for (DrawingML drawingML : nodeCollection) {
images.add(drawingML.getShapeRenderer());
}
}
return images;
}
Above program is getting images from table so what should i add more to get the shapes.. Please suggest me any help will be appreciate !
You are using an older version of Aspose.Words. If you want to use older version of Aspose.Words, please get the collection of Shape and DrawingML nodes using Document.getChildNodes in your getAllPictures method.
NodeCollection<DrawingML> drwingmlnodes = document.getChildNodes(NodeType.DRAWING_ML, Boolean.TRUE);
NodeCollection<Shape> shapenodes = document.getChildNodes(NodeType.SHAPE, Boolean.TRUE);
Note that we removed the DrawingML from our APIs in Aspose.Words 15.2.0. If you want to use latest version of Aspose.Words v16.5.0, please only use NodeType.SHAPE.
I work with Aspose as Developer evangelist.

Error while trying to classify new instance using Java with Weka-No output instance format defined

I am trying to use Weka in my project to classify text documents using a Naïve Bayes classifier. I found the two classes below on this site.
The first class MyFilteredLearner builds, trains, evaluates, and saves the classifier to disk, this all works fine.
The second class MyFilteredClassifier loads the single text string from a text file and makes it into an instance successfully. It also restores the classifier from disk. What it fails to do is to classify the instance with the method classify(), it instead returns the exception message ‘No output instance format defined’.
I have spend ages searching for an answer, have tried installing the developer and stable versions of Weka, but still get the same issue.
Does anybody know what is incorrect in the code or needs to be added/done differently? The file details and code are as follows:
ARFF file (spam.ARFF) used to train the classifier:
#relation sms_test
#attribute spamclass {spam,ham}
#attribute text String
#data
ham,'Go until jurong point, crazy.. Available only in bugis n great world la e buffet...Cine there got amore wat...'
etc……………………………………………………………………
Single line text file (toClassify.txt) for the new instance:
this is spam or not, who knows?
Code of MyFilteredLearner:
public class MyFilteredLearner {
Instances trainData;
StringToWordVector filter;
FilteredClassifier classifier;
public void loadDataset(String fileName) {
try {
BufferedReader reader = new BufferedReader(new FileReader(fileName));
ArffReader arff = new ArffReader(reader);
trainData = arff.getData();
System.out.println("===== Loaded dataset: " + fileName + " =====");
reader.close();
}
catch (IOException e) {
System.out.println("Problem found when reading: " + fileName);
}
}
public void learn() {
try {
trainData.setClassIndex(0);
classifier = new FilteredClassifier();
filter = new StringToWordVector();
filter.setAttributeIndices("last");
classifier.setFilter(filter);
classifier.setClassifier(new NaiveBayes());
classifier.buildClassifier(trainData);
System.out.println("===== Training on filtered (training) dataset done =====");
}
catch (Exception e) {
System.out.println("Problem found when training");
}
}
public void evaluate() {
try {
trainData.setClassIndex(0);
filter = new StringToWordVector();
filter.setAttributeIndices("last");
classifier = new FilteredClassifier();
classifier.setFilter(filter);
classifier.setClassifier(new NaiveBayes());
Evaluation eval = new Evaluation(trainData);
eval.crossValidateModel(classifier, trainData, 4, new Random(1));
System.out.println(eval.toSummaryString());
System.out.println(eval.toClassDetailsString());
System.out.println("===== Evaluating on filtered (training) dataset done =====");
}
catch (Exception e) {
System.out.println("Problem found when evaluating");
}
}
public void saveModel(String fileName) {
try {
ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(fileName));
out.writeObject(classifier);
System.out.println("Saved model: " + out.toString());
out.close();
System.out.println("===== Saved model: " + fileName + "=====");
}
catch (IOException e) {
System.out.println("Problem found when writing: " + fileName);
}
}
}
Code of MyFilteredClassifier:
public class MyFilteredClassifier {
String text;
Instances instances;
FilteredClassifier classifier;
StringToWordVector filter;
public void load(String fileName) {
try {
BufferedReader reader = new BufferedReader(new FileReader(fileName));
String line;
text = "";
while ((line = reader.readLine()) != null) {
text = text + " " + line;
}
System.out.println("===== Loaded text data: " + fileName + " =====");
reader.close();
System.out.println(text);
}
catch (IOException e) {
System.out.println("Problem found when reading: " + fileName);
}
}
public void makeInstance() {
FastVector fvNominalVal = new FastVector(2);
fvNominalVal.addElement("spam");
fvNominalVal.addElement("ham");
Attribute attribute1 = new Attribute("class", fvNominalVal);
Attribute attribute2 = new Attribute("text",(FastVector) null);
FastVector fvWekaAttributes = new FastVector(2);
fvWekaAttributes.addElement(attribute1);
fvWekaAttributes.addElement(attribute2);
instances = new Instances("Test relation", fvWekaAttributes,1);
instances.setClassIndex(0);
DenseInstance instance = new DenseInstance(2);
instance.setValue(attribute2, text);
instances.add(instance);
System.out.println("===== Instance created with reference dataset =====");
System.out.println(instances);
}
public void loadModel(String fileName) {
try {
ObjectInputStream in = new ObjectInputStream(new FileInputStream(fileName));
Object tmp = in.readObject();
classifier = (FilteredClassifier) tmp;
in.close();
System.out.println("===== Loaded model: " + fileName + "=====");
}
catch (Exception e) {
System.out.println("Problem found when reading: " + fileName);
}
}
public void classify() {
try {
double pred = classifier.classifyInstance(instances.instance(0));
System.out.println("===== Classified instance =====");
System.out.println("Class predicted: " + instances.classAttribute().value((int) pred));
}
catch (Exception e) {
System.out.println("Error: " + e.getMessage());
}
}
public static void main(String args[]) {
MyFilteredLearner c = new MyFilteredLearner();
c.loadDataset("spam.ARFF");
c.learn();
c.evaluate();
c.saveModel("spamClassifier.binary");
MyFilteredClassifier c1 = new MyFilteredClassifier();
c1.load("toClassify.txt");
c1.loadModel("spamClassifier.binary");
c1.makeInstance();
c1.classify();
}
}
It seems you change the code from the blog's GitHub repository in one detail and it is the cause of your error:
c.learn();
c.evaluate();
vs
c.evaluate();
c.learn();
The evaluate() method resets the classifier with the line:
classifier = new FilteredClassifier();
but doesn't build a model. The actual evaluation uses a copy of the passed classifier, so the original classifier (the one in your class) remains untrained.
// weka/classifiers/Evaluation.java (method: crossValidateModel)
Classifier copiedClassifier = Classifier.makeCopy(classifier);
copiedClassifier.buildClassifier(train);
So you first build your model, but then overwrite it when evaluating it and then save the uninitialized model. Switch them around so you train it directly before saving it to a file, then it works.

Error While Reading Large Excel Files (xlsx) Via Apache POI

I am trying to read large excel files xlsx via Apache POI, say 40-50 MB. I am getting out of memory exception. The current heap memory is 3GB.
I can read smaller excel files without any issues. I need a way to read large excel files and then them back as response via Spring excel view.
public class FetchExcel extends AbstractView {
#Override
protected void renderMergedOutputModel(
Map model, HttpServletRequest request, HttpServletResponse response)
throws Exception {
String fileName = "SomeExcel.xlsx";
response.setContentType("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
OPCPackage pkg = OPCPackage.open("/someDir/SomeExcel.xlsx");
XSSFWorkbook workbook = new XSSFWorkbook(pkg);
ServletOutputStream respOut = response.getOutputStream();
pkg.close();
workbook.write(respOut);
respOut.flush();
workbook = null;
response.setHeader("Content-disposition", "attachment;filename=\"" +fileName+ "\"");
}
}
I first started off using XSSFWorkbook workbook = new XSSFWorkbook(FileInputStream in);
but that was costly per Apache POI API, so I switched to OPC package way but still the same effect. I don't need to parse or process the file, just read it and return it.
Here is an example to read a large xls file using sax parser.
public void parseExcel(File file) throws IOException {
OPCPackage container;
try {
container = OPCPackage.open(file.getAbsolutePath());
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container);
XSSFReader xssfReader = new XSSFReader(container);
StylesTable styles = xssfReader.getStylesTable();
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
while (iter.hasNext()) {
InputStream stream = iter.next();
processSheet(styles, strings, stream);
stream.close();
}
} catch (InvalidFormatException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
} catch (OpenXML4JException e) {
e.printStackTrace();
}
}
protected void processSheet(StylesTable styles, ReadOnlySharedStringsTable strings, InputStream sheetInputStream) throws IOException, SAXException {
InputSource sheetSource = new InputSource(sheetInputStream);
SAXParserFactory saxFactory = SAXParserFactory.newInstance();
try {
SAXParser saxParser = saxFactory.newSAXParser();
XMLReader sheetParser = saxParser.getXMLReader();
ContentHandler handler = new XSSFSheetXMLHandler(styles, strings, new SheetContentsHandler() {
#Override
public void startRow(int rowNum) {
}
#Override
public void endRow() {
}
#Override
public void cell(String cellReference, String formattedValue) {
}
#Override
public void headerFooter(String text, boolean isHeader, String tagName) {
}
},
false//means result instead of formula
);
sheetParser.setContentHandler(handler);
sheetParser.parse(sheetSource);
} catch (ParserConfigurationException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
You don't mention whether you need to modify the spreadsheet or not.
This may be obvious, but if you don't need to modify the spreadsheet, then you don't need to parse it and write it back out, you can simply read bytes from the file, and write out bytes, as you would with, say an image, or any other binary format.
If you do need to modify the spreadsheet before sending it to the user, then to my knowledge, you may have to take a different approach.
Every library that I'm aware of for reading Excel files in Java reads the whole spreadsheet into memory, so you'd have to have 50MB of memory available for every spreadsheet that could possibly be concurrently processed. This involves, as others have pointed out, adjusting the heap available to the VM.
If you need to process a large number of spreadsheets concurrently, and can't allocate enough memory, consider using a format that can be streamed, instead of read all at once into memory. CSV format can be opened by Excel, and I've had good results in the past by setting the content-type to application/vnd.ms-excel, setting the attachment filename to something ending in ".xls", but actually returning CSV content. I haven't tried this in a couple of years, so YMMV.
In the bellwo example I'll add a complete code how to parse a complete excel file (for me 60Mo) into list of object without any problem of "out of memory" and work fine:
import java.util.ArrayList;
import java.util.List;
class DistinctByProperty {
private static OPCPackage xlsxPackage = null;
private static PrintStream output= System.out;
private static List<MassUpdateMonitoringRow> resultMapping = new ArrayList<>();
public static void main(String[] args) throws IOException {
File file = new File("C:\\Users\\aberguig032018\\Downloads\\your_excel.xlsx");
double bytes = file.length();
double kilobytes = (bytes / 1024);
double megabytes = (kilobytes / 1024);
System.out.println("Size "+megabytes);
parseExcel(file);
}
public static void parseExcel(File file) throws IOException {
try {
xlsxPackage = OPCPackage.open(file.getAbsolutePath(), PackageAccess.READ);
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(xlsxPackage);
XSSFReader xssfReader = new XSSFReader(xlsxPackage);
StylesTable styles = xssfReader.getStylesTable();
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
int index = 0;
while (iter.hasNext()) {
try (InputStream stream = iter.next()) {
String sheetName = iter.getSheetName();
output.println();
output.println(sheetName + " [index=" + index + "]:");
processSheet(styles, strings, new MappingFromXml(resultMapping), stream);
}
++index;
}
} catch (InvalidFormatException e) {
e.printStackTrace();
} catch (OpenXML4JException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
}
}
private static void processSheet(StylesTable styles, ReadOnlySharedStringsTable strings, MappingFromXml mappingFromXml, InputStream sheetInputStream) throws IOException, SAXException {
DataFormatter formatter = new DataFormatter();
InputSource sheetSource = new InputSource(sheetInputStream);
try {
XMLReader sheetParser = SAXHelper.newXMLReader();
ContentHandler handler = new XSSFSheetXMLHandler(
styles, null, strings, mappingFromXml, formatter, false);
sheetParser.setContentHandler(handler);
sheetParser.parse(sheetSource);
System.out.println("Size of Array "+resultMapping.size());
} catch(ParserConfigurationException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
}
}
you have to add a calss that implements
SheetContentsHandler
import com.sun.org.apache.xpath.internal.operations.Bool;
import org.apache.poi.ss.util.CellAddress;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.usermodel.XSSFComment;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
public class MappingFromXml implements SheetContentsHandler {
private List<myObject> result = new ArrayList<>();
private myObject myObject = null;
private int lineNumber = 0;
/**
* Number of columns to read starting with leftmost
*/
private int minColumns = 25;
/**
* Destination for data
*/
private PrintStream output = System.out;
public MappingFromXml(List<myObject> list) {
this.result = list;
}
#Override
public void startRow(int i) {
output.println("iii " + i);
lineNumber = i;
myObject = new myObject();
}
#Override
public void endRow(int i) {
output.println("jjj " + i);
result.add(myObject);
myObject = null;
}
#Override
public void cell(String cellReference, String formattedValue, XSSFComment comment) {
int columnIndex = (new CellReference(cellReference)).getCol();
if(lineNumber > 0){
switch (columnIndex) {
case 0: {//Tech id
if (formattedValue != null && !formattedValue.isEmpty())
myObject.setId(Integer.parseInt(formattedValue));
}
break;
//TODO add other cell
}
}
}
#Override
public void headerFooter(String s, boolean b, String s1) {
}
}
For more information visite this link
I too faced the same issue of OOM while parsing xlsx file...after two days of struggle, I finally found out the below code that was really perfect;
This code is based on sjxlsx. It reads the xlsx and stores in a HSSF sheet.
[code=java]
// read the xlsx file
SimpleXLSXWorkbook = new SimpleXLSXWorkbook(new File("C:/test.xlsx"));
HSSFWorkbook hsfWorkbook = new HSSFWorkbook();
org.apache.poi.ss.usermodel.Sheet hsfSheet = hsfWorkbook.createSheet();
Sheet sheetToRead = workbook.getSheet(0, false);
SheetRowReader reader = sheetToRead.newReader();
Cell[] row;
int rowPos = 0;
while ((row = reader.readRow()) != null) {
org.apache.poi.ss.usermodel.Row hfsRow = hsfSheet.createRow(rowPos);
int cellPos = 0;
for (Cell cell : row) {
if(cell != null){
org.apache.poi.ss.usermodel.Cell hfsCell = hfsRow.createCell(cellPos);
hfsCell.setCellType(org.apache.poi.ss.usermodel.Cell.CELL_TYPE_STRING);
hfsCell.setCellValue(cell.getValue());
}
cellPos++;
}
rowPos++;
}
return hsfSheet;[/code]

Categories