My mission is pretty simple: converting every single page of a pdf file into images. I tried using icepdf open source version to generate the images but they don't generate the image with the correct font. So I start using PDFBox instead. The code is the following:
PDDocument document = PDDocument.load(new File("testing.pdf"));
List<PDPage> pages = document.getDocumentCatalog().getAllPages();
for (int i = 0; i < pages.size(); i++) {
PDPage singlePage = pages.get(i);
BufferedImage buffImage = convertToImage(singlePage, 8, 12);
ImageIO.write(buffImage, "png", new File(PdfUtil.DATA_OUTPUT_DIR+(count++)+".png"));
}
The font looks good, but the pictures within the pdf file look fainted out (See the attachment). I look into the source code but I still have no clue how to fix it. Do you guys have any idea what's going on? Please help. Thanks!!
Convert PDF file 04-Request-Headers.pdf to image using pdfbox.
Download this file and paste it in Documents folder.
Example:
package com.pdf.pdfbox.test;
import java.awt.HeadlessException;
import java.awt.Toolkit;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFImageWriter;
public class ConvertPDFPageToImageWithoutText {
public static void main(String[] args) {
try {
String oldPath = "C:/Documents/04-Request-Headers.pdf";
File oldFile = new File(oldPath);
if (oldFile.exists()) {
PDDocument document = PDDocument.load(oldPath);
#SuppressWarnings("unchecked")
List<PDPage> list = document.getDocumentCatalog().getAllPages();
String fileName = oldFile.getName().replace(".pdf", "");
String imageFormat = "png";
String password = "";
int startPage = 1;
int endPage = list.size();
String outputPrefix = "C:/Documents/PDFCopy/";//converted images saved here
File file = new File(outputPrefix);
if (!file.exists()) {
file.mkdirs();
}
int imageType = 24;
String color = "rgb";
int resolution;
try {
resolution = Toolkit.getDefaultToolkit().getScreenResolution();
} catch (HeadlessException e) {
resolution = 96;
}
if ("bilevel".equalsIgnoreCase(color)) {
imageType = BufferedImage.TYPE_BYTE_BINARY;
} else if ("indexed".equalsIgnoreCase(color)) {
imageType = BufferedImage.TYPE_BYTE_INDEXED;
} else if ("gray".equalsIgnoreCase(color)) {
imageType = BufferedImage.TYPE_BYTE_GRAY;
} else if ("rgb".equalsIgnoreCase(color)) {
imageType = BufferedImage.TYPE_INT_RGB;
} else if ("rgba".equalsIgnoreCase(color)) {
imageType = BufferedImage.TYPE_INT_ARGB;
} else {
System.err.println("Error: the number of bits per pixel must be 1, 8 or 24.");
}
PDFImageWriter pdfImageWriter = new PDFImageWriter();
boolean imageWriter = pdfImageWriter.writeImage(document, imageFormat, password, startPage, endPage, outputPrefix + fileName, imageType, resolution);
if (!imageWriter) {
throw new Exception("No writer found for format '" + imageFormat + "'");
}
document.close();
} else {
System.err.println(oldPath +" File Can't be found");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
OR
Try the below solution for convert pdf files to image format.
How to Convert PDF to image with resolution in java Using PDF Renderer
Use the following code for conversions it works fine!
import java.awt.HeadlessException;
import java.awt.Toolkit;
import java.awt.image.BufferedImage;
import javax.imageio.ImageIO;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFImageWriter;
/**
* Convert a PDF document to an image.
*
* #author Ben Litchfield
* #version $Revision: 1.6 $
*/
public class PDFToImage
{
private static final String PASSWORD = "-password";
private static final String START_PAGE = "-startPage";
private static final String END_PAGE = "-endPage";
private static final String IMAGE_FORMAT = "-imageType";
private static final String OUTPUT_PREFIX = "-outputPrefix";
private static final String COLOR = "-color";
private static final String RESOLUTION = "-resolution";
/**
* private constructor.
*/
private PDFToImage()
{
//static class
}
/**
* Infamous main method.
*
* #param args Command line arguments, should be one and a reference to a file.
*
* #throws Exception If there is an error parsing the document.
*/
public static void main( String[] args ) throws Exception
{
String password = "";
String pdfFile = "D:/docoverview.pdf";
String outputPrefix = "D:/printdata/pdfimages/";
String imageFormat = "jpg";
int startPage = 1;
int endPage = Integer.MAX_VALUE;
String color = "rgb";
int resolution;
try
{
resolution = Toolkit.getDefaultToolkit().getScreenResolution();
}
catch( HeadlessException e )
{
resolution = 96;
}
for( int i = 0; i < args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
{
i++;
if( i >= args.length )
{
usage();
}
password = args[i];
}
else if( args[i].equals( START_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
startPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( END_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
endPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( IMAGE_FORMAT ) )
{
i++;
imageFormat = args[i];
}
else if( args[i].equals( OUTPUT_PREFIX ) )
{
i++;
outputPrefix = args[i];
}
else if( args[i].equals( COLOR ) )
{
i++;
color = args[i];
}
else if( args[i].equals( RESOLUTION ) )
{
i++;
resolution = Integer.parseInt(args[i]);
}
else
{
if( pdfFile == null )
{
pdfFile = args[i];
}
}
}
if( pdfFile == null )
{
usage();
}
else
{
if(outputPrefix == null)
{
outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));
}
PDDocument document = null;
try
{
document = PDDocument.load( pdfFile );
//document.print();
if( document.isEncrypted() )
{
try
{
document.decrypt( password );
}
catch( InvalidPasswordException e )
{
if( args.length == 4 )//they supplied the wrong password
{
System.err.println( "Error: The supplied password is incorrect." );
System.exit( 2 );
}
else
{
//they didn't supply a password and the default of "" was wrong.
System.err.println( "Error: The document is encrypted." );
usage();
}
}
}
int imageType = 24;
if ("bilevel".equalsIgnoreCase(color))
{
imageType = BufferedImage.TYPE_BYTE_BINARY;
}
else if ("indexed".equalsIgnoreCase(color))
{
imageType = BufferedImage.TYPE_BYTE_INDEXED;
}
else if ("gray".equalsIgnoreCase(color))
{
imageType = BufferedImage.TYPE_BYTE_GRAY;
}
else if ("rgb".equalsIgnoreCase(color))
{
imageType = BufferedImage.TYPE_INT_RGB;
}
else if ("rgba".equalsIgnoreCase(color))
{
imageType = BufferedImage.TYPE_INT_ARGB;
}
else
{
System.err.println( "Error: the number of bits per pixel must be 1, 8 or 24." );
System.exit( 2 );
}
//Make the call
PDFImageWriter imageWriter = new PDFImageWriter();
boolean success = imageWriter.writeImage(document, imageFormat, password,
startPage, endPage, outputPrefix, imageType, resolution);
if (!success)
{
System.err.println( "Error: no writer found for image format '"
+ imageFormat + "'" );
System.exit(1);
}
}
catch (Exception e)
{
System.err.println(e);
}
finally
{
if( document != null )
{
document.close();
}
}
}
}
/**
* This will print the usage requirements and exit.
*/
private static void usage()
{
System.err.println( "Usage: java org.apache.pdfbox.PDFToImage [OPTIONS] <PDF file>\n" +
" -password <password> Password to decrypt document\n" +
" -imageType <image type> (" + getImageFormats() + ")\n" +
" -outputPrefix <output prefix> Filename prefix for image files\n" +
" -startPage <number> The first page to start extraction(1 based)\n" +
" -endPage <number> The last page to extract(inclusive)\n" +
" -color <string> The color depth (valid: bilevel, indexed, gray, rgb, rgba)\n" +
" -resolution <number> The bitmap resolution in dpi\n" +
" <PDF file> The PDF document to use\n"
);
System.exit(1);
}
private static String getImageFormats()
{
StringBuffer retval = new StringBuffer();
String[] formats = ImageIO.getReaderFormatNames();
for( int i = 0; i < formats.length; i++ )
{
retval.append( formats[i] );
if( i + 1 < formats.length )
{
retval.append( "," );
}
}
return retval.toString();
}
}
I ended up trying different pdf libraries out there. The best solution is to use "JPedal", but you can only get a trial version for free. You can also try icepdf for free, but it might not generate the correct font.
Related
I need to extract images from a PDF and I am doing it via PDFBox (v 1.8.9).
It works well the 90% of cases but I have some images that when extracted are saved with black background (or are completely white) even if they look perfectly good in the original pdf. I imagine it is something with those jpgs files. What should I check in the jpgs?
I am trying to see If I can upload an example pdf
This is the relevant (quite standard) piece of code...
String pdfFile = promptForPDFFile(jf, "Select PDF file");
// Load pdf file
PDDocument document=PDDocument.load(pdfFile);
//Get the pdf pages
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
int pagetot = pages.size();
int pagenum = 1;
while( iter.hasNext() )
{
// Cycle on the pages for the images
PDPage page = (PDPage)iter.next();
PDResources resources = page.getResources();
PDFTextStripper textStripper=new PDFTextStripper();
textStripper.setStartPage(pagenum);
textStripper.setEndPage(pagenum);
Map images = resources.getImages();
// Get page text content and use it as file name
String pagecontent= textStripper.getText(document);
pagecontent = pagecontent.replaceAll("\n", "");
pagecontent = pagecontent.replaceAll("\r", "");
if( images != null )
{
Iterator imageIter = images.keySet().iterator();
while( imageIter.hasNext() )
{
String key = (String)imageIter.next();
PDXObjectImage image = (PDXObjectImage)images.get( key );
File tempdir = new File(tempPath+"/temp/");
tempdir.mkdirs();
String name = tempPath+"/temp/"+pagecontent;
//System.out.println( "Writing image:" + name );
//Write the image to file
image.write2file( name );
}
}
pagenum ++;
if (pagenum % 10 ==0)
{
System.out.print("\n--- "+ pagenum +"/"+pagetot);
}
}
Thanks in advance
I ran ExtractImages.java against the two files you sent me. The problem file has CMYK images, as can be seen with this screenshot from PDFDebugger:
The problem is that the 1.8 version doesn't handle CMYK images properly.
But there's a trick:
The images are encoded with the DCTDecode filter, which is JPEG. You have "real JPEGs" in the PDF.
I am able to extract your images properly by using the "-directJPEG" option of that tool, which bypasses the decoding mechanism of PDFBox, and just saves the JPEG files "as is".
Note that while this works nicely with your files, it doesn't work properly if the images have an external colorspace specified in the PDF.
Here's the full source code. See writeJpeg2file() for the raw extraction details.
public class ExtractImages
{
private int imageCounter = 1;
private static final String PASSWORD = "-password";
private static final String PREFIX = "-prefix";
private static final String ADDKEY = "-addkey";
private static final String NONSEQ = "-nonSeq";
private static final String DIRECTJPEG = "-directJPEG";
private static final List<String> DCT_FILTERS = new ArrayList<String>();
static
{
DCT_FILTERS.add( COSName.DCT_DECODE.getName() );
DCT_FILTERS.add( COSName.DCT_DECODE_ABBREVIATION.getName() );
}
private ExtractImages()
{
}
/**
* This is the entry point for the application.
*
* #param args The command-line arguments.
*
* #throws Exception If there is an error decrypting the document.
*/
public static void main( String[] args ) throws Exception
{
ExtractImages extractor = new ExtractImages();
extractor.extractImages( args );
}
private void extractImages( String[] args ) throws Exception
{
if( args.length < 1 || args.length > 4 )
{
usage();
}
else
{
String pdfFile = null;
String password = "";
String prefix = null;
boolean addKey = false;
boolean useNonSeqParser = false;
boolean directJPEG = false;
for( int i=0; i<args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
{
i++;
if( i >= args.length )
{
usage();
}
password = args[i];
}
else if( args[i].equals( PREFIX ) )
{
i++;
if( i >= args.length )
{
usage();
}
prefix = args[i];
}
else if( args[i].equals( ADDKEY ) )
{
addKey = true;
}
else if( args[i].equals( NONSEQ ) )
{
useNonSeqParser = true;
}
else if( args[i].equals( DIRECTJPEG ) )
{
directJPEG = true;
}
else
{
if( pdfFile == null )
{
pdfFile = args[i];
}
}
}
if(pdfFile == null)
{
usage();
}
else
{
if( prefix == null && pdfFile.length() >4 )
{
prefix = pdfFile.substring( 0, pdfFile.length() -4 );
}
PDDocument document = null;
try
{
if (useNonSeqParser)
{
document = PDDocument.loadNonSeq(new File(pdfFile), null, password);
}
else
{
document = PDDocument.load( pdfFile );
if( document.isEncrypted() )
{
StandardDecryptionMaterial spm = new StandardDecryptionMaterial(password);
document.openProtection(spm);
}
}
AccessPermission ap = document.getCurrentAccessPermission();
if( ! ap.canExtractContent() )
{
throw new IOException(
"Error: You do not have permission to extract images." );
}
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
while( iter.hasNext() )
{
PDPage page = (PDPage)iter.next();
PDResources resources = page.getResources();
// extract all XObjectImages which are part of the page resources
processResources(resources, prefix, addKey, directJPEG);
}
}
finally
{
if( document != null )
{
document.close();
}
}
}
}
}
public void writeJpeg2file(PDJpeg image, String filename) throws IOException
{
FileOutputStream out = null;
try
{
out = new FileOutputStream(filename + ".jpg");
InputStream data = image.getPDStream().getPartiallyFilteredStream(DCT_FILTERS);
byte[] buf = new byte[1024];
int amountRead;
while ((amountRead = data.read(buf)) != -1)
{
out.write(buf, 0, amountRead);
}
IOUtils.closeQuietly(data);
out.flush();
}
finally
{
if (out != null)
{
out.close();
}
}
}
private void processResources(PDResources resources, String prefix,
boolean addKey, boolean directJPEG) throws IOException
{
if (resources == null)
{
return;
}
Map<String, PDXObject> xobjects = resources.getXObjects();
if( xobjects != null )
{
Iterator<String> xobjectIter = xobjects.keySet().iterator();
while( xobjectIter.hasNext() )
{
String key = xobjectIter.next();
PDXObject xobject = xobjects.get( key );
// write the images
if (xobject instanceof PDXObjectImage)
{
PDXObjectImage image = (PDXObjectImage)xobject;
String name = null;
if (addKey)
{
name = getUniqueFileName( prefix + "_" + key, image.getSuffix() );
}
else
{
name = getUniqueFileName( prefix, image.getSuffix() );
}
System.out.println( "Writing image:" + name );
if (directJPEG && "jpg".equals(image.getSuffix()))
{
writeJpeg2file((PDJpeg) image, name);
}
else
{
image.write2file(name);
}
image.clear(); // PDFBOX-2101 get rid of cache ASAP
}
// maybe there are more images embedded in a form object
else if (xobject instanceof PDXObjectForm)
{
PDXObjectForm xObjectForm = (PDXObjectForm)xobject;
PDResources formResources = xObjectForm.getResources();
processResources(formResources, prefix, addKey, directJPEG);
}
}
}
resources.clear();
}
private String getUniqueFileName( String prefix, String suffix )
{
String uniqueName = null;
File f = null;
while( f == null || f.exists() )
{
uniqueName = prefix + "-" + imageCounter;
f = new File( uniqueName + "." + suffix );
imageCounter++;
}
return uniqueName;
}
/**
* This will print the usage requirements and exit.
*/
private static void usage()
{
System.err.println( "Usage: java org.apache.pdfbox.ExtractImages [OPTIONS] <PDF file>\n" +
" -password <password> Password to decrypt document\n" +
" -prefix <image-prefix> Image prefix(default to pdf name)\n" +
" -addkey add the internal image key to the file name\n" +
" -nonSeq Enables the new non-sequential parser\n" +
" -directJPEG Forces the direct extraction of JPEG images regardless of colorspace\n" +
" <PDF file> The PDF document to use\n"
);
System.exit( 1 );
}
}
After reading endless documents and trying to understand the examples about opencv/javacv for extracting keypoints, computing features with some DescriptorExtractors to match an input image against bunch of images to see if the input image is one of them or part of one of those images, I think, we should be storing the Mat objects after computing them.
I will use Emily Webb's code as an example:
String smallUrl = "rsz_our-mobile-planet-us-infographic_infographics_lg_unberela.jpg";
String largeUrl = "our-mobile-planet-us-infographic_infographics_lg.jpg";
IplImage image = cvLoadImage(largeUrl,CV_LOAD_IMAGE_UNCHANGED );
IplImage image2 = cvLoadImage(smallUrl,CV_LOAD_IMAGE_UNCHANGED );
CvMat descriptorsA = new CvMat(null);
CvMat descriptorsB = new CvMat(null);
final FastFeatureDetector ffd = new FastFeatureDetector(40, true);
final KeyPoint keyPoints = new KeyPoint();
final KeyPoint keyPoints2 = new KeyPoint();
ffd.detect(image, keyPoints, null);
ffd.detect(image2, keyPoints2, null);
System.out.println("keyPoints.size() : "+keyPoints.size());
System.out.println("keyPoints2.size() : "+keyPoints2.size());
// BRISK extractor = new BRISK();
//BriefDescriptorExtractor extractor = new BriefDescriptorExtractor();
FREAK extractor = new FREAK();
extractor.compute(image, keyPoints, descriptorsA);
extractor.compute(image2, keyPoints2, descriptorsB);
System.out.println("descriptorsA.size() : "+descriptorsA.size());
System.out.println("descriptorsB.size() : "+descriptorsB.size());
DMatch dmatch = new DMatch();
//FlannBasedMatcher matcher = new FlannBasedMatcher();
//DescriptorMatcher matcher = new DescriptorMatcher();
BFMatcher matcher = new BFMatcher();
matcher.match(descriptorsA, descriptorsB, dmatch, null);
System.out.println(dmatch.capacity());
My question is :
How can I store descriptorsA (or descriptorsB) in a DB --in java implementation of opencv- ? (They are Mat objects obtained after extractor.compute(image, keyPoints, descriptorsA); )
I am aware of the fact that Mat objects are not serializable objects in java implementation but surely, if you want to match an image against a set of archive images, you have to extract the descriptors of your archive and store them some where for feature use..
After some more search I have found some links in http://answers.opencv.org/question/8873/best-way-to-store-a-mat-object-in-android/
Although the answers are mainly for android devices and referring to earlier questions about saving keypoints ( Saving ORB feature vectors using OpenCV4Android (java API)), the answer "from Mat object to xml and xml to Mat object" in the code below seems to be working:
import org.opencv.core.CvType;
import org.opencv.core.Mat;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.util.Locale;
import java.util.Scanner;
public class TaFileStorage {
// static
public static final int READ = 0;
public static final int WRITE = 1;
// varaible
private File file;
private boolean isWrite;
private Document doc;
private Element rootElement;
public TaFileStorage() {
file = null;
isWrite = false;
doc = null;
rootElement = null;
}
// read or write
public void open(String filePath, int flags ) {
try {
if( flags == READ ) {
open(filePath);
}
else {
create(filePath);
}
} catch(Exception e) {
e.printStackTrace();
}
}
// read only
public void open(String filePath) {
try {
file = new File(filePath);
if( file == null || file.isFile() == false ) {
System.err.println("Can not open file: " + filePath );
}
else {
isWrite = false;
doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(file);
doc.getDocumentElement().normalize();
}
} catch(Exception e) {
e.printStackTrace();
}
}
// write only
public void create(String filePath) {
try {
file = new File(filePath);
if( file == null ) {
System.err.println("Can not wrtie file: " + filePath );
}
else {
isWrite = true;
doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
rootElement = doc.createElement("opencv_storage");
doc.appendChild(rootElement);
}
} catch(Exception e) {
e.printStackTrace();
}
}
public Mat readMat(String tag) {
if( isWrite ) {
System.err.println("Try read from file with write flags");
return null;
}
NodeList nodelist = doc.getElementsByTagName(tag);
Mat readMat = null;
for( int i = 0 ; i<nodelist.getLength() ; i++ ) {
Node node = nodelist.item(i);
if( node.getNodeType() == Node.ELEMENT_NODE ) {
Element element = (Element)node;
String type_id = element.getAttribute("type_id");
if( "opencv-matrix".equals(type_id) == false) {
System.out.println("Fault type_id ");
}
String rowsStr = element.getElementsByTagName("rows").item(0).getTextContent();
String colsStr = element.getElementsByTagName("cols").item(0).getTextContent();
String dtStr = element.getElementsByTagName("dt").item(0).getTextContent();
String dataStr = element.getElementsByTagName("data").item(0).getTextContent();
int rows = Integer.parseInt(rowsStr);
int cols = Integer.parseInt(colsStr);
int type = CvType.CV_8U;
Scanner s = new Scanner(dataStr);
s.useLocale(Locale.US);
if( "f".equals(dtStr) ) {
type = CvType.CV_32F;
readMat = new Mat( rows, cols, type );
float fs[] = new float[1];
for( int r=0 ; r<rows ; r++ ) {
for( int c=0 ; c<cols ; c++ ) {
if( s.hasNextFloat() ) {
fs[0] = s.nextFloat();
}
else {
fs[0] = 0;
System.err.println("Unmatched number of float value at rows="+r + " cols="+c);
}
readMat.put(r, c, fs);
}
}
}
else if( "i".equals(dtStr) ) {
type = CvType.CV_32S;
readMat = new Mat( rows, cols, type );
int is[] = new int[1];
for( int r=0 ; r<rows ; r++ ) {
for( int c=0 ; c<cols ; c++ ) {
if( s.hasNextInt() ) {
is[0] = s.nextInt();
}
else {
is[0] = 0;
System.err.println("Unmatched number of int value at rows="+r + " cols="+c);
}
readMat.put(r, c, is);
}
}
}
else if( "s".equals(dtStr) ) {
type = CvType.CV_16S;
readMat = new Mat( rows, cols, type );
short ss[] = new short[1];
for( int r=0 ; r<rows ; r++ ) {
for( int c=0 ; c<cols ; c++ ) {
if( s.hasNextShort() ) {
ss[0] = s.nextShort();
}
else {
ss[0] = 0;
System.err.println("Unmatched number of int value at rows="+r + " cols="+c);
}
readMat.put(r, c, ss);
}
}
}
else if( "b".equals(dtStr) ) {
readMat = new Mat( rows, cols, type );
byte bs[] = new byte[1];
for( int r=0 ; r<rows ; r++ ) {
for( int c=0 ; c<cols ; c++ ) {
if( s.hasNextByte() ) {
bs[0] = s.nextByte();
}
else {
bs[0] = 0;
System.err.println("Unmatched number of byte value at rows="+r + " cols="+c);
}
readMat.put(r, c, bs);
}
}
}
}
}
return readMat;
}
public void writeMat(String tag, Mat mat) {
try {
if( isWrite == false) {
System.err.println("Try write to file with no write flags");
return;
}
Element matrix = doc.createElement(tag);
matrix.setAttribute("type_id", "opencv-matrix");
rootElement.appendChild(matrix);
Element rows = doc.createElement("rows");
rows.appendChild( doc.createTextNode( String.valueOf(mat.rows()) ));
Element cols = doc.createElement("cols");
cols.appendChild( doc.createTextNode( String.valueOf(mat.cols()) ));
Element dt = doc.createElement("dt");
String dtStr;
int type = mat.type();
if(type == CvType.CV_32F ) { // type == CvType.CV_32FC1
dtStr = "f";
}
else if( type == CvType.CV_32S ) { // type == CvType.CV_32SC1
dtStr = "i";
}
else if( type == CvType.CV_16S ) { // type == CvType.CV_16SC1
dtStr = "s";
}
else if( type == CvType.CV_8U ){ // type == CvType.CV_8UC1
dtStr = "b";
}
else {
dtStr = "unknown";
}
dt.appendChild( doc.createTextNode( dtStr ));
Element data = doc.createElement("data");
String dataStr = dataStringBuilder( mat );
data.appendChild( doc.createTextNode( dataStr ));
// append all to matrix
matrix.appendChild( rows );
matrix.appendChild( cols );
matrix.appendChild( dt );
matrix.appendChild( data );
} catch(Exception e) {
e.printStackTrace();
}
}
private String dataStringBuilder(Mat mat) {
StringBuilder sb = new StringBuilder();
int rows = mat.rows();
int cols = mat.cols();
int type = mat.type();
if( type == CvType.CV_32F ) {
float fs[] = new float[1];
for( int r=0 ; r<rows ; r++ ) {
for( int c=0 ; c<cols ; c++ ) {
mat.get(r, c, fs);
sb.append( String.valueOf(fs[0]));
sb.append( ' ' );
}
sb.append( '\n' );
}
}
else if( type == CvType.CV_32S ) {
int is[] = new int[1];
for( int r=0 ; r<rows ; r++ ) {
for( int c=0 ; c<cols ; c++ ) {
mat.get(r, c, is);
sb.append( String.valueOf(is[0]));
sb.append( ' ' );
}
sb.append( '\n' );
}
}
else if( type == CvType.CV_16S ) {
short ss[] = new short[1];
for( int r=0 ; r<rows ; r++ ) {
for( int c=0 ; c<cols ; c++ ) {
mat.get(r, c, ss);
sb.append( String.valueOf(ss[0]));
sb.append( ' ' );
}
sb.append( '\n' );
}
}
else if( type == CvType.CV_8U ) {
byte bs[] = new byte[1];
for( int r=0 ; r<rows ; r++ ) {
for( int c=0 ; c<cols ; c++ ) {
mat.get(r, c, bs);
sb.append( String.valueOf(bs[0]));
sb.append( ' ' );
}
sb.append( '\n' );
}
}
else {
sb.append("unknown type\n");
}
return sb.toString();
}
public void release() {
try {
if( isWrite == false) {
System.err.println("Try release of file with no write flags");
return;
}
DOMSource source = new DOMSource(doc);
StreamResult result = new StreamResult(file);
// write to xml file
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
// do it
transformer.transform(source, result);
} catch(Exception e) {
e.printStackTrace();
}
}
}
As the code proposed by Thorben was to slow in my case, I came up with the following code using serialization.
public final void saveMat(String path, Mat mat) {
File file = new File(path).getAbsoluteFile();
file.getParentFile().mkdirs();
try {
int cols = mat.cols();
float[] data = new float[(int) mat.total() * mat.channels()];
mat.get(0, 0, data);
try (ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(path))) {
oos.writeObject(cols);
oos.writeObject(data);
oos.close();
}
} catch (IOException | ClassCastException ex) {
System.err.println("ERROR: Could not save mat to file: " + path);
Logger.getLogger(this.class.getName()).log(Level.SEVERE, null, ex);
}
}
public final Mat loadMat(String path) {
try {
int cols;
float[] data;
try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(path))) {
cols = (int) ois.readObject();
data = (float[]) ois.readObject();
}
Mat mat = new Mat(data.length / cols, cols, CvType.CV_32F);
mat.put(0, 0, data);
return mat;
} catch (IOException | ClassNotFoundException | ClassCastException ex) {
System.err.println("ERROR: Could not load mat from file: " + path);
Logger.getLogger(this.class.getName()).log(Level.SEVERE, null, ex);
}
return null;
}
For descriptors you OpenCV uses Mats of floats, in other cases you have to modify the code accordingly to this list found here:
CV_8U and CV_8S -> byte[]
CV_16U and CV_16S -> short[]
CV_32S -> int[]
CV_32F -> float[]
CV_64F-> double[]
After search all of the answers,i edit some code and it seems work.I use it to store the Sift Descriptor into HBase.
public static byte[] serializeMat(Mat mat) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
try {
float[] data = new float[(int) mat.total() * mat.channels()];
mat.get(0, 0, data);
ObjectOutput out = new ObjectOutputStream(bos);
out.writeObject(data);
out.close();
// Get the bytes of the serialized object
byte[] buf = bos.toByteArray();
return buf;
} catch (IOException ioe) {
ioe.printStackTrace();
return null;
}
}
Here is the challenge I'm currently facing.
I have a lot of PDFs and I have to remove the blank pages inside them and display only the pages with content (text or images).
The problem is that those pdfs are scanned documents.
So the blank pages have some dirty left behind by the scanner.
I did some research and ended up with this code that checks for 99% of the page as white or light gray.
I needed the gray factor as the scanned documents sometimes are not pure white.
private static Boolean isBlank(PDPage pdfPage) throws IOException {
BufferedImage bufferedImage = pdfPage.convertToImage();
long count = 0;
int height = bufferedImage.getHeight();
int width = bufferedImage.getWidth();
Double areaFactor = (width * height) * 0.99;
for (int x = 0; x < width ; x++) {
for (int y = 0; y < height ; y++) {
Color c = new Color(bufferedImage.getRGB(x, y));
// verify light gray and white
if (c.getRed() == c.getGreen() && c.getRed() == c.getBlue()
&& c.getRed() >= 248) {
count++;
}
}
}
if (count >= areaFactor) {
return true;
}
return false;
}
#Shoyo's code works fine for PDFBox version < 2.0. For future readers, there's no much change but, just in case, here is the code for PDFBOX 2.0+ to make your life easier.
In your main (By main, I mean the place where you are loading your PDF into PDDocument) method:
try {
PDDocument document = PDDocument.load(new File("/home/codemantra/Downloads/tetml_ct_access/C.pdf"));
PDFRenderer renderedDoc = new PDFRenderer(document);
for (int pageNumber = 0; pageNumber < document.getNumberOfPages(); pageNumber++) {
if(isBlank(renderedDoc.renderImage(pageNumber))) {
System.out.println("Blank Page Number : " + pageNumber + 1);
}
}
} catch (Exception e) {
e.printStackTrace();
}
And isBlank method will just have BufferedImage passed in:
private static Boolean isBlank(BufferedImage pageImage) throws IOException {
BufferedImage bufferedImage = pageImage;
long count = 0;
int height = bufferedImage.getHeight();
int width = bufferedImage.getWidth();
Double areaFactor = (width * height) * 0.99;
for (int x = 0; x < width; x++) {
for (int y = 0; y < height; y++) {
Color c = new Color(bufferedImage.getRGB(x, y));
if (c.getRed() == c.getGreen() && c.getRed() == c.getBlue() && c.getRed() >= 248) {
count++;
}
}
}
if (count >= areaFactor) {
return true;
}
return false;
}
All the credits goes to #Shoyo
Update:
Some PDFs have "This Page was Intentionally Left Blank" to which the above code considers as blank. If this is your requirement then feel free to use the above code. But, my requirement was only to filter out the pages that were completely blank (No any images present nor consisting of any fonts). So, I ended up using this code (Plus this code runs faster :P) :
public static void main(String[] args) {
try {
PDDocument document = PDDocument.load(new File("/home/codemantra/Downloads/CTP2040.pdf"));
PDPageTree allPages = document.getPages();
Integer pageNumber = 1;
for (PDPage page : allPages) {
Iterable<COSName> xObjects = page.getResources().getXObjectNames();
Iterable<COSName> fonts = page.getResources().getFontNames();
if(xObjects.spliterator().getExactSizeIfKnown() == 0 && fonts.spliterator().getExactSizeIfKnown() == 0) {
System.out.println(pageNumber);
}
pageNumber++;
}
} catch (Exception e) {
e.printStackTrace();
}
}
This will return the page numbers of those pages which are completely blank.
Hope this helps someone! :)
#Pramesh Bajracharya, Your solution to find a blank page in a PDF document is intact!
If in case the requirement is to remove the blank pages the same code can be enhanced as below
List<Integer> blankPageList = new ArrayList<Integer>();
for( PDPage page : allPages )
{
Iterable<COSName> xObjects = page.getResources().getXObjectNames();
Iterable<COSName> fonts = page.getResources().getFontNames();
// condition to determine if the page is a blank page
if( xObjects.spliterator().getExactSizeIfKnown() == 0 && fonts.spliterator().getExactSizeIfKnown() == 0 )
{
pageRemovalList.add( pageNumber );
}
pageNumber++;
}
// remove the blank pages from the pdf document using the blank page numbers list
for( Integer i : blankPageList )
{
document.removePage( i );
}
http://www.rgagnon.com/javadetails/java-detect-and-remove-blank-page-in-pdf.html
import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.io.RandomAccessSourceFactory;
import com.itextpdf.text.pdf.PdfCopy;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfImportedPage;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.RandomAccessFileOrArray;
public class RemoveBlankPageFromPDF {
// value where we can consider that this is a blank image
// can be much higher or lower depending of what is considered as a blank page
public static final int BLANK_THRESHOLD = 160;
public static void removeBlankPdfPages(String source, String destination)
throws IOException, DocumentException
{
PdfReader r = null;
RandomAccessSourceFactory rasf = null;
RandomAccessFileOrArray raf = null;
Document document = null;
PdfCopy writer = null;
try {
r = new PdfReader(source);
// deprecated
// RandomAccessFileOrArray raf
// = new RandomAccessFileOrArray(pdfSourceFile);
// itext 5.4.1
rasf = new RandomAccessSourceFactory();
raf = new RandomAccessFileOrArray(rasf.createBestSource(source));
document = new Document(r.getPageSizeWithRotation(1));
writer = new PdfCopy(document, new FileOutputStream(destination));
document.open();
PdfImportedPage page = null;
for (int i=1; i<=r.getNumberOfPages(); i++) {
// first check, examine the resource dictionary for /Font or
// /XObject keys. If either are present -> not blank.
PdfDictionary pageDict = r.getPageN(i);
PdfDictionary resDict = (PdfDictionary) pageDict.get( PdfName.RESOURCES );
boolean noFontsOrImages = true;
if (resDict != null) {
noFontsOrImages = resDict.get( PdfName.FONT ) == null &&
resDict.get( PdfName.XOBJECT ) == null;
}
System.out.println(i + " noFontsOrImages " + noFontsOrImages);
if (!noFontsOrImages) {
byte bContent [] = r.getPageContent(i,raf);
ByteArrayOutputStream bs = new ByteArrayOutputStream();
bs.write(bContent);
System.out.println
(i + bs.size() + " > BLANK_THRESHOLD " + (bs.size() > BLANK_THRESHOLD));
if (bs.size() > BLANK_THRESHOLD) {
page = writer.getImportedPage(r, i);
writer.addPage(page);
}
}
}
}
finally {
if (document != null) document.close();
if (writer != null) writer.close();
if (raf != null) raf.close();
if (r != null) r.close();
}
}
public static void main (String ... args) throws Exception {
removeBlankPdfPages
("C://temp//documentwithblank.pdf", "C://temp//documentwithnoblank.pdf");
}
}
I'm trying to load an excel file(xlsx) into a Workbook Object using apache POI 3.10.
I'm receiving a java.lang.OutofMemoryError.
I'm using Java 8 with the -Xmx2g argument on the JVM.
All 4 cores(64bit System) and my RAM(4gb) are maxed out when I run the program.
The excel sheet has 43 columns and 166,961 Rows which equal 7,179,323 Cells.
I'm using Apache POIs WorkBookFactory.create(new File) because it uses less memory than using InputFileStream.
Does anyone have any ideas how to optimize memory usage or another way to create the Workbook?
Below is my test Reader class, don't judge, it's rough and includes debugging statements:
import java.io.File;
import java.io.IOException;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
public class Reader {
private Workbook wb;
public Reader(File excel) {
System.out.println("CONSTRUCTOR");
wb = null;
try {
wb = WorkbookFactory.create(excel);
} catch (IOException e) {
System.out.println("IO Exception");
System.out.println(e.getMessage());
} catch (InvalidFormatException e) {
System.out.println("Invalid Format");
System.out.println(e.getMessage());
}
}
public boolean exists() { return (wb != null); }
public void print() {}
public static void main(String[] args) {
System.out.println("START PRG");
//File f = new File("oldfilename.xls");
File f = new File("filename.xlsx");
System.out.println("PATH:" + f.getAbsoluteFile());
if (!f.exists()) {
System.out.println("File does not exist.");
System.exit(0);
}
System.out.println("FILE");
Reader r = new Reader(f);
System.out.println("Reader");
r.print();
System.out.println("PRG DONE");
}
}
apparently loading a 24mb file shouldn't be causing OOM...
at first glance it appears to me, though Xmx set to 2G, there's actually not that much memory free in system. in other words OS and other processes may have taken more than 2G out of 4G of physical memory! Check available physical memory first. in case available below what's expected, try closing some other running apps/processes.
if that's not the case and there's indeed enough memory left, without profiling it's really hard to identify the real cause. use a profile tool to check JVM status, related to memory first. you may simply use jconsole (as it comes with JDK). #see this on how to activate JMX
once you are connected, check readings related to memory, specifically below memory spaces:
old gen
young gen
perm gen
monitor these spaces and see where it's struggling. I assume this is a standalone application. in case this is deployed on server (as web or services), you may consider '-XX:NewRatio' option for distributing heap spaces effectively and efficiently. #see tuning related details here.
Please confirm these before proceeding,
Is there any infinite execution in looping(for/while)
Ensure your physical storage size
Maximize buffer memory
Note
As per my understanding Apache POI will not consume that much amount of memory.
I am just a beginner, but may I ask you some questions.
Why not use XSSFWorkbook class to open XLSX file. I mean, I always use it to handle XLSX files, and this time I tried with a file(7 MB; that was the largest I could find in my computer), and it worked perfectly.
Why not use newer File API(NIO, Java 7). Again, I do not know if this will make any difference or not. But, it worked for me.
Windows 7 Ultimate | 64 bit | Intel 2nd Gen Core i3|Eclipse Juno|JDK 1.7.45|Apache POI 3.9
Path file = Paths.get("XYZABC.xlsx");
try {
XSSFWorkbook wb = new XSSFWorkbook(Files.newInputStream(file, StandardOpenOption.READ));
} catch (IOException e) {
System.out.println("Some IO Error!!!!");
}
Do, tell if it works for you or not.
Did you tried using SXSSFWorkbook? We also used Apache POI to handle relatively big XLSX files, and we also had memory problems when using plain XSSFWorkbook. Although we didn't have to read in the files, we were just writing tens of thousands of lines of informations. Using this, our memory problems got solved. You can pass an XSSFWorkbook to its constructor and the size of data you want to keep in memory.
Java 1.8
based on HSSF and XSSF Limitations
my poi version is 3.17 POI Examples
lauches my code
public class Controller {
EX stressTest;
public void fineFile() {
String stresstest = "C:\\Stresstest.xlsx";
HashMap<String, String[]> stressTestMap = new HashMap<>();
stressTestMap.put("aaaa", new String[]{"myField", "The field"});
stressTestMap.put("bbbb", new String[]{"other", "Other value"});
try {
InputStream stressTestIS = new FileInputStream(stresstest);
stressTest = new EX(stresstest, stressTestIS, stressTestMap);
} catch (IOException exp) {
}
}
public void printErr() {
if (stressTest.thereAreErrors()) {
try {
FileWriter myWriter = new FileWriter(
"C:\\logErrorsStressTest" +
(new SimpleDateFormat("ddMMyyyyHHmmss")).format(new Date()) +
".txt"
);
myWriter.write(stressTest.getBodyFileErrors());
myWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
} else {
}
}
public void createBD() {
List<OneObjectWhatever> entitiesList =
(
!stressTest.thereAreErrors()
? ((List<OneObjectWhatever>) stressTest.toListCustomerObject(OneObjectWhatever.class))
: new ArrayList<>()
);
entitiesList.forEach(entity -> {
Field[] fields = entity.getClass().getDeclaredFields();
String valueString = "";
for (Field attr : fields) {
try {
attr.setAccessible(true);
valueString += " StressTest:" + attr.getName() + ": -" + attr.get(fields) + "- ";
attr.setAccessible(true);
} catch (Exception reflectionError) {
System.out.println(reflectionError);
}
}
});
}
}
MY CODE
public class EX {
private HashMap<Integer, HashMap<Integer, String> > rows;
private List<String> errors;
private int maxColOfHeader, minColOfHeader;
private HashMap<Integer, String> header;
private HashMap<String,String[]> relationHeaderClassPropertyDescription;
private void initVariables(String name, InputStream file) {
this.rows = new HashMap();
this.header = new HashMap<>();
this.errors = new ArrayList<String>(){{add("["+name+"] empty cells in position -> ");}};
try{
InputStream is = FileMagic.prepareToCheckMagic(file);
FileMagic fm = FileMagic.valueOf(is);
is.close();
switch (fm) {
case OLE2:
XLS2CSVmra xls2csv = new XLS2CSVmra(name, 50, rows);
xls2csv.process();
System.out.println("OLE2");
break;
case OOXML:
File flatFile = new File(name);
OPCPackage p = OPCPackage.open(flatFile, PackageAccess.READ);
XLSX2CSV xlsx2csv = new XLSX2CSV(p, System.out, 50, this.rows);
xlsx2csv.process();
p.close();
System.out.println("OOXML");
break;
default:
System.out.println("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
break;
}
} catch (IOException | EncryptedDocumentException | SAXException | OpenXML4JException exp){
System.out.println(exp);
exp.printStackTrace();
}
int rowHeader = rows.keySet().stream().findFirst().get();
this.header.putAll(rows.get(rowHeader));
this.rows.remove(rowHeader);
this.minColOfHeader = this.header.keySet().stream().findFirst().get();
this.maxColOfHeader = this.header.entrySet().stream()
.mapToInt(e -> e.getKey()).max()
.orElseThrow(NoSuchElementException::new);
}
public EX(String name, InputStream file, HashMap<String,String[]> relationHeaderClassPropertyDescription_) {
this.relationHeaderClassPropertyDescription = relationHeaderClassPropertyDescription_;
initVariables(name, file);
validate();
}
private void validate(){
rows.forEach((inx,row) -> {
for(int i = minColOfHeader; i <= maxColOfHeader; i++) {
//System.out.println("r:"+inx+" c:"+i+" cr:"+(!row.containsKey(i))+" vr:"+((!row.containsKey(i)) || row.get(i).trim().isEmpty())+" ch:"+header.containsKey(i)+" vh:"+(header.containsKey(i) && (!header.get(i).trim().isEmpty()))+" val:"+(row.containsKey(i)&&!row.get(i).trim().isEmpty()?row.get(i):"empty"));
if((!row.containsKey(i)) || row.get(i).trim().isEmpty()) {
if(header.containsKey(i) && (!header.get(i).trim().isEmpty())) {
String description = getRelationHeaders(i,1);
errors.add(" ["+header.get(i)+"]{"+description+"} = fila: "+(inx+1)+" - columna: "+ CellReference.convertNumToColString(i));
// System.out.println(" fila: "+inx+" - columna: " + i + " - valor: "+ (row.get(i).isEmpty()?"empty":row.get(i)));
}
}
}
});
header.forEach((i,v)->{System.out.println("stressTestMap.put(\""+v+"\", new String[]{\"{"+i+"}\",\"Mi descripcion XD\"});");});
}
public String getBodyFileErrors()
{
return String.join(System.lineSeparator(), errors);
}
public boolean thereAreErrors() {
return errors.stream().count() > 1;
}
public<T extends Class> List<? extends Object> toListCustomerObject(T type) {
List<Object> list = new ArrayList<>();
rows.forEach((inx, row) -> {
try {
Object obj = type.newInstance();
for(int i = minColOfHeader; i <= maxColOfHeader; i++) {
if (row.containsKey(i) && !row.get(i).trim().isEmpty()) {
if (header.containsKey(i) && !header.get(i).trim().isEmpty()) {
if(relationHeaderClassPropertyDescription.containsKey(header.get(i))) {
String nameProperty = getRelationHeaders(i,0);
Field field = type.getDeclaredField(nameProperty);
try{
field.setAccessible(true);
field.set(obj, (isConvertibleTo(field.getType(),row.get(i)) ? toObject(field.getType(),row.get(i)) : defaultValue(field.getType())) );
field.setAccessible(false);
}catch (Exception fex) {
//System.out.println("113"+fex);
continue;
}
}
}
}
}
list.add(obj);
} catch (Exception ex) {
//System.out.println("123:"+ex);
}
});
return list;
}
private Object toObject( Class clazz, String value ) {
if( Boolean.class == clazz || Boolean.TYPE == clazz) return Boolean.parseBoolean( value );
if( Byte.class == clazz || Byte.TYPE == clazz) return Byte.parseByte( value );
if( Short.class == clazz || Short.TYPE == clazz) return Short.parseShort( value );
if( Integer.class == clazz || Integer.TYPE == clazz) return Integer.parseInt( value );
if( Long.class == clazz || Long.TYPE == clazz) return Long.parseLong( value );
if( Float.class == clazz || Float.TYPE == clazz) return Float.parseFloat( value );
if( Double.class == clazz || Double.TYPE == clazz) return Double.parseDouble( value );
return value;
}
private boolean isConvertibleTo( Class clazz, String value ) {
String ptn = "";
if( Boolean.class == clazz || Boolean.TYPE == clazz) ptn = ".*";
if( Byte.class == clazz || Byte.TYPE == clazz) ptn = "^\\d+$";
if( Short.class == clazz || Short.TYPE == clazz) ptn = "^\\d+$";
if( Integer.class == clazz || Integer.TYPE == clazz) ptn = "^\\d+$";
if( Long.class == clazz || Long.TYPE == clazz) ptn = "^\\d+$";
if( Float.class == clazz || Float.TYPE == clazz) ptn = "^\\d+(\\.\\d+)?$";
if( Double.class == clazz || Double.TYPE == clazz) ptn = "^\\d+(\\.\\d+)?$";
Pattern pattern = Pattern.compile(ptn, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(value);
return matcher.find();
}
private Object defaultValue( Class clazz) {
if( Boolean.class == clazz || Boolean.TYPE == clazz) return Boolean.parseBoolean( "false" );
if( Byte.class == clazz || Byte.TYPE == clazz) return Byte.parseByte( "0" );
if( Short.class == clazz || Short.TYPE == clazz) return Short.parseShort( "0" );
if( Integer.class == clazz || Integer.TYPE == clazz) return Integer.parseInt( "0" );
if( Long.class == clazz || Long.TYPE == clazz) return Long.parseLong( "0" );
if( Float.class == clazz || Float.TYPE == clazz) return Float.parseFloat( "0.0" );
if( Double.class == clazz || Double.TYPE == clazz) return Double.parseDouble( "0.0" );
return "";
}
private String getRelationHeaders(Integer columnIndexHeader, Integer TypeOrDescription /*0 - Type, 1 - Description*/) {
try {
return relationHeaderClassPropertyDescription.get(header.get(columnIndexHeader))[TypeOrDescription];
} catch (Exception e) {
}
return header.get(columnIndexHeader);
}
}
these are the modifications I made to the examples:
XLSX2CSV
public class XLSX2CSV {
/**
* Uses the XSSF Event SAX helpers to do most of the work
* of parsing the Sheet XML, and outputs the contents
* as a (basic) CSV.
*/
private class SheetToCSV implements SheetContentsHandler {
private boolean firstCellOfRow = false;
private int currentRow = -1;
private int currentCol = -1;
HashMap<Integer, String> valuesCell;
private void outputMissingRows(int number) {
for (int i=0; i<number; i++) {
for (int j=0; j<minColumns; j++) {
output.append(',');
}
output.append('\n');
}
}
#Override
public void startRow(int rowNum) {
// If there were gaps, output the missing rows
outputMissingRows(rowNum-currentRow-1);
// Prepare for this row
firstCellOfRow = true;
currentRow = rowNum;
currentCol = -1;
valuesCell = new HashMap<>();
}
#Override
public void endRow(int rowNum) {
// Ensure the minimum number of columns
for (int i = currentCol; i < minColumns; i++) {
output.append(',');
}
output.append('\n');
if (!valuesCell.isEmpty())
_rows.put(rowNum, valuesCell);
}
#Override
public void cell(String cellReference, String formattedValue,
XSSFComment comment) {
if (firstCellOfRow) {
firstCellOfRow = false;
} else {
output.append(',');
}
// gracefully handle missing CellRef here in a similar way as XSSFCell does
if (cellReference == null) {
cellReference = new CellAddress(currentRow, currentCol).formatAsString();
}
// Did we miss any cells?
int thisCol = (new CellReference(cellReference)).getCol();
int missedCols = thisCol - currentCol - 1;
for (int i = 0; i < missedCols; i++) {
output.append(',');
}
currentCol = thisCol;
if (!formattedValue.isEmpty())
valuesCell.put(thisCol, formattedValue);
// Number or string?
output.append(formattedValue);
/*try {
//noinspection ResultOfMethodCallIgnored
Double.parseDouble(formattedValue);
output.append(formattedValue);
} catch (NumberFormatException e) {
output.append('"');
output.append(formattedValue);
output.append('"');
}*/
}
#Override
public void headerFooter(String text, boolean isHeader, String tagName) {
// Skip, no headers or footers in CSV
}
}
///////////////////////////////////////
private final OPCPackage xlsxPackage;
/**
* Number of columns to read starting with leftmost
*/
private final int minColumns;
/**
* Destination for data
*/
private final PrintStream output;
public HashMap<Integer, HashMap<Integer, String>> _rows;
/**
* Creates a new XLSX -> CSV converter
*
* #param pkg The XLSX package to process
* #param output The PrintStream to output the CSV to
* #param minColumns The minimum number of columns to output, or -1 for no minimum
*/
public XLSX2CSV(OPCPackage pkg, PrintStream output, int minColumns, HashMap<Integer, HashMap<Integer, String> > __rows) {
this.xlsxPackage = pkg;
this.output = output;
this.minColumns = minColumns;
this._rows = __rows;
}
/**
* Parses and shows the content of one sheet
* using the specified styles and shared-strings tables.
*
* #param styles The table of styles that may be referenced by cells in the sheet
* #param strings The table of strings that may be referenced by cells in the sheet
* #param sheetInputStream The stream to read the sheet-data from.
* #exception java.io.IOException An IO exception from the parser,
* possibly from a byte stream or character stream
* supplied by the application.
* #throws SAXException if parsing the XML data fails.
*/
public void processSheet(
StylesTable styles,
ReadOnlySharedStringsTable strings,
SheetContentsHandler sheetHandler,
InputStream sheetInputStream) throws IOException, SAXException {
DataFormatter formatter = new DataFormatter();
InputSource sheetSource = new InputSource(sheetInputStream);
try {
XMLReader sheetParser = SAXHelper.newXMLReader();
ContentHandler handler = new XSSFSheetXMLHandler(
styles, null, strings, sheetHandler, formatter, false);
sheetParser.setContentHandler(handler);
sheetParser.parse(sheetSource);
} catch(ParserConfigurationException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
}
/**
* Initiates the processing of the XLS workbook file to CSV.
*
* #throws IOException If reading the data from the package fails.
* #throws SAXException if parsing the XML data fails.
*/
public void process() throws IOException, OpenXML4JException, SAXException {
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(this.xlsxPackage);
XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);
StylesTable styles = xssfReader.getStylesTable();
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
int index = 0;
while (iter.hasNext()) {
InputStream stream = iter.next();
String sheetName = iter.getSheetName();
this.output.println();
this.output.println(sheetName + " [index=" + index + "]:");
processSheet(styles, strings, new SheetToCSV(), stream);
stream.close();
++index;
break;
}
}
}
XLS2CSVmra
public class XLS2CSVmra implements HSSFListener {
private int minColumns;
private POIFSFileSystem fs;
private PrintStream output;
public HashMap<Integer, HashMap<Integer, String>> _rows;
private HashMap<Integer, String> valuesCell;
private int lastRowNumber;
private int lastColumnNumber;
/** Should we output the formula, or the value it has? */
private boolean outputFormulaValues = false;
/** For parsing Formulas */
private SheetRecordCollectingListener workbookBuildingListener;
private HSSFWorkbook stubWorkbook;
// Records we pick up as we process
private SSTRecord sstRecord;
private FormatTrackingHSSFListener formatListener;
/** So we known which sheet we're on */
private int sheetIndex = -1;
private BoundSheetRecord[] orderedBSRs;
private List<BoundSheetRecord> boundSheetRecords = new ArrayList<BoundSheetRecord>();
// For handling formulas with string results
private int nextRow;
private int nextColumn;
private boolean outputNextStringRecord;
/**
* Creates a new XLS -> CSV converter
* #param fs The POIFSFileSystem to process
* #param output The PrintStream to output the CSV to
* #param minColumns The minimum number of columns to output, or -1 for no minimum
*/
public XLS2CSVmra(POIFSFileSystem fs, PrintStream output, int minColumns, HashMap<Integer, HashMap<Integer, String>> __rows) {
this.fs = fs;
this.output = output;
this.minColumns = minColumns;
this._rows = __rows;
this.valuesCell = new HashMap<>();
}
/**
* Creates a new XLS -> CSV converter
* #param filename The file to process
* #param minColumns The minimum number of columns to output, or -1 for no minimum
* #throws IOException
* #throws FileNotFoundException
*/
public XLS2CSVmra(String filename, int minColumns, HashMap<Integer, HashMap<Integer, String>> __rows) throws IOException, FileNotFoundException {
this(
new POIFSFileSystem(new FileInputStream(filename)),
System.out, minColumns,
__rows
);
}
/**
* Initiates the processing of the XLS file to CSV
*/
public void process() throws IOException {
MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
formatListener = new FormatTrackingHSSFListener(listener);
HSSFEventFactory factory = new HSSFEventFactory();
HSSFRequest request = new HSSFRequest();
if(outputFormulaValues) {
request.addListenerForAllRecords(formatListener);
} else {
workbookBuildingListener = new SheetRecordCollectingListener(formatListener);
request.addListenerForAllRecords(workbookBuildingListener);
}
factory.processWorkbookEvents(request, fs);
}
/**
* Main HSSFListener method, processes events, and outputs the
* CSV as the file is processed.
*/
#Override
public void processRecord(Record record) {
if(sheetIndex>0)
return;
int thisRow = -1;
int thisColumn = -1;
String thisStr = null;
switch (record.getSid())
{
case BoundSheetRecord.sid:
if(sheetIndex==-1)
boundSheetRecords.add((BoundSheetRecord)record);
break;
case BOFRecord.sid:
BOFRecord br = (BOFRecord)record;
if(br.getType() == BOFRecord.TYPE_WORKSHEET && sheetIndex==-1) {
// Create sub workbook if required
if(workbookBuildingListener != null && stubWorkbook == null) {
stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
}
// Output the worksheet name
// Works by ordering the BSRs by the location of
// their BOFRecords, and then knowing that we
// process BOFRecords in byte offset order
sheetIndex++;
if(orderedBSRs == null) {
orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);
}
output.println();
output.println(
orderedBSRs[sheetIndex].getSheetname() +
" [" + (sheetIndex+1) + "]:"
);
}
break;
case SSTRecord.sid:
sstRecord = (SSTRecord) record;
break;
case BlankRecord.sid:
BlankRecord brec = (BlankRecord) record;
thisRow = brec.getRow();
thisColumn = brec.getColumn();
thisStr = "";
break;
case BoolErrRecord.sid:
BoolErrRecord berec = (BoolErrRecord) record;
thisRow = berec.getRow();
thisColumn = berec.getColumn();
thisStr = "";
break;
case FormulaRecord.sid:
FormulaRecord frec = (FormulaRecord) record;
thisRow = frec.getRow();
thisColumn = frec.getColumn();
if(outputFormulaValues) {
if(Double.isNaN( frec.getValue() )) {
// Formula result is a string
// This is stored in the next record
outputNextStringRecord = true;
nextRow = frec.getRow();
nextColumn = frec.getColumn();
} else {
thisStr = formatListener.formatNumberDateCell(frec);
}
} else {
thisStr = '"' +
HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
}
break;
case StringRecord.sid:
if(outputNextStringRecord) {
// String for formula
StringRecord srec = (StringRecord)record;
thisStr = srec.getString();
thisRow = nextRow;
thisColumn = nextColumn;
outputNextStringRecord = false;
}
break;
case LabelRecord.sid:
LabelRecord lrec = (LabelRecord) record;
thisRow = lrec.getRow();
thisColumn = lrec.getColumn();
thisStr = '"' + lrec.getValue() + '"';
break;
case LabelSSTRecord.sid:
LabelSSTRecord lsrec = (LabelSSTRecord) record;
thisRow = lsrec.getRow();
thisColumn = lsrec.getColumn();
if(sstRecord == null) {
thisStr = '"' + "(No SST Record, can't identify string)" + '"';
} else {
thisStr = '"' + sstRecord.getString(lsrec.getSSTIndex()).toString() + '"';
}
break;
case NoteRecord.sid:
NoteRecord nrec = (NoteRecord) record;
thisRow = nrec.getRow();
thisColumn = nrec.getColumn();
// TODO: Find object to match nrec.getShapeId()
thisStr = '"' + "(TODO)" + '"';
break;
case NumberRecord.sid:
NumberRecord numrec = (NumberRecord) record;
thisRow = numrec.getRow();
thisColumn = numrec.getColumn();
// Format
thisStr = formatListener.formatNumberDateCell(numrec);
break;
case RKRecord.sid:
RKRecord rkrec = (RKRecord) record;
thisRow = rkrec.getRow();
thisColumn = rkrec.getColumn();
thisStr = '"' + "(TODO)" + '"';
break;
default:
break;
}
// Handle new row
if(thisRow != -1 && thisRow != lastRowNumber) {
lastColumnNumber = -1;
}
// Handle missing column
if(record instanceof MissingCellDummyRecord) {
MissingCellDummyRecord mc = (MissingCellDummyRecord)record;
thisRow = mc.getRow();
thisColumn = mc.getColumn();
thisStr = "";
}
// If we got something to print out, do so
if(thisStr != null) {
if (thisColumn > 0) {
output.print(',');
}
if (!thisStr.isEmpty())
valuesCell.put(thisColumn, thisStr);
output.print(thisStr);
}
// Update column and row count
if(thisRow > -1)
lastRowNumber = thisRow;
if(thisColumn > -1)
lastColumnNumber = thisColumn;
// Handle end of row
if(record instanceof LastCellOfRowDummyRecord) {
// Print out any missing commas if needed
if(minColumns > 0) {
// Columns are 0 based
if(lastColumnNumber == -1) { lastColumnNumber = 0; }
for(int i=lastColumnNumber; i<(minColumns); i++) {
output.print(',');
}
}
// We're onto a new row
lastColumnNumber = -1;
// End the row
output.println();
if(!valuesCell.isEmpty()) {
HashMap<Integer, String> newRow = new HashMap<>();
valuesCell.forEach((inx,vStr) -> {
newRow.put(inx, vStr);
});
_rows.put(lastRowNumber, newRow);
valuesCell = new HashMap<>();
}
}
}
}
Closed. This question is opinion-based. It is not currently accepting answers.
Want to improve this question? Update the question so it can be answered with facts and citations by editing this post.
Closed 4 years ago.
Improve this question
I would like to know a way to extract individual line of measures. I am not sure if an algorithm for this already exists so I've thought of scanning a sheet music from left to right, extract all the white spaces from above and below a line of measures.
I am not looking for a way to convert the sheet music into MusicXML or extract other useful information. No, essentially what I am dealing with is a regular document. I need to separate the paragraphs. I am not interested in the information conveyed by the paragraph but simply chunking them separately from the regions of the document. In this case a paragraph would be one line of measures. I don't need individual measures but all the measure on each line of sheet music.
This is one of the output I would like from the full sheet music but without the title, composer and etc.
Supposing you have the sheet music in PDF File, I would use Apache PDFBox to get images from an input PDF File containing the sheet music, then locate the coordinates of the whole bar you need, the with a selected image define the coordinates to crop the image and manipulate it until you get the desired result.
PDDocument document = null;
document = PDDocument.load(inFile);
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
Map pageImages = resources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
image.write2OutputStream(/* some output stream */);
}
}
}
Here is a sample code available in Apache PDFBox.
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
/**
* This will read a read pdf and extract images. <br/><br/>
*
* usage: java org.apache.pdfbox.ExtractImages <pdffile> <password> [imageprefix]
*
* #author Ben Litchfield
* #version $Revision: 1.7 $
*/
public class ExtractImages
{
private int imageCounter = 1;
private static final String PASSWORD = "-password";
private static final String PREFIX = "-prefix";
private static final String ADDKEY = "-addkey";
private static final String NONSEQ = "-nonSeq";
private ExtractImages()
{
}
/**
* This is the entry point for the application.
*
* #param args The command-line arguments.
*
* #throws Exception If there is an error decrypting the document.
*/
public static void main( String[] args ) throws Exception
{
ExtractImages extractor = new ExtractImages();
extractor.extractImages( args );
}
private void extractImages( String[] args ) throws Exception
{
if( args.length < 1 || args.length > 4 )
{
usage();
}
else
{
String pdfFile = null;
String password = "";
String prefix = null;
boolean addKey = false;
boolean useNonSeqParser = false;
for( int i=0; i<args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
{
i++;
if( i >= args.length )
{
usage();
}
password = args[i];
}
else if( args[i].equals( PREFIX ) )
{
i++;
if( i >= args.length )
{
usage();
}
prefix = args[i];
}
else if( args[i].equals( ADDKEY ) )
{
addKey = true;
}
else if( args[i].equals( NONSEQ ) )
{
useNonSeqParser = true;
}
else
{
if( pdfFile == null )
{
pdfFile = args[i];
}
}
}
if(pdfFile == null)
{
usage();
}
else
{
if( prefix == null && pdfFile.length() >4 )
{
prefix = pdfFile.substring( 0, pdfFile.length() -4 );
}
PDDocument document = null;
try
{
if (useNonSeqParser)
{
document = PDDocument.loadNonSeq(new File(pdfFile), null, password);
}
else
{
document = PDDocument.load( pdfFile );
if( document.isEncrypted() )
{
StandardDecryptionMaterial spm = new StandardDecryptionMaterial(password);
document.openProtection(spm);
}
}
AccessPermission ap = document.getCurrentAccessPermission();
if( ! ap.canExtractContent() )
{
throw new IOException(
"Error: You do not have permission to extract images." );
}
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
while( iter.hasNext() )
{
PDPage page = (PDPage)iter.next();
PDResources resources = page.getResources();
// extract all XObjectImages which are part of the page resources
processResources(resources, prefix, addKey);
}
}
finally
{
if( document != null )
{
document.close();
}
}
}
}
}
private void processResources(PDResources resources, String prefix, boolean addKey) throws IOException
{
if (resources == null)
{
return;
}
Map<String, PDXObject> xobjects = resources.getXObjects();
if( xobjects != null )
{
Iterator<String> xobjectIter = xobjects.keySet().iterator();
while( xobjectIter.hasNext() )
{
String key = xobjectIter.next();
PDXObject xobject = xobjects.get( key );
// write the images
if (xobject instanceof PDXObjectImage)
{
PDXObjectImage image = (PDXObjectImage)xobject;
String name = null;
if (addKey)
{
name = getUniqueFileName( prefix + "_" + key, image.getSuffix() );
}
else
{
name = getUniqueFileName( prefix, image.getSuffix() );
}
System.out.println( "Writing image:" + name );
image.write2file( name );
}
// maybe there are more images embedded in a form object
else if (xobject instanceof PDXObjectForm)
{
PDXObjectForm xObjectForm = (PDXObjectForm)xobject;
PDResources formResources = xObjectForm.getResources();
processResources(formResources, prefix, addKey);
}
}
}
}
private String getUniqueFileName( String prefix, String suffix )
{
String uniqueName = null;
File f = null;
while( f == null || f.exists() )
{
uniqueName = prefix + "-" + imageCounter;
f = new File( uniqueName + "." + suffix );
imageCounter++;
}
return uniqueName;
}
/**
* This will print the usage requirements and exit.
*/
private static void usage()
{
System.err.println( "Usage: java org.apache.pdfbox.ExtractImages [OPTIONS] <PDF file>\n" +
" -password <password> Password to decrypt document\n" +
" -prefix <image-prefix> Image prefix(default to pdf name)\n" +
" -addkey add the internal image key to the file name\n" +
" -nonSeq Enables the new non-sequential parser\n" +
" <PDF file> The PDF document to use\n"
);
System.exit( 1 );
}
}
Now to crop image you can use:
/**
* Crop the main image according to this rectangle, and scale it to the
* correct size for a thumbnail.
*/
public InputStream cropAndScale(InputStream mainImageStream,
CropRectangle crop) {
try {
RenderedOp mainImage = loadImage(mainImageStream);
RenderedOp opaqueImage = makeImageOpaque(mainImage);
RenderedOp croppedImage = cropImage(opaqueImage, crop);
RenderedOp scaledImage = scaleImage(croppedImage);
byte[] jpegBytes = encodeAsJpeg(scaledImage);
return new ByteArrayInputStream(jpegBytes);
} catch (Exception e) {
throw new IllegalStateException("Failed to scale the image", e);
}
}
which is available in this page and the project
There is other option to parse images inside a pdf file, take a look at this code specially this