I was able to merge multiple PDF files into a single PDF using the code below -
public void mergePDF() {
File file1 = new File("inputPDF/001.pdf");
File file2 = new File("inputPDF/002.pdf");
File file3 = new File("inputPDF/003.pdf");
File file4 = new File("inputPDF/004.pdf");
try {
PDDocument doc1 = PDDocument.load(file1);
PDDocument doc2 = PDDocument.load(file2);
PDDocument doc3 = PDDocument.load(file3);
PDDocument doc4 = PDDocument.load(file4);
PDFMergerUtility PDFmerger = new PDFMergerUtility();
PDFmerger.setDestinationFileName("outputImages/merged.pdf");
System.out.println("Destination path set to "+PDFmerger.getDestinationFileName());
PDFmerger.addSource(file1);
PDFmerger.addSource(file2);
PDFmerger.addSource(file3);
PDFmerger.addSource(file4);
//Merging the documents
PDFmerger.mergeDocuments();
doc1.close();
doc2.close();
doc3.close();
doc4.close();
System.out.println("Done!");
} catch (IOException e) {
e.printStackTrace();
}
}
However, my requirement is to merge multiple images (JPG, PNG) to a single PDF as well.
Is it possible to merge multiple images to a single PDF using PDFBox?
Since I struggled with this task, here's my code. The merged document is PDF/A-1b compliant
import com.google.common.io.Resources;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Calendar;
import java.util.List;
import javax.xml.transform.TransformerException;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDOutputIntent;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.preflight.parser.PreflightParser;
import org.apache.xmpbox.XMPMetadata;
import org.apache.xmpbox.schema.DublinCoreSchema;
import org.apache.xmpbox.schema.PDFAIdentificationSchema;
import org.apache.xmpbox.schema.XMPBasicSchema;
import org.apache.xmpbox.type.BadFieldValueException;
import org.apache.xmpbox.xml.XmpSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public final class PDFMerger {
private static final Logger LOG = LoggerFactory.getLogger(PDFMerger3.class);
private static final String OUTPUT_CONDITION_IDENTIFIER = "sRGB IEC61966-2.1";
public static final String DOCUMENT_CREATOR = "Mr. Meeseeks";
public static final String DOCUMENT_SUBJECT = "Great subject";
public static final String DOCUMENT_TITLE = "Here goes your title";
/**
* Creates a compound PDF document from a list of input documents.
* <p>
* The merged document is PDF/A-1b compliant
*
* #param sources list of source PDF document streams.
* #return compound PDF document as a readable input stream.
* #throws IOException if anything goes wrong during PDF merge.
*/
public static ByteArrayOutputStream mergeFiles(final List<InputStream> sources) throws IOException {
Path mergeDirectory = Files.createTempDirectory("merge-" + System.currentTimeMillis());
try (ByteArrayOutputStream mergedPDFOutputStream = new ByteArrayOutputStream()) {
LOG.debug("Merging {} source documents into one PDF", sources.size());
PDFMergerUtility mixedPdfMerger = createMixedPdfMerger(sources, mergedPDFOutputStream, mergeDirectory);
mergeFileStreams(mergedPDFOutputStream, mixedPdfMerger);
return mergedPDFOutputStream;
} catch (Exception e) {
if (!(e instanceof IOException)) {
throw new IOException("PDF merge problem", e);
}
throw (IOException) e;
} finally {
FileUtils.deleteDirectory(mergeDirectory.toFile());
sources.forEach(IOUtils::closeQuietly);
}
}
private static void mergeFileStreams(ByteArrayOutputStream mergedPDFOutputStream, PDFMergerUtility pdfMerger)
throws IOException, BadFieldValueException, TransformerException {
LOG.debug("Initialising PDF merge utility");
try (COSStream cosStream = new COSStream()) {
// PDF and XMP properties must be identical, otherwise document is not PDF/A compliant
pdfMerger.setDestinationDocumentInformation(createPDFDocumentInfo());
pdfMerger.setDestinationMetadata(createXMPMetadata(cosStream));
pdfMerger.mergeDocuments(MemoryUsageSetting.setupTempFileOnly());
LOG.debug("PDF merge successful, size = {} bytes", mergedPDFOutputStream.size());
}
}
#SuppressWarnings("UnstableApiUsage")
private static PDFMergerUtility createMixedPdfMerger(List<InputStream> sources, ByteArrayOutputStream mergedPDFOutputStream, Path mergeDirectory) throws IOException {
PDFMergerUtility pdfMerger = new PDFMergerUtility();
byte[] colorProfile = org.apache.commons.io.IOUtils.toByteArray(Resources.getResource("sRGB.icc"));
for (InputStream source : sources) {
File file = streamToFile(mergeDirectory, source);
if (isPdf(file)) {
pdfMerger.addSource(file);
} else {
pdfMerger.addSource(imageToPDDocument(mergeDirectory, file, colorProfile));
}
}
pdfMerger.setDestinationStream(mergedPDFOutputStream);
return pdfMerger;
}
private static PDDocumentInformation createPDFDocumentInfo() {
LOG.debug("Setting document info (title, author, subject) for merged PDF");
PDDocumentInformation documentInformation = new PDDocumentInformation();
documentInformation.setTitle(DOCUMENT_TITLE);
documentInformation.setCreator(DOCUMENT_CREATOR);
documentInformation.setSubject(DOCUMENT_SUBJECT);
return documentInformation;
}
private static PDMetadata createXMPMetadata(COSStream cosStream)
throws BadFieldValueException, TransformerException, IOException {
LOG.debug("Setting XMP metadata (title, author, subject) for merged PDF");
XMPMetadata xmpMetadata = XMPMetadata.createXMPMetadata();
// PDF/A-1b properties
PDFAIdentificationSchema pdfaSchema = xmpMetadata.createAndAddPFAIdentificationSchema();
pdfaSchema.setPart(1);
pdfaSchema.setConformance("B");
pdfaSchema.setAboutAsSimple("");
// Dublin Core properties
DublinCoreSchema dublinCoreSchema = xmpMetadata.createAndAddDublinCoreSchema();
dublinCoreSchema.setTitle(DOCUMENT_TITLE);
dublinCoreSchema.addCreator(DOCUMENT_CREATOR);
dublinCoreSchema.setDescription(DOCUMENT_SUBJECT);
// XMP Basic properties
XMPBasicSchema basicSchema = xmpMetadata.createAndAddXMPBasicSchema();
Calendar creationDate = Calendar.getInstance();
basicSchema.setCreateDate(creationDate);
basicSchema.setModifyDate(creationDate);
basicSchema.setMetadataDate(creationDate);
basicSchema.setCreatorTool(DOCUMENT_CREATOR);
// Create and return XMP data structure in XML format
try (ByteArrayOutputStream xmpOutputStream = new ByteArrayOutputStream();
OutputStream cosXMPStream = cosStream.createOutputStream()) {
new XmpSerializer().serialize(xmpMetadata, xmpOutputStream, true);
cosXMPStream.write(xmpOutputStream.toByteArray());
return new PDMetadata(cosStream);
}
}
private static File imageToPDDocument(Path mergeDirectory, File file, byte[] colorProfile) throws IOException {
try (PDDocument doc = new PDDocument()) {
PDImageXObject pdImage = PDImageXObject.createFromFileByContent(file, doc);
drawPage(doc, pdImage);
doc.getDocumentCatalog().addOutputIntent(createColorScheme(doc, colorProfile));
File pdfFile = Files.createTempFile(mergeDirectory, String.valueOf(System.currentTimeMillis()), ".tmp").toFile();
doc.save(pdfFile);
return pdfFile;
}
}
private static void drawPage(PDDocument doc, PDImageXObject pdImage) throws IOException {
PDPage page;
pdImage.getCOSObject().setItem(COSName.SMASK, COSName.NONE);
boolean isLandscapeMode = pdImage.getWidth() > pdImage.getHeight();
if (isLandscapeMode) {
page = new PDPage(new PDRectangle(PDRectangle.A4.getHeight(), PDRectangle.A4.getWidth()));
float scale = Math.min(Math.min(PDRectangle.A4.getWidth() / pdImage.getHeight(), PDRectangle.A4.getHeight() / pdImage.getWidth()), 1);
float width = pdImage.getWidth() * scale;
float height = pdImage.getHeight() * scale;
// center the image
float startWidth = (PDRectangle.A4.getHeight() - width) / 2;
float startHeight = (PDRectangle.A4.getWidth() - height) / 2;
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
contentStream.drawImage(pdImage, startWidth, startHeight, width, height);
}
} else {
page = new PDPage(PDRectangle.A4);
float scale = Math.min(Math.min(PDRectangle.A4.getWidth() / pdImage.getWidth(), PDRectangle.A4.getHeight() / pdImage.getHeight()), 1);
float width = pdImage.getWidth() * scale;
float height = pdImage.getHeight() * scale;
// try to center the image
float startWidth = (PDRectangle.A4.getWidth() - width) / 2;
float startHeight = (PDRectangle.A4.getHeight() - height) / 2;
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
contentStream.drawImage(pdImage, startWidth, startHeight, width, height);
}
}
doc.addPage(page);
}
private static PDOutputIntent createColorScheme(PDDocument doc, byte[] colorProfile) throws IOException {
PDOutputIntent intent = new PDOutputIntent(doc, new ByteArrayInputStream(colorProfile));
intent.setInfo(OUTPUT_CONDITION_IDENTIFIER);
intent.setOutputCondition(OUTPUT_CONDITION_IDENTIFIER);
intent.setOutputConditionIdentifier(OUTPUT_CONDITION_IDENTIFIER);
intent.setRegistryName("http://www.color.org");
return intent;
}
private static boolean isPdf(File file) {
try {
PreflightParser preflightParser = new PreflightParser(file);
preflightParser.parse();
return true;
} catch (Exception e) {
return false;
}
}
private static File streamToFile(Path tempDirectory, InputStream in) throws IOException {
final Path tempFile = Files.createTempFile(tempDirectory, String.valueOf(System.currentTimeMillis()), ".tmp");
try (FileOutputStream out = new FileOutputStream(tempFile.toFile())) {
IOUtils.copy(in, out);
}
return tempFile.toFile();
}
}
You can take a look at this gist for an option to merge pdf files as well.
You need to convert the images to a PDF first. See How can I convert a PNG file to PDF using java? or Create PDF from a PNG image Or Java Panel for an example on how to do this.
After that, use pdfbox to merge the resulting pdfs.
I have used itext library for merging images and convert them to pdf
Here is the code
Document document = new Document();
PdfWriter.getInstance(document, new FileOutputStream(image_path+"\\"+image_name+".pdf"));
document.open();
Paragraph p = new Paragraph();
File files[] = new File(path).listFiles();
PdfPTable table = new PdfPTable(1);
for (File file : files) {
table.setWidthPercentage(100);
table.addCell(createImageCell(file.getAbsolutePath()));
}
document.add(table);
document.close();
Hope It helps
Related
I am trying to export 3 HTML pages (all with same content) into a PDF using iText7.1.0 and pdfHTML2.0.0 using this example. For some reason, the pages have formatting issue at the footer. The jsFiddle link to my HTML code that is being used by PDF renderer.
Below is the Java code used for rendering the PDF (Test.html is the same HTML code in the fiddle):
package com.itextpdf.htmlsamples.chapter01;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import com.itextpdf.html2pdf.ConverterProperties;
import com.itextpdf.html2pdf.HtmlConverter;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.kernel.geom.PageSize;
import com.itextpdf.kernel.utils.PdfMerger;
import com.itextpdf.licensekey.LicenseKey;
/**
* Can we parse different HTML files and combine them into one PDF?
* Yes, this can be done in different ways. This example shows how
* to create a PDF in memory for each HTML, then use PdfMerger to
* merge the different PDFs into one, on a page per page basis.
*/
public class C07E01_CombineHtml {
/** The Base URI of the HTML page. */
public static final String BASEURI = "src/main/resources/html/";
/** An array containing the paths to different HTML files. */
public static final String[] SRC = {
String.format("%sTest.html", BASEURI),
String.format("%sTest.html", BASEURI),
String.format("%sTest.html", BASEURI)
};
/** The target folder for the result. */
public static final String TARGET = "target/results/ch07/";
/** The path to the resulting PDF file. */
public static final String DEST = String.format("%sbundle.pdf", TARGET);
protected PageSize A4;
/**
* The main method of this example.
*
* #param args no arguments are needed to run this example.
* #throws IOException Signals that an I/O exception has occurred.
*/
public static void main(String[] args) throws IOException {
LicenseKey.loadLicenseFile("C://Users//Sparks//Desktop//itextkey-0.xml");
File file = new File(TARGET);
file.mkdirs();
new C07E01_CombineHtml().createPdf(BASEURI, SRC, DEST);
}
/**
* Creates the PDF file.
*
* #param baseUri the base URI
* #param src an array with the paths to different source HTML files
* #param dest the path to the resulting PDF
* #throws IOException Signals that an I/O exception has occurred.
*/
public void createPdf(String baseUri, String[] src, String dest) throws IOException {
ConverterProperties properties = new ConverterProperties();
properties.setBaseUri(baseUri);
PdfWriter writer = new PdfWriter(dest);
PdfDocument pdf = new PdfDocument(writer);
PdfMerger merger = new PdfMerger(pdf);
for (String html : src) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
PdfDocument temp = new PdfDocument(new PdfWriter(baos));
PageSize pageSize = PageSize.A4;
temp.setDefaultPageSize(pageSize);
HtmlConverter.convertToPdf(new FileInputStream(html), temp, properties);
temp = new PdfDocument(new PdfReader(new ByteArrayInputStream(baos.toByteArray())));
merger.merge(temp, 1, temp.getNumberOfPages());
temp.close();
}
pdf.close();
}
}
The output PDF file has 6 pages without footer. It should have 3 pages each of 'A4' size.
Any suggestions would be helpful.
Changing the PageSize to one that is larger should solve this specific issue.
Afterward you can scale the page down in order to get a PDF with A4 pages.
Take a look at the code sample below to get an idea about how you can do this.
public static void main(String[] args) throws IOException {
ByteArrayOutputStream pdf = createPdf("src/main/resources/SO47869248/html.html");
// To get from A3 to A4 the size has to shrink 71%
new SO47869248().scalePdf(DEST, new ByteArrayInputStream(pdf.toByteArray()), 0.7071f);
}
public static ByteArrayOutputStream createPdf(String htmlSrc) throws IOException {
ByteArrayOutputStream output = new ByteArrayOutputStream();
ConverterProperties converterProperties = new ConverterProperties();
converterProperties.setBaseUri(new File(htmlSrc).getParent());
PdfWriter writer = new PdfWriter(output);
PdfDocument pdfDocument = new PdfDocument(writer);
PdfMerger merger = new PdfMerger(pdfDocument);
for(int x=0; x < 3; x++){
ByteArrayOutputStream baos = new ByteArrayOutputStream();
PdfDocument temp = new PdfDocument(new PdfWriter(baos));
temp.setDefaultPageSize(PageSize.A3);
HtmlConverter.convertToPdf(new FileInputStream(htmlSrc), temp, converterProperties);
temp = new PdfDocument(new PdfReader(new ByteArrayInputStream(baos.toByteArray())));
merger.merge(temp, 1, temp.getNumberOfPages());
temp.close();
}
pdfDocument.close();
return output;
}
public void scalePdf(String dest, ByteArrayInputStream input, float scale) throws IOException {
// Create the source document
PdfDocument srcDoc = new PdfDocument(new PdfReader(input));
PdfDocument pdfDoc = new PdfDocument(new PdfWriter(dest));
ScaleDownEventHandler eventHandler = new ScaleDownEventHandler(scale);
int n = srcDoc.getNumberOfPages();
pdfDoc.addEventHandler(PdfDocumentEvent.START_PAGE, eventHandler);
PdfCanvas canvas;
PdfFormXObject page;
for (int p = 1; p <= n; p++) {
eventHandler.setPageDict(srcDoc.getPage(p).getPdfObject());
canvas = new PdfCanvas(pdfDoc.addNewPage());
page = srcDoc.getPage(p).copyAsFormXObject(pdfDoc);
canvas.addXObject(page, scale, 0f, 0f, scale, 0f, 0f);
}
pdfDoc.close();
srcDoc.close();
}
protected class ScaleDownEventHandler implements IEventHandler {
protected float scale = 1;
protected PdfDictionary pageDict;
public ScaleDownEventHandler(float scale) {
this.scale = scale;
}
public void setPageDict(PdfDictionary pageDict) {
this.pageDict = pageDict;
}
#Override
public void handleEvent(Event event) {
PdfDocumentEvent docEvent = (PdfDocumentEvent) event;
PdfPage page = docEvent.getPage();
page.put(PdfName.Rotate, pageDict.getAsNumber(PdfName.Rotate));
scaleDown(page, pageDict, PdfName.MediaBox, scale);
scaleDown(page, pageDict, PdfName.CropBox, scale);
}
protected void scaleDown(PdfPage destPage, PdfDictionary pageDictSrc, PdfName box, float scale) {
PdfArray original = pageDictSrc.getAsArray(box);
if (original != null) {
float width = original.getAsNumber(2).floatValue() - original.getAsNumber(0).floatValue();
float height = original.getAsNumber(3).floatValue() - original.getAsNumber(1).floatValue();
PdfArray result = new PdfArray();
result.add(new PdfNumber(0));
result.add(new PdfNumber(0));
result.add(new PdfNumber(width * scale));
result.add(new PdfNumber(height * scale));
destPage.put(box, result);
}
}
}
For this example I picked the A3 pagesize constant. You can also create a PageSize object using specific measurements. As shown below:
Constructor:
public PageSize(float width, float height)
Example:
PageSize pageSize = new PageSize(750, 1000);
PdfDocument temp = new PdfDocument(pageSize);
try this.
style="page-break-after: always; width: 320pt;" in
Friends, I am using PDFBox 2.0.6. I have been successfull in extracting images from the pdf file, But right now it is creating an image for single pdf page. But the issue is that there can be any no. of images in a pdf page, And I want that each embedded image should be extracted as a single image itself.
Here is the code,
import java.awt.image.BufferedImage;
import java.io.File;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
public class DemoPdf {
public static void main(String args[]) throws Exception {
//Loading an existing PDF document
File file = new File("C:/Users/ADMIN/Downloads/Vehicle_Photographs.pdf");
PDDocument document = PDDocument.load(file);
//Instantiating the PDFRenderer class
PDFRenderer renderer = new PDFRenderer(document);
File imageFolder = new File("C:/Users/ADMIN/Desktop/image");
for (int page = 0; page < document.getNumberOfPages(); ++page) {
//Rendering an image from the PDF document
BufferedImage image = renderer.renderImage(page);
//Writing the image to a file
ImageIO.write(image, "JPEG", new File(imageFolder+"/" + page +".jpg"));
System.out.println("Image created"+ page);
}
//Closing the document
document.close();
}
}
Is it possible in PDFBox that I can extract all embedded images as separate images, Thanks
Yes. It is possible to extract all images from all the pages in pdf.
You may refer this link, extract images from pdf using PDFBox.
The basic idea here is that, extend the class with PDFStreamEngine, and override processOperator method. Call PDFStreamEngine.processPage for all the pages. And if the object that has been passed to processOperator is an Image Object, get BufferedImage from the object, and save it.
Extend PDFStreamEngine and override the processOperator some thing like
#Override
protected void processOperator( Operator operator, List<COSBase> operands) throws IOException
{
String operation = operator.getName();
if( "Do".equals(operation) )
{
COSName objectName = (COSName) operands.get( 0 );
PDXObject xobject = getResources().getXObject( objectName );
if( xobject instanceof PDImageXObject)
{
PDImageXObject image = (PDImageXObject)xobject;
int imageWidth = image.getWidth();
int imageHeight = image.getHeight();
// same image to local
BufferedImage bImage = new BufferedImage(imageWidth,imageHeight,BufferedImage.TYPE_INT_ARGB);
bImage = image.getImage();
ImageIO.write(bImage,"PNG",new File("c:\\temp\\image_"+imageNumber+".png"));
imageNumber++;
}
else
{
}
}
else
{
super.processOperator( operator, operands);
}
}
This answer is similar with #jprism. But this is intended for someone who want just copy and paste this ready to use code with demo.
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.UUID;
public class ExtractImagesUseCase extends PDFStreamEngine{
private final String filePath;
private final String outputDir;
// Constructor
public ExtractImagesUseCase(String filePath,
String outputDir){
this.filePath = filePath;
this.outputDir = outputDir;
}
// Execute
public void execute(){
try{
File file = new File(filePath);
PDDocument document = PDDocument.load(file);
for(PDPage page : document.getPages()){
processPage(page);
}
}catch(IOException e){
e.printStackTrace();
}
}
#Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException{
String operation = operator.getName();
if("Do".equals(operation)){
COSName objectName = (COSName) operands.get(0);
PDXObject pdxObject = getResources().getXObject(objectName);
if(pdxObject instanceof PDImageXObject){
// Image
PDImageXObject image = (PDImageXObject) pdxObject;
BufferedImage bImage = image.getImage();
// File
String randomName = UUID.randomUUID().toString();
File outputFile = new File(outputDir,randomName + ".png");
// Write image to file
ImageIO.write(bImage, "PNG", outputFile);
}else if(pdxObject instanceof PDFormXObject){
PDFormXObject form = (PDFormXObject) pdxObject;
showForm(form);
}
}
else super.processOperator(operator, operands);
}
}
Demo
public class ExtractImageDemo{
public static void main(String[] args){
String filePath = "C:\\Users\\John\\Downloads\\Documents\\sample-file.pdf";
String outputDir = "C:\\Users\\John\\Downloads\\Documents\\Output";
ExtractImagesUseCase useCase = new ExtractImagesUseCase(
filePath,
outputDir
);
useCase.execute();
}
}
I'd like to know, if there any way to replace images in ppt presentaions via Apache POI?
I have a template, where I've placed the elements (text fields and images) and I found out how to replace text, but didn't find anything for images.
Replacing images can be done in two ways:
Simply replace the image inside pptx-file, which is a zip file, under the path (/ppt/media). Checkout this post howto do it ..
or the POI method is to remove the file and add a new one ... and maybe change few other image properties (width, height, ...) - see below for an example
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.OutputStream;
import javax.xml.namespace.QName;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSheet;
import org.apache.xmlbeans.XmlCursor;
public class ReplaceImageInPptx {
public static void main(String[] args) throws Exception {
FileInputStream fis = new FileInputStream("test2.pptx");
XMLSlideShow pptx = new XMLSlideShow(fis);
fis.close();
String blipNS[] = {
"http://schemas.openxmlformats.org/drawingml/2006/main",
"http://schemas.openxmlformats.org/presentationml/2006/main"
};
for (XSLFSheet slide : pptx.getSlides()) {
PackagePart packPart = slide.getPackagePart();
for (String ns : blipNS) {
XmlCursor picCur = slide.getXmlObject().newCursor();
picCur.selectPath("declare namespace p='"+ns+"' .//p:blip"); // or blipFill
while (picCur.toNextSelection()) {
// ... doesn't work for all namespaces ...
// CTBlipFillProperties blipFill = (CTBlipFillProperties)picCur.getObject();
// CTBlip blip = blipFill.getBlip();
// String relId = blip.getEmbed();
QName relName = new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "embed");
String relId = picCur.getAttributeText(relName);
// remove old media file and reference
PackageRelationship packRel = packPart.getRelationship(relId);
PackagePartName oldPartName = PackagingURIHelper.createPartName(packRel.getTargetURI());
packPart.getPackage().removePart(oldPartName);
// add something new
PackagePartName partName = PackagingURIHelper.createPartName("/ppt/media/smiley.jpg");
PackagePart part = pptx.getPackage().createPart(partName, "image/jpeg");
OutputStream partOs = part.getOutputStream();
FileInputStream fis2 = new FileInputStream("src/test/resources/smiley.jpg");
byte buf[] = new byte[1024];
for (int readBytes; (readBytes = fis2.read(buf)) != -1; partOs.write(buf, 0, readBytes));
fis2.close();
partOs.close();
PackageRelationship prs = slide.getPackagePart().addRelationship(partName, TargetMode.INTERNAL, "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image");
// blip.setEmbed(prs.getId());
picCur.setAttributeText(relName, prs.getId());
// maybe change the size a bit
// blipFill.getStretch().getFillRect().setL(<left padding in % (+/-)>)
}
picCur.dispose();
}
}
FileOutputStream fos = new FileOutputStream("test3.pptx");
pptx.write(fos);
fos.close();
}
}
I m trying to extract images from a pdf using pdfbox. The example pdf here
But i m getting blank images only.
The code i m trying:-
public static void main(String[] args) {
PDFImageExtract obj = new PDFImageExtract();
try {
obj.read_pdf();
} catch (IOException ex) {
System.out.println("" + ex);
}
}
void read_pdf() throws IOException {
PDDocument document = null;
try {
document = PDDocument.load("C:\\Users\\Pradyut\\Documents\\MCS-034.pdf");
} catch (IOException ex) {
System.out.println("" + ex);
}
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
int i =1;
String name = null;
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
Map pageImages = resources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
i ++;
}
}
}
}
Thanks
Here is code using PDFBox 2.0.1 that will get a list of all images from the PDF. This is different than the other code in that it will recurse through the document instead of trying to get the images from the top level.
public List<RenderedImage> getImagesFromPDF(PDDocument document) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (PDPage page : document.getPages()) {
images.addAll(getImagesFromResources(page.getResources()));
}
return images;
}
private List<RenderedImage> getImagesFromResources(PDResources resources) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (COSName xObjectName : resources.getXObjectNames()) {
PDXObject xObject = resources.getXObject(xObjectName);
if (xObject instanceof PDFormXObject) {
images.addAll(getImagesFromResources(((PDFormXObject) xObject).getResources()));
} else if (xObject instanceof PDImageXObject) {
images.add(((PDImageXObject) xObject).getImage());
}
}
return images;
}
The below GetImagesFromPDF java class get all images in 04-Request-Headers.pdf file and save those files into destination folder PDFCopy.
import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
#SuppressWarnings({ "unchecked", "rawtypes", "deprecation" })
public class GetImagesFromPDF {
public static void main(String[] args) {
try {
String sourceDir = "C:/PDFCopy/04-Request-Headers.pdf";// Paste pdf files in PDFCopy folder to read
String destinationDir = "C:/PDFCopy/";
File oldFile = new File(sourceDir);
if (oldFile.exists()) {
PDDocument document = PDDocument.load(sourceDir);
List<PDPage> list = document.getDocumentCatalog().getAllPages();
String fileName = oldFile.getName().replace(".pdf", "_cover");
int totalImages = 1;
for (PDPage page : list) {
PDResources pdResources = page.getResources();
Map pageImages = pdResources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage pdxObjectImage = (PDXObjectImage) pageImages.get(key);
pdxObjectImage.write2file(destinationDir + fileName+ "_" + totalImages);
totalImages++;
}
}
}
} else {
System.err.println("File not exists");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
For PDFBox 2.0.1, pudaykiran's answer must be slightly modified since some APIs have been changed.
public static void testPDFBoxExtractImages() throws Exception {
PDDocument document = PDDocument.load(new File("D:/Temp/Test.pdf"));
PDPageTree list = document.getPages();
for (PDPage page : list) {
PDResources pdResources = page.getResources();
for (COSName c : pdResources.getXObjectNames()) {
PDXObject o = pdResources.getXObject(c);
if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
File file = new File("D:/Temp/" + System.nanoTime() + ".png");
ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject)o).getImage(), "png", file);
}
}
}
}
Just add the .jpeg to the end of your path:
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i + ".jpeg");
That works for me.
You can use PDPage.convertToImage() function which can convert the PDF page into a BufferedImage. Next you can use the BufferedImage to create an Image.
Use the following reference for further detail:
All PDF realated classes in PDFBox you can get in
Apache PDFBox 1.8.3 API
Here you can see PDPage related documentation.
And do not forget to look for PDPage.convertToImage() function in PDPage class.
This is a kotlin version of #Matt's answer.
fun <R> PDResources.onImageResources(block: (RenderedImage) -> (R)): List<R> =
this.xObjectNames.flatMap {
when (val xObject = this.getXObject(it)) {
is PDFormXObject -> xObject.resources.onImageResources(block)
is PDImageXObject -> listOf(block(xObject.image))
else -> emptyList()
}
}
You can use it on PDPage Resources like this:
page.resources.onImageResources { image ->
Files.createTempFile("image", "xxx").also { path->
if(!ImageIO.write(it, "xxx", file.toFile()))
IllegalStateException("Couldn't write image to file")
}
}
Where "xxx" is the format you need (like "jpeg")
For someone who want just copy and paste this ready to use code
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.UUID;
public class ExtractImagesUseCase extends PDFStreamEngine{
private final String filePath;
private final String outputDir;
// Constructor
public ExtractImagesUseCase(String filePath,
String outputDir){
this.filePath = filePath;
this.outputDir = outputDir;
}
// Execute
public void execute(){
try{
File file = new File(filePath);
PDDocument document = PDDocument.load(file);
for(PDPage page : document.getPages()){
processPage(page);
}
}catch(IOException e){
e.printStackTrace();
}
}
#Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException{
String operation = operator.getName();
if("Do".equals(operation)){
COSName objectName = (COSName) operands.get(0);
PDXObject pdxObject = getResources().getXObject(objectName);
if(pdxObject instanceof PDImageXObject){
// Image
PDImageXObject image = (PDImageXObject) pdxObject;
BufferedImage bImage = image.getImage();
// File
String randomName = UUID.randomUUID().toString();
File outputFile = new File(outputDir,randomName + ".png");
// Write image to file
ImageIO.write(bImage, "PNG", outputFile);
}else if(pdxObject instanceof PDFormXObject){
PDFormXObject form = (PDFormXObject) pdxObject;
showForm(form);
}
}
else super.processOperator(operator, operands);
}
}
Demo
public class ExtractImageDemo{
public static void main(String[] args){
String filePath = "C:\\Users\\John\\Downloads\\Documents\\sample-file.pdf";
String outputDir = "C:\\Users\\John\\Downloads\\Documents\\Output";
ExtractImagesUseCase useCase = new ExtractImagesUseCase(
filePath,
outputDir
);
useCase.execute();
}
}
Instead of calling
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
You can use the ImageIO.write() static method to write the RGB image out in whatever format you need. Here I've used PNG:
File outputFile = new File( "C:\\Users\\Pradyut\\Documents\\image" + i + ".png");
ImageIO.write( image.getRGBImage(), "png", outputFile);
I am currently using PDFBox and reading from within a.pdf which is found in folder 1
I first list all the Pdf files found within the folder.
Then I check the number of pages that each file has.
Now i want to go to the very end of the file below the footer to add an image that can be recognised by the printer to staple the pages since it will realise it has reached end of file.
I have arrived till getting list of files and the number of pages.
What command do i use to go to the end of the last page and write there.
Should i transform the .pdf file into text or
Should i be able to use PDPageContentStream
This is the code I am currently using I am trying to test and see if a AAA string will be insterted into my last page of the pdf file. the project is executing with no errors but for some reason it is not being inserted into the pdf.
package pdfviewer;
import java.io.*;
import java.util.*;
import java.util.List;
import java.io.IOException;
import org.apache.pdfbox.PDFReader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.edit.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
public class Main {
/**
* #param args the command line arguments
*/
public static List flist()
{
List listfile = new ArrayList();
String path = "C:/1";
String files;
File folder = new File(path);
File[] listOfFiles = folder.listFiles();
for (int i = 0; i < listOfFiles.length; i++)
{
if (listOfFiles[i].isFile())
{
files = listOfFiles[i].getName();
if (files.endsWith(".pdf") || files.endsWith(".PDF"))
{
listfile.add(listOfFiles[i]);
}
}
}
System.out.println(listfile);
return listfile;
}
public static void CheckPages(List a)
{
String dir = null;
Object[] arraydir = a.toArray(new Object[0]);
for (int i=0; i< arraydir.length; i++)
{
int pages = 0;
PDFont font = PDType1Font.HELVETICA_BOLD;
float fontSize = 12.0f;
dir = arraydir[i].toString();
System.out.println(dir);
try {
PDDocument pdoc = PDDocument.load(dir);
List allPages = pdoc.getDocumentCatalog().getAllPages();
pages = pdoc.getNumberOfPages();
System.out.println(allPages);
int f = pages;
System.out.println(pages);
PDPage page = (PDPage) allPages.get(i);
//System.out.println(page);
PDRectangle pageSize = page.findMediaBox();
float stringWidth = font.getStringWidth( "AAA" );
float centeredPosition = (pageSize.getWidth() - (stringWidth*fontSize)/1000f)/2f;
PDPageContentStream contentStream = new PDPageContentStream(pdoc,page,true,true);
//System.out.println(contentStream);
contentStream.beginText();
contentStream.setFont( font, fontSize );
contentStream.moveTextPositionByAmount( centeredPosition, 30 );
contentStream.drawString( "AAA" );
contentStream.endText();
contentStream.close();
pdoc.close();
}
catch (Exception e)
{
System.err.println("An exception occured in parsing the PDF Document."+ e.getMessage());
}
}
}
public static void main(String[] args)
{
List l = new ArrayList();
l = pdfviewer.Main.flist();
pdfviewer.Main.CheckPages(l);
}
}
Thanks for your attention
The code I was using above is correct.
The problem is that the PDF files being generated are version 1.2, that is the reason why I am not being allowed to Edit the pdf document.
Does anyone know what I should do if i'm using a version 1.2, since I can't really upgrade it.
you can look at the examples supplied with the library.
there are two files that are of interest to you:
1- AddImageToPDF.java AddImageToPDF.java on google code search
2- AddMessageToEachPage.java AddMessageToEachPage.java on google code search
the second one adds a message to every page but you can modify it to work with the last page only. according to the PDFBox user guide document, they should be found under the folder: src/main/java/org/apache/pdfbox/examples
I have added links on google code search in case you have trouble locating the files.
I have not worked with the library or tried the examples and I am quite sure you will need to modify the code a little to suit your needs for the location of the added line/image.
In any case, if this helps you and you get a working solution, you can add the solution so that others can benefit from it.
EDIT:
After seeing the code posted by the question author, I add a modification to make it work.
I allowed myself also to make few changes for clarity.
import java.io.File;
import java.io.FileFilter;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.edit.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
public class Main {
/**
* #param args the command line arguments
*/
public static final FileFilter pdfFileFilter = new FileFilter() {
public boolean accept(File file) {
return file.isFile() && file.getName().toLowerCase().endsWith(".pdf");
}
};
public static void closeQuietly(PDDocument doc) {
if (doc != null) {
try {
doc.close();
} catch (Exception exception) {
//do something here if you wish like logging
}
}
}
public static void CheckPages(File[] sourcePdfFiles,String textToInsert, String prefix) {
for (File sourcePdfFile : sourcePdfFiles) {
PDFont font = PDType1Font.HELVETICA_BOLD;
float fontSize = 12.0f;
PDDocument pdoc = null;
try {
pdoc = PDDocument.load(sourcePdfFile);
List allPages = pdoc.getDocumentCatalog().getAllPages();
PDPage lastPage = (PDPage) allPages.get(allPages.size() - 1);
PDRectangle pageSize = lastPage.findMediaBox();
float stringWidth = font.getStringWidth(textToInsert);
float centeredPosition = (pageSize.getWidth() - (stringWidth * fontSize) / 1000f) / 2f;
PDPageContentStream contentStream = new PDPageContentStream(pdoc, lastPage, true, true);
contentStream.beginText();
contentStream.setFont(font, fontSize);
contentStream.moveTextPositionByAmount(centeredPosition, 30);
contentStream.drawString(textToInsert);
contentStream.endText();
contentStream.close();
File resultFile = new File(sourcePdfFile.getParentFile(), prefix + sourcePdfFile.getName());
pdoc.save(resultFile.getAbsolutePath());
} catch (Exception e) {
System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
} finally {
closeQuietly(pdoc);
}
}
}
public static void main(String[] args) {
File pdfFilesFolder = new File("C:\\1");
File[] pdfFiles = pdfFilesFolder.listFiles(pdfFileFilter);
//when a file is processed, the result will be saved in a new file having the location of the source file
//and the same name of source file prefixed with this
String modifiedFilePrefix = "modified-";
CheckPages(pdfFiles,"AAA", modifiedFilePrefix);
}
}