Flatten vector graphics inside pdf and extract using java - java

I am trying to get sizes (width and depth) of images embedded in a PDF file. The images in the PDF are all high resolution vector images.
I tried using PDFBox. PDFBox libraries extract images perfectly for normal graphics. But, when it gets vector images, it extracts different layers as different images.
I have also read about iText. But iText can convert the whole page as rasterized image. Whereas, my PDF page is actually consisting multiple images and I need to extract/get size of all of them differently.
I am attaching my PDFBox image extraction code here. Please let me know, how can I get one vectored image as one image and not as layers.
My code is as follows:
package com.abp.pdf.util;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
public class ExtractImages {
private int imageCounter = 1;
private ExtractImages() {
}
public static void main(String[] args) throws Exception {
ExtractImages extractor = new ExtractImages();
extractor.extractImages(args);
}
private void extractImages(String[] args) throws Exception {
String pdfFile = null;
String password = "";
String prefix = null;
boolean addKey = false;
boolean useNonSeqParser = true;
pdfFile = "/home/suvankar/Resources/myfile.pdf";
if (prefix == null && pdfFile.length() > 4) {
prefix = pdfFile.substring(0, pdfFile.lastIndexOf("/") + 1)
+ "extracted/images"
+ pdfFile.substring(pdfFile.lastIndexOf("/"),
pdfFile.length() - 4);
}
PDDocument document = null;
try {
if (useNonSeqParser) {
document = PDDocument.loadNonSeq(new File(pdfFile), null,
password);
} else {
document = PDDocument.load(pdfFile);
if (document.isEncrypted()) {
StandardDecryptionMaterial spm = new StandardDecryptionMaterial(
password);
document.openProtection(spm);
}
}
AccessPermission ap = document.getCurrentAccessPermission();
if (!ap.canExtractContent()) {
throw new IOException(
"Error: You do not have permission to extract images.");
}
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
processResources(resources, prefix, addKey);
}
} finally {
if (document != null) {
document.close();
}
}
}
private void processResources(PDResources resources, String prefix,
boolean addKey) throws IOException {
if (resources == null) {
return;
}
Map<String, PDXObject> xobjects = resources.getXObjects();
if (xobjects != null) {
Iterator<String> xobjectIter = xobjects.keySet().iterator();
while (xobjectIter.hasNext()) {
String key = xobjectIter.next();
PDXObject xobject = xobjects.get(key);
// write the images
if (xobject instanceof PDXObjectImage) {
PDXObjectImage image = (PDXObjectImage) xobject;
String name = null;
if (addKey) {
name = getUniqueFileName(prefix + "_" + key,
image.getSuffix());
} else {
name = getUniqueFileName(prefix, image.getSuffix());
}
System.out.println("Writing image:" + name + "\nHeight - "+ image.getHeight() + "\nWidth - " + image.getWidth());
// name="extracted/images/" + name;
/*BufferedImage ib= image.getRGBImage();
File outputfile = new File(name + "-buffered.jpg");
ImageIO.write(ib, "jpeg", outputfile);*/
image.write2file(name);
}
// maybe there are more images embedded in a form object
else if (xobject instanceof PDXObjectForm) {
PDXObjectForm xObjectForm = (PDXObjectForm) xobject;
PDResources formResources = xObjectForm.getResources();
processResources(formResources, prefix, addKey);
}
}
}
}
private String getUniqueFileName(String prefix, String suffix) {
String uniqueName = null;
File f = null;
while (f == null || f.exists()) {
uniqueName = prefix + "-" + imageCounter;
f = new File(uniqueName + "." + suffix);
imageCounter++;
}
return uniqueName;
}
/**
* This will print the usage requirements and exit.
*/
private static void usage() {
System.err
.println("Usage: java org.apache.pdfbox.ExtractImages [OPTIONS] <PDF file>\n"
+ " -password <password> Password to decrypt document\n"
+ " -prefix <image-prefix> Image prefix(default to pdf name)\n"
+ " -addkey add the internal image key to the file name\n"
+ " -nonSeq Enables the new non-sequential parser\n"
+ " <PDF file> The PDF document to use\n");
System.exit(1);
}
}

Related

Converting html files with *ngFor to pdf in Java

I have to generate documents in my Java Web application (Maven, runs on a server) and have to insert data from a Java class into this document.
I would like to be able to write a HTML file with placeholders. The placeholder should be replaced from the application with data from a Java class.
I also would like to be able to use conditionals like *ngFor (e.g. inserting a list into a ) or *ngIf from Angular (or attributes with a similar function).
Does anyone know a library for this?
I have a good knowledge of Java, HTML etc. so using such a library (if there is one) will not be a problem for me
In the meantime I've written a little script myself. In case someone needs something similar, I've included it in an answer
In the meantime I have searched further and unfortunately I have not found a suitable solution so far. Therefore, I have now set about programming a solution myself. Much less effort is required than expected. Here is my current code. It is currently a rough draft and certainly needs some improvement.
package com.XYZ.file.bo;
import java.beans.IntrospectionException;
import java.beans.PropertyDescriptor;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.InputStream;
import java.io.StringWriter;
import java.lang.reflect.InvocationTargetException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.XYZ.file.service.FileService;
import com.XYZ.servicelocator.ServiceLocator;
import com.XYZ.util.TechnicalException;
import com.itextpdf.html2pdf.ConverterProperties;
import com.itextpdf.html2pdf.HtmlConverter;
public class TemplateGeneratorBO {
private FileService fileService;
private static final String DOC_TEMPLATE_DIR = FileBO.BASE_DIR + "templates/";
public File generateDoc(String tempFileName, String saveFolder, String saveFileName, Object entity) {
String htmlDoc = parseHtmlDoc(tempFileName, entity);
htmlDoc = replaceSpecialChars(htmlDoc);
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
ConverterProperties converterProperties = new ConverterProperties();
HtmlConverter.convertToPdf(htmlDoc, outStream, converterProperties);
InputStream inStream = new ByteArrayInputStream(outStream.toByteArray());
saveFolder += "/" + callGetter(entity, "id") + "/templates";
if (!getFileService().createAndSaveFile(saveFolder, saveFileName + ".pdf", inStream)) {
int counter = 0;
boolean success = false;
do {
counter++;
success = getFileService().createAndSaveFile(saveFolder, saveFileName + "-" + counter + ".pdf",
inStream);
} while (!success);
return getFileService().getFile(saveFolder, saveFileName + "-" + counter + ".pdf");
}
return getFileService().getFile(saveFolder, saveFileName + ".pdf");
}
private String parseHtmlDoc(String fileName, Object entity) {
try {
File htmlFile = new File(DOC_TEMPLATE_DIR + fileName);
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(htmlFile);
doc.getDocumentElement().normalize();
Element elm = doc.getDocumentElement();
NodeList headList = elm.getElementsByTagName("head");
NodeList bodyList = elm.getElementsByTagName("body");
verifyTemplate(fileName, elm, headList, bodyList);
Node head = headList.item(0);
String html = "<html>" + xmlToString(head) + "<body>";
html += nodeToString(bodyList.item(0),
newTempEntList(new TemplateGenEntity(entity.getClass().getSimpleName(), entity)));
html += "</body></html>";
return html;
} catch (Exception exc) {
throw new TechnicalException("DocGenerator Exception with file " + fileName, exc);
}
}
private String nodeToString(Node parentNode, List<TemplateGenEntity> entities)
throws TransformerException, ScriptException {
NodeList nodes = parentNode.getChildNodes();
StringBuilder string = new StringBuilder("");
for (int i = 0; i < nodes.getLength(); i++) {
Node node = nodes.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element elm = (Element) node;
string.append(elementToString(elm, entities));
} else {
string.append(insertValues(xmlToString(node), entities));
}
}
return string.toString();
}
private String elementToString(Element elm, List<TemplateGenEntity> entities)
throws ScriptException, TransformerException {
if (!proofNgIf(elm, entities)) {
return "";
}
if (elm.hasAttribute("ngFor")) {
return ngForElementToString(elm, entities);
}
return "<" + elm.getNodeName() + getElementAttributes(elm) + ">" + nodeToString(elm, entities) + "</"
+ elm.getNodeName() + ">";
}
#SuppressWarnings("unchecked")
private String ngForElementToString(Element elm, List<TemplateGenEntity> entities)
throws ScriptException, TransformerException {
String attrs = getElementAttributes(elm);
String ngFor = elm.getAttribute("ngFor");
String[] ngForList = ngFor.split(" of ");
StringBuilder string = new StringBuilder();
ScriptEngineManager factory = new ScriptEngineManager();
ScriptEngine engine = factory.getEngineByName("JavaScript");
for (TemplateGenEntity ent : entities) {
engine.put(ent.getEntityName(), ent.getEntity());
}
List<Object> list = (List<Object>) engine.eval(ngForList[1]);
for (Object obj : list) {
string.append("<" + elm.getNodeName() + attrs + ">"
+ nodeToString(elm, newTempEntList(entities, new TemplateGenEntity(ngForList[0], obj))) + "</"
+ elm.getNodeName() + ">");
}
return string.toString();
}
/**
*
* #return true if no ngIf or ngIf condition is true
* #throws ScriptException
*/
private boolean proofNgIf(Element elm, List<TemplateGenEntity> entities) throws ScriptException {
if (!elm.hasAttribute("ngIf")) {
return true;
}
String ngIf = elm.getAttribute("ngIf");
if (ngIf.isBlank()) {
throw new TechnicalException("Document template contains empty ngIf!");
}
ScriptEngineManager factory = new ScriptEngineManager();
ScriptEngine engine = factory.getEngineByName("JavaScript");
for (TemplateGenEntity ent : entities) {
engine.put(ent.getEntityName(), ent.getEntity());
}
return (boolean) engine.eval(ngIf);
}
private String insertValues(String strIn, List<TemplateGenEntity> entities) throws ScriptException {
StringBuilder str = new StringBuilder(strIn);
int begin = str.indexOf("{{");
int end = str.indexOf("}}") + 2;
while (begin != -1 && end != 1) {
String var = str.substring(begin, end);
var = var.replace("{{", "");
var = var.replace("}}", "");
ScriptEngineManager factory = new ScriptEngineManager();
ScriptEngine engine = factory.getEngineByName("JavaScript");
for (TemplateGenEntity ent : entities) {
engine.put(ent.getEntityName(), ent.getEntity());
}
Object val = engine.eval(var);
String valStr = objectToStr(val);
str = str.replace(begin, end, valStr);
begin = str.indexOf("{{");
end = str.indexOf("}}") + 2;
}
return str.toString();
}
private String xmlToString(Node node) throws TransformerException {
StringWriter writer = new StringWriter();
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.setOutputProperty(OutputKeys.INDENT, "no");
transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, "yes");
transformer.transform(new DOMSource(node), new StreamResult(writer));
return writer.toString();
}
private String replaceSpecialChars(String str) {
str = str.replace(">", ">");
return str;
}
private String getElementAttributes(Element elm) {
StringBuilder attrStr = new StringBuilder();
NamedNodeMap attrs = elm.getAttributes();
for (int i = 0; i < attrs.getLength(); i++) {
Attr attr = (Attr) attrs.item(i);
String attrName = attr.getName();
String attrVal = attr.getValue();
if (attrName.equals("ngIf") || attrName.equals("ngFor")) {
continue;
}
attrStr.append(" " + attrName + "=\"" + attrVal + "\"");
}
return attrStr.toString();
}
private void verifyTemplate(String fileName, Element elm, NodeList head, NodeList body) {
if (!elm.getNodeName().equalsIgnoreCase("html")) {
throw new TechnicalException("Document template " + fileName + " doesn't starts with html node!");
}
if (head.getLength() != 1 || head.item(0) == null) {
throw new TechnicalException("Document template " + fileName + " doesn't contains head!");
}
if (body.getLength() != 1 || head.item(0) == null) {
throw new TechnicalException("Document template " + fileName + " doesn't contains body!");
}
}
private FileService getFileService() {
if (fileService == null) {
fileService = ServiceLocator.locateService(FileService.class);
}
return fileService;
}
private Object callGetter(Object obj, String fieldName) {
PropertyDescriptor pd;
try {
pd = new PropertyDescriptor(fieldName, obj.getClass());
return pd.getReadMethod().invoke(obj);
} catch (IntrospectionException | IllegalAccessException | IllegalArgumentException
| InvocationTargetException e) {
throw new TechnicalException(e.getMessage(), e);
}
}
private String objectToStr(Object obj) {
if (obj instanceof Date) {
return new SimpleDateFormat("dd.MM.yyyy").format(obj);
}
return obj.toString();
}
public static List<TemplateGenEntity> newTempEntList(TemplateGenEntity entity) {
List<TemplateGenEntity> list = new ArrayList<>();
list.add(entity);
return list;
}
public static List<TemplateGenEntity> newTempEntList(List<TemplateGenEntity> entities, TemplateGenEntity entity) {
List<TemplateGenEntity> list = new ArrayList<>();
for (TemplateGenEntity ent : entities) {
list.add(ent);
}
list.add(entity);
return list;
}
public class TemplateGenEntity {
private String entityName;
private Object entity;
public TemplateGenEntity(String entityName, Object entity) {
this.entityName = entityName;
this.entity = entity;
}
public String getEntityName() {
return entityName;
}
public void setEntityName(String entityName) {
this.entityName = entityName;
}
public Object getEntity() {
return entity;
}
public void setEntity(Object entity) {
this.entity = entity;
}
}
}
use freemarker for the placeholder replacement and pd4ml for the html convertion, it works well for me.

Image compression or conversion

Simple i want to apply image compression using PNG/JPEG/Bitmap file.
Android we have Bitmap.CompressFormat to compressed our bitmap file and use for further operation.
Bitmap.CompressFormat class allow to compress in 3 format as below :
JPEG
PNG
WEBP
My query is i want to compress file in any on of below format :
JBIG2
TIFF G4
TIFF LZW
I have found some image compression library like ImageIo & ImageMagick but didn't get any success. I want to use this file to upload on AmazonServer. Please guide me how to achieve this or is there any other option to upload image on amazon server.
Thanks for your time.
I don't know about those file's compression but i created this class to upload files programatically into an Amazon s3 bucket that uses the Amazon SDK api:
package com.amazon.util;
import com.amazonaws.AmazonClientException;
import com.amazonaws.auth.PropertiesCredentials;
import com.amazonaws.regions.Region;
import com.amazonaws.regions.Regions;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.CannedAccessControlList;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.PutObjectRequest;
import com.amazonaws.services.s3.model.PutObjectResult;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
public class AmazonS3Files {
private static final String existingBucketName = "bucketName";
private final String strProperties = "accessKey = MYACESSKEY \n"
+ "secretKey = my+secret+key";
private static final String urlRegion = "https://s3.amazonaws.com/";
public static final String urlPathS3 = urlRegion + existingBucketName;
public String UploadFile(InputStream inFile, String pathDocument, String fileName) {
return UploadFile(inFile, pathDocument, fileName, false);
}
public void deleteObjectsInFolder(String folderPath) {
InputStream inputStreamCredentials = new ByteArrayInputStream(strProperties.getBytes());
folderPath = folderPath.replace('\\', '/');
if (folderPath.charAt(folderPath.length() - 1) == '/') {
folderPath = folderPath.substring(0, folderPath.length() - 1);
}
if (folderPath.charAt(0) == '/') {
folderPath = folderPath.substring(1, folderPath.length());
}
try {
AmazonS3 s3Client = new AmazonS3Client(new PropertiesCredentials(inputStreamCredentials));
for (S3ObjectSummary file : s3Client.listObjects(existingBucketName, folderPath).getObjectSummaries()) {
s3Client.deleteObject(existingBucketName, file.getKey());
}
} catch (IOException | AmazonClientException e) {
System.out.println(e);
}
}
public void deleteFile(String filePath) {
InputStream inputStreamCredentials = new ByteArrayInputStream(strProperties.getBytes());
filePath = filePath.replace('\\', '/');
if (filePath.charAt(0) == '/') {
filePath = filePath.substring(1, filePath.length());
}
try {
AmazonS3 s3Client = new AmazonS3Client(new PropertiesCredentials(inputStreamCredentials));
s3Client.deleteObject(existingBucketName, filePath);
} catch (IOException | AmazonClientException e) {
System.out.println(e);
}
}
public String UploadFile(InputStream inFile, String pathDocument, String fileName, boolean bOverwiteFile) {
InputStream inputStreamCredentials = new ByteArrayInputStream(strProperties.getBytes());
String amazonFileUploadLocationOriginal;
String strFileExtension = fileName.substring(fileName.lastIndexOf("."), fileName.length());
fileName = fileName.substring(0, fileName.lastIndexOf("."));
fileName = fileName.replaceAll("[^A-Za-z0-9]", "");
fileName = fileName + strFileExtension;
pathDocument = pathDocument.replace('\\', '/');
try {
if (pathDocument.charAt(pathDocument.length() - 1) == '/') {
pathDocument = pathDocument.substring(0, pathDocument.length() - 1);
}
if (pathDocument.charAt(0) == '/') {
pathDocument = pathDocument.substring(1, pathDocument.length());
}
amazonFileUploadLocationOriginal = existingBucketName + "/" + pathDocument;
AmazonS3 s3Client = new AmazonS3Client(new PropertiesCredentials(inputStreamCredentials));
s3Client.setRegion(Region.getRegion(Regions.SA_EAST_1));
ObjectMetadata objectMetadata = new ObjectMetadata();
objectMetadata.setContentLength(inFile.available());
if (!bOverwiteFile) {
boolean bFileServerexists = true;
int tmpIntEnum = 0;
while (bFileServerexists) {
String tmpStrFile = fileName;
if (tmpIntEnum > 0) {
tmpStrFile = fileName.substring(0, fileName.lastIndexOf(".")) + "(" + tmpIntEnum + ")" + fileName.substring(fileName.lastIndexOf("."), fileName.length());
}
if (!serverFileExists(urlRegion + amazonFileUploadLocationOriginal + "/" + tmpStrFile)) {
bFileServerexists = false;
fileName = tmpStrFile;
}
tmpIntEnum++;
}
}
String strFileType = fileName.substring(fileName.lastIndexOf("."), fileName.length());
if (strFileType.toUpperCase().equals(".jpg".toUpperCase())) {
objectMetadata.setContentType("image/jpeg");
} else if (strFileType.toUpperCase().equals(".png".toUpperCase())) {
objectMetadata.setContentType("image/png");
} else if (strFileType.toUpperCase().equals(".gif".toUpperCase())) {
objectMetadata.setContentType("image/gif");
} else if (strFileType.toUpperCase().equals(".gmap".toUpperCase())) {
objectMetadata.setContentType("text/plain");
}
PutObjectRequest putObjectRequest = new PutObjectRequest(amazonFileUploadLocationOriginal, fileName, inFile, objectMetadata).withCannedAcl(CannedAccessControlList.PublicRead);
PutObjectResult result = s3Client.putObject(putObjectRequest);
return "/" + pathDocument + "/" + fileName;
} catch (Exception e) {
// TODO: handle exception
return null;
}
}
public boolean serverFileExists(String URLName) {
try {
HttpURLConnection.setFollowRedirects(false);
HttpURLConnection con =
(HttpURLConnection) new URL(URLName).openConnection();
con.setRequestMethod("HEAD");
return (con.getResponseCode() == HttpURLConnection.HTTP_OK);
} catch (Exception e) {
e.printStackTrace();
return false;
}
}
}
And for usage with your file:
BufferedImage img = null;
try {
img = ImageIO.read(new File("file.jpg"));
String strReturn = AmazonS3Files.UploadFile(new ByteArrayInputStream(((DataBufferByte)(img).getRaster().getDataBuffer()).getData()), "path/to/file", "newfilename.jpg"); //Returns null if the upload doesn't work or the s3 file path of the uploaded file
} catch (IOException e) {
//Handle Exception
}

How to parse HTML with java properly?

Scenario/Requirement:
Download html page from some URL
Download images that were mentioned in html tags.
Change tags for images in my file, so I can open it with my browser offline and see them.
I made first 2 points, but am having difficulties with the third one.Tags do not change.What am I doing wrong?
The job is to open a file, find img src tag and replace it by another tag! Can you give me an example?
Code:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import java.awt.image.BufferedImage;
import java.net.URL;
import java.net.URLConnection;
import javax.imageio.ImageIO;
import javax.swing.text.AttributeSet;
import javax.swing.text.html.HTMLDocument;
public class ExtractAllImages {
static String result_doc = "/home/foo/index.html";
static String home_folder = "/home/foo/";
static String start_webURL = "http://www.oracle.com/";
public static void main(String args[]) throws Exception {
String webUrl = start_webURL;
URL url = new URL(webUrl);
URLConnection connection = url.openConnection();
InputStream is = connection.getInputStream();
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
HTMLEditorKit.Parser parser = new ParserDelegator();
HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
parser.parse(br, callback, true);
FileWriter writer = new FileWriter(result_doc);
htmlKit.write(writer, htmlDoc, 0, htmlDoc.getLength());
writer.close();
int number_or_images = 0;
String[] array = new String[4096];
for (HTMLDocument.Iterator iterator = htmlDoc.getIterator(HTML.Tag.IMG); iterator.isValid(); iterator.next()) {
AttributeSet attributes = iterator.getAttributes();
String imgSrc = (String) attributes.getAttribute(HTML.Attribute.SRC);
System.out.println("img_src = " + imgSrc);
if (imgSrc != null && (imgSrc.endsWith(".jpg") || (imgSrc.endsWith(".png")) || (imgSrc.endsWith(".jpeg")) || (imgSrc.endsWith(".bmp")) || (imgSrc.endsWith(".ico")))) {
try {
downloadImage(webUrl, imgSrc);
} catch (IOException ex) {
System.out.println(ex.getMessage());
}
}
array[number_or_images] = imgSrc;
number_or_images++;
///TODO change
}
for(int i =0; i < number_or_images; i++)
{
System.out.println("before = "+array[i]);
while(true)
{
int count = array[i].indexOf('/');
if(count == -1) break;
array[i] = array[i].substring(count+1);
}
System.out.println("after = " + array[i]);
}
//TODO open file and replace tags
int i =0;
File input = new File(result_doc);
Document doc = Jsoup.parse(input, "UTF-8");
System.out.println( input.canWrite());
for( Element img : doc.select("img[src]") )
{
String s = img.attr("src");
System.out.println(s);
img.attr("src", "/home/foo/"+array[i]); // set attribute 'src' to 'your-source-here'
s = img.attr("src");
System.out.println(s);
++i;
}
}
private static void downloadImage(String url, String imgSrc) throws IOException {
BufferedImage image = null;
try {
if (!(imgSrc.startsWith("http"))) {
url = url + imgSrc;
} else {
url = imgSrc;
}
imgSrc = imgSrc.substring(imgSrc.lastIndexOf("/") + 1);
String imageFormat = null;
imageFormat = imgSrc.substring(imgSrc.lastIndexOf(".") + 1);
String imgPath = null;
imgPath = home_folder + imgSrc + "";
URL imageUrl = new URL(url);
image = ImageIO.read(imageUrl);
if (image != null) {
File file = new File(imgPath);
ImageIO.write(image, imageFormat, file);
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
Solved.
I didn't save changes. Need to add code befire "downloadImage()"
int i = 0;
File input = new File(result_doc);
Document doc = Jsoup.parse(input, "UTF-8");
for( Element img : doc.select("img[src]") ) {
img.attr("src",array[i]); // set attribute 'src' to 'your-source-here'
++i;
}
try {
String strmb = doc.outerHtml();
bw = new BufferedWriter(new FileWriter(result_doc));
bw.write(strmb);
bw.close();
}
catch (Exception ex) {
System.out.println("Program stopped. The problem is " + "\"" +
ex.getMessage()+"\"");
}
You can go with JSOUP
Try something like below
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public static void getAllTags(){
try {
File input=new File("H:\\html pages\\index1.html");
Document document=Jsoup.parse(input, "UTF-8");
Document parse=Jsoup.parse(document.html());
Elements body=parse.select("body");
Elements bodyTags=body.select("*");
for (Element element : bodyTags) {
//Do what you want with tag
System.out.println(element.tagName());
}
} catch (Exception e) {
e.printStackTrace();
}
If you want to parse html then try this
public static void parseHTML(){
try {
File input = new File("H:\\html\\index1.html");
Document document = Jsoup.parse(input, "UTF-8");
Document parse = Jsoup.parse(document.html());
Elements bodyElements = parse.select("div");
Elements elements = bodyElements.select("*");
for (Element element : elements) {
FilterHtml.setHtmlTAG(element.tagName());
FilterHtml.ParseXml();
Elements body = bodyElements.select(FilterHtml.getXmlTAG());
if (body.is(FilterHtml.getXmlTAG())) {
Elements tag = parse.select(FilterHtml.getXmlTAG());
//Do something meaning full with tag
System.out.println(tag.text());
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
Hope this would help. if yes please mark it green.

Using PDFbox to determine the coordinates of words in a document

I'm using PDFbox to extract the coordinates of words/strings in a PDF document, and have so far had success determining the position of individual characters. this is the code thus far, from the PDFbox doc:
package printtextlocations;
import java.io.*;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import java.io.IOException;
import java.util.List;
public class PrintTextLocations extends PDFTextStripper {
public PrintTextLocations() throws IOException {
super.setSortByPosition(true);
}
public static void main(String[] args) throws Exception {
PDDocument document = null;
try {
File input = new File("C:\\path\\to\\PDF.pdf");
document = PDDocument.load(input);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (InvalidPasswordException e) {
System.err.println("Error: Document is encrypted with a password.");
System.exit(1);
}
}
PrintTextLocations printer = new PrintTextLocations();
List allPages = document.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
System.out.println("Processing page: " + i);
PDStream contents = page.getContents();
if (contents != null) {
printer.processStream(page, page.findResources(), page.getContents().getStream());
}
}
} finally {
if (document != null) {
document.close();
}
}
}
/**
* #param text The text to be processed
*/
#Override /* this is questionable, not sure if needed... */
protected void processTextPosition(TextPosition text) {
System.out.println("String[" + text.getXDirAdj() + ","
+ text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale="
+ text.getXScale() + " height=" + text.getHeightDir() + " space="
+ text.getWidthOfSpace() + " width="
+ text.getWidthDirAdj() + "]" + text.getCharacter());
}
}
This produces a series of lines containing the position of each character, including spaces, that looks like this:
String[202.5604,41.880127 fs=1.0 xscale=13.98 height=9.68814 space=3.8864403 width=9.324661]P
Where 'P' is the character. I have not been able to find a function in PDFbox to find words, and I am not familiar enough with Java to be able to accurately concatenate these characters back into words to search through even though the spaces are also included. Has anyone else been in a similar situation, and if so how did you approach it? I really only need the coordinate of the first character in the word so that parts simplified, but as to how I'm going to match a string against that kind of output is beyond me.
There is no function in PDFBox that allows you to extract words automatically. I'm currently working on extracting data to gather it into blocks and here is my process:
I extract all the characters of the document (called glyphs) and store them in a list.
I do an analysis of the coordinates of each glyph, looping over the list. If they overlap (if the top of the current glyph is contained between the top and bottom of the preceding/or the bottom of the current glyph is contained between the top and bottom of the preceding one), I add it to the same line.
At this point, I have extracted the different lines of the document (be careful, if your document is multi-column, the expression "lines" means all the glyphs that overlap vertically, ie the text of all the columns that have the same vertical coordinates).
Then, you can compare the left coordinate of the current glyph to the right coordinate of the preceding one to determine if they belong to the same word or not (the PDFTextStripper class provides a getSpacingTolerance() method that gives you, based on trials and errors, the value of a "normal" space. If the difference between the right and the left coordinates is lower than this value, both glyphs belong to the same word.
I applied this method to my work and it works good.
Based on the original idea here is a version of the text search for PDFBox 2. The code itself is rough, but simple. It should get you started fairly quickly.
import java.io.IOException;
import java.io.Writer;
import java.util.List;
import java.util.Set;
import lu.abac.pdfclient.data.PDFTextLocation;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
public class PrintTextLocator extends PDFTextStripper {
private final Set<PDFTextLocation> locations;
public PrintTextLocator(PDDocument document, Set<PDFTextLocation> locations) throws IOException {
super.setSortByPosition(true);
this.document = document;
this.locations = locations;
this.output = new Writer() {
#Override
public void write(char[] cbuf, int off, int len) throws IOException {
}
#Override
public void flush() throws IOException {
}
#Override
public void close() throws IOException {
}
};
}
public Set<PDFTextLocation> doSearch() throws IOException {
processPages(document.getDocumentCatalog().getPages());
return locations;
}
#Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
super.writeString(text);
String searchText = text.toLowerCase();
for (PDFTextLocation textLoc:locations) {
int start = searchText.indexOf(textLoc.getText().toLowerCase());
if (start!=-1) {
// found
TextPosition pos = textPositions.get(start);
textLoc.setFound(true);
textLoc.setPage(getCurrentPageNo());
textLoc.setX(pos.getXDirAdj());
textLoc.setY(pos.getYDirAdj());
}
}
}
}
take a look on this, I think it's what you need.
https://jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/
Here is the code:
import java.io.File;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
public class PrintTextLocations extends PDFTextStripper {
public static StringBuilder tWord = new StringBuilder();
public static String seek;
public static String[] seekA;
public static List wordList = new ArrayList();
public static boolean is1stChar = true;
public static boolean lineMatch;
public static int pageNo = 1;
public static double lastYVal;
public PrintTextLocations()
throws IOException {
super.setSortByPosition(true);
}
public static void main(String[] args)
throws Exception {
PDDocument document = null;
seekA = args[1].split(",");
seek = args[1];
try {
File input = new File(args[0]);
document = PDDocument.load(input);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (InvalidPasswordException e) {
System.err.println("Error: Document is encrypted with a password.");
System.exit(1);
}
}
PrintTextLocations printer = new PrintTextLocations();
List allPages = document.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
PDStream contents = page.getContents();
if (contents != null) {
printer.processStream(page, page.findResources(), page.getContents().getStream());
}
pageNo += 1;
}
} finally {
if (document != null) {
System.out.println(wordList);
document.close();
}
}
}
#Override
protected void processTextPosition(TextPosition text) {
String tChar = text.getCharacter();
System.out.println("String[" + text.getXDirAdj() + ","
+ text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale="
+ text.getXScale() + " height=" + text.getHeightDir() + " space="
+ text.getWidthOfSpace() + " width="
+ text.getWidthDirAdj() + "]" + text.getCharacter());
String REGEX = "[,.\\[\\](:;!?)/]";
char c = tChar.charAt(0);
lineMatch = matchCharLine(text);
if ((!tChar.matches(REGEX)) && (!Character.isWhitespace(c))) {
if ((!is1stChar) && (lineMatch == true)) {
appendChar(tChar);
} else if (is1stChar == true) {
setWordCoord(text, tChar);
}
} else {
endWord();
}
}
protected void appendChar(String tChar) {
tWord.append(tChar);
is1stChar = false;
}
protected void setWordCoord(TextPosition text, String tChar) {
tWord.append("(").append(pageNo).append(")[").append(roundVal(Float.valueOf(text.getXDirAdj()))).append(" : ").append(roundVal(Float.valueOf(text.getYDirAdj()))).append("] ").append(tChar);
is1stChar = false;
}
protected void endWord() {
String newWord = tWord.toString().replaceAll("[^\\x00-\\x7F]", "");
String sWord = newWord.substring(newWord.lastIndexOf(' ') + 1);
if (!"".equals(sWord)) {
if (Arrays.asList(seekA).contains(sWord)) {
wordList.add(newWord);
} else if ("SHOWMETHEMONEY".equals(seek)) {
wordList.add(newWord);
}
}
tWord.delete(0, tWord.length());
is1stChar = true;
}
protected boolean matchCharLine(TextPosition text) {
Double yVal = roundVal(Float.valueOf(text.getYDirAdj()));
if (yVal.doubleValue() == lastYVal) {
return true;
}
lastYVal = yVal.doubleValue();
endWord();
return false;
}
protected Double roundVal(Float yVal) {
DecimalFormat rounded = new DecimalFormat("0.0'0'");
Double yValDub = new Double(rounded.format(yVal));
return yValDub;
}
}
Dependencies:
PDFBox,
FontBox,
Apache Common Logging Interface.
You can run it by typing on command line:
javac PrintTextLocations.java
sudo java PrintTextLocations file.pdf WORD1,WORD2,....
the output is similar to:
[(1)[190.3 : 286.8] WORD1, (1)[283.3 : 286.8] WORD2, ...]
For those who still need assistance, this is what I used in my code and should be useful to start with. It uses PDFBox 2.0.16
public class PDFTextLocator extends PDFTextStripper {
private static String key_string;
private static float x;
private static float y;
public PDFTextLocator() throws IOException {
x = -1;
y = -1;
}
/**
* Takes in a PDF Document, phrase to find, and page to search and returns the x,y in float array
* #param document
* #param phrase
* #param page
* #return
* #throws IOException
*/
public static float[] getCoordiantes(PDDocument document, String phrase, int page) throws IOException {
key_string = phrase;
PDFTextStripper stripper = new PDFTextLocator();
stripper.setSortByPosition(true);
stripper.setStartPage(page);
stripper.setEndPage(page);
stripper.writeText(document, new OutputStreamWriter(new ByteArrayOutputStream()));
y = document.getPage(page).getMediaBox().getHeight()-y;
return new float[]{x,y};
}
/**
* Override the default functionality of PDFTextStripper.writeString()
*/
#Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
if(string.contains(key_string)) {
TextPosition text = textPositions.get(0);
if(x == -1) {
x = text.getXDirAdj();
y = text.getYDirAdj();
}
}
}
}
Below is the Maven dependency details,
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.16</version>
</dependency>
I got this working using the IKVM conversion PDFBox.NET 1.8.9. in C# and .NET.
I finally figured out the character (glyph) coordinates are private to the .NET assembly, but can be accessed using System.Reflection.
I posted a full example of getting the coordinates of WORDS and drawing them back on images of PDF's using SVG and HTML here: https://github.com/tsamop/PDF_Interpreter
For the example below you need PDFbox.NET: http://www.squarepdf.net/pdfbox-in-net, and include references to it in your project.
It took me quite a while to figure it out, so I really hope it saves someone else time!!
If you just need to know where to look for the characters & coordinates, a very abridged version would be:
using System;
using System.Reflection;
using org.apache.pdfbox.pdmodel;
using org.apache.pdfbox.util;
// to test run pdfTest.RunTest(#"C:\temp\test_2.pdf");
class pdfTest
{
//simple example for getting character (gliph) coordinates out of a pdf doc.
// a more complete example is here: https://github.com/tsamop/PDF_Interpreter
public static void RunTest(string sFilename)
{
//probably a better way to get page count, but I cut this out of a bigger project.
PDDocument oDoc = PDDocument.load(sFilename);
object[] oPages = oDoc.getDocumentCatalog().getAllPages().toArray();
int iPageNo = 0; //1's based!!
foreach (object oPage in oPages)
{
iPageNo++;
//feed the stripper a page.
PDFTextStripper tStripper = new PDFTextStripper();
tStripper.setStartPage(iPageNo);
tStripper.setEndPage(iPageNo);
tStripper.getText(oDoc);
//This gets the "charactersByArticle" private object in PDF Box.
FieldInfo charactersByArticleInfo = typeof(PDFTextStripper).GetField("charactersByArticle", BIndingFlags.NonPublic | BindingFlags.Instance);
object charactersByArticle = charactersByArticleInfo.GetValue(tStripper);
object[] aoArticles = (object[])charactersByArticle.GetField("elementData");
foreach (object oArticle in aoArticles)
{
if (oArticle != null)
{
//THE CHARACTERS within the article
object[] aoCharacters = (object[])oArticle.GetField("elementData");
foreach (object oChar in aoCharacters)
{
/*properties I caulght using reflection:
* endX, endY, font, fontSize, fontSizePt, maxTextHeight, pageHeight, pageWidth, rot, str textPos, unicodCP, widthOfSpace, widths, wordSpacing, x, y
*
*/
if (oChar != null)
{
//this is a really quick test.
// for a more complete solution that pulls the characters into words and displays the word positions on the page, try this: https://github.com/tsamop/PDF_Interpreter
//the Y's appear to be the bottom of the char?
double mfMaxTextHeight = Convert.ToDouble(oChar.GetField("maxTextHeight")); //I think this is the height of the character/word
char mcThisChar = oChar.GetField("str").ToString().ToCharArray()[0];
double mfX = Convert.ToDouble(oChar.GetField("x"));
double mfY = Convert.ToDouble(oChar.GetField("y")) - mfMaxTextHeight;
//CALCULATE THE OTHER SIDE OF THE GLIPH
double mfWidth0 = ((Single[])oChar.GetField("widths"))[0];
double mfXend = mfX + mfWidth0; // Convert.ToDouble(oChar.GetField("endX"));
//CALCULATE THE BOTTOM OF THE GLIPH.
double mfYend = mfY + mfMaxTextHeight; // Convert.ToDouble(oChar.GetField("endY"));
double mfPageHeight = Convert.ToDouble(oChar.GetField("pageHeight"));
double mfPageWidth = Convert.ToDouble(oChar.GetField("pageWidth"));
System.Diagnostics.Debug.Print(#"add some stuff to test {0}, {1}, {2}", mcThisChar, mfX, mfY);
}
}
}
}
}
}
}
using System.Reflection;
/// <summary>
/// To deal with the Java interface hiding necessary properties! ~mwr
/// </summary>
public static class GetField_Extension
{
public static object GetField(this object randomPDFboxObject, string sFieldName)
{
FieldInfo itemInfo = randomPDFboxObject.GetType().GetField(sFieldName, BindingFlags.NonPublic | BindingFlags.Instance);
return itemInfo.GetValue(randomPDFboxObject);
}
}

extract images from pdf using pdfbox

I m trying to extract images from a pdf using pdfbox. The example pdf here
But i m getting blank images only.
The code i m trying:-
public static void main(String[] args) {
PDFImageExtract obj = new PDFImageExtract();
try {
obj.read_pdf();
} catch (IOException ex) {
System.out.println("" + ex);
}
}
void read_pdf() throws IOException {
PDDocument document = null;
try {
document = PDDocument.load("C:\\Users\\Pradyut\\Documents\\MCS-034.pdf");
} catch (IOException ex) {
System.out.println("" + ex);
}
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
int i =1;
String name = null;
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
Map pageImages = resources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
i ++;
}
}
}
}
Thanks
Here is code using PDFBox 2.0.1 that will get a list of all images from the PDF. This is different than the other code in that it will recurse through the document instead of trying to get the images from the top level.
public List<RenderedImage> getImagesFromPDF(PDDocument document) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (PDPage page : document.getPages()) {
images.addAll(getImagesFromResources(page.getResources()));
}
return images;
}
private List<RenderedImage> getImagesFromResources(PDResources resources) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (COSName xObjectName : resources.getXObjectNames()) {
PDXObject xObject = resources.getXObject(xObjectName);
if (xObject instanceof PDFormXObject) {
images.addAll(getImagesFromResources(((PDFormXObject) xObject).getResources()));
} else if (xObject instanceof PDImageXObject) {
images.add(((PDImageXObject) xObject).getImage());
}
}
return images;
}
The below GetImagesFromPDF java class get all images in 04-Request-Headers.pdf file and save those files into destination folder PDFCopy.
import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
#SuppressWarnings({ "unchecked", "rawtypes", "deprecation" })
public class GetImagesFromPDF {
public static void main(String[] args) {
try {
String sourceDir = "C:/PDFCopy/04-Request-Headers.pdf";// Paste pdf files in PDFCopy folder to read
String destinationDir = "C:/PDFCopy/";
File oldFile = new File(sourceDir);
if (oldFile.exists()) {
PDDocument document = PDDocument.load(sourceDir);
List<PDPage> list = document.getDocumentCatalog().getAllPages();
String fileName = oldFile.getName().replace(".pdf", "_cover");
int totalImages = 1;
for (PDPage page : list) {
PDResources pdResources = page.getResources();
Map pageImages = pdResources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage pdxObjectImage = (PDXObjectImage) pageImages.get(key);
pdxObjectImage.write2file(destinationDir + fileName+ "_" + totalImages);
totalImages++;
}
}
}
} else {
System.err.println("File not exists");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
For PDFBox 2.0.1, pudaykiran's answer must be slightly modified since some APIs have been changed.
public static void testPDFBoxExtractImages() throws Exception {
PDDocument document = PDDocument.load(new File("D:/Temp/Test.pdf"));
PDPageTree list = document.getPages();
for (PDPage page : list) {
PDResources pdResources = page.getResources();
for (COSName c : pdResources.getXObjectNames()) {
PDXObject o = pdResources.getXObject(c);
if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
File file = new File("D:/Temp/" + System.nanoTime() + ".png");
ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject)o).getImage(), "png", file);
}
}
}
}
Just add the .jpeg to the end of your path:
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i + ".jpeg");
That works for me.
You can use PDPage.convertToImage() function which can convert the PDF page into a BufferedImage. Next you can use the BufferedImage to create an Image.
Use the following reference for further detail:
All PDF realated classes in PDFBox you can get in
Apache PDFBox 1.8.3 API
Here you can see PDPage related documentation.
And do not forget to look for PDPage.convertToImage() function in PDPage class.
This is a kotlin version of #Matt's answer.
fun <R> PDResources.onImageResources(block: (RenderedImage) -> (R)): List<R> =
this.xObjectNames.flatMap {
when (val xObject = this.getXObject(it)) {
is PDFormXObject -> xObject.resources.onImageResources(block)
is PDImageXObject -> listOf(block(xObject.image))
else -> emptyList()
}
}
You can use it on PDPage Resources like this:
page.resources.onImageResources { image ->
Files.createTempFile("image", "xxx").also { path->
if(!ImageIO.write(it, "xxx", file.toFile()))
IllegalStateException("Couldn't write image to file")
}
}
Where "xxx" is the format you need (like "jpeg")
For someone who want just copy and paste this ready to use code
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.UUID;
public class ExtractImagesUseCase extends PDFStreamEngine{
private final String filePath;
private final String outputDir;
// Constructor
public ExtractImagesUseCase(String filePath,
String outputDir){
this.filePath = filePath;
this.outputDir = outputDir;
}
// Execute
public void execute(){
try{
File file = new File(filePath);
PDDocument document = PDDocument.load(file);
for(PDPage page : document.getPages()){
processPage(page);
}
}catch(IOException e){
e.printStackTrace();
}
}
#Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException{
String operation = operator.getName();
if("Do".equals(operation)){
COSName objectName = (COSName) operands.get(0);
PDXObject pdxObject = getResources().getXObject(objectName);
if(pdxObject instanceof PDImageXObject){
// Image
PDImageXObject image = (PDImageXObject) pdxObject;
BufferedImage bImage = image.getImage();
// File
String randomName = UUID.randomUUID().toString();
File outputFile = new File(outputDir,randomName + ".png");
// Write image to file
ImageIO.write(bImage, "PNG", outputFile);
}else if(pdxObject instanceof PDFormXObject){
PDFormXObject form = (PDFormXObject) pdxObject;
showForm(form);
}
}
else super.processOperator(operator, operands);
}
}
Demo
public class ExtractImageDemo{
public static void main(String[] args){
String filePath = "C:\\Users\\John\\Downloads\\Documents\\sample-file.pdf";
String outputDir = "C:\\Users\\John\\Downloads\\Documents\\Output";
ExtractImagesUseCase useCase = new ExtractImagesUseCase(
filePath,
outputDir
);
useCase.execute();
}
}
Instead of calling
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
You can use the ImageIO.write() static method to write the RGB image out in whatever format you need. Here I've used PNG:
File outputFile = new File( "C:\\Users\\Pradyut\\Documents\\image" + i + ".png");
ImageIO.write( image.getRGBImage(), "png", outputFile);

Categories