Open a PDF file from shared path in Java

Open a PDF file from shared path in Java - java

I have a PDF file in my shared path. I have tried accessing it through the normal method but it's not happening. How to access that file?
This is the code I have tried.
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
/**
* This class is used to read an existing
* pdf file using iText jar.
* #author codesjava
*/
public class PDFReadExample {
public static void main(String args[]){
try {
//Create PdfReader instance.
PdfReader pdfReader = new PdfReader("D:\\testFile.pdf");
//Get the number of pages in pdf.
int pages = pdfReader.getNumberOfPages();
//Iterate the pdf through pages.
for(int i=1; i<=pages; i++) {
//Extract the page content using PdfTextExtractor.
String pageContent =
PdfTextExtractor.getTextFromPage(pdfReader, i);
//Print the page content on console.
System.out.println("Content on Page "
+ i + ": " + pageContent);
}
//Close the PdfReader.
pdfReader.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
But what if my path is like this:
http://team.net/po/kc ape/Platform%20Symbol/form%7D

PdfReader has a constructor which accepts an InputStream.
To access content behind an URL you can use the class URL:
import java.net.URL;
import java.io.InputStream;
import com.itextpdf.text.pdf.PdfReader;
URL url=new URL("http://...").
InputStream is = url.openStream();
PdfReader reader=new PdfReader(is);

Related

Huge white space after header in PDF using Flying Saucer

I am trying to export an HTML page into a PDF using Flying Saucer. For some reason, the pages have a large white space after the header (id = "divTemplateHeaderPage1") divisions.
The jsFiddle link to my HTML code that is being used by PDF renderer: https://jsfiddle.net/Sparks245/uhxqdta6/.
Below is the Java code used for rendering the PDF (Test.html is the same HTML code in the fiddle) and rendering only one page.
import java.io.IOException;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.json.HTTP;
import org.json.JSONException;
import org.json.*;
import org.json.simple.JSONArray;
import org.json.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import org.json.simple.parser.*;
import org.xhtmlrenderer.pdf.ITextRenderer;
import com.lowagie.text.DocumentException;
import com.lowagie.text.List;
import com.sun.xml.internal.bind.v2.runtime.unmarshaller.XsiNilLoader.Array;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStream;
#WebServlet("/PLPDFExport")
public class PLPDFExport extends HttpServlet
{
//Option for Serialization
private static final long serialVersionUID = 1L;
public PLPDFExport()
{
super();
}
//Get method
protected void doGet(HttpServletRequest request,
HttpServletResponse response)
throws ServletException,
IOException
{
}
//Post method
protected void doPost(HttpServletRequest request,
HttpServletResponse response)
throws ServletException,
IOException
{
StringBuffer jb = new StringBuffer();
String line = null;
int Pages;
String[] newArray = null;
try
{
BufferedReader reader = request.getReader();
while ((line = reader.readLine()) != null)
{ jb.append(line);
}
} catch (Exception e) { /*report an error*/ }
try
{
JSONObject obj = new JSONObject(jb.toString());
Pages = obj.getInt("Pages");
newArray = new String[1];
for(int cnt = 1; cnt <= 1; cnt++)
{
StringBuffer buf = new StringBuffer();
String base = "C:/Users/Sparks/Desktop/";
buf.append(readFile(base + "Test.html"));
newArray[0] = buf.toString();
}
}
catch (JSONException e)
{
// crash and burn
throw new IOException("Error parsing JSON request string");
}
//Get the parameters
OutputStream os = null;
try {
final File outputFile = File.createTempFile("FlyingSacuer.PDFRenderToMultiplePages", ".pdf");
os = new FileOutputStream(outputFile);
ITextRenderer renderer = new ITextRenderer();
// we need to create the target PDF
// we'll create one page per input string, but we call layout for the first
renderer.setScaleToFit(true);
renderer.isScaleToFit();
renderer.setDocumentFromString(newArray[0]);
renderer.layout();
try {
renderer.createPDF(os, false);
} catch (DocumentException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// each page after the first we add using layout() followed by writeNextDocument()
for (int i = 1; i < newArray.length; i++) {
renderer.setScaleToFit(true);
renderer.isScaleToFit();
renderer.setDocumentFromString(newArray[i]);
renderer.layout();
try {
renderer.writeNextDocument();
} catch (DocumentException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// complete the PDF
renderer.finishPDF();
System.out.println("PDF Downloaded to " + outputFile );
System.out.println(newArray[0]);
}
finally {
if (os != null) {
try {
os.close();
} catch (IOException e) { /*ignore*/ }
}
}
//Return
response.setContentType("application/json");
response.setCharacterEncoding("UTF-8");
response.getWriter().write("File Uploaded");
}
String readFile(String fileName) throws IOException {
BufferedReader br = new BufferedReader(new FileReader(fileName));
try {
StringBuilder sb = new StringBuilder();
String line = br.readLine();
while (line != null) {
sb.append(line);
sb.append("\n");
line = br.readLine();
}
return sb.toString();
} finally {
br.close();
}
}
}
The link for exported PDF: https://drive.google.com/file/d/13CmlJK0ZDLolt7C3yLN2k4uJqV3TX-4B/view?usp=sharing
I tried adding css properties like page-break-inside: avoid to the header divisions but it didn't work. Also I tried adding absolute positions and top margins to the body division (id = "divTemplateBodyPage1") just below the header div, but the white space continues to exist.
Any suggestions would be helpful.

Please take a look at the metadata of your PDF:
You are using an old third party tool that is not endorsed by iText Group, and that uses iText 2.1.7, a version of iText dating from 2009 that should no longer be used.
It would probably have been OK to complain and to write "My code isn't working" about 7 years ago, but if you would use the most recent version of iText, the result of converting your HTML to PDF would look like this:
I only needed a single line of code to get this result:
HtmlConverter.convertToPdf(new File(src), new File(dest));
In this line src is the path the the source HTML and dest is the path to the resulting PDF.
I only had to apply one minor change to your HTML. I change the #page properties like this:
#page {
size: 27cm 38cm;
margin: 0.2cm;
}
If I hadn't changed this part of the CSS, the page size would have been A4, and in that case, not all the content would have fitted the page. I also added a small margin because I didn't like the fact that the border was sticking to close to the sides of the page.
Morale: don't use old versions of libraries! Download the latest version of iText and the pdfHTML add-on. You need iText 7 core and the pdfHTML add-on. You might also want to read the HTML to PDF tutorial.

JAVA Read sample doc file, fill with data and generate PDF

I am triing to make an automatization program in JAVA.
I have a sample doc file. I need to fill the blank parts or the <> "signed" parts from database,
than create pdf files.
I tried to read the word :
import java.io.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class ReadDocFile {
public static void main(String[] args) {
File file = null;
WordExtractor extractor = null ;
try {
file = new File("c:\\New.doc");
FileInputStream fis=new FileInputStream(file.getAbsolutePath());
HWPFDocument document=new HWPFDocument(fis);
extractor = new WordExtractor(document);
String [] fileData = extractor.getParagraphText();
for(int i=0;i<fileData.length;i++){
if(fileData[i] != null)
System.out.println(fileData[i]);
}
}
catch(Exception exep){}
}
}
but this attemption is bad in many way cause i only need to write some of the parts, and this method make a single test from the doc.
So can you advice me some api that write in a word doc eg: after Name : or in the 5 row write this:
And when it finish with the word it should generate a pdf and do it again ...
I am looking a solution wich i found xssfworkbook with some extra function ( generate pdf of the doc ).
Or read the sample pdf and fill with datas and save to a new pdf.
Thx

Use Itext (http://sourceforge.net/projects/itext/)
and Apache POI Project http://poi.apache.org/index.html
A sample code :
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hwpf.extractor.WordExtractor;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Paragraph;
import com.itextpdf.text.pdf.PdfWriter;
public static void main(String[] args) {
String pdfPath = "C:/";
String pdfDocPath = null;
try {
InputStream is = new BufferedInputStream(new FileInputStream("C:/Test.doc"));
WordExtractor wd = new WordExtractor(is);
String text = wd.getText();
/* FOR DOCX
// IMPORT
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
// CODE
XWPFDocument hdoc = new XWPFDocument(is);
extractor = new XWPFWordExtractor(hdoc);
String text = extractor.getText();
*/
Document document = new Document();
PdfWriter.getInstance(document, new FileOutputStream(pdfPath + "viewDoc.pdf"));
document.open();
document.add(new Paragraph(text));
document.close();
pdfDocPath = pdfPath + "viewDoc.pdf";
System.out.println("Pdf document path is" + pdfDocPath);
}
catch (FileNotFoundException e1) {
System.out.println("File does not exist.");
}
catch (IOException ioe) {
System.out.println("IO Exception");
}
catch (DocumentException e) {
e.printStackTrace();
}
}

How to get raw text from pdf file using java

I have some pdf files, Using pdfbox i have converted them into text and stored into text files, Now from the text files i want to remove
Hyperlinks
All special characters
Blank lines
headers footers of pdf files
“1)”,“2)”, “a)”, “bullets”, etc.
I want to get valid text line by line like this:
We propose OntoGain, a method for ontology learning from multi-word concept terms extracted from plain text. OntoGain follows an ontology learning process dened by distinct processing layers. Building upon plain term extraction a con-cept hierarchy is formed by clustering the extracted concepts. The derived term taxonomy is then enriched with non-taxonomic relations. Several dierent state-of-the-art methods have been examined for implementing each layer. OntoGain is based upon multi-word term concepts, as multi-word or compound terms are vested with more solid and distinctive semantics than plain single word terms. We opted for a hierarchical clustering method and Formal Concept Analysis (FCA) algorithm for building the term taxonomy. Furthermore an association rule algorithm is applied for revealing non-taxonomic relations. A method which tries to carry out the most appropriate generalization level between a relation's concepts is also implemented. To show proof of concept, a system prototype is implemented. The OntoGain allows transformation of the derived ontology into OWL using Jena Semantic Web Frame-work1. OntoGain is applied on two separate data sources a medical and computer corpus and its results are compared with similar results obtained by Text2Onto, a state-of-the-art-ontology learning method. The analysis of 11.5 CCD1.1 results indicates that OntoGain performs better than Text2Onto in terms of precision extracts more correct concepts while being more selective extracts fewer but more reasonable concepts.
How can I achieve this?

Using pdfbox we can achive this
Example :
public static void main(String args[]) {
PDFParser parser = null;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
PDFTextStripper pdfStripper;
String parsedText;
String fileName = "E:\\Files\\Small Files\\PDF\\JDBC.pdf";
File file = new File(fileName);
try {
parser = new PDFParser(new FileInputStream(file));
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
parsedText = pdfStripper.getText(pdDoc);
System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", ""));
} catch (Exception e) {
e.printStackTrace();
try {
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
} catch (Exception e1) {
e1.printStackTrace();
}
}
}

Hi we can extract the pdf files using Apache Tika
The Example is :
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
public class WebPagePdfExtractor {
public Map<String, Object> processRecord(String url) {
DefaultHttpClient httpclient = new DefaultHttpClient();
Map<String, Object> map = new HashMap<String, Object>();
try {
HttpGet httpGet = new HttpGet(url);
HttpResponse response = httpclient.execute(httpGet);
HttpEntity entity = response.getEntity();
InputStream input = null;
if (entity != null) {
try {
input = entity.getContent();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
AutoDetectParser parser = new AutoDetectParser();
ParseContext parseContext = new ParseContext();
parser.parse(input, handler, metadata, parseContext);
map.put("text", handler.toString().replaceAll("\n|\r|\t", " "));
map.put("title", metadata.get(TikaCoreProperties.TITLE));
map.put("pageCount", metadata.get("xmpTPg:NPages"));
map.put("status_code", response.getStatusLine().getStatusCode() + "");
} catch (Exception e) {
e.printStackTrace();
} finally {
if (input != null) {
try {
input.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
} catch (Exception exception) {
exception.printStackTrace();
}
return map;
}
public static void main(String arg[]) {
WebPagePdfExtractor webPagePdfExtractor = new WebPagePdfExtractor();
Map<String, Object> extractedMap = webPagePdfExtractor.processRecord("http://math.about.com/library/q20.pdf");
System.out.println(extractedMap.get("text"));
}
}

You can use iText for do such things
//iText imports
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
for example:
try {
PdfReader reader = new PdfReader(INPUTFILE);
int n = reader.getNumberOfPages();
String str=PdfTextExtractor.getTextFromPage(reader, 2); //Extracting the content from a particular page.
System.out.println(str);
reader.close();
} catch (Exception e) {
System.out.println(e);
}
another one
try {
PdfReader reader = new PdfReader("c:/temp/test.pdf");
System.out.println("This PDF has "+reader.getNumberOfPages()+" pages.");
String page = PdfTextExtractor.getTextFromPage(reader, 2);
System.out.println("Page Content:\n\n"+page+"\n\n");
System.out.println("Is this document tampered: "+reader.isTampered());
System.out.println("Is this document encrypted: "+reader.isEncrypted());
} catch (IOException e) {
e.printStackTrace();
}
the above examples can only extract the text, but you need to do some more to remove hyperlinks, bullets, heading & numbers.

For the newer versions of Apache pdfbox. Here is the example from the original source
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.examples.util;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.text.PDFTextStripper;
/**
* This is a simple text extraction example to get started. For more advance usage, see the
* ExtractTextByArea and the DrawPrintTextLocations examples in this subproject, as well as the
* ExtractText tool in the tools subproject.
*
* #author Tilman Hausherr
*/
public class ExtractTextSimple
{
private ExtractTextSimple()
{
// example class should not be instantiated
}
/**
* This will print the documents text page by page.
*
* #param args The command line arguments.
*
* #throws IOException If there is an error parsing or extracting the document.
*/
public static void main(String[] args) throws IOException
{
if (args.length != 1)
{
usage();
}
try (PDDocument document = PDDocument.load(new File(args[0])))
{
AccessPermission ap = document.getCurrentAccessPermission();
if (!ap.canExtractContent())
{
throw new IOException("You do not have permission to extract text");
}
PDFTextStripper stripper = new PDFTextStripper();
// This example uses sorting, but in some cases it is more useful to switch it off,
// e.g. in some files with columns where the PDF content stream respects the
// column order.
stripper.setSortByPosition(true);
for (int p = 1; p <= document.getNumberOfPages(); ++p)
{
// Set the page interval to extract. If you don't, then all pages would be extracted.
stripper.setStartPage(p);
stripper.setEndPage(p);
// let the magic happen
String text = stripper.getText(document);
// do some nice output with a header
String pageStr = String.format("page %d:", p);
System.out.println(pageStr);
for (int i = 0; i < pageStr.length(); ++i)
{
System.out.print("-");
}
System.out.println();
System.out.println(text.trim());
System.out.println();
// If the extracted text is empty or gibberish, please try extracting text
// with Adobe Reader first before asking for help. Also read the FAQ
// on the website:
// https://pdfbox.apache.org/2.0/faq.html#text-extraction
}
}
}
/**
* This will print the usage for this document.
*/
private static void usage()
{
System.err.println("Usage: java " + ExtractTextSimple.class.getName() + " <input-pdf>");
System.exit(-1);
}
}

Extracting all keywords from PDF(from a web page) file on your local machine or Base64 encoded string:
import org.apache.commons.codec.binary.Base64;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class WebPagePdfExtractor {
public static void main(String arg[]) {
WebPagePdfExtractor webPagePdfExtractor = new WebPagePdfExtractor();
System.out.println("From file: " + webPagePdfExtractor.processRecord(createByteArray()).get("text"));
System.out.println("From string: " + webPagePdfExtractor.processRecord(getArrayFromBase64EncodedString()).get("text"));
}
public Map<String, Object> processRecord(byte[] byteArray) {
Map<String, Object> map = new HashMap<>();
try {
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(false);
stripper.setShouldSeparateByBeads(true);
PDDocument document = PDDocument.load(byteArray);
String text = stripper.getText(document);
map.put("text", text.replaceAll("\n|\r|\t", " "));
} catch (Exception exception) {
exception.printStackTrace();
}
return map;
}
private static byte[] getArrayFromBase64EncodedString() {
String encodedContent = "data:application/pdf;base64,JVBERi0xLjMKJcTl8uXrp/Og0MTGCjQgMCBvYmoKPDwgL0xlbmd0aCA1IDAgUiAvRmlsdGVyIC9GbGF0ZURlY29kZSA+PgpzdHJlYW0KeAGF0E0OgjAQBeA9p3hL3UCHlha2Gg9A0sS1AepPxIDl/rFFErVESDddvPlm8nqU6EFpzARjBCVkLHNkipBzPBsc8UCyt4TKgmCr/9HI+GDqg2x8Luzk8UtfYwX5DVWLnQaLmd+qHTsF3V5QEekWidZuDNpgc7L1FvqGg35fOzPlqslFYJrzZdnkq6YI77TXtrs3GBo7oKvNss9mfhT0IAV+e6CUL5pSTWb0t1tVBKbI5McsXxNmciYKZW5kc3RyZWFtCmVuZG9iago1IDAgb2JqCjE4NQplbmRvYmoKMiAwIG9iago8PCAvVHlwZSAvUGFnZSAvUGFyZW50IDMgMCBSIC9SZXNvdXJjZXMgNiAwIFIgL0NvbnRlbnRzIDQgMCBSIC9NZWRpYUJveCBbMCAwIDU5NSA4NDJdC" +
"j4+CmVuZG9iago2IDAgb2JqCjw8IC9Qcm9jU2V0IFsgL1BERiAvVGV4dCBdIC9Db2xvclNwYWNlIDw8IC9DczEgNyAwIFIgL0NzMiA4IDAgUiA+PiAvRm9udCA8PAovVFQxIDkgMCBSID4+ID4+CmVuZG9iagoxMCAwIG9iago8PCAvTGVuZ3RoIDExIDAgUiAvTiAxIC9BbHRlcm5hdGUgL0RldmljZUdyYXkgL0ZpbHRlciAvRmxhdGVEZWNvZGUgPj4Kc3RyZWFtCngBhVVdaBxVFD67c2cDEgcftA0ttIM/bQnpMolWE4u12026SRO362ZTmyrKdHY2O81kZpyZ3SahT6XgmxYE6augPsaCCLYqNi/2paXFkko1DwoRWowgKH1S8Dsz22R2QTLDnfnuueeee8537rmXqOtv3fPstEo054R+oZybPjl9Su26TWlSqJvw6Ebg5UqlCcaO65j8b38e3qUUS+7sZ1vtY1v25KoZGNC6huZWA2OOKKURZWqG54dEXZcgHzwbeoxvAz85WynngdeAldZcQHqqYDqmbxlqwdcX1JLv1i" +
"w76etW42xjy2fObrCv/OxG6w5mJ8fx74XPF0xnahJ4H/CSoY8w7gO+27ROFGOcTnvhkXKsn842ZqdyLfnJmn90qiW/UG+MMs4SpZcW65U3gJ8AXnVOF4+39Ndn3XG200Mk9RhB/hTws8Ba3RzjPKnAFd8tsz7Lw6o5PAL8MvAlKxyrAMO+9EPQnGQ5sKDFep79xFoie0Y/VgLeBnzItAu8FuyIiheW2OYg8LxjF3ktxC4um0EUL2IXP4X1ymisL6dDv8JznyaS99Sso2PA4EQerfujLIc/cujZ0d56EXjJb5Q59j3Aa7o/UgCGzcxjVX2YeX4BeIBOpHQyyaXT+Brk0L+INyCLmhHyyMdYDX2bCtBw0Hz0DGgVgHRaAColtEz0WCeeo1IVPZVmollBhNjK/ahvUH7Xp9SAtE7rkNaBXqNfIsk8/Upz6OchbWBspsNuHl44tAgP2BO2+aBl0xXbhSaeRzsoJsQrYlAMkSpeFYfFITEM6ZA4GM2JvU/6zn4+2LD0LtZN+r4MDkKsZ8MzB6xwNAE8+AfrzkaaCbYu7mjs87yP3j/vv2MZtz74s429APoxJ7/BogtrJiXmXj/3TU/CQ3VFfPXWne7r5+h4MktR3qqdWZLX5PvyCr735NWkDflneRXvvbZcPcoL/5O5zSFGO5LNQc48m1G0ccYbwCG4qUVz9rdZTLLptmK0YMlClJ2ruP/LCfPDPLexUnMu7vC8tz9jNs33ig+LdL5Pu6y" +
"ta59oP2p/aCvax0C/Sx9KX0rfSlekq9INUqVr0rL0nfS99Ln0NXpfQLosXenYSXHsG7sHfsZ71mjtMGaGsxQQ88LazApLH/F3BmOb+TOh1V4Dnbt/Yy3liLJTeUYZVnYrzykTSq9yQDmsbFcG0PqVUWUvRnZusGRjPc6AhX+SZ4umI67iPLFXdbDnw0sd76ZfXMPWhjXYST0Ontnapg6vEVe/FVVjvDtdnAY6TSFii84ich86nB8nqv7O2VyTODVSb+KUsMQu0S/GWjWYEwdQheNt9TjIVZoZyQxncqRmejNDmf7MMcZRrNH5ktmL0SF8RxLeM8sx/5s1xGcY7x3mqAlso4dbKzTncd8R5V1vwbdm6qE6oGkvqTlcr6Y65hjZPlW3bTUaClTfDEy/aVazxHc3zyP66/XoTk5tu2E0/GYso1TqJtF/t4+TNAplbmRzdHJlYW0KZW5kb2JqCjExIDAgb2JqCjExMTYKZW5kb2JqCjcgMCBvYmoKWyAvSUNDQmFzZWQgMTAgMCBSIF0KZW5kb2JqCjEyIDAgb2JqCjw8IC9MZW5ndGgg" +
"MTMgMCBSIC9OIDMgL0FsdGVybmF0ZSAvRGV2aWNlUkdCIC9GaWx0ZXIgL0ZsYXRlRGVjb2RlID4+CnN0cmVhbQp4AYVVW4gbVRj+kznJCrvO09rVLaRDvXQpu0u2Fd2ltJpbk7RrGrLZ1RZBs5OTZMzsJM5M0gt9KoLii6u+SUG8vS0IgtJ6wdYH+1KpUFZ36yIoPrR4QSj0RbfxO5NkJllqm2XPfPP93/lv558ZooG1Qr2u+xWiJcM2c8mo8tzRY8rAOvnpIRqkURosqFY9ks3OEn5CK679v1s/kE8wVyfubO9Xb7kbLHJLJfLdB75WtNQl4BNEgbNq3bSJBobBTx+36wKLHIZNJAj8osDlNoaNhhfb+DVHk8/FoDkLLKuVQhF4BXh8sYcv9+B2DlDAT5Ib3NRURfQia9ZKms4dQ3u5h7lHeTe4pDdQs/PbgXXIqs4dxnUMtb9SLMQFngReUQuJOeBHgK81tYVMB9+u29Ec8GNE/p2N6nwEeDdwqmQenAeGH79ZaaS6+J1Tlfyz4LeB/8ZYzBzp7F1TrRh6STvB367wtOhviEhSN" +
"DudB4Yf6YBZywk9cpBKRR5PAI8Dv16tHRY5wKf0mdWcE7zIZ+1UJSbyFPzllwqHssCjwL9yPSn0iCX9W7eznRxYyNAzIi5isTi3nHrhh4XsSj4FHnGZbpv5zl62XNIOpjv6TypmSvBi77W67swocgv4zUZO1I5YgcmCmUgCw2cgy4150U+Bm7TgKxCnGi1iVcmgTVIoR0mK4lonE5YSaaSD4bByMBx3Xc2Es8+iKniNmo7Nwpp1lO2dXa1CZbAGXXe0KsVCH1EDnir0B9iK61OhGO4a4Mr/46edy42OnxobYWG2F//72Czbz6bZDCnsKfY0O8DiYGfYPtd3Fnu6FYl8biBK28/LiMgd3QJqv4gabSpg/QWKGlmuh76uLI82xjzLGfMFTb3yxt89vdKws+oqJvo6euRePQ/8FrgeWMW6HthwfSiBnwIb+FtHb7xaap6902VxUhpOtNan23oWXVUElerOziV0QUPNvKfmiV4fl05/+aAXbZWde/7q0KXTJWN51GNFF/irmVsZOjPuseEfw3+GV8PvhT8M/y69LX0qfSWdlz6XLpMiXZ" +
"AuSl9L30ofS1+4+rvNkHv2JDIXcyXyFtPVrbC315hYOSpvlx+W4/IO+VF51lUp8og8JafkXbBsd8/Nm2+lt3L05Siidftz51jiWdFcTzgD3/2YAM2L2DcD88hYo+PwaaLfYt4MOglt75PXqYiF2BRLb5nuaTHzXd/BRDAejJAS3B2cCU4FDwncfZaDu2CbwZrozQ3z4Sr6KuU2PyG+JxSr1U+aWrliK3vC4SeVCD59XEkb6uS4UtB1xTFZisktbjZ5cZLEd1PsI7qZc76Hvm1XPM5+hmj/X3j3fe9xxxpEKxbRyOMeN4Z35QPvEp17Qm2YzbY/8vm+I7JKe/c4976hKN5fP7daN/EeG3iLaPPNVuuf91utzQ/gf4Pogv4foJ98VQplbmRzdHJlYW0KZW5kb2JqCjEzIDAgb2JqCjEwNzkKZW5kb2JqCjggMCBvYmoKWyAvSUNDQmFzZWQgMTIgMCBSIF0KZW5kb2JqCjMgMCBvYmoKPDwgL1R5cGUgL1BhZ2VzIC9NZWRpYUJveCBbMCAwIDU5NSA4NDJdIC9Db3VudCAxIC9LaWR" +
"zIFsgMiAwIFIgXSA+PgplbmRvYmoKMTQgMCBvYmoKPDwgL1R5cGUgL0NhdGFsb2cgL1BhZ2VzIDMgMCBSID4+CmVuZG9iago5IDAgb2JqCjw8IC9UeXBlIC9Gb250IC9TdWJ0eXBlIC9UcnVlVHlwZSAvQmFzZUZvbnQgL0NOVFpYVStNZW5" +
"sby1SZWd1bGFyIC9Gb250RGVzY3JpcHRvcgoxNSAwIFIgL0VuY29kaW5nIC9NYWNSb21hbkVuY29kaW5nIC9GaXJzdENoYXIgMzIgL0xhc3RDaGFyIDExNiAvV2lkdGhzIFsgNjAyCjAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgNjAyIDYwMiA2MDIgNjAyIDYwMiA2MDIgMCAwIDAgMCAwIDAgMCAwIDAKMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgNjAyIDAgMAo2MDIgNjAyIDYwMiA2MDIgNjAyIDYwMiAwIDAgNjAyIDYwMiAwIDAgNjAyIDAgMCA2MDIgNjAyIF0gPj4KZW5kb2JqCjE1IDAgb2JqCjw8IC9UeXBlIC9Gb250RGVzY3JpcHRvciAvRm9udE5hbWUgL0NOVFpYVStNZW5sby1SZWd1bGFyIC9GbGFncyAzMyAvRm9udEJCb3gKWy01NTggLTM3NSA3MTggMTA0MV0g" +
"L0l0YWxpY0FuZ2xlIDAgL0FzY2VudCA5MjggL0Rlc2NlbnQgLTIzNiAvQ2FwSGVpZ2h0IDcyOQovU3RlbVYgOTkgL1hIZWlnaHQgNTQ3…/ZfICj5JcLdi/ATmQZKogDPg0lIDBunI0ZGOB1OB/Lpyce1TbJqCpBThycVs3GyQPZSLKexbMGyFss8LF4sNb2lElu5HPlJ2439G1jKsbRh6cTyPNpx8I6AFxa8P+xD2E4e/G+5PqJ/8aDzERFvGBJR/WLkfwcM3kRCiZpokDMdxhn5MeD9Rn5MSm0mYUpLSF98J5HXaQgtpJvoDWGesEe4C4NgK3woWsQ88RgzszXsMM4WyALeIC5gO5B/FYk/pNxVCJGoZT8NYc8LIknrONeVQYznus51pYeZHCaXw+RYIJLAEogJfMEbVPrvv31S6icvTMlp1EQhO41cOuXb0EEkSYkmGaMXSzuIfhCKAA4Y/YScTs9ASizblWVyWB1UT4fwNfSp9+mgwLFd4oI3D++9++kuheYWpOnEeBhLJrv7kVg" +
"Xk1hkVDRExLgkieUZTTt1jZYGkTTiXU8tULUtIsEIfeKMgY5AV3u7yZyTQdK6Mm923fwgHe1GZWTfmCJy5CYi05PgwqWzB5HBw2n2wL7OBEmVPZxmZYpWi6TSU7pM2BNY1kojs0sLN1bPOLZ4/nuzL1CNp/S+zt27dx+lA4Y/2/hA1fq8kR9kZF57u6R96YgvZRmsvfe5OBj5TSKjkN+wRqu6LrRF1yjF19lbYhudDVKTdVe/8DAClihbX6MNEuItofH9kF9k+FwXMofC7rqCDHcZu293G7tz0qmNWi2iM6FvYrYN2RuEvCbT7GDnZ0xDyMZt/Otb8z+aP+/dOS17927ZurVu24ZVnrYFz7w95jxlayE+8b3Nf/m6b5/j2QMb1v26qeXZ8iWVSUkH7fYLb1bKBw7aA57LYgVGYAGtLM8dT3WgIwC6PAIaVSOjsCaUatXEFiJKBm0fvTEQODe0K9Mki/mK3DPnBOUsHkchH/ckhFIHZJmyrE6T0+TIFi7xfvRjx/X33jves5rFBb6Gk4GsHXwbLX1Hlp0XZZeKa8eRYe4EURUX3agy" +
"1RnXWxp1QiNZo2tS7baBjUTYqDqBGONtspI7UEwosSsoMUVevAM5CEO9mmRVEquF/ExwsrxOCTd7OpKnpXxFjfzzO8uPTnz44Oydb7bunLy1kHXu5huMBt59vYvfsNtPZmb4tjfvdblQGjXI23jUayTpg9w5VfFRjer4RqP6DRGPs/ViY3iDscmVYCN9dQkqKZaGxbuMga6uwBXZeYLq/MKI6jShPq0DqDNBUBg0Wy2C0y6YjMSRGU4TJKslPKhYuJS7fkL7u+m7F33yzc3PeOBb6qSWsZv4Zys3bVq5as0atu+gK5Ff4ldLH+d3vvuW36bL6Ab6LF0X37Pw4I4dB//4+z0+RZ8y306xEuNGP7LI3V+tItF2baRBRfZHqurNjjr7O3H1fdrMTZE6GilG6dWSNt8uStbh/Y03u9AkM1G3snI7rtwMyCKWd2DKMeegZ6W749Lj0+3pjvSEZtJMm4VmdbNme3hzRHNkc1RztH4m7rJ3Q4OzB5uc2XpE9M0eOOh+mi1LoNfdwlGfQtuwV159duGWPfTAgfv/VP3GBz98d4eu2jirfca81uK6o8P62oWsJxaXLT57sN/4npUtpY/8eXvr4bhVzwwa6E9MnDIlc2PQditxr2bMIowYLdLd0YxYouv1lvqQJn0bfREiRCIJo0xmzeg43Ju8NTk2KIaDe0ynpqxeHlEd5ixUx790gazCdL9/QFPpiWvX3y/byg1ramr" +
"q6mpq1sAZYeQ/utYVTaP3Uys10cHTuOaj8xfPdV44L/uSzE8Jyt6K/GQFIyKGKSUIChgEY09jScPcUUJf02C4DCNRShuL32qS0zOo1WGVZIMYbEXZ2QmaSVamWRUUnlgS+DzknT3F7eWPHpnBf+Dnqf3GR3d82g1ran4XItRPl744dl/OW8nJNIeGUS118782Lt3lWyT72RH08USUUxgZiFIyUm3IfonWkxf10mG1EKYioUzSGTQW47mhHYGhHZmKAVzJDKD60b9lA0aJxFHZyWSndqBKs8TEM3Mn0JV8hZ930uRdf5IsTZPnz/UG0uCMd6JfTq9lefDRornXFke7E6O0tpjEUDDXhYWH1tvC6w2AlmgzHEk63D8xikjaUZLZ7BiNhtjRqy10846gERo7u9EC0RKRGyV0Bx0nDL3pxzg5TJANrleZEdlZMH31ytXrvWtWrPZ3Xx3fUjSneeTmNSlbyjuuX+9Y2JDmF3JOffzxqVOfnuefBXggNmb/gJTtvpCqWQ/TIVSFZ+mQh6ZvkPcRlF+MIr8Ud2SoHvC3OKne1KZ9UU0FiYzVhUqaQgvaGIoMTWwoxnKM67KFOU1BZrGTpfh/uBhz4LEnVtb5/RmvL3ljl7C/Z6ywv3H9W2/0rJYsPTtK5l6W5daN+qqWDHh+6oicYqKpyD9kyocpRTtSXcR8Em1J7muwlW1LexrtSs5JZLuScwa5pXgGyx/hdZxIeA" +
"JTPMvxBMSaYoSmQ+nHtDywiJbzyzTe7xcfDmR5vTBcyPsKv7yBPEyXopGDxCAHOoUoppRILPQiribnP/IqOjlLka3XEo5eIbu8bCTCqRmej6+99ib/lF6im3/13ItnD8OtF4LyxN+YxQq0iwTyqjsx0mwIFVUkLkZSWbX1dmiLORxlVBGTIWSC" +
"NNE0wTAxNnJCdIHTeHOcTzt1nM80dUbxARJ9r/0+T2BoAA0leOYPHXrlpnIwoYmg8NPdo9LFdJYupavSQ9JD09Xpmtzw3IjcyNyo3OjcmNzY3LhcWzVUi9WsWqpWVYdUh1arqzXecG+EN9Ib5Y32xnhjvXFem5POpPLJEh5Ff6LMf2vVqgwKOx" +
"IeHbu64vXswkn3v54zdkzOzp2Oubnjy6B7dMEZfqlnubDymyWVX/SsEFbeWCy3YknJ0NxCWddt/CFxKspCjmFZ7tgfY1ibvokegcNxGL9GKZGsUI5imYqJoV/8GMZcsm0pXJhNRtkbfuofdPmBA3IYu/rV+/Oa6I3VNatqa1fVrF7Xc1xSe4um" +
"8Xf5df53fnwavfXR+Qud5y5iFJPtvRPjmIQ8JZJlbrdOK+g1EfG2kFBBpY6wxdvy4myRao0tXrSSOtouWuqs7ZH1JrHe1WZqSopTa+JjVOSBGEk/RiVZEgqSgu58RXZfObDIh6OR3+o23uo2RyjHipKn6ZU8Tak9CcETUz5L4pVkSPq3k2cPTBMGYPo2CFUCJx9oLqqqfPitsWvXdX1YtP+x+YemPrvqVkjBy789//70FjFn34ABk4vGjXXqo7dVtbQ6nW3Z2XM91RmCPn7jilf+4FD2incAMYS9hLExwx2pZyEG2E9M9HDIfnWIJhTzYclo1v88MnbdHNohp0Cyg8sx8Wdmb8Lr7nY+a9ayU5dP7ZZDI3uJH/b2NP9qzsaWE0KJlw5HnSvPvefwlwRZ2r988CqMnmzBUyScRGD+EUXyySgymowhY8k4Mp48gLl+EXmITFM+pPjvhSANSb5Ej5w4dXrxg8kTyhYtrEidUjZ/2cLZTxLyT2S78dEKZW5kc3RyZWFtCmVuZG9iagoxNyAwIG9iago0ODAxCmVuZG9iagoxOCAwIG9iagooKQplbmRvYmoKMTkgMCBvYmoKKE1hYyBPUyBYIDEwLjEyLjYgUXVhcnR6IFBERkNvbnRleHQpCmVuZG9iagoyMCAwIG9iagooKQplbmRvYmoKMjEgMCBvYmoKKCkKZW5kb2JqCjIyIDAgb2JqCihUZXh0TWF0ZSkKZW5kb2JqCjIzIDAgb2JqCihEOjIwMTcxMjEyMTMwMzQ4WjAwJzAwJykKZW5kb2JqCjI0IDAgb2JqCigpCmVuZG9iagoyNSAwIG9iagpbICgpIF0KZW5kb2JqCjEgMCBvYmoKPDwgL1RpdGxlIDE4IDAgUiAvQXV0aG9yIDIwIDAgUiAvU3ViamVjdCAyMSAwIFIgL1Byb2R1Y2VyIDE5IDAgUiAvQ3JlYXRvcgoyMiAwIFIgL0NyZWF0aW9uRGF0ZSAyMyAwIFIgL01vZERhdGUgMjMgMCBSIC9LZXl3b3JkcyAyNCAwIFIgL0FBUEw6S2V5d29yZHMKMjUgMCBSID4+CmVuZG9iagp4cmVmCjAgMjYKMDAwMDAwMDAwMCA2NTUzNSBmIAowMDAwMDA4OTI5IDAwMDAwIG4gCjAwMDAwMDAzMDAgMDAwMDAgbiAKMDAwMDAwMzAyOCAwMDAwMCBuIAowMDAwMDAwMDIyIDAwMDAwIG4gCjAwMDAwMDAyODEgMDAwMDAgbiAKMDAwMDAwMDQwNCAwMDAwMCBuIAowMDAwMDAxNzUzIDAwMDAwIG4gCjAwMDAwMDI5OTIgMDAwMDAgbiAKMDAwMDAwMzE2MSAwMDAwMCBuIAowMDAwMDAwNTEyIDAwMDAwIG4gCjAwMDAwMDE3MzIgMDAwMDAgbiAKMDAwMDAwMTc4OSAwMDAwMCBuIAowMDAwMDAyOTcxIDAwMDAwIG4gCjAwMDAwMDMxMTEgMDAwMDAgbiAKMDAwMDAwMzU0NCAwMDAwMCB" +
"uIAowMDAwMDAzNzk2IDAwMDAwIG4gCjAwMDAwMDg2ODcgMDAwMDAgbiAKMDAwMDAwODcwOCAwMDAwMCBuIAowMDAwMDA4NzI3IDAwMDAwIG4gCjAwMDAwMDg3ODAgMDAwMDAgbiAKMDAwMDAwODc5OSAwMDAwMCBuIAowMDAwMDA4ODE4IDAwMDAwIG4gCjAwMDAwMDg4NDUgMDAwMDAgbiAKMDAwMDAwODg4NyAwMDAwMCBuIAowMDAwMDA4OTA2IDAwMDAwIG4gCnRyYWlsZXIKPDwgL1NpemUgMjYgL1Jvb3QgMTQgMCBSIC9JbmZvIDEgMCBSIC9JRCBbIDxkYjc4M2NhNDM2Mzg4YzI5ZDc5MDQ2NzY3NjUxNjE3OT4KPGRiNzgzY2E0MzYzODhjMjlkNzkwNDY3Njc2NTE2MTc5PiBdID4+CnN0YXJ0eHJlZgo5MTA0CiUlRU9GCg==";
String content = encodedContent.substring("data:application/pdf;base64," .length());
return Base64.decodeBase64(content);
}
public static byte[] createByteArray() {
String pathToBinaryData = "/bla-bla/src/main/resources/small.pdf";
File file = new File(pathToBinaryData);
if (!file.exists()) {
System.out.println(" could not be found in folder " + pathToBinaryData);
return null;
}
FileInputStream fin = null;
try {
fin = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
byte fileContent[] = new byte[(int) file.length()];
try {
fin.read(fileContent);
} catch (IOException e) {
e.printStackTrace();
}
return fileContent;
}
}

How to open and manipulate Word document/template in Java?

I need to open a .doc/.dot/.docx/.dotx (I'm not picky, I just want it to work) document,
parse it for placeholders (or something similar),
put my own data,
and then return generated .doc/.docx/.dotx/.pdf document.
And on top of all that, I need the tools to accomplish that to be free.
I've searched around for something that would suit my needs, but I can't find anything.
Tools like Docmosis, Javadocx, Aspose etc. are commercial.
From what I've read, Apache POI is nowhere near successfully implementing this (they currently don't have any official developer working on Word part of framework).
The only thing that looks that could do the trick is OpenOffice UNO API.
But that is a pretty big byte for someone that has never used this API (like me).
So if I am going to jump into this, I need to make sure that I am on the right path.
Can someone give me some advice on this?

I know it's been a long time since I've posted this question, and I said that I would post my solution when I'm finished.
So here it is.
I hope that it will help someone someday.
This is a full working class, and all you have to do is put it in your application, and place TEMPLATE_DIRECTORY_ROOT directory with .docx templates in your root directory.
Usage is very simple.
You put placeholders (key) in your .docx file, and then pass file name and Map containing corresponding key-value pairs for that file.
Enjoy!
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URI;
import java.util.Deque;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.UUID;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipOutputStream;
import javax.faces.context.ExternalContext;
import javax.faces.context.FacesContext;
import javax.servlet.http.HttpServletResponse;
public class DocxManipulator {
private static final String MAIN_DOCUMENT_PATH = "word/document.xml";
private static final String TEMPLATE_DIRECTORY_ROOT = "TEMPLATES_DIRECTORY/";
/* PUBLIC METHODS */
/**
* Generates .docx document from given template and the substitution data
*
* #param templateName
* Template data
* #param substitutionData
* Hash map with the set of key-value pairs that represent
* substitution data
* #return
*/
public static Boolean generateAndSendDocx(String templateName, Map<String,String> substitutionData) {
String templateLocation = TEMPLATE_DIRECTORY_ROOT + templateName;
String userTempDir = UUID.randomUUID().toString();
userTempDir = TEMPLATE_DIRECTORY_ROOT + userTempDir + "/";
try {
// Unzip .docx file
unzip(new File(templateLocation), new File(userTempDir));
// Change data
changeData(new File(userTempDir + MAIN_DOCUMENT_PATH), substitutionData);
// Rezip .docx file
zip(new File(userTempDir), new File(userTempDir + templateName));
// Send HTTP response
sendDOCXResponse(new File(userTempDir + templateName), templateName);
// Clean temp data
deleteTempData(new File(userTempDir));
}
catch (IOException ioe) {
System.out.println(ioe.getMessage());
return false;
}
return true;
}
/* PRIVATE METHODS */
/**
* Unzipps specified ZIP file to specified directory
*
* #param zipfile
* Source ZIP file
* #param directory
* Destination directory
* #throws IOException
*/
private static void unzip(File zipfile, File directory) throws IOException {
ZipFile zfile = new ZipFile(zipfile);
Enumeration<? extends ZipEntry> entries = zfile.entries();
while (entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
File file = new File(directory, entry.getName());
if (entry.isDirectory()) {
file.mkdirs();
}
else {
file.getParentFile().mkdirs();
InputStream in = zfile.getInputStream(entry);
try {
copy(in, file);
}
finally {
in.close();
}
}
}
}
/**
* Substitutes keys found in target file with corresponding data
*
* #param targetFile
* Target file
* #param substitutionData
* Map of key-value pairs of data
* #throws IOException
*/
#SuppressWarnings({ "unchecked", "rawtypes" })
private static void changeData(File targetFile, Map<String,String> substitutionData) throws IOException{
BufferedReader br = null;
String docxTemplate = "";
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(targetFile), "UTF-8"));
String temp;
while( (temp = br.readLine()) != null)
docxTemplate = docxTemplate + temp;
br.close();
targetFile.delete();
}
catch (IOException e) {
br.close();
throw e;
}
Iterator substitutionDataIterator = substitutionData.entrySet().iterator();
while(substitutionDataIterator.hasNext()){
Map.Entry<String,String> pair = (Map.Entry<String,String>)substitutionDataIterator.next();
if(docxTemplate.contains(pair.getKey())){
if(pair.getValue() != null)
docxTemplate = docxTemplate.replace(pair.getKey(), pair.getValue());
else
docxTemplate = docxTemplate.replace(pair.getKey(), "NEDOSTAJE");
}
}
FileOutputStream fos = null;
try{
fos = new FileOutputStream(targetFile);
fos.write(docxTemplate.getBytes("UTF-8"));
fos.close();
}
catch (IOException e) {
fos.close();
throw e;
}
}
/**
* Zipps specified directory and all its subdirectories
*
* #param directory
* Specified directory
* #param zipfile
* Output ZIP file name
* #throws IOException
*/
private static void zip(File directory, File zipfile) throws IOException {
URI base = directory.toURI();
Deque<File> queue = new LinkedList<File>();
queue.push(directory);
OutputStream out = new FileOutputStream(zipfile);
Closeable res = out;
try {
ZipOutputStream zout = new ZipOutputStream(out);
res = zout;
while (!queue.isEmpty()) {
directory = queue.pop();
for (File kid : directory.listFiles()) {
String name = base.relativize(kid.toURI()).getPath();
if (kid.isDirectory()) {
queue.push(kid);
name = name.endsWith("/") ? name : name + "/";
zout.putNextEntry(new ZipEntry(name));
}
else {
if(kid.getName().contains(".docx"))
continue;
zout.putNextEntry(new ZipEntry(name));
copy(kid, zout);
zout.closeEntry();
}
}
}
}
finally {
res.close();
}
}
/**
* Sends HTTP Response containing .docx file to Client
*
* #param generatedFile
* Path to generated .docx file
* #param fileName
* File name of generated file that is being presented to user
* #throws IOException
*/
private static void sendDOCXResponse(File generatedFile, String fileName) throws IOException {
FacesContext facesContext = FacesContext.getCurrentInstance();
ExternalContext externalContext = facesContext.getExternalContext();
HttpServletResponse response = (HttpServletResponse) externalContext
.getResponse();
BufferedInputStream input = null;
BufferedOutputStream output = null;
response.reset();
response.setHeader("Content-Type", "application/msword");
response.setHeader("Content-Disposition", "attachment; filename=\"" + fileName + "\"");
response.setHeader("Content-Length",String.valueOf(generatedFile.length()));
input = new BufferedInputStream(new FileInputStream(generatedFile), 10240);
output = new BufferedOutputStream(response.getOutputStream(), 10240);
byte[] buffer = new byte[10240];
for (int length; (length = input.read(buffer)) > 0;) {
output.write(buffer, 0, length);
}
output.flush();
input.close();
output.close();
// Inform JSF not to proceed with rest of life cycle
facesContext.responseComplete();
}
/**
* Deletes directory and all its subdirectories
*
* #param file
* Specified directory
* #throws IOException
*/
public static void deleteTempData(File file) throws IOException {
if (file.isDirectory()) {
// directory is empty, then delete it
if (file.list().length == 0)
file.delete();
else {
// list all the directory contents
String files[] = file.list();
for (String temp : files) {
// construct the file structure
File fileDelete = new File(file, temp);
// recursive delete
deleteTempData(fileDelete);
}
// check the directory again, if empty then delete it
if (file.list().length == 0)
file.delete();
}
} else {
// if file, then delete it
file.delete();
}
}
private static void copy(InputStream in, OutputStream out) throws IOException {
byte[] buffer = new byte[1024];
while (true) {
int readCount = in.read(buffer);
if (readCount < 0) {
break;
}
out.write(buffer, 0, readCount);
}
}
private static void copy(File file, OutputStream out) throws IOException {
InputStream in = new FileInputStream(file);
try {
copy(in, out);
} finally {
in.close();
}
}
private static void copy(InputStream in, File file) throws IOException {
OutputStream out = new FileOutputStream(file);
try {
copy(in, out);
} finally {
out.close();
}
}
}

Since a docx file is merely a zip-archive of xml files (plus any binary files for embedded objects such as images), we met that requirement by unpacking the zip file, feeding the document.xml to a template engine (we used freemarker) that does the merging for us, and then zipping the output document to get the new docx file.
The template document then is simply an ordinary docx with embedded freemarker expressions / directives, and can be edited in Word.
Since (un)zipping can be done with the JDK, and Freemarker is open source, you don't incur any licence fees, not even for word itself.
The limitation is that this approach can only emit docx or rtf files, and the output document will have the same filetype as the template. If you need to convert the document to another format (such as pdf) you'll have to solve that problem separately.

I ended up relying on Apache Poi 3.12 and processing paragraphs (separately extracting paragraphs also from tables, headers/footers, and footnotes, as such paragraphs aren't returned by XWPFDocument.getParagraphs() ).
The processing code (~100 lines) and unit tests are here on github.

I've been in more or less the same situation as you, I had to modify a whole bunch of MS Word merge templates at once. After having googled a lot to try to find a Java solution I finally installed Visual Studio 2010 Express which is free and did the job in C#.

I have recently dealt with similar problem:
"A tool which accepts a template '.docx' file, processes the file by evaluation of passed parameter context and outputs a '.docx' file as the result of the process."
finally god brought us scriptlet4dox :).
the key features for this product is:
1. groovy code injection as scripts in template file (parameter injection, etc.)
2. loop over collection items in table
and so many other features.
but as I checked the last commit on the project is performed about a year ago, so there is a probability that the project is not supported for new features and new bug-fixes. this is your choice to use it or not.

I want to add a Line into a PDF document using java

I am currently using PDFBox and reading from within a.pdf which is found in folder 1
I first list all the Pdf files found within the folder.
Then I check the number of pages that each file has.
Now i want to go to the very end of the file below the footer to add an image that can be recognised by the printer to staple the pages since it will realise it has reached end of file.
I have arrived till getting list of files and the number of pages.
What command do i use to go to the end of the last page and write there.
Should i transform the .pdf file into text or
Should i be able to use PDPageContentStream
This is the code I am currently using I am trying to test and see if a AAA string will be insterted into my last page of the pdf file. the project is executing with no errors but for some reason it is not being inserted into the pdf.
package pdfviewer;
import java.io.*;
import java.util.*;
import java.util.List;
import java.io.IOException;
import org.apache.pdfbox.PDFReader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.edit.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
public class Main {
/**
* #param args the command line arguments
*/
public static List flist()
{
List listfile = new ArrayList();
String path = "C:/1";
String files;
File folder = new File(path);
File[] listOfFiles = folder.listFiles();
for (int i = 0; i < listOfFiles.length; i++)
{
if (listOfFiles[i].isFile())
{
files = listOfFiles[i].getName();
if (files.endsWith(".pdf") || files.endsWith(".PDF"))
{
listfile.add(listOfFiles[i]);
}
}
}
System.out.println(listfile);
return listfile;
}
public static void CheckPages(List a)
{
String dir = null;
Object[] arraydir = a.toArray(new Object[0]);
for (int i=0; i< arraydir.length; i++)
{
int pages = 0;
PDFont font = PDType1Font.HELVETICA_BOLD;
float fontSize = 12.0f;
dir = arraydir[i].toString();
System.out.println(dir);
try {
PDDocument pdoc = PDDocument.load(dir);
List allPages = pdoc.getDocumentCatalog().getAllPages();
pages = pdoc.getNumberOfPages();
System.out.println(allPages);
int f = pages;
System.out.println(pages);
PDPage page = (PDPage) allPages.get(i);
//System.out.println(page);
PDRectangle pageSize = page.findMediaBox();
float stringWidth = font.getStringWidth( "AAA" );
float centeredPosition = (pageSize.getWidth() - (stringWidth*fontSize)/1000f)/2f;
PDPageContentStream contentStream = new PDPageContentStream(pdoc,page,true,true);
//System.out.println(contentStream);
contentStream.beginText();
contentStream.setFont( font, fontSize );
contentStream.moveTextPositionByAmount( centeredPosition, 30 );
contentStream.drawString( "AAA" );
contentStream.endText();
contentStream.close();
pdoc.close();
}
catch (Exception e)
{
System.err.println("An exception occured in parsing the PDF Document."+ e.getMessage());
}
}
}
public static void main(String[] args)
{
List l = new ArrayList();
l = pdfviewer.Main.flist();
pdfviewer.Main.CheckPages(l);
}
}
Thanks for your attention
The code I was using above is correct.
The problem is that the PDF files being generated are version 1.2, that is the reason why I am not being allowed to Edit the pdf document.
Does anyone know what I should do if i'm using a version 1.2, since I can't really upgrade it.

you can look at the examples supplied with the library.
there are two files that are of interest to you:
1- AddImageToPDF.java AddImageToPDF.java on google code search
2- AddMessageToEachPage.java AddMessageToEachPage.java on google code search
the second one adds a message to every page but you can modify it to work with the last page only. according to the PDFBox user guide document, they should be found under the folder: src/main/java/org/apache/pdfbox/examples
I have added links on google code search in case you have trouble locating the files.
I have not worked with the library or tried the examples and I am quite sure you will need to modify the code a little to suit your needs for the location of the added line/image.
In any case, if this helps you and you get a working solution, you can add the solution so that others can benefit from it.
EDIT:
After seeing the code posted by the question author, I add a modification to make it work.
I allowed myself also to make few changes for clarity.
import java.io.File;
import java.io.FileFilter;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.edit.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
public class Main {
/**
* #param args the command line arguments
*/
public static final FileFilter pdfFileFilter = new FileFilter() {
public boolean accept(File file) {
return file.isFile() && file.getName().toLowerCase().endsWith(".pdf");
}
};
public static void closeQuietly(PDDocument doc) {
if (doc != null) {
try {
doc.close();
} catch (Exception exception) {
//do something here if you wish like logging
}
}
}
public static void CheckPages(File[] sourcePdfFiles,String textToInsert, String prefix) {
for (File sourcePdfFile : sourcePdfFiles) {
PDFont font = PDType1Font.HELVETICA_BOLD;
float fontSize = 12.0f;
PDDocument pdoc = null;
try {
pdoc = PDDocument.load(sourcePdfFile);
List allPages = pdoc.getDocumentCatalog().getAllPages();
PDPage lastPage = (PDPage) allPages.get(allPages.size() - 1);
PDRectangle pageSize = lastPage.findMediaBox();
float stringWidth = font.getStringWidth(textToInsert);
float centeredPosition = (pageSize.getWidth() - (stringWidth * fontSize) / 1000f) / 2f;
PDPageContentStream contentStream = new PDPageContentStream(pdoc, lastPage, true, true);
contentStream.beginText();
contentStream.setFont(font, fontSize);
contentStream.moveTextPositionByAmount(centeredPosition, 30);
contentStream.drawString(textToInsert);
contentStream.endText();
contentStream.close();
File resultFile = new File(sourcePdfFile.getParentFile(), prefix + sourcePdfFile.getName());
pdoc.save(resultFile.getAbsolutePath());
} catch (Exception e) {
System.err.println("An exception occured in parsing the PDF Document." + e.getMessage());
} finally {
closeQuietly(pdoc);
}
}
}
public static void main(String[] args) {
File pdfFilesFolder = new File("C:\\1");
File[] pdfFiles = pdfFilesFolder.listFiles(pdfFileFilter);
//when a file is processed, the result will be saved in a new file having the location of the source file
//and the same name of source file prefixed with this
String modifiedFilePrefix = "modified-";
CheckPages(pdfFiles,"AAA", modifiedFilePrefix);
}
}

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.