I need to open a .doc/.dot/.docx/.dotx (I'm not picky, I just want it to work) document,
parse it for placeholders (or something similar),
put my own data,
and then return generated .doc/.docx/.dotx/.pdf document.
And on top of all that, I need the tools to accomplish that to be free.
I've searched around for something that would suit my needs, but I can't find anything.
Tools like Docmosis, Javadocx, Aspose etc. are commercial.
From what I've read, Apache POI is nowhere near successfully implementing this (they currently don't have any official developer working on Word part of framework).
The only thing that looks that could do the trick is OpenOffice UNO API.
But that is a pretty big byte for someone that has never used this API (like me).
So if I am going to jump into this, I need to make sure that I am on the right path.
Can someone give me some advice on this?
I know it's been a long time since I've posted this question, and I said that I would post my solution when I'm finished.
So here it is.
I hope that it will help someone someday.
This is a full working class, and all you have to do is put it in your application, and place TEMPLATE_DIRECTORY_ROOT directory with .docx templates in your root directory.
Usage is very simple.
You put placeholders (key) in your .docx file, and then pass file name and Map containing corresponding key-value pairs for that file.
Enjoy!
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URI;
import java.util.Deque;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.UUID;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipOutputStream;
import javax.faces.context.ExternalContext;
import javax.faces.context.FacesContext;
import javax.servlet.http.HttpServletResponse;
public class DocxManipulator {
private static final String MAIN_DOCUMENT_PATH = "word/document.xml";
private static final String TEMPLATE_DIRECTORY_ROOT = "TEMPLATES_DIRECTORY/";
/* PUBLIC METHODS */
/**
* Generates .docx document from given template and the substitution data
*
* #param templateName
* Template data
* #param substitutionData
* Hash map with the set of key-value pairs that represent
* substitution data
* #return
*/
public static Boolean generateAndSendDocx(String templateName, Map<String,String> substitutionData) {
String templateLocation = TEMPLATE_DIRECTORY_ROOT + templateName;
String userTempDir = UUID.randomUUID().toString();
userTempDir = TEMPLATE_DIRECTORY_ROOT + userTempDir + "/";
try {
// Unzip .docx file
unzip(new File(templateLocation), new File(userTempDir));
// Change data
changeData(new File(userTempDir + MAIN_DOCUMENT_PATH), substitutionData);
// Rezip .docx file
zip(new File(userTempDir), new File(userTempDir + templateName));
// Send HTTP response
sendDOCXResponse(new File(userTempDir + templateName), templateName);
// Clean temp data
deleteTempData(new File(userTempDir));
}
catch (IOException ioe) {
System.out.println(ioe.getMessage());
return false;
}
return true;
}
/* PRIVATE METHODS */
/**
* Unzipps specified ZIP file to specified directory
*
* #param zipfile
* Source ZIP file
* #param directory
* Destination directory
* #throws IOException
*/
private static void unzip(File zipfile, File directory) throws IOException {
ZipFile zfile = new ZipFile(zipfile);
Enumeration<? extends ZipEntry> entries = zfile.entries();
while (entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
File file = new File(directory, entry.getName());
if (entry.isDirectory()) {
file.mkdirs();
}
else {
file.getParentFile().mkdirs();
InputStream in = zfile.getInputStream(entry);
try {
copy(in, file);
}
finally {
in.close();
}
}
}
}
/**
* Substitutes keys found in target file with corresponding data
*
* #param targetFile
* Target file
* #param substitutionData
* Map of key-value pairs of data
* #throws IOException
*/
#SuppressWarnings({ "unchecked", "rawtypes" })
private static void changeData(File targetFile, Map<String,String> substitutionData) throws IOException{
BufferedReader br = null;
String docxTemplate = "";
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(targetFile), "UTF-8"));
String temp;
while( (temp = br.readLine()) != null)
docxTemplate = docxTemplate + temp;
br.close();
targetFile.delete();
}
catch (IOException e) {
br.close();
throw e;
}
Iterator substitutionDataIterator = substitutionData.entrySet().iterator();
while(substitutionDataIterator.hasNext()){
Map.Entry<String,String> pair = (Map.Entry<String,String>)substitutionDataIterator.next();
if(docxTemplate.contains(pair.getKey())){
if(pair.getValue() != null)
docxTemplate = docxTemplate.replace(pair.getKey(), pair.getValue());
else
docxTemplate = docxTemplate.replace(pair.getKey(), "NEDOSTAJE");
}
}
FileOutputStream fos = null;
try{
fos = new FileOutputStream(targetFile);
fos.write(docxTemplate.getBytes("UTF-8"));
fos.close();
}
catch (IOException e) {
fos.close();
throw e;
}
}
/**
* Zipps specified directory and all its subdirectories
*
* #param directory
* Specified directory
* #param zipfile
* Output ZIP file name
* #throws IOException
*/
private static void zip(File directory, File zipfile) throws IOException {
URI base = directory.toURI();
Deque<File> queue = new LinkedList<File>();
queue.push(directory);
OutputStream out = new FileOutputStream(zipfile);
Closeable res = out;
try {
ZipOutputStream zout = new ZipOutputStream(out);
res = zout;
while (!queue.isEmpty()) {
directory = queue.pop();
for (File kid : directory.listFiles()) {
String name = base.relativize(kid.toURI()).getPath();
if (kid.isDirectory()) {
queue.push(kid);
name = name.endsWith("/") ? name : name + "/";
zout.putNextEntry(new ZipEntry(name));
}
else {
if(kid.getName().contains(".docx"))
continue;
zout.putNextEntry(new ZipEntry(name));
copy(kid, zout);
zout.closeEntry();
}
}
}
}
finally {
res.close();
}
}
/**
* Sends HTTP Response containing .docx file to Client
*
* #param generatedFile
* Path to generated .docx file
* #param fileName
* File name of generated file that is being presented to user
* #throws IOException
*/
private static void sendDOCXResponse(File generatedFile, String fileName) throws IOException {
FacesContext facesContext = FacesContext.getCurrentInstance();
ExternalContext externalContext = facesContext.getExternalContext();
HttpServletResponse response = (HttpServletResponse) externalContext
.getResponse();
BufferedInputStream input = null;
BufferedOutputStream output = null;
response.reset();
response.setHeader("Content-Type", "application/msword");
response.setHeader("Content-Disposition", "attachment; filename=\"" + fileName + "\"");
response.setHeader("Content-Length",String.valueOf(generatedFile.length()));
input = new BufferedInputStream(new FileInputStream(generatedFile), 10240);
output = new BufferedOutputStream(response.getOutputStream(), 10240);
byte[] buffer = new byte[10240];
for (int length; (length = input.read(buffer)) > 0;) {
output.write(buffer, 0, length);
}
output.flush();
input.close();
output.close();
// Inform JSF not to proceed with rest of life cycle
facesContext.responseComplete();
}
/**
* Deletes directory and all its subdirectories
*
* #param file
* Specified directory
* #throws IOException
*/
public static void deleteTempData(File file) throws IOException {
if (file.isDirectory()) {
// directory is empty, then delete it
if (file.list().length == 0)
file.delete();
else {
// list all the directory contents
String files[] = file.list();
for (String temp : files) {
// construct the file structure
File fileDelete = new File(file, temp);
// recursive delete
deleteTempData(fileDelete);
}
// check the directory again, if empty then delete it
if (file.list().length == 0)
file.delete();
}
} else {
// if file, then delete it
file.delete();
}
}
private static void copy(InputStream in, OutputStream out) throws IOException {
byte[] buffer = new byte[1024];
while (true) {
int readCount = in.read(buffer);
if (readCount < 0) {
break;
}
out.write(buffer, 0, readCount);
}
}
private static void copy(File file, OutputStream out) throws IOException {
InputStream in = new FileInputStream(file);
try {
copy(in, out);
} finally {
in.close();
}
}
private static void copy(InputStream in, File file) throws IOException {
OutputStream out = new FileOutputStream(file);
try {
copy(in, out);
} finally {
out.close();
}
}
}
Since a docx file is merely a zip-archive of xml files (plus any binary files for embedded objects such as images), we met that requirement by unpacking the zip file, feeding the document.xml to a template engine (we used freemarker) that does the merging for us, and then zipping the output document to get the new docx file.
The template document then is simply an ordinary docx with embedded freemarker expressions / directives, and can be edited in Word.
Since (un)zipping can be done with the JDK, and Freemarker is open source, you don't incur any licence fees, not even for word itself.
The limitation is that this approach can only emit docx or rtf files, and the output document will have the same filetype as the template. If you need to convert the document to another format (such as pdf) you'll have to solve that problem separately.
I ended up relying on Apache Poi 3.12 and processing paragraphs (separately extracting paragraphs also from tables, headers/footers, and footnotes, as such paragraphs aren't returned by XWPFDocument.getParagraphs() ).
The processing code (~100 lines) and unit tests are here on github.
I've been in more or less the same situation as you, I had to modify a whole bunch of MS Word merge templates at once. After having googled a lot to try to find a Java solution I finally installed Visual Studio 2010 Express which is free and did the job in C#.
I have recently dealt with similar problem:
"A tool which accepts a template '.docx' file, processes the file by evaluation of passed parameter context and outputs a '.docx' file as the result of the process."
finally god brought us scriptlet4dox :).
the key features for this product is:
1. groovy code injection as scripts in template file (parameter injection, etc.)
2. loop over collection items in table
and so many other features.
but as I checked the last commit on the project is performed about a year ago, so there is a probability that the project is not supported for new features and new bug-fixes. this is your choice to use it or not.
Related
I am having issue with extracting files into subfolders. The zip files I need to unzip have multiple folders, and I am trying to keep the directory structure when unzipping.
package components;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
/**
* This utility extracts files and directories of a standard zip file to
* a destination directory.
*
*
*/
public class UnzipUtility {
/**
* Size of the buffer to read/write data
*/
private static final int BUFFER_SIZE = 4096;
/**
* Extracts a zip file specified by the zipFilePath to a directory specified by
* destDirectory (will be created if does not exists)
* #param zipFilePath
* #param destDirectory
* #throws IOException
*/
public void unzip(String zipFilePath, String destDirectory) throws IOException {
File destDir = new File(destDirectory);
if (!destDir.exists()) {
destDir.mkdir();
}
ZipInputStream zipIn = new ZipInputStream(new FileInputStream(zipFilePath));
ZipEntry entry = zipIn.getNextEntry();
// iterates over entries in the zip file
while (entry != null) {
String filePath = destDirectory + File.separator + entry.getName();
if (!entry.isDirectory()) {
// if the entry is a file, extracts it
extractFile(zipIn, filePath);
System.out.println(filePath);
} else {
// if the entry is a directory, make the directory
File dir = new File(filePath);
System.out.println(filePath);
dir.mkdir();
}
zipIn.closeEntry();
entry = zipIn.getNextEntry();
}
zipIn.close();
}
/**
* Extracts a zip entry (file entry)
* #param zipIn
* #param filePath
* #throws IOException
*/
private void extractFile(ZipInputStream zipIn, String filePath) throws IOException {
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(filePath));
byte[] bytesIn = new byte[BUFFER_SIZE];
int read = 0;
while ((read = zipIn.read(bytesIn)) != -1) {
bos.write(bytesIn, 0, read);
}
bos.close();
}
}
The output I get from the code is below. The issue is that it the file DoD-DISA-logos-as-JPEG.jpg is part of a child folder to subfolder. So path should be folder/subfolder/childfolder/DoD-DISA-logos-as-JPEG.jpg. It is not including that childfolder so it gets IO exception I believe.
java.io.FileNotFoundException: /Users/user/Desktop/folder/subfolder/DoD-DISA-logos-as-JPEG.jpg (No such file or directory)
at java.io.FileOutputStream.open0(Native Method)
at java.io.FileOutputStream.open(FileOutputStream.java:270)
at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
at java.io.FileOutputStream.<init>(FileOutputStream.java:101)
at components.UnzipUtility.extractFile(UnzipUtility.java:62)
at components.UnzipUtility.unzip(UnzipUtility.java:42)
at components.UnzipUtilityTest.main(UnzipUtilityTest.java:9)
You should not rely on entry.isDirectory() or on the fact that the ZipEntry iterator will list all the folders hierarchies.
A zip files structure may contain only a ZipEntry file wich contains in its name a subfolders hierarchy. Eg. foo/bar/file.txt will be a ZipEntry but not foo/ nor foo/bar/
Thus for :
a file, you should get the parent directory and create the whole hierarchy if it does not exists with File#mkdirs
a directory, you should create the whole hierarchy if it does not exists with File#mkdirs
Your code should be transformed to something like that:
if (!entry.isDirectory()) {
// if the entry is a file, extracts it
new File(filePath).getParentFile().mkdirs();
extractFile(zipIn, filePath);
System.out.println(filePath);
} else {
// if the entry is a directory, make the directory
File dir = new File(filePath);
System.out.println(filePath);
dir.mkdirs();
}
I recommend to use an external library to zip/unzip like zt-zip.
I am trying to unzip a compressed folder using java.util.zip package:
Now my compressed folder structure is :
My compressed folder name is classes.zip
Inside this zip folder i have a classes folder inside that i have sub folders as well as files:
If you further go inside the www folder then again it has sub folder which is a java package and inside the package structured folder i have .class files.
Now i am trying to unzip this compressed folder and my code is:
package www.eor.com;
/**
* A console application that tests the UnzipUtility class
*
*/
public class UnzipUtilityTest {
public static void main(String[] args) {
String zipFilePath = "D:/classes.zip";
String destDirectory = "D:/Dojo";
UnzipUtility unzipper = new UnzipUtility();
try {
unzipper.unzip(zipFilePath, destDirectory);
} catch (Exception ex) {
// some errors occurred
ex.printStackTrace();
}
}
}
and the supporting class is :
package www.eor.com;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
/**
* This utility extracts files and directories of a standard zip file to
* a destination directory.
*/
public class UnzipUtility {
/**
* Size of the buffer to read/write data
*/
private static final int BUFFER_SIZE = 4096;
/**
* Extracts a zip file specified by the zipFilePath to a directory specified by
* destDirectory (will be created if does not exists)
* #param zipFilePath
* #param destDirectory
* #throws IOException
*/
public void unzip(String zipFilePath, String destDirectory) throws IOException {
File destDir = new File(destDirectory);
if (!destDir.exists()) {
destDir.mkdir();
}
ZipInputStream zipIn = new ZipInputStream(new FileInputStream(zipFilePath));
ZipEntry entry = zipIn.getNextEntry();
// iterates over entries in the zip file
while (entry != null) {
String filePath = destDirectory + File.separator + entry.getName();
if (!entry.isDirectory()) {
// if the entry is a file, extracts it
extractFile(zipIn, filePath);
} else {
// if the entry is a directory, make the directory
File dir = new File(filePath);
dir.mkdir();
}
zipIn.closeEntry();
entry = zipIn.getNextEntry();
}
zipIn.close();
}
/**
* Extracts a zip entry (file entry)
* #param zipIn
* #param filePath
* #throws IOException
*/
private void extractFile(ZipInputStream zipIn, String filePath) throws IOException {
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(filePath));
byte[] bytesIn = new byte[BUFFER_SIZE];
int read = 0;
while ((read = zipIn.read(bytesIn)) != -1) {
bos.write(bytesIn, 0, read);
}
bos.close();
}
}
Now when i run UnzipUtilityTest class it is giving me exception as :
java.io.FileNotFoundException: D:\Dojo\classes\camel-config-xml.xml (The system cannot find the path specified)
at java.io.FileOutputStream.open(Native Method)
at java.io.FileOutputStream.<init>(FileOutputStream.java:221)
at java.io.FileOutputStream.<init>(FileOutputStream.java:110)
at www.cognizant.com.UnzipUtility.extractFile(UnzipUtility.java:59)
at www.cognizant.com.UnzipUtility.unzip(UnzipUtility.java:41)
at www.cognizant.com.UnzipUtilityTest.main(UnzipUtilityTest.java:16)
Why it is giving this exception and how to rectify this problem?
It's probably due to the parent of the file classes/ does not exists, so it can't create a file in it.
When you extract entries of a zip, you have to create the parent folders for the file. A zip does not necessarily contain entries for every single folder. But every entry in zip is of the form path/to/entry/filename.ext so you can derive the parent path of the entry and create the parent folders accordingly.
so before extracting the file, do
new File(filePath).getParent().mkdirs()
I am trying to export whole repository in zip format .I am not able to find anything related to exporting a folder or repository in svnkit.
I have never tried zipping the whole repository using org.tmatesoft.svnkit. But I have an utility that zips files from a local folder. So if you copy the remote repository locally (into a temp directory), you should be able to zip it using the following class. It needs 2 values, starting folder path and the full, zip file name.
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
public class ZipFolder {
private String startingFolderPath;
private String outputZipFilePath;
private List<String> fileList;
/**
* Zip files
*/
public void zipFiles(){
byte[] buffer = new byte[1024];
try{
if (fileList == null) {
fileList = new ArrayList<String>();
generateFileList(new File(startingFolderPath));
}
System.out.println("File Count:" + fileList.size());
FileOutputStream fos = new FileOutputStream(outputZipFilePath);
ZipOutputStream zos = new ZipOutputStream(fos);
System.out.println("Output to Zip : " + outputZipFilePath);
for(String file : fileList){
System.out.println("File Included : " + file);
ZipEntry ze= new ZipEntry(file);
zos.putNextEntry(ze);
FileInputStream in =
new FileInputStream(startingFolderPath + File.separator + file);
int len;
while ((len = in.read(buffer)) > 0) {
zos.write(buffer, 0, len);
}
in.close();
}
zos.closeEntry();
//remember close it
zos.close();
System.out.println("Done");
}catch(IOException ex){
ex.printStackTrace();
}
}
/**
* Traverse a directory and get all files,
* and add the file into fileList
* #param node file or directory
*/
public void generateFileList(File node) throws IOException {
//add file only
if(node.isFile()){
fileList.add(generateZipEntry(node.getAbsoluteFile().toString()));
}
if(node.isDirectory()){
String[] subNote = node.list();
for(String filename : subNote){
generateFileList(new File(node, filename));
}
}
}
/**
* Format the file path for zip
* #param file file path
* #return Formatted file path
*/
private String generateZipEntry(String file){
return file.substring(startingFolderPath.length()+1, file.length());
}
public void setStartingFolderPath(String startingFolderPath) {
this.startingFolderPath = startingFolderPath;
}
public void setOutputZipFilePath(String outputZipFilePath) {
this.outputZipFilePath = outputZipFilePath;
}
}
I'm working on an application that need to create a tar archive in order to calculate his hash.
But I encounter some problems :
the tar is not the same in different machine, then the hash calculated is different
I'm not able to add directories properly
If I add an zip file, at the end, in the tar, I have the content off my zip file :/
I have read different post in SO and the dedicated tutorial on apache, and also the source test code of the apache commons compress jar, but I don't get the right solution.
Are there anybody that can find where my code is not correct ?
public static File createTarFile(File[] files, File repository) {
File tarFile = new File(TEMP_DIR + File.separator + repository.getName() + Constants.TAR_EXTENSION);
if (tarFile.exists()) {
tarFile.delete();
}
try {
OutputStream out = new FileOutputStream(tarFile);
TarArchiveOutputStream aos = (TarArchiveOutputStream) new ArchiveStreamFactory().createArchiveOutputStream("tar", out);
for(File file : files){
Utilities.addFileToTar(aos, file, "");
}
aos.finish();
aos.close();
out.close();
} catch (Exception e) {
e.printStackTrace();
}
return tarFile;
}
private static void addFileToTar(TarArchiveOutputStream tOut, File file, String base) throws IOException {
TarArchiveEntry entry = new TarArchiveEntry(file, base + file.getName());
entry.setModTime(0);
entry.setSize(file.length());
entry.setUserId(0);
entry.setGroupId(0);
entry.setUserName("avalon");
entry.setGroupName("excalibur");
entry.setMode(0100000);
entry.setSize(file.length());
tOut.putArchiveEntry(entry);
if (file.isFile()) {
IOUtils.copy(new FileInputStream(file), tOut);
tOut.closeArchiveEntry();
} else {
tOut.closeArchiveEntry();
File[] children = file.listFiles();
if (children != null) {
for (File child : children) {
addFileToTar(tOut, child, file.getName());
}
}
}
}
Thank you.
I finally found the solution after reading the post of caarlos0 : Encoding problem when compressing files with Apache Commons Compression on Linux
Using the apache-commons-1.8.jar library, I have made a tool class that can do the job :
You can find this code here : GitHub repository of the library MakeTar
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.utils.IOUtils;
/**
* The Class TarArchive.
*/
public class TarArchive {
/**
* Creates the tar of files.
*
* #param files the files
* #param tarPath the tar path
* #throws IOException Signals that an I/O exception has occurred.
*/
public static void createTarOfFiles(String[] files, String tarPath) throws IOException
{
FileOutputStream fOut = null;
BufferedOutputStream bOut = null;
TarArchiveOutputStream tOut = null;
Arrays.sort(files);
try
{
fOut = new FileOutputStream(new File(tarPath));
bOut = new BufferedOutputStream(fOut);
tOut = new TarArchiveOutputStream(bOut);
for (String file : files) {
addFileToTar(tOut, file, "");
}
}
finally
{
tOut.finish();
tOut.close();
bOut.close();
fOut.close();
}
}
/**
* Creates the tar of directory.
*
* #param directoryPath the directory path
* #param tarPath the tar path
* #throws IOException Signals that an I/O exception has occurred.
*/
public static void createTarOfDirectory(String directoryPath, String tarPath) throws IOException
{
FileOutputStream fOut = null;
BufferedOutputStream bOut = null;
TarArchiveOutputStream tOut = null;
try
{
fOut = new FileOutputStream(new File(tarPath));
bOut = new BufferedOutputStream(fOut);
tOut = new TarArchiveOutputStream(bOut);
addFileToTar(tOut, directoryPath, "");
}
finally
{
tOut.finish();
tOut.close();
bOut.close();
fOut.close();
}
}
/**
* Adds the file to tar.
*
* #param tOut the t out
* #param path the path
* #param base the base
* #throws IOException Signals that an I/O exception has occurred.
*/
private static void addFileToTar(TarArchiveOutputStream tOut, String path, String base) throws IOException
{
File f = new File(path);
String entryName = base + f.getName();
TarArchiveEntry tarEntry = new TarArchiveEntry(f, entryName);
tOut.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
if(f.isFile())
{
tarEntry.setModTime(0);
tOut.putArchiveEntry(tarEntry);
IOUtils.copy(new FileInputStream(f), tOut);
tOut.closeArchiveEntry();
}
else
{
File[] children = f.listFiles();
Arrays.sort(children);
if(children != null)
{
for(File child : children)
{
addFileToTar(tOut, child.getAbsolutePath(), entryName + "/");
}
}
}
}
}
Thanks for read me.
EDIT : Little correction, I have add the sort of the arrays.
EDIT 2 : I have corrected the code in order to have the same archive on all machine. The hash calculated on the archive is the same everywhere.
I have some pdf files, Using pdfbox i have converted them into text and stored into text files, Now from the text files i want to remove
Hyperlinks
All special characters
Blank lines
headers footers of pdf files
“1)”,“2)”, “a)”, “bullets”, etc.
I want to get valid text line by line like this:
We propose OntoGain, a method for ontology learning from multi-word concept terms extracted from plain text. OntoGain follows an ontology learning process dened by distinct processing layers. Building upon plain term extraction a con-cept hierarchy is formed by clustering the extracted concepts. The derived term taxonomy is then enriched with non-taxonomic relations. Several dierent state-of-the-art methods have been examined for implementing each layer. OntoGain is based upon multi-word term concepts, as multi-word or compound terms are vested with more solid and distinctive semantics than plain single word terms. We opted for a hierarchical clustering method and Formal Concept Analysis (FCA) algorithm for building the term taxonomy. Furthermore an association rule algorithm is applied for revealing non-taxonomic relations. A method which tries to carry out the most appropriate generalization level between a relation's concepts is also implemented. To show proof of concept, a system prototype is implemented. The OntoGain allows transformation of the derived ontology into OWL using Jena Semantic Web Frame-work1. OntoGain is applied on two separate data sources a medical and computer corpus and its results are compared with similar results obtained by Text2Onto, a state-of-the-art-ontology learning method. The analysis of 11.5 CCD1.1 results indicates that OntoGain performs better than Text2Onto in terms of precision extracts more correct concepts while being more selective extracts fewer but more reasonable concepts.
How can I achieve this?
Using pdfbox we can achive this
Example :
public static void main(String args[]) {
PDFParser parser = null;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
PDFTextStripper pdfStripper;
String parsedText;
String fileName = "E:\\Files\\Small Files\\PDF\\JDBC.pdf";
File file = new File(fileName);
try {
parser = new PDFParser(new FileInputStream(file));
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
parsedText = pdfStripper.getText(pdDoc);
System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", ""));
} catch (Exception e) {
e.printStackTrace();
try {
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
} catch (Exception e1) {
e1.printStackTrace();
}
}
}
Hi we can extract the pdf files using Apache Tika
The Example is :
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
public class WebPagePdfExtractor {
public Map<String, Object> processRecord(String url) {
DefaultHttpClient httpclient = new DefaultHttpClient();
Map<String, Object> map = new HashMap<String, Object>();
try {
HttpGet httpGet = new HttpGet(url);
HttpResponse response = httpclient.execute(httpGet);
HttpEntity entity = response.getEntity();
InputStream input = null;
if (entity != null) {
try {
input = entity.getContent();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
AutoDetectParser parser = new AutoDetectParser();
ParseContext parseContext = new ParseContext();
parser.parse(input, handler, metadata, parseContext);
map.put("text", handler.toString().replaceAll("\n|\r|\t", " "));
map.put("title", metadata.get(TikaCoreProperties.TITLE));
map.put("pageCount", metadata.get("xmpTPg:NPages"));
map.put("status_code", response.getStatusLine().getStatusCode() + "");
} catch (Exception e) {
e.printStackTrace();
} finally {
if (input != null) {
try {
input.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
} catch (Exception exception) {
exception.printStackTrace();
}
return map;
}
public static void main(String arg[]) {
WebPagePdfExtractor webPagePdfExtractor = new WebPagePdfExtractor();
Map<String, Object> extractedMap = webPagePdfExtractor.processRecord("http://math.about.com/library/q20.pdf");
System.out.println(extractedMap.get("text"));
}
}
You can use iText for do such things
//iText imports
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
for example:
try {
PdfReader reader = new PdfReader(INPUTFILE);
int n = reader.getNumberOfPages();
String str=PdfTextExtractor.getTextFromPage(reader, 2); //Extracting the content from a particular page.
System.out.println(str);
reader.close();
} catch (Exception e) {
System.out.println(e);
}
another one
try {
PdfReader reader = new PdfReader("c:/temp/test.pdf");
System.out.println("This PDF has "+reader.getNumberOfPages()+" pages.");
String page = PdfTextExtractor.getTextFromPage(reader, 2);
System.out.println("Page Content:\n\n"+page+"\n\n");
System.out.println("Is this document tampered: "+reader.isTampered());
System.out.println("Is this document encrypted: "+reader.isEncrypted());
} catch (IOException e) {
e.printStackTrace();
}
the above examples can only extract the text, but you need to do some more to remove hyperlinks, bullets, heading & numbers.
For the newer versions of Apache pdfbox. Here is the example from the original source
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.examples.util;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.text.PDFTextStripper;
/**
* This is a simple text extraction example to get started. For more advance usage, see the
* ExtractTextByArea and the DrawPrintTextLocations examples in this subproject, as well as the
* ExtractText tool in the tools subproject.
*
* #author Tilman Hausherr
*/
public class ExtractTextSimple
{
private ExtractTextSimple()
{
// example class should not be instantiated
}
/**
* This will print the documents text page by page.
*
* #param args The command line arguments.
*
* #throws IOException If there is an error parsing or extracting the document.
*/
public static void main(String[] args) throws IOException
{
if (args.length != 1)
{
usage();
}
try (PDDocument document = PDDocument.load(new File(args[0])))
{
AccessPermission ap = document.getCurrentAccessPermission();
if (!ap.canExtractContent())
{
throw new IOException("You do not have permission to extract text");
}
PDFTextStripper stripper = new PDFTextStripper();
// This example uses sorting, but in some cases it is more useful to switch it off,
// e.g. in some files with columns where the PDF content stream respects the
// column order.
stripper.setSortByPosition(true);
for (int p = 1; p <= document.getNumberOfPages(); ++p)
{
// Set the page interval to extract. If you don't, then all pages would be extracted.
stripper.setStartPage(p);
stripper.setEndPage(p);
// let the magic happen
String text = stripper.getText(document);
// do some nice output with a header
String pageStr = String.format("page %d:", p);
System.out.println(pageStr);
for (int i = 0; i < pageStr.length(); ++i)
{
System.out.print("-");
}
System.out.println();
System.out.println(text.trim());
System.out.println();
// If the extracted text is empty or gibberish, please try extracting text
// with Adobe Reader first before asking for help. Also read the FAQ
// on the website:
// https://pdfbox.apache.org/2.0/faq.html#text-extraction
}
}
}
/**
* This will print the usage for this document.
*/
private static void usage()
{
System.err.println("Usage: java " + ExtractTextSimple.class.getName() + " <input-pdf>");
System.exit(-1);
}
}
Extracting all keywords from PDF(from a web page) file on your local machine or Base64 encoded string:
import org.apache.commons.codec.binary.Base64;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class WebPagePdfExtractor {
public static void main(String arg[]) {
WebPagePdfExtractor webPagePdfExtractor = new WebPagePdfExtractor();
System.out.println("From file: " + webPagePdfExtractor.processRecord(createByteArray()).get("text"));
System.out.println("From string: " + webPagePdfExtractor.processRecord(getArrayFromBase64EncodedString()).get("text"));
}
public Map<String, Object> processRecord(byte[] byteArray) {
Map<String, Object> map = new HashMap<>();
try {
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(false);
stripper.setShouldSeparateByBeads(true);
PDDocument document = PDDocument.load(byteArray);
String text = stripper.getText(document);
map.put("text", text.replaceAll("\n|\r|\t", " "));
} catch (Exception exception) {
exception.printStackTrace();
}
return map;
}
private static byte[] getArrayFromBase64EncodedString() {
String encodedContent = "data:application/pdf;base64,JVBERi0xLjMKJcTl8uXrp/Og0MTGCjQgMCBvYmoKPDwgL0xlbmd0aCA1IDAgUiAvRmlsdGVyIC9GbGF0ZURlY29kZSA+PgpzdHJlYW0KeAGF0E0OgjAQBeA9p3hL3UCHlha2Gg9A0sS1AepPxIDl/rFFErVESDddvPlm8nqU6EFpzARjBCVkLHNkipBzPBsc8UCyt4TKgmCr/9HI+GDqg2x8Luzk8UtfYwX5DVWLnQaLmd+qHTsF3V5QEekWidZuDNpgc7L1FvqGg35fOzPlqslFYJrzZdnkq6YI77TXtrs3GBo7oKvNss9mfhT0IAV+e6CUL5pSTWb0t1tVBKbI5McsXxNmciYKZW5kc3RyZWFtCmVuZG9iago1IDAgb2JqCjE4NQplbmRvYmoKMiAwIG9iago8PCAvVHlwZSAvUGFnZSAvUGFyZW50IDMgMCBSIC9SZXNvdXJjZXMgNiAwIFIgL0NvbnRlbnRzIDQgMCBSIC9NZWRpYUJveCBbMCAwIDU5NSA4NDJdC" +
"j4+CmVuZG9iago2IDAgb2JqCjw8IC9Qcm9jU2V0IFsgL1BERiAvVGV4dCBdIC9Db2xvclNwYWNlIDw8IC9DczEgNyAwIFIgL0NzMiA4IDAgUiA+PiAvRm9udCA8PAovVFQxIDkgMCBSID4+ID4+CmVuZG9iagoxMCAwIG9iago8PCAvTGVuZ3RoIDExIDAgUiAvTiAxIC9BbHRlcm5hdGUgL0RldmljZUdyYXkgL0ZpbHRlciAvRmxhdGVEZWNvZGUgPj4Kc3RyZWFtCngBhVVdaBxVFD67c2cDEgcftA0ttIM/bQnpMolWE4u12026SRO362ZTmyrKdHY2O81kZpyZ3SahT6XgmxYE6augPsaCCLYqNi/2paXFkko1DwoRWowgKH1S8Dsz22R2QTLDnfnuueeee8537rmXqOtv3fPstEo054R+oZybPjl9Su26TWlSqJvw6Ebg5UqlCcaO65j8b38e3qUUS+7sZ1vtY1v25KoZGNC6huZWA2OOKKURZWqG54dEXZcgHzwbeoxvAz85WynngdeAldZcQHqqYDqmbxlqwdcX1JLv1i" +
"w76etW42xjy2fObrCv/OxG6w5mJ8fx74XPF0xnahJ4H/CSoY8w7gO+27ROFGOcTnvhkXKsn842ZqdyLfnJmn90qiW/UG+MMs4SpZcW65U3gJ8AXnVOF4+39Ndn3XG200Mk9RhB/hTws8Ba3RzjPKnAFd8tsz7Lw6o5PAL8MvAlKxyrAMO+9EPQnGQ5sKDFep79xFoie0Y/VgLeBnzItAu8FuyIiheW2OYg8LxjF3ktxC4um0EUL2IXP4X1ymisL6dDv8JznyaS99Sso2PA4EQerfujLIc/cujZ0d56EXjJb5Q59j3Aa7o/UgCGzcxjVX2YeX4BeIBOpHQyyaXT+Brk0L+INyCLmhHyyMdYDX2bCtBw0Hz0DGgVgHRaAColtEz0WCeeo1IVPZVmollBhNjK/ahvUH7Xp9SAtE7rkNaBXqNfIsk8/Upz6OchbWBspsNuHl44tAgP2BO2+aBl0xXbhSaeRzsoJsQrYlAMkSpeFYfFITEM6ZA4GM2JvU/6zn4+2LD0LtZN+r4MDkKsZ8MzB6xwNAE8+AfrzkaaCbYu7mjs87yP3j/vv2MZtz74s429APoxJ7/BogtrJiXmXj/3TU/CQ3VFfPXWne7r5+h4MktR3qqdWZLX5PvyCr735NWkDflneRXvvbZcPcoL/5O5zSFGO5LNQc48m1G0ccYbwCG4qUVz9rdZTLLptmK0YMlClJ2ruP/LCfPDPLexUnMu7vC8tz9jNs33ig+LdL5Pu6y" +
"ta59oP2p/aCvax0C/Sx9KX0rfSlekq9INUqVr0rL0nfS99Ln0NXpfQLosXenYSXHsG7sHfsZ71mjtMGaGsxQQ88LazApLH/F3BmOb+TOh1V4Dnbt/Yy3liLJTeUYZVnYrzykTSq9yQDmsbFcG0PqVUWUvRnZusGRjPc6AhX+SZ4umI67iPLFXdbDnw0sd76ZfXMPWhjXYST0Ontnapg6vEVe/FVVjvDtdnAY6TSFii84ich86nB8nqv7O2VyTODVSb+KUsMQu0S/GWjWYEwdQheNt9TjIVZoZyQxncqRmejNDmf7MMcZRrNH5ktmL0SF8RxLeM8sx/5s1xGcY7x3mqAlso4dbKzTncd8R5V1vwbdm6qE6oGkvqTlcr6Y65hjZPlW3bTUaClTfDEy/aVazxHc3zyP66/XoTk5tu2E0/GYso1TqJtF/t4+TNAplbmRzdHJlYW0KZW5kb2JqCjExIDAgb2JqCjExMTYKZW5kb2JqCjcgMCBvYmoKWyAvSUNDQmFzZWQgMTAgMCBSIF0KZW5kb2JqCjEyIDAgb2JqCjw8IC9MZW5ndGgg" +
"MTMgMCBSIC9OIDMgL0FsdGVybmF0ZSAvRGV2aWNlUkdCIC9GaWx0ZXIgL0ZsYXRlRGVjb2RlID4+CnN0cmVhbQp4AYVVW4gbVRj+kznJCrvO09rVLaRDvXQpu0u2Fd2ltJpbk7RrGrLZ1RZBs5OTZMzsJM5M0gt9KoLii6u+SUG8vS0IgtJ6wdYH+1KpUFZ36yIoPrR4QSj0RbfxO5NkJllqm2XPfPP93/lv558ZooG1Qr2u+xWiJcM2c8mo8tzRY8rAOvnpIRqkURosqFY9ks3OEn5CK679v1s/kE8wVyfubO9Xb7kbLHJLJfLdB75WtNQl4BNEgbNq3bSJBobBTx+36wKLHIZNJAj8osDlNoaNhhfb+DVHk8/FoDkLLKuVQhF4BXh8sYcv9+B2DlDAT5Ib3NRURfQia9ZKms4dQ3u5h7lHeTe4pDdQs/PbgXXIqs4dxnUMtb9SLMQFngReUQuJOeBHgK81tYVMB9+u29Ec8GNE/p2N6nwEeDdwqmQenAeGH79ZaaS6+J1Tlfyz4LeB/8ZYzBzp7F1TrRh6STvB367wtOhviEhSN" +
"DudB4Yf6YBZywk9cpBKRR5PAI8Dv16tHRY5wKf0mdWcE7zIZ+1UJSbyFPzllwqHssCjwL9yPSn0iCX9W7eznRxYyNAzIi5isTi3nHrhh4XsSj4FHnGZbpv5zl62XNIOpjv6TypmSvBi77W67swocgv4zUZO1I5YgcmCmUgCw2cgy4150U+Bm7TgKxCnGi1iVcmgTVIoR0mK4lonE5YSaaSD4bByMBx3Xc2Es8+iKniNmo7Nwpp1lO2dXa1CZbAGXXe0KsVCH1EDnir0B9iK61OhGO4a4Mr/46edy42OnxobYWG2F//72Czbz6bZDCnsKfY0O8DiYGfYPtd3Fnu6FYl8biBK28/LiMgd3QJqv4gabSpg/QWKGlmuh76uLI82xjzLGfMFTb3yxt89vdKws+oqJvo6euRePQ/8FrgeWMW6HthwfSiBnwIb+FtHb7xaap6902VxUhpOtNan23oWXVUElerOziV0QUPNvKfmiV4fl05/+aAXbZWde/7q0KXTJWN51GNFF/irmVsZOjPuseEfw3+GV8PvhT8M/y69LX0qfSWdlz6XLpMiXZ" +
"AuSl9L30ofS1+4+rvNkHv2JDIXcyXyFtPVrbC315hYOSpvlx+W4/IO+VF51lUp8og8JafkXbBsd8/Nm2+lt3L05Siidftz51jiWdFcTzgD3/2YAM2L2DcD88hYo+PwaaLfYt4MOglt75PXqYiF2BRLb5nuaTHzXd/BRDAejJAS3B2cCU4FDwncfZaDu2CbwZrozQ3z4Sr6KuU2PyG+JxSr1U+aWrliK3vC4SeVCD59XEkb6uS4UtB1xTFZisktbjZ5cZLEd1PsI7qZc76Hvm1XPM5+hmj/X3j3fe9xxxpEKxbRyOMeN4Z35QPvEp17Qm2YzbY/8vm+I7JKe/c4976hKN5fP7daN/EeG3iLaPPNVuuf91utzQ/gf4Pogv4foJ98VQplbmRzdHJlYW0KZW5kb2JqCjEzIDAgb2JqCjEwNzkKZW5kb2JqCjggMCBvYmoKWyAvSUNDQmFzZWQgMTIgMCBSIF0KZW5kb2JqCjMgMCBvYmoKPDwgL1R5cGUgL1BhZ2VzIC9NZWRpYUJveCBbMCAwIDU5NSA4NDJdIC9Db3VudCAxIC9LaWR" +
"zIFsgMiAwIFIgXSA+PgplbmRvYmoKMTQgMCBvYmoKPDwgL1R5cGUgL0NhdGFsb2cgL1BhZ2VzIDMgMCBSID4+CmVuZG9iago5IDAgb2JqCjw8IC9UeXBlIC9Gb250IC9TdWJ0eXBlIC9UcnVlVHlwZSAvQmFzZUZvbnQgL0NOVFpYVStNZW5" +
"sby1SZWd1bGFyIC9Gb250RGVzY3JpcHRvcgoxNSAwIFIgL0VuY29kaW5nIC9NYWNSb21hbkVuY29kaW5nIC9GaXJzdENoYXIgMzIgL0xhc3RDaGFyIDExNiAvV2lkdGhzIFsgNjAyCjAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgNjAyIDYwMiA2MDIgNjAyIDYwMiA2MDIgMCAwIDAgMCAwIDAgMCAwIDAKMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgMCAwIDAgNjAyIDAgMAo2MDIgNjAyIDYwMiA2MDIgNjAyIDYwMiAwIDAgNjAyIDYwMiAwIDAgNjAyIDAgMCA2MDIgNjAyIF0gPj4KZW5kb2JqCjE1IDAgb2JqCjw8IC9UeXBlIC9Gb250RGVzY3JpcHRvciAvRm9udE5hbWUgL0NOVFpYVStNZW5sby1SZWd1bGFyIC9GbGFncyAzMyAvRm9udEJCb3gKWy01NTggLTM3NSA3MTggMTA0MV0g" +
"L0l0YWxpY0FuZ2xlIDAgL0FzY2VudCA5MjggL0Rlc2NlbnQgLTIzNiAvQ2FwSGVpZ2h0IDcyOQovU3RlbVYgOTkgL1hIZWlnaHQgNTQ3…/ZfICj5JcLdi/ATmQZKogDPg0lIDBunI0ZGOB1OB/Lpyce1TbJqCpBThycVs3GyQPZSLKexbMGyFss8LF4sNb2lElu5HPlJ2439G1jKsbRh6cTyPNpx8I6AFxa8P+xD2E4e/G+5PqJ/8aDzERFvGBJR/WLkfwcM3kRCiZpokDMdxhn5MeD9Rn5MSm0mYUpLSF98J5HXaQgtpJvoDWGesEe4C4NgK3woWsQ88RgzszXsMM4WyALeIC5gO5B/FYk/pNxVCJGoZT8NYc8LIknrONeVQYznus51pYeZHCaXw+RYIJLAEogJfMEbVPrvv31S6icvTMlp1EQhO41cOuXb0EEkSYkmGaMXSzuIfhCKAA4Y/YScTs9ASizblWVyWB1UT4fwNfSp9+mgwLFd4oI3D++9++kuheYWpOnEeBhLJrv7kVg" +
"Xk1hkVDRExLgkieUZTTt1jZYGkTTiXU8tULUtIsEIfeKMgY5AV3u7yZyTQdK6Mm923fwgHe1GZWTfmCJy5CYi05PgwqWzB5HBw2n2wL7OBEmVPZxmZYpWi6TSU7pM2BNY1kojs0sLN1bPOLZ4/nuzL1CNp/S+zt27dx+lA4Y/2/hA1fq8kR9kZF57u6R96YgvZRmsvfe5OBj5TSKjkN+wRqu6LrRF1yjF19lbYhudDVKTdVe/8DAClihbX6MNEuItofH9kF9k+FwXMofC7rqCDHcZu293G7tz0qmNWi2iM6FvYrYN2RuEvCbT7GDnZ0xDyMZt/Otb8z+aP+/dOS17927ZurVu24ZVnrYFz7w95jxlayE+8b3Nf/m6b5/j2QMb1v26qeXZ8iWVSUkH7fYLb1bKBw7aA57LYgVGYAGtLM8dT3WgIwC6PAIaVSOjsCaUatXEFiJKBm0fvTEQODe0K9Mki/mK3DPnBOUsHkchH/ckhFIHZJmyrE6T0+TIFi7xfvRjx/X33jves5rFBb6Gk4GsHXwbLX1Hlp0XZZeKa8eRYe4EURUX3agy" +
"1RnXWxp1QiNZo2tS7baBjUTYqDqBGONtspI7UEwosSsoMUVevAM5CEO9mmRVEquF/ExwsrxOCTd7OpKnpXxFjfzzO8uPTnz44Oydb7bunLy1kHXu5huMBt59vYvfsNtPZmb4tjfvdblQGjXI23jUayTpg9w5VfFRjer4RqP6DRGPs/ViY3iDscmVYCN9dQkqKZaGxbuMga6uwBXZeYLq/MKI6jShPq0DqDNBUBg0Wy2C0y6YjMSRGU4TJKslPKhYuJS7fkL7u+m7F33yzc3PeOBb6qSWsZv4Zys3bVq5as0atu+gK5Ff4ldLH+d3vvuW36bL6Ab6LF0X37Pw4I4dB//4+z0+RZ8y306xEuNGP7LI3V+tItF2baRBRfZHqurNjjr7O3H1fdrMTZE6GilG6dWSNt8uStbh/Y03u9AkM1G3snI7rtwMyCKWd2DKMeegZ6W749Lj0+3pjvSEZtJMm4VmdbNme3hzRHNkc1RztH4m7rJ3Q4OzB5uc2XpE9M0eOOh+mi1LoNfdwlGfQtuwV159duGWPfTAgfv/VP3GBz98d4eu2jirfca81uK6o8P62oWsJxaXLT57sN/4npUtpY/8eXvr4bhVzwwa6E9MnDIlc2PQditxr2bMIowYLdLd0YxYouv1lvqQJn0bfREiRCIJo0xmzeg43Ju8NTk2KIaDe0ynpqxeHlEd5ixUx790gazCdL9/QFPpiWvX3y/byg1ramr" +
"q6mpq1sAZYeQ/utYVTaP3Uys10cHTuOaj8xfPdV44L/uSzE8Jyt6K/GQFIyKGKSUIChgEY09jScPcUUJf02C4DCNRShuL32qS0zOo1WGVZIMYbEXZ2QmaSVamWRUUnlgS+DzknT3F7eWPHpnBf+Dnqf3GR3d82g1ran4XItRPl744dl/OW8nJNIeGUS118782Lt3lWyT72RH08USUUxgZiFIyUm3IfonWkxf10mG1EKYioUzSGTQW47mhHYGhHZmKAVzJDKD60b9lA0aJxFHZyWSndqBKs8TEM3Mn0JV8hZ930uRdf5IsTZPnz/UG0uCMd6JfTq9lefDRornXFke7E6O0tpjEUDDXhYWH1tvC6w2AlmgzHEk63D8xikjaUZLZ7BiNhtjRqy10846gERo7u9EC0RKRGyV0Bx0nDL3pxzg5TJANrleZEdlZMH31ytXrvWtWrPZ3Xx3fUjSneeTmNSlbyjuuX+9Y2JDmF3JOffzxqVOfnuefBXggNmb/gJTtvpCqWQ/TIVSFZ+mQh6ZvkPcRlF+MIr8Ud2SoHvC3OKne1KZ9UU0FiYzVhUqaQgvaGIoMTWwoxnKM67KFOU1BZrGTpfh/uBhz4LEnVtb5/RmvL3ljl7C/Z6ywv3H9W2/0rJYsPTtK5l6W5daN+qqWDHh+6oicYqKpyD9kyocpRTtSXcR8Em1J7muwlW1LexrtSs5JZLuScwa5pXgGyx/hdZxIeA" +
"JTPMvxBMSaYoSmQ+nHtDywiJbzyzTe7xcfDmR5vTBcyPsKv7yBPEyXopGDxCAHOoUoppRILPQiribnP/IqOjlLka3XEo5eIbu8bCTCqRmej6+99ib/lF6im3/13ItnD8OtF4LyxN+YxQq0iwTyqjsx0mwIFVUkLkZSWbX1dmiLORxlVBGTIWSC" +
"NNE0wTAxNnJCdIHTeHOcTzt1nM80dUbxARJ9r/0+T2BoAA0leOYPHXrlpnIwoYmg8NPdo9LFdJYupavSQ9JD09Xpmtzw3IjcyNyo3OjcmNzY3LhcWzVUi9WsWqpWVYdUh1arqzXecG+EN9Ib5Y32xnhjvXFem5POpPLJEh5Ff6LMf2vVqgwKOx" +
"IeHbu64vXswkn3v54zdkzOzp2Oubnjy6B7dMEZfqlnubDymyWVX/SsEFbeWCy3YknJ0NxCWddt/CFxKspCjmFZ7tgfY1ibvokegcNxGL9GKZGsUI5imYqJoV/8GMZcsm0pXJhNRtkbfuofdPmBA3IYu/rV+/Oa6I3VNatqa1fVrF7Xc1xSe4um" +
"8Xf5df53fnwavfXR+Qud5y5iFJPtvRPjmIQ8JZJlbrdOK+g1EfG2kFBBpY6wxdvy4myRao0tXrSSOtouWuqs7ZH1JrHe1WZqSopTa+JjVOSBGEk/RiVZEgqSgu58RXZfObDIh6OR3+o23uo2RyjHipKn6ZU8Tak9CcETUz5L4pVkSPq3k2cPTBMGYPo2CFUCJx9oLqqqfPitsWvXdX1YtP+x+YemPrvqVkjBy789//70FjFn34ABk4vGjXXqo7dVtbQ6nW3Z2XM91RmCPn7jilf+4FD2incAMYS9hLExwx2pZyEG2E9M9HDIfnWIJhTzYclo1v88MnbdHNohp0Cyg8sx8Wdmb8Lr7nY+a9ayU5dP7ZZDI3uJH/b2NP9qzsaWE0KJlw5HnSvPvefwlwRZ2r988CqMnmzBUyScRGD+EUXyySgymowhY8k4Mp48gLl+EXmITFM+pPjvhSANSb5Ej5w4dXrxg8kTyhYtrEidUjZ/2cLZTxLyT2S78dEKZW5kc3RyZWFtCmVuZG9iagoxNyAwIG9iago0ODAxCmVuZG9iagoxOCAwIG9iagooKQplbmRvYmoKMTkgMCBvYmoKKE1hYyBPUyBYIDEwLjEyLjYgUXVhcnR6IFBERkNvbnRleHQpCmVuZG9iagoyMCAwIG9iagooKQplbmRvYmoKMjEgMCBvYmoKKCkKZW5kb2JqCjIyIDAgb2JqCihUZXh0TWF0ZSkKZW5kb2JqCjIzIDAgb2JqCihEOjIwMTcxMjEyMTMwMzQ4WjAwJzAwJykKZW5kb2JqCjI0IDAgb2JqCigpCmVuZG9iagoyNSAwIG9iagpbICgpIF0KZW5kb2JqCjEgMCBvYmoKPDwgL1RpdGxlIDE4IDAgUiAvQXV0aG9yIDIwIDAgUiAvU3ViamVjdCAyMSAwIFIgL1Byb2R1Y2VyIDE5IDAgUiAvQ3JlYXRvcgoyMiAwIFIgL0NyZWF0aW9uRGF0ZSAyMyAwIFIgL01vZERhdGUgMjMgMCBSIC9LZXl3b3JkcyAyNCAwIFIgL0FBUEw6S2V5d29yZHMKMjUgMCBSID4+CmVuZG9iagp4cmVmCjAgMjYKMDAwMDAwMDAwMCA2NTUzNSBmIAowMDAwMDA4OTI5IDAwMDAwIG4gCjAwMDAwMDAzMDAgMDAwMDAgbiAKMDAwMDAwMzAyOCAwMDAwMCBuIAowMDAwMDAwMDIyIDAwMDAwIG4gCjAwMDAwMDAyODEgMDAwMDAgbiAKMDAwMDAwMDQwNCAwMDAwMCBuIAowMDAwMDAxNzUzIDAwMDAwIG4gCjAwMDAwMDI5OTIgMDAwMDAgbiAKMDAwMDAwMzE2MSAwMDAwMCBuIAowMDAwMDAwNTEyIDAwMDAwIG4gCjAwMDAwMDE3MzIgMDAwMDAgbiAKMDAwMDAwMTc4OSAwMDAwMCBuIAowMDAwMDAyOTcxIDAwMDAwIG4gCjAwMDAwMDMxMTEgMDAwMDAgbiAKMDAwMDAwMzU0NCAwMDAwMCB" +
"uIAowMDAwMDAzNzk2IDAwMDAwIG4gCjAwMDAwMDg2ODcgMDAwMDAgbiAKMDAwMDAwODcwOCAwMDAwMCBuIAowMDAwMDA4NzI3IDAwMDAwIG4gCjAwMDAwMDg3ODAgMDAwMDAgbiAKMDAwMDAwODc5OSAwMDAwMCBuIAowMDAwMDA4ODE4IDAwMDAwIG4gCjAwMDAwMDg4NDUgMDAwMDAgbiAKMDAwMDAwODg4NyAwMDAwMCBuIAowMDAwMDA4OTA2IDAwMDAwIG4gCnRyYWlsZXIKPDwgL1NpemUgMjYgL1Jvb3QgMTQgMCBSIC9JbmZvIDEgMCBSIC9JRCBbIDxkYjc4M2NhNDM2Mzg4YzI5ZDc5MDQ2NzY3NjUxNjE3OT4KPGRiNzgzY2E0MzYzODhjMjlkNzkwNDY3Njc2NTE2MTc5PiBdID4+CnN0YXJ0eHJlZgo5MTA0CiUlRU9GCg==";
String content = encodedContent.substring("data:application/pdf;base64," .length());
return Base64.decodeBase64(content);
}
public static byte[] createByteArray() {
String pathToBinaryData = "/bla-bla/src/main/resources/small.pdf";
File file = new File(pathToBinaryData);
if (!file.exists()) {
System.out.println(" could not be found in folder " + pathToBinaryData);
return null;
}
FileInputStream fin = null;
try {
fin = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
byte fileContent[] = new byte[(int) file.length()];
try {
fin.read(fileContent);
} catch (IOException e) {
e.printStackTrace();
}
return fileContent;
}
}