I want to convert a document .doc that contains some images. How to convert it to *.html, so that the images will remain same position? How to store those images in separate folder named image and use this folder as a source for image?
My code:
import java.io.BufferedWriter;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import javax.swing.JEditorPane;
import javax.swing.JFrame;
import javax.swing.JScrollPane;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.FileURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.w3c.dom.Document;
public class TestWordToHtmlConverter {
private File docFile;
private File file;
public TestWordToHtmlConverter(File docFile) {
this.docFile = docFile;
}
public void convert(File file) {
this.file = file;
try {
FileInputStream finStream=new FileInputStream(docFile.getAbsolutePath());
HWPFDocument doc=new HWPFDocument(finStream);
WordExtractor wordExtract=new WordExtractor(doc);
Document newDocument = DocumentBuilderFactory.newInstance() .newDocumentBuilder().newDocument();
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(newDocument) ;
wordToHtmlConverter.processDocument(doc);
StringWriter stringWriter = new StringWriter();
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
transformer.setOutputProperty(OutputKeys.METHOD, "html");
transformer.transform(new DOMSource( wordToHtmlConverter.getDocument()), new StreamResult( stringWriter ) );
String html = stringWriter.toString();
FileOutputStream fos=new FileOutputStream(new File("html/sample.html"));
DataOutputStream dos;
try {
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8"));
out.write(html);
out.close();
}
catch (IOException e) {
e.printStackTrace();
}
/*JEditorPane editorPane = new JEditorPane();
editorPane.setContentType("text/html");
editorPane.setEditable(false);
editorPane.setPage(file.toURI().toURL());
JScrollPane scrollPane = new JScrollPane(editorPane);
JFrame f = new JFrame("Display Html File");
f.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
f.getContentPane().add(scrollPane);
f.setSize(512, 342);
f.setVisible(true);*/
} catch(Exception e) {
e.printStackTrace();
}
}
public static void main(String args[]) {
TestWordToHtmlConverter TTC=new TestWordToHtmlConverter(new File("docx/sample.doc"));
TTC.convert(TTC.docFile);
}
}
This implementation doesn't create images or links to them. This can
be changed by overriding AbstractWordConverter.processImage(Element,
boolean, Picture) method
As said in API docs:
WordToHtmlConverter doesn't create images or links to them. This can
be changed by overriding AbstractWordConverter.processImage(Element, boolean, Picture) method.
How to override method you can found here:
I need Apache POI Pictures converted from a word document to a html file
You can try using DOCX 2 XHTML converter based on Apache POI XWPF:
XWPFDocument 2 XHTML
Also you can use Apache Tika, built on top of Apache POI. An example that included in Alfresco can be found here:
HTMLRenderingEngine
There are also many other converters.
Related
I am using PDFBOX and reading and saving the contents from PDF file . Requirement is text should be splitted to Header and Item in seperate array list .
PDF looks below.
Expected :
Following details PO,DeliveryDate,Vendor no should shown in arraylist 1 and other details like barcode,item number,description,quantity should shown in arraylist 2 .
Exisiting code for extracting data as txt from PDF.
PDFBoxReadFromFile.java
package pdfboxreadfromfile;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.io.*;
public class PDFBoxReadFromFile {
/**
* #param args the command line arguments
*/
public static void main(String[] args) {
PDFManager pdfManager = new PDFManager();
pdfManager.setFilePath("C:\\Users\\34\\Documents\\test.pdf");
try {
String text = pdfManager.toText();
System.out.println(text);
File file = new File("C:/Users/34/eclipse-workspace/pdfboxreadfromfile/file.txt");
FileWriter fw = new FileWriter(file);
PrintWriter pw = new PrintWriter(fw);
pw.println(text);
pw.close();
} catch (IOException ex) {
//System.err.println(ex.getMessage());
Logger.getLogger(PDFBoxReadFromFile.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
PDFManager.Java
package pdfboxreadfromfile;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDFManager {
private PDFParser parser;
private PDFTextStripper pdfStripper;
private PDDocument pdDoc;
private COSDocument cosDoc;
private String Text;
private String filePath;
private File file;
public PDFManager() {
}
public String toText() throws IOException {
this.pdfStripper = null;
this.pdDoc = null;
this.cosDoc = null;
file = new File(filePath);
parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
pdDoc.getNumberOfPages();
pdfStripper.setStartPage(0);
pdfStripper.setEndPage(pdDoc.getNumberOfPages());
Text = pdfStripper.getText(pdDoc);
return Text;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
public PDDocument getPdDoc() {
return pdDoc;
}
}
java
I have gone through tika Documentation. I find a solution to extract text. but it does not print return image.
.java File
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
public class Imageextractor3 {
public static void main(String[] args)
throws IOException, TikaException, SAXException {
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
TesseractOCRConfig config = new TesseractOCRConfig();
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
parseContext.set(PDFParserConfig.class, pdfConfig);
//need to add this to make sure recursive parsing happens!
parseContext.set(Parser.class, parser);
File file=new File("C://Users//Vaibhav Shukla//Desktop//8577.00.pdf");
System.out.println(file);
FileInputStream stream = new FileInputStream(new File("C://Users//Vaibhav Shukla//Desktop//pdfs//hh.pdf"));
Metadata metadata = new Metadata();
parser.parse(stream, handler, metadata, parseContext);
System.out.println(metadata);
String content = handler.toString();
FileOutputStream fos=new FileOutputStream("C://Users//Vaibhav Shukla//Desktop//pdfs//hd.doc");
fos.write(content.getBytes());
System.out.println("===============");
System.out.println(content);
System.out.println("Done");
}
}
I need suggestion how to add functionality which can detect the image in pdf file
A quick solution to convert extract images which are embedded in pdf.
public void extract(File file) throws IOException {
PDDocument doc=new PDDocument().load(file);
Iterator<PDPage> itr=doc.getDocumentCatalog().getPages().iterator();
while(itr.hasNext())
{
PDResources res=itr.next().getResources();
Iterable<COSName> cName=res.getXObjectNames();
Iterator<COSName> citr=cName.iterator();
while(citr.hasNext())
{
String imageName= citr.next().getName();
System.out.println(imageName);
COSName cosName=COSName.getPDFName(imageName);
PDImageXObject im = (PDImageXObject) res.getXObject(cosName);
File ff = new
File("C://Users//workspace//Desktop//pdfs//"+imageName+".png");
BufferedImage bi=im.getImage();
ImageIO.write(bi,"png",ff);
}
}}
I am getting "Invalid nested tag font found, expected closing tag table" when trying to convert one HTML to PDF using iText. Following is the program I am using.
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Element;
import com.itextpdf.text.Font;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.Phrase;
import com.itextpdf.text.Font.FontFamily;
import com.itextpdf.text.pdf.ColumnText;
import com.itextpdf.text.pdf.GrayColor;
import com.itextpdf.text.pdf.PdfPageEventHelper;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorkerHelper;
public class HTML2PDF {
class Watermark extends PdfPageEventHelper{
//font for watermarking text
Font font = new Font(FontFamily.HELVETICA, 52, Font.ITALIC, new GrayColor(.75f));
public void onEndPage(PdfWriter writer, Document document){
ColumnText.showTextAligned(writer.getDirectContentUnder(),
Element.ALIGN_CENTER,
new Phrase("Watermarking Text", font),
297.5f, 421,-45);//-45 specifies the angle of Watermarking Text
}
}
public void convertHTML2PDF(){
//Create objects of Document and specify the Page size of a PDF
Document document = new Document(PageSize.A4);
try{
PdfWriter pdfWriter;
//get the instance of class PdfWriter with the document objects and output path
pdfWriter = PdfWriter.getInstance(document, new FileOutputStream("/opt/remedy/html2any-cmd-linux/bin/test.pdf"));
//setting the Watermarking in the PageEvent of PdfWriter
pdfWriter.setPageEvent(new Watermark());
document.open();
String htmlContent = "";
BufferedReader in = new BufferedReader(new FileReader("/opt/remedy/html2any-cmd-linux/bin/WO00000001004641434460592596_1.html"));
String temp;
//read the html files content and stores it in a String variable
while((temp = in.readLine())!=null){
htmlContent += temp;
}
in.close();
XMLWorkerHelper xmlWorker = XMLWorkerHelper.getInstance();
// converts the html into a PDF
xmlWorker.parseXHtml(pdfWriter, document, new StringReader(htmlContent));
document.close();
}
catch(Exception e)
{
e.printStackTrace();
}
}
//main method
public static void main(String[] args){
new HTML2PDF().convertHTML2PDF();
}
}
There are some tags in html which do not need to be closed. Could you suggest how this type of html can be parsed using XMLWorker?
I am triing to make an automatization program in JAVA.
I have a sample doc file. I need to fill the blank parts or the <> "signed" parts from database,
than create pdf files.
I tried to read the word :
import java.io.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class ReadDocFile {
public static void main(String[] args) {
File file = null;
WordExtractor extractor = null ;
try {
file = new File("c:\\New.doc");
FileInputStream fis=new FileInputStream(file.getAbsolutePath());
HWPFDocument document=new HWPFDocument(fis);
extractor = new WordExtractor(document);
String [] fileData = extractor.getParagraphText();
for(int i=0;i<fileData.length;i++){
if(fileData[i] != null)
System.out.println(fileData[i]);
}
}
catch(Exception exep){}
}
}
but this attemption is bad in many way cause i only need to write some of the parts, and this method make a single test from the doc.
So can you advice me some api that write in a word doc eg: after Name : or in the 5 row write this:
And when it finish with the word it should generate a pdf and do it again ...
I am looking a solution wich i found xssfworkbook with some extra function ( generate pdf of the doc ).
Or read the sample pdf and fill with datas and save to a new pdf.
Thx
Use Itext (http://sourceforge.net/projects/itext/)
and Apache POI Project http://poi.apache.org/index.html
A sample code :
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hwpf.extractor.WordExtractor;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Paragraph;
import com.itextpdf.text.pdf.PdfWriter;
public static void main(String[] args) {
String pdfPath = "C:/";
String pdfDocPath = null;
try {
InputStream is = new BufferedInputStream(new FileInputStream("C:/Test.doc"));
WordExtractor wd = new WordExtractor(is);
String text = wd.getText();
/* FOR DOCX
// IMPORT
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
// CODE
XWPFDocument hdoc = new XWPFDocument(is);
extractor = new XWPFWordExtractor(hdoc);
String text = extractor.getText();
*/
Document document = new Document();
PdfWriter.getInstance(document, new FileOutputStream(pdfPath + "viewDoc.pdf"));
document.open();
document.add(new Paragraph(text));
document.close();
pdfDocPath = pdfPath + "viewDoc.pdf";
System.out.println("Pdf document path is" + pdfDocPath);
}
catch (FileNotFoundException e1) {
System.out.println("File does not exist.");
}
catch (IOException ioe) {
System.out.println("IO Exception");
}
catch (DocumentException e) {
e.printStackTrace();
}
}
i want to convert html file with images to pdf using iText. I am providing my source here.
This is my HTML file...
<html>
<body>
<img src='' width='62' height='80' style='float: left; margin-right: 28px;' alt="" />
<!-- <img src="add.png" alt="" /> -->
</body>
</html>
I want to convert this html file to pdf...
Am using the following java code...
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.net.URL;
import java.nio.charset.Charset;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.encoding.Encoding;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.w3c.tidy.Tidy;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.Pipeline;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerFontProvider;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.css.CssFilesImpl;
import com.itextpdf.tool.xml.css.StyleAttrCSSResolver;
import com.itextpdf.tool.xml.html.CssAppliersImpl;
import com.itextpdf.tool.xml.html.HTML;
import com.itextpdf.tool.xml.html.TagProcessor;
import com.itextpdf.tool.xml.html.TagProcessorFactory;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
import com.itextpdf.tool.xml.pipeline.html.ImageProvider;
import com.pdfcrowd.Client;
public class App
{
public static void main( String[] args ) throws DocumentException, IOException
{
// step 1
Document document = new Document();
document.newPage();
// step 2
PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream("pdf.pdf"));
// step 3
document.open();
// step 4
XMLWorkerHelper.getInstance().parseXHtml(writer, document,
new FileInputStream("index.html"));
//step 5
document.close();
System.out.println( "PDF Created!" );
}
}
Am getting the following error...
Exception in thread "main" ExceptionConverter: java.io.IOException: The document has no pages.
at com.itextpdf.text.pdf.PdfPages.writePageTree(PdfPages.java:113)
at com.itextpdf.text.pdf.PdfWriter.close(PdfWriter.java:1243)
at com.itextpdf.text.pdf.PdfDocument.close(PdfDocument.java:849)
at com.itextpdf.text.Document.close(Document.java:416)
at App.main(App.java:64)
Please help me out How can i convert html file with images to pdf using itext. I am able to convert that html file if i dont have images or if i hardcode the image path. Thanks in advance
You need to implement a custom image tag processor to process the images embedded inside your html:
package com.example.itext.processor;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.itextpdf.text.Chunk;
import com.itextpdf.text.Element;
import com.itextpdf.text.Image;
import com.itextpdf.text.log.Level;
import com.itextpdf.text.log.Logger;
import com.itextpdf.text.log.LoggerFactory;
import com.itextpdf.text.pdf.codec.Base64;
import com.itextpdf.tool.xml.NoCustomContextException;
import com.itextpdf.tool.xml.Tag;
import com.itextpdf.tool.xml.WorkerContext;
import com.itextpdf.tool.xml.exceptions.LocaleMessages;
import com.itextpdf.tool.xml.exceptions.RuntimeWorkerException;
import com.itextpdf.tool.xml.html.HTML;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
public class ImageTagProcessor extends com.itextpdf.tool.xml.html.Image {
private final Logger logger = LoggerFactory.getLogger(getClass());
/*
* (non-Javadoc)
*
* #see com.itextpdf.tool.xml.TagProcessor#endElement(com.itextpdf.tool.xml.Tag, java.util.List, com.itextpdf.text.Document)
*/
#Override
public List<Element> end(final WorkerContext ctx, final Tag tag, final List<Element> currentContent) {
final Map<String, String> attributes = tag.getAttributes();
String src = attributes.get(HTML.Attribute.SRC);
List<Element> elements = new ArrayList<Element>(1);
if (null != src && src.length() > 0) {
Image img = null;
if (src.startsWith("data:image/")) {
final String base64Data = src.substring(src.indexOf(",") + 1);
try {
img = Image.getInstance(Base64.decode(base64Data));
} catch (Exception e) {
if (logger.isLogging(Level.ERROR)) {
logger.error(String.format(LocaleMessages.getInstance().getMessage(LocaleMessages.HTML_IMG_RETRIEVE_FAIL), src), e);
}
}
if (img != null) {
try {
final HtmlPipelineContext htmlPipelineContext = getHtmlPipelineContext(ctx);
elements.add(getCssAppliers().apply(new Chunk((com.itextpdf.text.Image) getCssAppliers().apply(img, tag, htmlPipelineContext), 0, 0, true), tag,
htmlPipelineContext));
} catch (NoCustomContextException e) {
throw new RuntimeWorkerException(e);
}
}
}
if (img == null) {
elements = super.end(ctx, tag, currentContent);
}
}
return elements;
}
}
Following code snippet registers the custom image tag processor and coverts an HTML document to PDF
public static void main(String[] args) {
convertHtmlToPdf();
}
private static void convertHtmlToPdf() {
try {
final OutputStream file = new FileOutputStream(new File("C:\\Test.pdf"));
final Document document = new Document();
final PdfWriter writer = PdfWriter.getInstance(document, file);
document.open();
final TagProcessorFactory tagProcessorFactory = Tags.getHtmlTagProcessorFactory();
tagProcessorFactory.removeProcessor(HTML.Tag.IMG);
tagProcessorFactory.addProcessor(new ImageTagProcessor(), HTML.Tag.IMG);
final CssFilesImpl cssFiles = new CssFilesImpl();
cssFiles.add(XMLWorkerHelper.getInstance().getDefaultCSS());
final StyleAttrCSSResolver cssResolver = new StyleAttrCSSResolver(cssFiles);
final HtmlPipelineContext hpc = new HtmlPipelineContext(new CssAppliersImpl(new XMLWorkerFontProvider()));
hpc.setAcceptUnknown(true).autoBookmark(true).setTagFactory(tagProcessorFactory);
final HtmlPipeline htmlPipeline = new HtmlPipeline(hpc, new PdfWriterPipeline(document, writer));
final Pipeline<?> pipeline = new CssResolverPipeline(cssResolver, htmlPipeline);
final XMLWorker worker = new XMLWorker(pipeline, true);
final Charset charset = Charset.forName("UTF-8");
final XMLParser xmlParser = new XMLParser(true, worker, charset);
final InputStream is = new FileInputStream("C:\\test.html");
xmlParser.parse(is, charset);
is.close();
document.close();
file.close();
} catch (Exception e) {
e.printStackTrace();
// TODO
}
}
This Exception will occur if there is no content in your pdf page.
Try Passing your InputStream like this
String str="<html>
<body>
<img src='' width='62' height='80' style='float: left; margin-right: 28px;' alt="" />
<!-- <img src="add.png" alt="" /> -->
</body>
</html>"
InputStream is = new ByteArrayInputStream(str.getBytes());
XMLWorkerHelper.getInstance().parseXHtml(writer, document, is);