convert html to pdf using iText - java

i want to convert html file with images to pdf using iText. I am providing my source here.
This is my HTML file...
<html>
<body>
<img src='data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAD4AAABQCAMAAAB24TZcAAAABGdBTUEAANbY1E9YMgAAABl0RVh0U29mdHdhcmUAQWRvYmUgSW1hZ2VSZWFkeXHJZTwAAAGAUExURdSmeJp2SHlbQIRoSUg2J499a8KebqeHZuGufBEVJPz7+3NWPVxGMduwhPXEktnX1mtROLq7t5WDc2VMNv3LmKB8TMSidMbFxLGlmXlhSMSddpJUL+y8i3VlVqedlOzr6gUIF2lXRLCLY4ZyXLyYaYhtUYiJhJFyU1dBLLiVZnlwZrWRY/Hx8b+2rbySaJh9YqeooDw4NygnKvvJlpyblzksIUhGRryYckc7MPjGlKODX5x8VVA8K+azgM3FvDInHK2JW2ZbUOHh4Xt2cFpaWKeAUM6kel1RRJmUjo5vSrWzrJJ1WFhLQCQmMuK1iJiMgmthWPPCkOm3hEtBOunm5LCNXnJtZquEXmNkYvG+i7Ctq+y5hrWRbKqSeaN/WqmFVYFgQh8aGOa4isWkd8mcby4vONDNy0AwI5h2U19JMxkdLzIuL1JBMjQ3P5Z6Ve6/j93c2+Xi34KAfJ5/Xvj4+O/u7sSKVJd4Wo6QjXE+IeOwfQcNJoBeQ8Gdbf/Mmf///5GX6NEAAAcrSURBVHja3JbpX9pIGMchiWkgEaOBtaGinBLEyopFBeMqtYKI4kGt2lILFsUoXa3WdZcc/dd3JheHAvaz7/Z5Ec2Q7/yeaw7Lz/9klv8rfnM+Orz5cXLjZsL+67h9eCq9Vaxvzc6v3W6+/TX85kN6ixdokkQQCaE5vrg28Qv4a2yFQcpSi/HzH6efi+/UaEAwWAtepuvv3tw/B//hqZGQqDFSmyHC7v0z8EldlZQQEgTfMgF23h8/T+gEhQGrcQYrMBKVtvfDb4qU/j3DMK3SdIKWsNs++M1iS8R8W/gULyG1771w+/stQWpTpFpzByb09MRHEwaoxUxToGtaZiBrE72cXzMyhcDiIRgCHxJPIxKt5aF23gMf0iquz8BJmAAFpUStxvG0xIA3arcHPsvrJM1wvFTDeEGQeKCewCo1jgRDwKuJrrh9C3osIfyiz+NboZFKxU0xJEYmeJbBhPoKiKyMDXfHd0mJWSETnoKiKCmgSioFDKFr4T1lbn/fgkHf+PGu+A+A12imMqdAqzNUXlFCFP+gOD41CKJBcCB4bKSnOmitB5VWSgnMrSjhCnu8D1hoS1xP/KcH1BhZdGi4c4VNAh/I5PGyRjdQqje+A6YXPIpup/DhHlMUh44f1hAJ6x77z3OwVjG/0ml7Ot4gOWnxvkfbALw+2EnPGc43ojWk3qNt7hdpiSp0ajcMukHQPB/4o3vPf8TKQgc+pqXdkpEtgGewE7THel/j66dtdBLA1XAYRXK8AGbxC/6RHvjbCuOE0Kklk8lcg/+OicaJcOhfTflTVYCHuYvX3XH7QCxcUAol9i6VursLha+VfcLPHwamZjfSAgxi6QId6oFnC5awsjdoWYjFPrOlB3QONAtJjrwsetiq2jkzgfc9nPdklJBDyXvGj+Zf+jIKe7pPoNFoOHwyoyaQKFcD9z3wzbwSGnT6fCMB9u5UmWMLYwTJQo5QC2AB6r122ukBJeVWnA6HIwlLnp/bI/w5wI3tJR3LjcZMbvVzL/xHwOG+M6s2mFeSjRm0QRyDYnyCOEv/0fOYGM/vha4N3J1S5hoZhCAcYBro/AwV63NIjafuzL4rLSjOZYKeIT45j9XUnQTs/Y7Inbqp/pABeIPBqsTystr0/pd9T9jprZIGO9CHa4gTPHairxr/eP/rwai+YdzlWQfALSHu4qTxfHxiQKVTaBINvfCjDFo1Fmzjor/zP+0BNXdgxSTdqRe5w0bT2hq+293mdWDOSJ5DWbgwd4uGpSPxXW5WGzGddhYWHsDRguqpO5x9jjq4HY3BnjtcRRGGe/Xqn38YC6SraVt84jnXwo0FgC8kOK7s+mv91St6RhVnZ72Vqeln4EM+cFY43SHgdj584c9ormdFbx3Jbk73v9PuvNCCvx67ntPzlmG2xUvUhQpZz9roxHdwXx4e7Yb/fdXc7o81PFcUxW2ry+Wy5miM4gQkEAh0uxKfXWbdLXs1XGxZURRnXZpZrVbXegT/rUvm571itnncQPctWZso2hAdd61GIzIuf32y5zduL0VxtwQPWG2vB7QP0OKKVaejOI7L8lP4+S3r+wY+zSZfGPvGPlFlt8FQ3BCPQPYpfOjWs3QHtMVLJqmU0NLe9XVhsBpOwyER0+D1oE534t8Hsn/KctwLokxUgeunD6FwCA2xMGtAPAdhjkr55afwoaksGpHlAKTnWUK9ZIAt15k/U+mK5voSuoI9Vre/fZPOBcFQKg4+PXsXg7urVra0Stvqmud4mTp4hN/s+lAIy8ErIC7Oz8aITzqegYkUL4tawQ+ivEvudP7Gt6SPpCpewJ8BfN+pb/aq71dG2kjayLuJ3/vC+gB+EBe9Xm/8KEQs67hShMmgIRsNylFuFe9UL1IGHXHNAtr77ZYN7htNB8LxJmCnyaBZULpJ6/g4ZZQCX83FAS1u3675xnTaX/GKFdLl+gIaDZeFpU78rS9oDnzZEmHstqPJKc9n90LJPThyBUZIVRtMv8Q1v9Xx8bzxigddWo1t7yZ//zgSCwRiK6CO0PUD2OR4hMnhHfiPtYiJr4a8Jj4MbHNe7UC4RtTfc5wsd+DD6RbxxTZ8chtkrcJGIlqX41GqTVzFp3wmfmCNi5rNT74Z3nwHi2BjZW11AtdzgvxIfSBl4l/Klzr+bfLvzSNYA1u9xTfmz8f4lLmA5HWfgV8eTa7BEohxox1xeZ1F5Ef4fTrYnL4oGjb7QZ3JVgk2W4KJPMZvmWbo9KWJ27QsXKHm3DkhJT/Gs6z55lo0abV5wCSL5txL/CMa4PYPUXN+5qwTj68aXwa5MP4Efj/VDA4TW3BV3PQMp7Wlgnfg555mcPFO8RbXMbXv8Oh6pG3J7IRM8bq3Q/zKLFqUQ3GteNYvbepG1XG57O0Qt9Hmd1bOKC1qbZH/zbK78FWzYMJ2aZoXPq7kr8ZvORr+iUSjJzQb/Gpa5l8BBgBZTppAyfsf0wAAAABJRU5ErkJggg==' width='62' height='80' style='float: left; margin-right: 28px;' alt="" />
<!-- <img src="add.png" alt="" /> -->
</body>
</html>
I want to convert this html file to pdf...
Am using the following java code...
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.net.URL;
import java.nio.charset.Charset;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.encoding.Encoding;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.w3c.tidy.Tidy;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.Pipeline;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerFontProvider;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.css.CssFilesImpl;
import com.itextpdf.tool.xml.css.StyleAttrCSSResolver;
import com.itextpdf.tool.xml.html.CssAppliersImpl;
import com.itextpdf.tool.xml.html.HTML;
import com.itextpdf.tool.xml.html.TagProcessor;
import com.itextpdf.tool.xml.html.TagProcessorFactory;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
import com.itextpdf.tool.xml.pipeline.html.ImageProvider;
import com.pdfcrowd.Client;
public class App
{
public static void main( String[] args ) throws DocumentException, IOException
{
// step 1
Document document = new Document();
document.newPage();
// step 2
PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream("pdf.pdf"));
// step 3
document.open();
// step 4
XMLWorkerHelper.getInstance().parseXHtml(writer, document,
new FileInputStream("index.html"));
//step 5
document.close();
System.out.println( "PDF Created!" );
}
}
Am getting the following error...
Exception in thread "main" ExceptionConverter: java.io.IOException: The document has no pages.
at com.itextpdf.text.pdf.PdfPages.writePageTree(PdfPages.java:113)
at com.itextpdf.text.pdf.PdfWriter.close(PdfWriter.java:1243)
at com.itextpdf.text.pdf.PdfDocument.close(PdfDocument.java:849)
at com.itextpdf.text.Document.close(Document.java:416)
at App.main(App.java:64)
Please help me out How can i convert html file with images to pdf using itext. I am able to convert that html file if i dont have images or if i hardcode the image path. Thanks in advance

You need to implement a custom image tag processor to process the images embedded inside your html:
package com.example.itext.processor;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.itextpdf.text.Chunk;
import com.itextpdf.text.Element;
import com.itextpdf.text.Image;
import com.itextpdf.text.log.Level;
import com.itextpdf.text.log.Logger;
import com.itextpdf.text.log.LoggerFactory;
import com.itextpdf.text.pdf.codec.Base64;
import com.itextpdf.tool.xml.NoCustomContextException;
import com.itextpdf.tool.xml.Tag;
import com.itextpdf.tool.xml.WorkerContext;
import com.itextpdf.tool.xml.exceptions.LocaleMessages;
import com.itextpdf.tool.xml.exceptions.RuntimeWorkerException;
import com.itextpdf.tool.xml.html.HTML;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
public class ImageTagProcessor extends com.itextpdf.tool.xml.html.Image {
private final Logger logger = LoggerFactory.getLogger(getClass());
/*
* (non-Javadoc)
*
* #see com.itextpdf.tool.xml.TagProcessor#endElement(com.itextpdf.tool.xml.Tag, java.util.List, com.itextpdf.text.Document)
*/
#Override
public List<Element> end(final WorkerContext ctx, final Tag tag, final List<Element> currentContent) {
final Map<String, String> attributes = tag.getAttributes();
String src = attributes.get(HTML.Attribute.SRC);
List<Element> elements = new ArrayList<Element>(1);
if (null != src && src.length() > 0) {
Image img = null;
if (src.startsWith("data:image/")) {
final String base64Data = src.substring(src.indexOf(",") + 1);
try {
img = Image.getInstance(Base64.decode(base64Data));
} catch (Exception e) {
if (logger.isLogging(Level.ERROR)) {
logger.error(String.format(LocaleMessages.getInstance().getMessage(LocaleMessages.HTML_IMG_RETRIEVE_FAIL), src), e);
}
}
if (img != null) {
try {
final HtmlPipelineContext htmlPipelineContext = getHtmlPipelineContext(ctx);
elements.add(getCssAppliers().apply(new Chunk((com.itextpdf.text.Image) getCssAppliers().apply(img, tag, htmlPipelineContext), 0, 0, true), tag,
htmlPipelineContext));
} catch (NoCustomContextException e) {
throw new RuntimeWorkerException(e);
}
}
}
if (img == null) {
elements = super.end(ctx, tag, currentContent);
}
}
return elements;
}
}
Following code snippet registers the custom image tag processor and coverts an HTML document to PDF
public static void main(String[] args) {
convertHtmlToPdf();
}
private static void convertHtmlToPdf() {
try {
final OutputStream file = new FileOutputStream(new File("C:\\Test.pdf"));
final Document document = new Document();
final PdfWriter writer = PdfWriter.getInstance(document, file);
document.open();
final TagProcessorFactory tagProcessorFactory = Tags.getHtmlTagProcessorFactory();
tagProcessorFactory.removeProcessor(HTML.Tag.IMG);
tagProcessorFactory.addProcessor(new ImageTagProcessor(), HTML.Tag.IMG);
final CssFilesImpl cssFiles = new CssFilesImpl();
cssFiles.add(XMLWorkerHelper.getInstance().getDefaultCSS());
final StyleAttrCSSResolver cssResolver = new StyleAttrCSSResolver(cssFiles);
final HtmlPipelineContext hpc = new HtmlPipelineContext(new CssAppliersImpl(new XMLWorkerFontProvider()));
hpc.setAcceptUnknown(true).autoBookmark(true).setTagFactory(tagProcessorFactory);
final HtmlPipeline htmlPipeline = new HtmlPipeline(hpc, new PdfWriterPipeline(document, writer));
final Pipeline<?> pipeline = new CssResolverPipeline(cssResolver, htmlPipeline);
final XMLWorker worker = new XMLWorker(pipeline, true);
final Charset charset = Charset.forName("UTF-8");
final XMLParser xmlParser = new XMLParser(true, worker, charset);
final InputStream is = new FileInputStream("C:\\test.html");
xmlParser.parse(is, charset);
is.close();
document.close();
file.close();
} catch (Exception e) {
e.printStackTrace();
// TODO
}
}

This Exception will occur if there is no content in your pdf page.
Try Passing your InputStream like this
String str="<html>
<body>
<img src='data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAD4AAABQCAMAAAB24TZcAAAABGdBTUEAANbY1E9YMgAAABl0RVh0U29mdHdhcmUAQWRvYmUgSW1hZ2VSZWFkeXHJZTwAAAGAUExURdSmeJp2SHlbQIRoSUg2J499a8KebqeHZuGufBEVJPz7+3NWPVxGMduwhPXEktnX1mtROLq7t5WDc2VMNv3LmKB8TMSidMbFxLGlmXlhSMSddpJUL+y8i3VlVqedlOzr6gUIF2lXRLCLY4ZyXLyYaYhtUYiJhJFyU1dBLLiVZnlwZrWRY/Hx8b+2rbySaJh9YqeooDw4NygnKvvJlpyblzksIUhGRryYckc7MPjGlKODX5x8VVA8K+azgM3FvDInHK2JW2ZbUOHh4Xt2cFpaWKeAUM6kel1RRJmUjo5vSrWzrJJ1WFhLQCQmMuK1iJiMgmthWPPCkOm3hEtBOunm5LCNXnJtZquEXmNkYvG+i7Ctq+y5hrWRbKqSeaN/WqmFVYFgQh8aGOa4isWkd8mcby4vONDNy0AwI5h2U19JMxkdLzIuL1JBMjQ3P5Z6Ve6/j93c2+Xi34KAfJ5/Xvj4+O/u7sSKVJd4Wo6QjXE+IeOwfQcNJoBeQ8Gdbf/Mmf///5GX6NEAAAcrSURBVHja3JbpX9pIGMchiWkgEaOBtaGinBLEyopFBeMqtYKI4kGt2lILFsUoXa3WdZcc/dd3JheHAvaz7/Z5Ec2Q7/yeaw7Lz/9klv8rfnM+Orz5cXLjZsL+67h9eCq9Vaxvzc6v3W6+/TX85kN6ixdokkQQCaE5vrg28Qv4a2yFQcpSi/HzH6efi+/UaEAwWAtepuvv3tw/B//hqZGQqDFSmyHC7v0z8EldlZQQEgTfMgF23h8/T+gEhQGrcQYrMBKVtvfDb4qU/j3DMK3SdIKWsNs++M1iS8R8W/gULyG1771w+/stQWpTpFpzByb09MRHEwaoxUxToGtaZiBrE72cXzMyhcDiIRgCHxJPIxKt5aF23gMf0iquz8BJmAAFpUStxvG0xIA3arcHPsvrJM1wvFTDeEGQeKCewCo1jgRDwKuJrrh9C3osIfyiz+NboZFKxU0xJEYmeJbBhPoKiKyMDXfHd0mJWSETnoKiKCmgSioFDKFr4T1lbn/fgkHf+PGu+A+A12imMqdAqzNUXlFCFP+gOD41CKJBcCB4bKSnOmitB5VWSgnMrSjhCnu8D1hoS1xP/KcH1BhZdGi4c4VNAh/I5PGyRjdQqje+A6YXPIpup/DhHlMUh44f1hAJ6x77z3OwVjG/0ml7Ot4gOWnxvkfbALw+2EnPGc43ojWk3qNt7hdpiSp0ajcMukHQPB/4o3vPf8TKQgc+pqXdkpEtgGewE7THel/j66dtdBLA1XAYRXK8AGbxC/6RHvjbCuOE0Kklk8lcg/+OicaJcOhfTflTVYCHuYvX3XH7QCxcUAol9i6VursLha+VfcLPHwamZjfSAgxi6QId6oFnC5awsjdoWYjFPrOlB3QONAtJjrwsetiq2jkzgfc9nPdklJBDyXvGj+Zf+jIKe7pPoNFoOHwyoyaQKFcD9z3wzbwSGnT6fCMB9u5UmWMLYwTJQo5QC2AB6r122ukBJeVWnA6HIwlLnp/bI/w5wI3tJR3LjcZMbvVzL/xHwOG+M6s2mFeSjRm0QRyDYnyCOEv/0fOYGM/vha4N3J1S5hoZhCAcYBro/AwV63NIjafuzL4rLSjOZYKeIT45j9XUnQTs/Y7Inbqp/pABeIPBqsTystr0/pd9T9jprZIGO9CHa4gTPHairxr/eP/rwai+YdzlWQfALSHu4qTxfHxiQKVTaBINvfCjDFo1Fmzjor/zP+0BNXdgxSTdqRe5w0bT2hq+293mdWDOSJ5DWbgwd4uGpSPxXW5WGzGddhYWHsDRguqpO5x9jjq4HY3BnjtcRRGGe/Xqn38YC6SraVt84jnXwo0FgC8kOK7s+mv91St6RhVnZ72Vqeln4EM+cFY43SHgdj584c9ormdFbx3Jbk73v9PuvNCCvx67ntPzlmG2xUvUhQpZz9roxHdwXx4e7Yb/fdXc7o81PFcUxW2ry+Wy5miM4gQkEAh0uxKfXWbdLXs1XGxZURRnXZpZrVbXegT/rUvm571itnncQPctWZso2hAdd61GIzIuf32y5zduL0VxtwQPWG2vB7QP0OKKVaejOI7L8lP4+S3r+wY+zSZfGPvGPlFlt8FQ3BCPQPYpfOjWs3QHtMVLJqmU0NLe9XVhsBpOwyER0+D1oE534t8Hsn/KctwLokxUgeunD6FwCA2xMGtAPAdhjkr55afwoaksGpHlAKTnWUK9ZIAt15k/U+mK5voSuoI9Vre/fZPOBcFQKg4+PXsXg7urVra0Stvqmud4mTp4hN/s+lAIy8ErIC7Oz8aITzqegYkUL4tawQ+ivEvudP7Gt6SPpCpewJ8BfN+pb/aq71dG2kjayLuJ3/vC+gB+EBe9Xm/8KEQs67hShMmgIRsNylFuFe9UL1IGHXHNAtr77ZYN7htNB8LxJmCnyaBZULpJ6/g4ZZQCX83FAS1u3675xnTaX/GKFdLl+gIaDZeFpU78rS9oDnzZEmHstqPJKc9n90LJPThyBUZIVRtMv8Q1v9Xx8bzxigddWo1t7yZ//zgSCwRiK6CO0PUD2OR4hMnhHfiPtYiJr4a8Jj4MbHNe7UC4RtTfc5wsd+DD6RbxxTZ8chtkrcJGIlqX41GqTVzFp3wmfmCNi5rNT74Z3nwHi2BjZW11AtdzgvxIfSBl4l/Klzr+bfLvzSNYA1u9xTfmz8f4lLmA5HWfgV8eTa7BEohxox1xeZ1F5Ef4fTrYnL4oGjb7QZ3JVgk2W4KJPMZvmWbo9KWJ27QsXKHm3DkhJT/Gs6z55lo0abV5wCSL5txL/CMa4PYPUXN+5qwTj68aXwa5MP4Efj/VDA4TW3BV3PQMp7Wlgnfg555mcPFO8RbXMbXv8Oh6pG3J7IRM8bq3Q/zKLFqUQ3GteNYvbepG1XG57O0Qt9Hmd1bOKC1qbZH/zbK78FWzYMJ2aZoXPq7kr8ZvORr+iUSjJzQb/Gpa5l8BBgBZTppAyfsf0wAAAABJRU5ErkJggg==' width='62' height='80' style='float: left; margin-right: 28px;' alt="" />
<!-- <img src="add.png" alt="" /> -->
</body>
</html>"
InputStream is = new ByteArrayInputStream(str.getBytes());
XMLWorkerHelper.getInstance().parseXHtml(writer, document, is);

Related

Java: Extract text from PDF and shows as header and items seperate array list

I am using PDFBOX and reading and saving the contents from PDF file . Requirement is text should be splitted to Header and Item in seperate array list .
PDF looks below.
Expected :
Following details PO,DeliveryDate,Vendor no should shown in arraylist 1 and other details like barcode,item number,description,quantity should shown in arraylist 2 .
Exisiting code for extracting data as txt from PDF.
PDFBoxReadFromFile.java
package pdfboxreadfromfile;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.io.*;
public class PDFBoxReadFromFile {
/**
* #param args the command line arguments
*/
public static void main(String[] args) {
PDFManager pdfManager = new PDFManager();
pdfManager.setFilePath("C:\\Users\\34\\Documents\\test.pdf");
try {
String text = pdfManager.toText();
System.out.println(text);
File file = new File("C:/Users/34/eclipse-workspace/pdfboxreadfromfile/file.txt");
FileWriter fw = new FileWriter(file);
PrintWriter pw = new PrintWriter(fw);
pw.println(text);
pw.close();
} catch (IOException ex) {
//System.err.println(ex.getMessage());
Logger.getLogger(PDFBoxReadFromFile.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
PDFManager.Java
package pdfboxreadfromfile;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDFManager {
private PDFParser parser;
private PDFTextStripper pdfStripper;
private PDDocument pdDoc;
private COSDocument cosDoc;
private String Text;
private String filePath;
private File file;
public PDFManager() {
}
public String toText() throws IOException {
this.pdfStripper = null;
this.pdDoc = null;
this.cosDoc = null;
file = new File(filePath);
parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
pdDoc.getNumberOfPages();
pdfStripper.setStartPage(0);
pdfStripper.setEndPage(pdDoc.getNumberOfPages());
Text = pdfStripper.getText(pdDoc);
return Text;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
public PDDocument getPdDoc() {
return pdDoc;
}
}
java

how to reconstruct an org.archive.io.warc.WARCRecordInfo from an org.archive.io.ArchiveRecord?

Using java, I need to read a warc archive file, filter it depending on the content of the html page, and write a new archive file.
the following code reads the archive. how to reconstruct an org.archive.io.warc.WARCRecordInfo from an org.archive.io.ArchiveRecord?
import org.apache.commons.io.IOUtils;
import org.archive.io.ArchiveRecord;
import org.archive.io.warc.*;
import org.archive.wayback.resourcestore.resourcefile.WarcResource;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;
public class Test126b {
public static void main() throws Exception {
File out = new java.io.File("out.warc.gz");
OutputStream bos = new BufferedOutputStream(new FileOutputStream(out));
WARCWriterPoolSettings settings = ...
WARCWriter writer = new WARCWriter(new AtomicInteger(), bos, out, settings);
File in = new java.io.File("in.warc.gz");
WARCReader reader = WARCReaderFactory.get(in);
Iterator<ArchiveRecord> it = reader.iterator();
while (it.hasNext()) {
ArchiveRecord archiveRecord = it.next();
if (archiveRecord.getHeader().getHeaderValue("WARC-Type") == "response") {
WARCRecord warcRecord = (WARCRecord) archiveRecord;
WarcResource warcResource = new WarcResource(warcRecord, reader);
warcResource.parseHeaders();
String url = warcResource.getWarcHeaders().getUrl();
System.out.println("+++ url: " + url);
byte[] content = IOUtils.toByteArray(warcResource);
String htmlPage = new String(content);
if (htmlPage.contains("hello world")) {
writer.writeRecord(warcRecordInfo) // how to reconstruct the WARCRecordInfo
}
}
}
reader.close();
writer.close();
}
}

Svg integration in pdf using flying saucer

I under gone a situation of converting html to pdf, Thankfully I can achieved this through flying saucer api. But My HTML consists of svg tags while converting I am unable to get the svg in pdf. It can be achieved using a Stackoverflow question
and Tutorial.
What is meant by the replacedElementFactory?
ChainingReplacedElementFactory chainingReplacedElementFactory
= new ChainingReplacedElementFactory();
chainingReplacedElementFactory.addReplacedElementFactory(replacedElementFactory);
chainingReplacedElementFactory.addReplacedElementFactory(new SVGReplacedElementFactory());
renderer.getSharedContext().setReplacedElementFactory(chainingReplacedElementFactory);
It's just an error in the tutorial, the line with replacedElementFactory is not needed.
Here is my working example.
Java:
import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.OutputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.xhtmlrenderer.pdf.ITextRenderer;
public class PdfSvg {
public static void main(String[] args) throws Exception {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
Document inputDoc = builder.parse("svg.html");
ByteArrayOutputStream output = new ByteArrayOutputStream();
ITextRenderer renderer = new ITextRenderer();
ChainingReplacedElementFactory chainingReplacedElementFactory = new ChainingReplacedElementFactory();
chainingReplacedElementFactory.addReplacedElementFactory(new SVGReplacedElementFactory());
renderer.getSharedContext().setReplacedElementFactory(chainingReplacedElementFactory);
renderer.setDocument(inputDoc, "");;
renderer.layout();
renderer.createPDF(output);
OutputStream fos = new FileOutputStream("svg.pdf");
output.writeTo(fos);
}
}
HTML:
<html>
<head>
<style type="text/css">
svg {display: block;width:100mm;height:100mm}
</style>
</head>
<body>
<div>
<svg xmlns="http://www.w3.org/2000/svg">
<circle cx="50" cy="50" r="40" stroke="black" stroke-width="3"
fill="red" />
</svg>
</div>
</body>
</html>
The ChainingReplacedElementFactory, SVGReplacedElement and SVGReplacedElementFactory comes from the tutorial.
If you wanted an in page solution, here's an alternate using #cloudformatter which is a remote formatting service. I added their Javascript to your fiddle along with some text and your Highchart chart.
http://jsfiddle.net/yk0Lxzg0/1/
var click="return xepOnline.Formatter.Format('printme', {render:'download'})";
jQuery('#buttons').append('<button onclick="'+ click +'">PDF</button>');
The above code placed in the fiddle will format the div with 'id' printme to PDF for download. That div includes your chart and some text.
http://www.cloudformatter.com/CSS2Pdf.APIDoc.Usage shows usage instructions and has many more samples of charts in SVG formatted to PDF either by themselves or as part of pages combined with text, tables and such.
#Rajesh I hope you already found a solution to your problem. If not (or anyone having issues working with flying saucer, batik and svg tags) then you might want to consider this-
removing all clip-path="url(#highcharts-xxxxxxx-xx)" from <g> tags did the trick for me.
My code is referring to the missing code part "SVGReplacedElementFactory".
And I use it like this:
renderer
.getSharedContext()
.setReplacedElementFactory( new B64ImgReplacedElementFactory() );
import com.itextpdf.text.BadElementException;
import com.itextpdf.text.Image;
import com.itextpdf.text.pdf.codec.Base64;
import org.apache.batik.transcoder.TranscoderException;
import org.apache.batik.transcoder.TranscoderInput;
import org.apache.batik.transcoder.TranscoderOutput;
import org.apache.batik.transcoder.image.JPEGTranscoder;
import org.apache.batik.transcoder.image.PNGTranscoder;
import org.w3c.dom.Element;
import org.xhtmlrenderer.extend.FSImage;
import org.xhtmlrenderer.extend.ReplacedElement;
import org.xhtmlrenderer.extend.ReplacedElementFactory;
import org.xhtmlrenderer.extend.UserAgentCallback;
import org.xhtmlrenderer.layout.LayoutContext;
import org.xhtmlrenderer.pdf.ITextFSImage;
import org.xhtmlrenderer.pdf.ITextImageElement;
import org.xhtmlrenderer.render.BlockBox;
import org.xhtmlrenderer.simple.extend.FormSubmissionListener;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
public class B64ImgReplacedElementFactory implements ReplacedElementFactory
{
public ReplacedElement createReplacedElement(LayoutContext c, BlockBox box, UserAgentCallback uac, int cssWidth, int cssHeight)
{
Element e = box.getElement();
if(e == null)
{
return null;
}
String nodeName = e.getNodeName();
if(nodeName.equals("img"))
{
String attribute = e.getAttribute("src");
FSImage fsImage;
try
{
fsImage = buildImage(attribute, uac);
}
catch(BadElementException e1)
{
fsImage = null;
}
catch(IOException e1)
{
fsImage = null;
}
if(fsImage != null)
{
if(cssWidth != -1 || cssHeight != -1)
{
fsImage.scale(cssWidth, cssHeight);
}
return new ITextImageElement(fsImage);
}
}
return null;
}
protected FSImage buildImage(String srcAttr, UserAgentCallback uac) throws IOException, BadElementException
{
if(srcAttr.startsWith("data:image/"))
{
// BASE64Decoder decoder = new BASE64Decoder();
// byte[] decodedBytes = decoder.decodeBuffer(b64encoded);
// byte[] decodedBytes = B64Decoder.decode(b64encoded);
byte[] decodedBytes = Base64.decode(srcAttr.substring(srcAttr.indexOf("base64,") + "base64,".length(), srcAttr.length()));
return new ITextFSImage(Image.getInstance(decodedBytes));
}
FSImage fsImage = uac.getImageResource(srcAttr).getImage();
if(fsImage == null)
{
return convertToPNG(srcAttr);
}
return null;
}
private FSImage convertToPNG(String srcAttr) throws IOException, BadElementException
{
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
PNGTranscoder t = new PNGTranscoder();
// t.addTranscodingHint(JPEGTranscoder.KEY_PIXEL_UNIT_TO_MILLIMETER, (25.4f / 72f));
t.addTranscodingHint(JPEGTranscoder.KEY_WIDTH, 4000.0F);
t.addTranscodingHint(JPEGTranscoder.KEY_HEIGHT, 4000.0F);
try
{
t.transcode(
new TranscoderInput(srcAttr),
new TranscoderOutput(byteArrayOutputStream)
);
}
catch(TranscoderException e)
{
e.printStackTrace();
}
byteArrayOutputStream.flush();
byteArrayOutputStream.close();
return new ITextFSImage(Image.getInstance(byteArrayOutputStream.toByteArray()));
}
public void remove(Element e)
{
}
#Override
public void setFormSubmissionListener(FormSubmissionListener formSubmissionListener)
{
}
public void reset()
{
}
}

Error in parsing the HTML using XMLworkerHelper

I am getting "Invalid nested tag font found, expected closing tag table" when trying to convert one HTML to PDF using iText. Following is the program I am using.
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Element;
import com.itextpdf.text.Font;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.Phrase;
import com.itextpdf.text.Font.FontFamily;
import com.itextpdf.text.pdf.ColumnText;
import com.itextpdf.text.pdf.GrayColor;
import com.itextpdf.text.pdf.PdfPageEventHelper;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorkerHelper;
public class HTML2PDF {
class Watermark extends PdfPageEventHelper{
//font for watermarking text
Font font = new Font(FontFamily.HELVETICA, 52, Font.ITALIC, new GrayColor(.75f));
public void onEndPage(PdfWriter writer, Document document){
ColumnText.showTextAligned(writer.getDirectContentUnder(),
Element.ALIGN_CENTER,
new Phrase("Watermarking Text", font),
297.5f, 421,-45);//-45 specifies the angle of Watermarking Text
}
}
public void convertHTML2PDF(){
//Create objects of Document and specify the Page size of a PDF
Document document = new Document(PageSize.A4);
try{
PdfWriter pdfWriter;
//get the instance of class PdfWriter with the document objects and output path
pdfWriter = PdfWriter.getInstance(document, new FileOutputStream("/opt/remedy/html2any-cmd-linux/bin/test.pdf"));
//setting the Watermarking in the PageEvent of PdfWriter
pdfWriter.setPageEvent(new Watermark());
document.open();
String htmlContent = "";
BufferedReader in = new BufferedReader(new FileReader("/opt/remedy/html2any-cmd-linux/bin/WO00000001004641434460592596_1.html"));
String temp;
//read the html files content and stores it in a String variable
while((temp = in.readLine())!=null){
htmlContent += temp;
}
in.close();
XMLWorkerHelper xmlWorker = XMLWorkerHelper.getInstance();
// converts the html into a PDF
xmlWorker.parseXHtml(pdfWriter, document, new StringReader(htmlContent));
document.close();
}
catch(Exception e)
{
e.printStackTrace();
}
}
//main method
public static void main(String[] args){
new HTML2PDF().convertHTML2PDF();
}
}
There are some tags in html which do not need to be closed. Could you suggest how this type of html can be parsed using XMLWorker?

how to add image and text in pdf for save html file

import com.itextpdf.text.Document;
import com.itextpdf.text.Image;
import com.itextpdf.text.html.simpleparser.HTMLWorker;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.text.pdf.codec.Base64.OutputStream;
import java.io.FileOutputStream;
import java.io.StringReader;
import java.net.URL;
public class myclass {
public static void main(String[] args) {
String result = "<html><body><div>(i) the recognised association shall have the approval of the Forward Markets Commission established under the Forward Contracts (Regulation) Act, 1952 (74 of 1952) in respect of trading in derivatives and shall function in accordance with the guidelines or conditions laid down by the Forward Markets Commission; </div> <body> </html>";
Document document = new Document();
OutputStream file = null;
try {
PdfWriter.getInstance(document, new FileOutputStream(
"E://Image.pdf"));
document.open();
PdfWriter.getInstance(document, file);
document.open();
#SuppressWarnings("deprecation")
HTMLWorker htmlWorker = new HTMLWorker(document);
htmlWorker.parse(new StringReader(result));
String imageUrl = "http://www.taxmann.com/emailer/demo/mobileAapp/newAppDesign.jpg";
Image image2 = Image.getInstance(new URL(imageUrl));
document.add(image2);
document.close();
file.flush();
document.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
I am trying to save image and text in pdf file. When we set Either text or image then it's working fine, simultaneously am not able to save image and text Both in pdf. How will I save image and text both in Pdf? I am Using iText.
May the problem is Wrong import for Outputstream and file is Always null . You never assigned any O/P strream.
Try this
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.StringReader;
import java.net.URL;
import com.itextpdf.text.Document;
import com.itextpdf.text.Image;
import com.itextpdf.text.html.simpleparser.HTMLWorker;
import com.itextpdf.text.pdf.PdfWriter;
public class ItextExample {
#SuppressWarnings("deprecation")
public static void main(String[] args) {
String result = "<html><body><div>(i) the recognised association shall have the approval of the Forward Markets Commission established under the Forward Contracts (Regulation) Act, 1952 (74 of 1952) in respect of trading in derivatives and shall function in accordance with the guidelines or conditions laid down by the Forward Markets Commission; </div> <body> </html>";
Document document = new Document();
OutputStream file = null;
try {
file = new FileOutputStream("E://Image1.pdf");
PdfWriter.getInstance(document,file);
document.open();
HTMLWorker htmlWorker = new HTMLWorker(document);
htmlWorker.parse(new StringReader(result));
String imageUrl = "http://www.taxmann.com/emailer/demo/mobileAapp/newAppDesign.jpg";
Image image2 = Image.getInstance(new URL(imageUrl));
document.add(image2);
} catch (Exception e) {
e.printStackTrace();
}finally {
try {
document.close();
file.flush();
}catch(Exception e) {
e.printStackTrace();
}
}
}
}

Categories