I am getting "Invalid nested tag font found, expected closing tag table" when trying to convert one HTML to PDF using iText. Following is the program I am using.
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Element;
import com.itextpdf.text.Font;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.Phrase;
import com.itextpdf.text.Font.FontFamily;
import com.itextpdf.text.pdf.ColumnText;
import com.itextpdf.text.pdf.GrayColor;
import com.itextpdf.text.pdf.PdfPageEventHelper;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorkerHelper;
public class HTML2PDF {
class Watermark extends PdfPageEventHelper{
//font for watermarking text
Font font = new Font(FontFamily.HELVETICA, 52, Font.ITALIC, new GrayColor(.75f));
public void onEndPage(PdfWriter writer, Document document){
ColumnText.showTextAligned(writer.getDirectContentUnder(),
Element.ALIGN_CENTER,
new Phrase("Watermarking Text", font),
297.5f, 421,-45);//-45 specifies the angle of Watermarking Text
}
}
public void convertHTML2PDF(){
//Create objects of Document and specify the Page size of a PDF
Document document = new Document(PageSize.A4);
try{
PdfWriter pdfWriter;
//get the instance of class PdfWriter with the document objects and output path
pdfWriter = PdfWriter.getInstance(document, new FileOutputStream("/opt/remedy/html2any-cmd-linux/bin/test.pdf"));
//setting the Watermarking in the PageEvent of PdfWriter
pdfWriter.setPageEvent(new Watermark());
document.open();
String htmlContent = "";
BufferedReader in = new BufferedReader(new FileReader("/opt/remedy/html2any-cmd-linux/bin/WO00000001004641434460592596_1.html"));
String temp;
//read the html files content and stores it in a String variable
while((temp = in.readLine())!=null){
htmlContent += temp;
}
in.close();
XMLWorkerHelper xmlWorker = XMLWorkerHelper.getInstance();
// converts the html into a PDF
xmlWorker.parseXHtml(pdfWriter, document, new StringReader(htmlContent));
document.close();
}
catch(Exception e)
{
e.printStackTrace();
}
}
//main method
public static void main(String[] args){
new HTML2PDF().convertHTML2PDF();
}
}
There are some tags in html which do not need to be closed. Could you suggest how this type of html can be parsed using XMLWorker?
Related
i have successfully generated barcode using itext5. but the problem i am having now is whenever i try to scan generated barcode via xlab scaner model YT-760 it fails to read it.
BarcodeInTable
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.Phrase;
import com.itextpdf.text.pdf.Barcode128;
import com.itextpdf.text.pdf.PdfContentByte;
import com.itextpdf.text.pdf.PdfPCell;
import com.itextpdf.text.pdf.PdfPTable;
import com.itextpdf.text.pdf.PdfWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
public class BarcodeInTable {
public static final String DEST = "D:/barcode_in_table.pdf";
public static void main(String[] args) throws IOException, DocumentException {
File file = new File(DEST);
file.getParentFile().mkdirs();
new BarcodeInTable().createPdf(DEST);
}
public void createPdf(String dest) throws IOException, DocumentException {
Document document = new Document();
PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(dest));
document.open();
String code = "675-FH-A12";
PdfContentByte cb = writer.getDirectContent();
PdfPTable table = new PdfPTable(1);
Barcode128 barcode128 = new Barcode128();
barcode128.setFont(null);
barcode128.setCode(code);
barcode128.setCodeType(Barcode128.CODE128);
Image barcode128Image = barcode128.createImageWithBarcode(cb,null,null);
PdfPCell cell = new PdfPCell();
cell.addElement(new Phrase("PO"+code));
cell.addElement(barcode128Image);
table.addCell(cell);
document.add(table);
document.close();
}
}
do i have to write codes to make printed barcode to make it readable ? or simply printing generated barcode will be scannable?
p.s i am printing barcode with sato SA408 printer to print barcode. even printing on plain paper and trying to scan won't work.
I am unable to parse the html input type value and convert it into pdf file.I am using pdfWriter to generate the pdf and using xmlworker-5.5.4.jar and itext.jar.It does not parse the input type value of html file and unable to convert into pdf file.This problem is generating when using htmlworker or xmlworker.
Code:
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Element;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.html.simpleparser.HTMLWorker;
import com.itextpdf.text.pdf.PdfWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.StringReader;
public class ParseHtml {
public static final String DEST = "D:/html_1.pdf";
public static void main(String[] args) throws IOException, DocumentException {
File file = new File(DEST);
file.getParentFile().mkdirs();
ParseHtml p=new ParseHtml();
p.createPdf(DEST);
}
#SuppressWarnings("deprecation")
public void createPdf(String file) throws IOException, DocumentException {
StringBuilder sb = new StringBuilder();
sb.append("<input type=\'text\' value=\"43645643634\"/>");
System.out.println("String------"+sb);
FileOutputStream outStream = new FileOutputStream(file);
Document document = new Document(PageSize.A4.rotate());
PdfWriter pdfWriter = PdfWriter.getInstance(document,outStream);
document.open();
document.newPage();
HTMLWorker htmlWorker = new HTMLWorker(document);
htmlWorker.parse(new StringReader(sb.toString()));
document.close();
outStream.close();
System.out.println("Document is created...");
}
}
I want to convert a document .doc that contains some images. How to convert it to *.html, so that the images will remain same position? How to store those images in separate folder named image and use this folder as a source for image?
My code:
import java.io.BufferedWriter;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import javax.swing.JEditorPane;
import javax.swing.JFrame;
import javax.swing.JScrollPane;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.FileURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.w3c.dom.Document;
public class TestWordToHtmlConverter {
private File docFile;
private File file;
public TestWordToHtmlConverter(File docFile) {
this.docFile = docFile;
}
public void convert(File file) {
this.file = file;
try {
FileInputStream finStream=new FileInputStream(docFile.getAbsolutePath());
HWPFDocument doc=new HWPFDocument(finStream);
WordExtractor wordExtract=new WordExtractor(doc);
Document newDocument = DocumentBuilderFactory.newInstance() .newDocumentBuilder().newDocument();
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(newDocument) ;
wordToHtmlConverter.processDocument(doc);
StringWriter stringWriter = new StringWriter();
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
transformer.setOutputProperty(OutputKeys.METHOD, "html");
transformer.transform(new DOMSource( wordToHtmlConverter.getDocument()), new StreamResult( stringWriter ) );
String html = stringWriter.toString();
FileOutputStream fos=new FileOutputStream(new File("html/sample.html"));
DataOutputStream dos;
try {
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8"));
out.write(html);
out.close();
}
catch (IOException e) {
e.printStackTrace();
}
/*JEditorPane editorPane = new JEditorPane();
editorPane.setContentType("text/html");
editorPane.setEditable(false);
editorPane.setPage(file.toURI().toURL());
JScrollPane scrollPane = new JScrollPane(editorPane);
JFrame f = new JFrame("Display Html File");
f.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
f.getContentPane().add(scrollPane);
f.setSize(512, 342);
f.setVisible(true);*/
} catch(Exception e) {
e.printStackTrace();
}
}
public static void main(String args[]) {
TestWordToHtmlConverter TTC=new TestWordToHtmlConverter(new File("docx/sample.doc"));
TTC.convert(TTC.docFile);
}
}
This implementation doesn't create images or links to them. This can
be changed by overriding AbstractWordConverter.processImage(Element,
boolean, Picture) method
As said in API docs:
WordToHtmlConverter doesn't create images or links to them. This can
be changed by overriding AbstractWordConverter.processImage(Element, boolean, Picture) method.
How to override method you can found here:
I need Apache POI Pictures converted from a word document to a html file
You can try using DOCX 2 XHTML converter based on Apache POI XWPF:
XWPFDocument 2 XHTML
Also you can use Apache Tika, built on top of Apache POI. An example that included in Alfresco can be found here:
HTMLRenderingEngine
There are also many other converters.
i want to convert html file with images to pdf using iText. I am providing my source here.
This is my HTML file...
<html>
<body>
<img src='' width='62' height='80' style='float: left; margin-right: 28px;' alt="" />
<!-- <img src="add.png" alt="" /> -->
</body>
</html>
I want to convert this html file to pdf...
Am using the following java code...
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.net.URL;
import java.nio.charset.Charset;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.encoding.Encoding;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.w3c.tidy.Tidy;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.Pipeline;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerFontProvider;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.css.CssFilesImpl;
import com.itextpdf.tool.xml.css.StyleAttrCSSResolver;
import com.itextpdf.tool.xml.html.CssAppliersImpl;
import com.itextpdf.tool.xml.html.HTML;
import com.itextpdf.tool.xml.html.TagProcessor;
import com.itextpdf.tool.xml.html.TagProcessorFactory;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
import com.itextpdf.tool.xml.pipeline.html.ImageProvider;
import com.pdfcrowd.Client;
public class App
{
public static void main( String[] args ) throws DocumentException, IOException
{
// step 1
Document document = new Document();
document.newPage();
// step 2
PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream("pdf.pdf"));
// step 3
document.open();
// step 4
XMLWorkerHelper.getInstance().parseXHtml(writer, document,
new FileInputStream("index.html"));
//step 5
document.close();
System.out.println( "PDF Created!" );
}
}
Am getting the following error...
Exception in thread "main" ExceptionConverter: java.io.IOException: The document has no pages.
at com.itextpdf.text.pdf.PdfPages.writePageTree(PdfPages.java:113)
at com.itextpdf.text.pdf.PdfWriter.close(PdfWriter.java:1243)
at com.itextpdf.text.pdf.PdfDocument.close(PdfDocument.java:849)
at com.itextpdf.text.Document.close(Document.java:416)
at App.main(App.java:64)
Please help me out How can i convert html file with images to pdf using itext. I am able to convert that html file if i dont have images or if i hardcode the image path. Thanks in advance
You need to implement a custom image tag processor to process the images embedded inside your html:
package com.example.itext.processor;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.itextpdf.text.Chunk;
import com.itextpdf.text.Element;
import com.itextpdf.text.Image;
import com.itextpdf.text.log.Level;
import com.itextpdf.text.log.Logger;
import com.itextpdf.text.log.LoggerFactory;
import com.itextpdf.text.pdf.codec.Base64;
import com.itextpdf.tool.xml.NoCustomContextException;
import com.itextpdf.tool.xml.Tag;
import com.itextpdf.tool.xml.WorkerContext;
import com.itextpdf.tool.xml.exceptions.LocaleMessages;
import com.itextpdf.tool.xml.exceptions.RuntimeWorkerException;
import com.itextpdf.tool.xml.html.HTML;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
public class ImageTagProcessor extends com.itextpdf.tool.xml.html.Image {
private final Logger logger = LoggerFactory.getLogger(getClass());
/*
* (non-Javadoc)
*
* #see com.itextpdf.tool.xml.TagProcessor#endElement(com.itextpdf.tool.xml.Tag, java.util.List, com.itextpdf.text.Document)
*/
#Override
public List<Element> end(final WorkerContext ctx, final Tag tag, final List<Element> currentContent) {
final Map<String, String> attributes = tag.getAttributes();
String src = attributes.get(HTML.Attribute.SRC);
List<Element> elements = new ArrayList<Element>(1);
if (null != src && src.length() > 0) {
Image img = null;
if (src.startsWith("data:image/")) {
final String base64Data = src.substring(src.indexOf(",") + 1);
try {
img = Image.getInstance(Base64.decode(base64Data));
} catch (Exception e) {
if (logger.isLogging(Level.ERROR)) {
logger.error(String.format(LocaleMessages.getInstance().getMessage(LocaleMessages.HTML_IMG_RETRIEVE_FAIL), src), e);
}
}
if (img != null) {
try {
final HtmlPipelineContext htmlPipelineContext = getHtmlPipelineContext(ctx);
elements.add(getCssAppliers().apply(new Chunk((com.itextpdf.text.Image) getCssAppliers().apply(img, tag, htmlPipelineContext), 0, 0, true), tag,
htmlPipelineContext));
} catch (NoCustomContextException e) {
throw new RuntimeWorkerException(e);
}
}
}
if (img == null) {
elements = super.end(ctx, tag, currentContent);
}
}
return elements;
}
}
Following code snippet registers the custom image tag processor and coverts an HTML document to PDF
public static void main(String[] args) {
convertHtmlToPdf();
}
private static void convertHtmlToPdf() {
try {
final OutputStream file = new FileOutputStream(new File("C:\\Test.pdf"));
final Document document = new Document();
final PdfWriter writer = PdfWriter.getInstance(document, file);
document.open();
final TagProcessorFactory tagProcessorFactory = Tags.getHtmlTagProcessorFactory();
tagProcessorFactory.removeProcessor(HTML.Tag.IMG);
tagProcessorFactory.addProcessor(new ImageTagProcessor(), HTML.Tag.IMG);
final CssFilesImpl cssFiles = new CssFilesImpl();
cssFiles.add(XMLWorkerHelper.getInstance().getDefaultCSS());
final StyleAttrCSSResolver cssResolver = new StyleAttrCSSResolver(cssFiles);
final HtmlPipelineContext hpc = new HtmlPipelineContext(new CssAppliersImpl(new XMLWorkerFontProvider()));
hpc.setAcceptUnknown(true).autoBookmark(true).setTagFactory(tagProcessorFactory);
final HtmlPipeline htmlPipeline = new HtmlPipeline(hpc, new PdfWriterPipeline(document, writer));
final Pipeline<?> pipeline = new CssResolverPipeline(cssResolver, htmlPipeline);
final XMLWorker worker = new XMLWorker(pipeline, true);
final Charset charset = Charset.forName("UTF-8");
final XMLParser xmlParser = new XMLParser(true, worker, charset);
final InputStream is = new FileInputStream("C:\\test.html");
xmlParser.parse(is, charset);
is.close();
document.close();
file.close();
} catch (Exception e) {
e.printStackTrace();
// TODO
}
}
This Exception will occur if there is no content in your pdf page.
Try Passing your InputStream like this
String str="<html>
<body>
<img src='' width='62' height='80' style='float: left; margin-right: 28px;' alt="" />
<!-- <img src="add.png" alt="" /> -->
</body>
</html>"
InputStream is = new ByteArrayInputStream(str.getBytes());
XMLWorkerHelper.getInstance().parseXHtml(writer, document, is);
import com.itextpdf.text.Document;
import com.itextpdf.text.Image;
import com.itextpdf.text.html.simpleparser.HTMLWorker;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.text.pdf.codec.Base64.OutputStream;
import java.io.FileOutputStream;
import java.io.StringReader;
import java.net.URL;
public class myclass {
public static void main(String[] args) {
String result = "<html><body><div>(i) the recognised association shall have the approval of the Forward Markets Commission established under the Forward Contracts (Regulation) Act, 1952 (74 of 1952) in respect of trading in derivatives and shall function in accordance with the guidelines or conditions laid down by the Forward Markets Commission; </div> <body> </html>";
Document document = new Document();
OutputStream file = null;
try {
PdfWriter.getInstance(document, new FileOutputStream(
"E://Image.pdf"));
document.open();
PdfWriter.getInstance(document, file);
document.open();
#SuppressWarnings("deprecation")
HTMLWorker htmlWorker = new HTMLWorker(document);
htmlWorker.parse(new StringReader(result));
String imageUrl = "http://www.taxmann.com/emailer/demo/mobileAapp/newAppDesign.jpg";
Image image2 = Image.getInstance(new URL(imageUrl));
document.add(image2);
document.close();
file.flush();
document.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
I am trying to save image and text in pdf file. When we set Either text or image then it's working fine, simultaneously am not able to save image and text Both in pdf. How will I save image and text both in Pdf? I am Using iText.
May the problem is Wrong import for Outputstream and file is Always null . You never assigned any O/P strream.
Try this
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.StringReader;
import java.net.URL;
import com.itextpdf.text.Document;
import com.itextpdf.text.Image;
import com.itextpdf.text.html.simpleparser.HTMLWorker;
import com.itextpdf.text.pdf.PdfWriter;
public class ItextExample {
#SuppressWarnings("deprecation")
public static void main(String[] args) {
String result = "<html><body><div>(i) the recognised association shall have the approval of the Forward Markets Commission established under the Forward Contracts (Regulation) Act, 1952 (74 of 1952) in respect of trading in derivatives and shall function in accordance with the guidelines or conditions laid down by the Forward Markets Commission; </div> <body> </html>";
Document document = new Document();
OutputStream file = null;
try {
file = new FileOutputStream("E://Image1.pdf");
PdfWriter.getInstance(document,file);
document.open();
HTMLWorker htmlWorker = new HTMLWorker(document);
htmlWorker.parse(new StringReader(result));
String imageUrl = "http://www.taxmann.com/emailer/demo/mobileAapp/newAppDesign.jpg";
Image image2 = Image.getInstance(new URL(imageUrl));
document.add(image2);
} catch (Exception e) {
e.printStackTrace();
}finally {
try {
document.close();
file.flush();
}catch(Exception e) {
e.printStackTrace();
}
}
}
}