convert a word documents to HTML with embedded images by TIKA

convert a word documents to HTML with embedded images by TIKA - java

I'm new in TIKA. I try to convert Microsoft word documents to HTML by using Tika. I'm using TikaOnDotNet wrapper to used TIKA on .Net framework. My conversion code is like following:
byte[] file = Files.toByteArray(new File(#"myPath\document.doc"));
AutoDetectParser tikaParser = new AutoDetectParser();
ByteArrayOutputStream output = new ByteArrayOutputStream();
SAXTransformerFactory factory = (SAXTransformerFactory)TransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
handler.setResult(new StreamResult(output));
ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);
tikaParser.parse(new ByteArrayInputStream(file), handler1, new Metadata());
File ofile = new File(#"C:\toHtml\text.html");
ofile.createNewFile();
DataOutputStream stream = new DataOutputStream(new FileOutputStream(ofile));
output.writeTo(stream);
everything working well except the embedded images. The generated HTML contains image tag like:
<img src="embedded:image2.wmf" alt="image2.wmf"/>
but the image source does not exists. Please advise me

Credits goes to #Gagravarr.
please note that this is a simple implementation of code, the original codes are available in comment of the questions.
This implementation is based on TikaOnDotNet wrapper.....
public class DocToHtml
{
private TikaConfig config = TikaConfig.getDefaultConfig();
public void Convert()
{
byte[] file = Files.toByteArray(new File(#"filename.doc"));
AutoDetectParser tikaParser = new AutoDetectParser();
ByteArrayOutputStream output = new ByteArrayOutputStream();
SAXTransformerFactory factory = (SAXTransformerFactory)TransformerFactory.newInstance();
var inputStream = new ByteArrayInputStream(file);
// ToHTMLContentHandler handler = new ToHTMLContentHandler();
var metaData = new Metadata();
EncodingDetector encodingDetector = new UniversalEncodingDetector();
var encode = encodingDetector.detect(inputStream, metaData) ?? new UTF_32();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, encode.toString());
handler.setResult(new StreamResult(output));
ContentHandler imageRewriting = new ImageRewritingContentHandler(handler);
// ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);
ParseContext context = new ParseContext();
context.set(typeof(EmbeddedDocumentExtractor), new FileEmbeddedDocumentEtractor());
tikaParser.parse(inputStream, imageRewriting, new Metadata(), context);
byte[] array = output.toByteArray();
System.IO.File.WriteAllBytes(#"C:\toHtml\text.html", array);
}
private class ImageRewritingContentHandler : ContentHandlerDecorator
{
public ImageRewritingContentHandler(ContentHandler handler) : base(handler)
{
}
public override void startElement(string uri, string localName, string name, Attributes origAttrs)
{
if ("img".Equals(localName))
{
AttributesImpl attrs;
if (origAttrs is AttributesImpl)
attrs = (AttributesImpl)origAttrs;
else
attrs = new AttributesImpl(origAttrs);
for (int i = 0; i < attrs.getLength(); i++)
{
if ("src".Equals(attrs.getLocalName(i)))
{
String src = attrs.getValue(i);
if (src.StartsWith("embedded:"))
{
var newSrc = src.Replace("embedded:", #"images\");
attrs.setValue(i, newSrc);
}
}
}
attrs.addAttribute(null, "width", "width","width", "100px");
base.startElement(uri, localName, name, attrs);
}
else
base.startElement(uri, localName, name, origAttrs);
}
}
private class FileEmbeddedDocumentEtractor : EmbeddedDocumentExtractor
{
private int count = 0;
public bool shouldParseEmbedded(Metadata m)
{
return true;
}
public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, bool outputHtml)
{
Detector detector = new DefaultDetector();
string name = metadata.get("resourceName");
MediaType contentType = detector.detect(inputStream, metadata);
if (contentType.getType() != "image") return;
var embeddedFile = name;
File outputFile = new File(#"C:\toHtml\images", embeddedFile);
try
{
using (FileOutputStream os = new FileOutputStream(outputFile))
{
var tin = inputStream as TikaInputStream;
if (tin != null)
{
if (tin.getOpenContainer() != null && tin.getOpenContainer() is DirectoryEntry)
{
POIFSFileSystem fs = new POIFSFileSystem();
fs.writeFilesystem(os);
}
else
{
IOUtils.copy(inputStream, os);
}
}
}
}
catch (Exception ex)
{
throw;
}
}
}
}

Related

In Java, how do you deal with double quote inside of a CSV that you need to parse

here is what I want to do,
This my spend.csv file :
"Date","Description","Detail","Amount"
"5/03/21","Cinema","Batman","7.90"
"15/02/20","Groceries","Potatoes","23.00"
"9/12/21","DIY","Wood Plates","33.99"
"9/07/22","Fuel","Shell","$56.00"
"23/08/19","Lamborghini","Aventador","800,000.00"
From a table view :
Table View of the csv
And here is what I want as my output file named spend.xml :
<?xml version="1.0" encoding="UTF-8"?>
<SPEND>
<RECORD DATE="5/03/21">
<DESC>Cinema</DESC>
<DETAIL>Batman</DETAIL>
<AMOUNT>7.90</AMOUNT>
</RECORD>
<RECORD DATE="15/02/20">
<DESC>Groceries</DESC>
<DETAIL>Potatoes</DETAIL>
<AMOUNT>23.00</AMOUNT>
</RECORD>
<RECORD DATE="9/12/21">
<DESC>DIY</DESC>
<DETAIL>Wood Plates</DETAIL>
<AMOUNT>33.99</AMOUNT>
</RECORD>
<RECORD DATE="9/07/22">
<DESC>Fuel</DESC>
<DETAIL>Shell</DETAIL>
<AMOUNT>$56.00</AMOUNT>
</RECORD>
<RECORD DATE="23/08/19">
<DESC>Lamborghini</DESC>
<DETAIL>Aventador</DETAIL>
<AMOUNT>800,000.00</AMOUNT>
</RECORD>
</SPEND>
In order to do that, I found some stuff here and there and managed to get this :
public class Main {
public static void main(String[] args) throws FileNotFoundException {
List<String> headers = new ArrayList<String>(5);
File file = new File("spend.csv");
BufferedReader reader = null;
try {
DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder domBuilder = domFactory.newDocumentBuilder();
Document newDoc = domBuilder.newDocument();
// Root element
Element rootElement = newDoc.createElement("XMLCreators");
newDoc.appendChild(rootElement);
reader = new BufferedReader(new FileReader(file));
int line = 0;
String text = null;
while ((text = reader.readLine()) != null) {
StringTokenizer st = new StringTokenizer(text, "", false);
int index = 0;
String[] rowValues = text.split(",");
if (line == 0) { // Header row
for (String col : rowValues) {
headers.add(col);
}
} else { // Data row
Element rowElement = newDoc.createElement("RECORDS");
rootElement.appendChild(rowElement);
for (int col = 0; col < headers.size(); col++) {
String header = headers.get(col);
String value = null;
if (col < rowValues.length) {
value = rowValues[col];
} else {
value = "";
}
Element curElement = newDoc.createElement(header);
curElement.appendChild(newDoc.createTextNode(value));
rowElement.appendChild(curElement);
}
}
line++;
}
ByteArrayOutputStream baos = null;
OutputStreamWriter osw = null;
try {
baos = new ByteArrayOutputStream();
osw = new OutputStreamWriter(baos);
TransformerFactory tranFactory = TransformerFactory.newInstance();
Transformer aTransformer = tranFactory.newTransformer();
aTransformer.setOutputProperty(OutputKeys.INDENT, "yes");
aTransformer.setOutputProperty(OutputKeys.METHOD, "xml");
aTransformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
Source src = new DOMSource(newDoc);
Result result = new StreamResult(osw);
aTransformer.transform(src, result);
osw.flush();
System.out.println(new String(baos.toByteArray()));
} catch (Exception exp) {
exp.printStackTrace();
} finally {
try {
osw.close();
} catch (Exception e) {
}
try {
baos.close();
} catch (Exception e) {
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
At this point the programm should print in the terminal the XML file but;
Sadly, because of the double quotes of each value in my CSV file, I'm having this issue :
java org.w3c.dom.domexception invalid_character_err an invalid or illegal xml character is specified
I think I'm missing something around those lines :
StringTokenizer st = new StringTokenizer(text, "", false);
int index = 0;
String[] rowValues = text.split(",");
I would like to keep the double quotes in my CSV, if anyone as an idea feel free to tell me please!

Before you run your conversion, do a
String.replaceAll("\"", "####")
Then run the conversion and when it is complete, reverse it and replace all the "####" in the string with double quotes

Another possible approach using OpenCsv and Jackson:
public class FileProcessor {
public static void main(String[] args) throws IOException {
List<DataStructure> importList = new CsvToBeanBuilder<DataStructure>(
new FileReader("pathIn"))
.withIgnoreEmptyLine(true)
.withType(DataStructure.class)
.build()
.parse();
ListLoader exportList = new ListLoader(importList);
XmlMapper xmlMapper = new XmlMapper();
xmlMapper.configure(ToXmlGenerator.Feature.WRITE_XML_DECLARATION, true)
.enable(SerializationFeature.INDENT_OUTPUT)
.writeValue(new File("pathOut"), exportList);
}
}
Class to serialize each element:
#Data
public class DataStructure {
#CsvBindByName
#JacksonXmlProperty(isAttribute = true, localName = "DATE")
private String date;
#CsvBindByName
#JacksonXmlProperty(localName = "DESC")
private String description;
#CsvBindByName
#JacksonXmlProperty(localName = "DETAIL")
private String detail;
#CsvBindByName
#JacksonXmlProperty(localName = "AMOUNT")
private String amount;
}
Class to serialize full list:
#JacksonXmlRootElement(localName = "SPEND")
public class ListLoader {
#JacksonXmlElementWrapper(useWrapping = false)
#JacksonXmlProperty(localName = "RECORD")
private List<DataStructure> list;
public ListLoader(List<DataStructure> list){
this.list = list;
}
}

Itext sign pdf by string signature in base 64 from client

I am trying to sign a pdf document with a signature that comes from the entire client in format base 64.
the service makes a request to calculate the hash from the document
I take the content from the pdf of the document, calculate the hash from it according to the algorithm.
service takes the received hash and signs it, sends the received signature along with the bytes of the document to be signed
I get a string in base 64 and pdf bytes to be signed
Is it possiple case? I give a code example
public byte[] insertSignature(byte[] document, String signature) {
try (InputStream inputStream = new ByteArrayInputStream(document);
ByteArrayOutputStream os = new ByteArrayOutputStream();
ByteArrayOutputStream result = new ByteArrayOutputStream()) {
byte[] decodeSignature = Base64.decodeBase64(signature);
CAdESSignature cades = new CAdESSignature(decodeSignature, null, null);
var certificate = cades.getCAdESSignerInfo(0).getSignerCertificate();
var subject = new Subject(certificate.getSubjectX500Principal().getEncoded());
List<String> names = getSignaturesFields(document);
String sigFieldName = String.format("Signature %s", names.size() + 1);
PdfName filter = PdfName.Adobe_PPKLite;
PdfName subFilter = PdfName.ETSI_CAdES_DETACHED;
int estimatedSize = 8192;
PdfReader reader = new PdfReader(inputStream);
StampingProperties stampingProperties = new StampingProperties();
if (names.size() > 1) {
stampingProperties.useAppendMode();
}
PdfSigner signer = new PdfSigner(reader, os, stampingProperties);
signer.setCertificationLevel(PdfSigner.CERTIFIED_NO_CHANGES_ALLOWED);
PdfSignatureAppearance appearance = signer.getSignatureAppearance();
appearance
.setContact(subject.email().orElse(""))
.setSignatureCreator(subject.organizationName().orElse(""))
.setLocation(subject.country())
.setReuseAppearance(false)
.setPageNumber(1);
signer.setFieldName(sigFieldName);
ContainerForPrepareSignedDocument external = new ContainerForPrepareSignedDocument(filter, subFilter);
signer.signExternalContainer(external, estimatedSize);
byte[] preSignedBytes = os.toByteArray();
ContainerReadyToSignedDocument extSigContainer = new ContainerReadyToSignedDocument(decodeSignature);
PdfDocument docToSign = new PdfDocument(new PdfReader(new ByteArrayInputStream(preSignedBytes)));
PdfSigner.signDeferred(docToSign, sigFieldName, result, extSigContainer);
docToSign.close();
return result.toByteArray();
}
catch (IOException e) {
throw new InternalException("IO exception by insert signature to document:", e);
}
catch (GeneralSecurityException e) {
throw new InternalException("General security by insert signature to document:", e);
}
catch (CAdESException e) {
throw new InternalException("CAdESException by insert signature to document:", e);
}
}
private List<String> getSignaturesFields(byte[] document)
throws IOException {
try (InputStream inputStream = new ByteArrayInputStream(document);
PdfReader reader = new PdfReader(inputStream);
PdfDocument pdfDocument = new PdfDocument(reader)) {
SignatureUtil signUtil = new SignatureUtil(pdfDocument);
return signUtil.getSignatureNames();
}
}
static class ContainerForPrepareSignedDocument implements IExternalSignatureContainer {
private final PdfName filter;
private final PdfName subFilter;
public ContainerForPrepareSignedDocument(PdfName filter,
PdfName subFilter) {
this.filter = filter;
this.subFilter = subFilter;
}
public byte[] sign(InputStream docBytes) {
return new byte[0];
}
public void modifySigningDictionary(PdfDictionary signDic) {
signDic.put(PdfName.Filter, filter);
signDic.put(PdfName.SubFilter, subFilter);
}
}
static class ContainerReadyToSignedDocument implements IExternalSignatureContainer {
private byte[] cmsSignatureContents;
public ContainerReadyToSignedDocument(byte[] cmsSignatureContents) {
this.cmsSignatureContents = cmsSignatureContents;
}
public byte[] sign(InputStream docBytes) {
return cmsSignatureContents;
}
public void modifySigningDictionary(PdfDictionary signDic) {
}
}

Avoid namespace while Parsing xml with woodstox

I am trying to parse an xml File and remove namespaces and prefix using woodstox parser(the xml contains nested elements and each element contains namespace at every level)
Below is the code i use to parse.I get the same input as i pass.Please help in resolving the issue
byte[] byteArray = null;
try {
File file = new File(xmlFileName);
byteArray = new byte[(int) file.length()];
byteArray = FileUtils.readFileToByteArray(file);
} catch (Exception e) {
e.printStackTrace();
}
InputStream articleStream = new ByteArrayInputStream(byteArray);
WstxInputFactory xmlInputFactory = (WstxInputFactory) XMLInputFactory.newInstance();
xmlInputFactory.configureForSpeed();
// xmlInputFactory.configureForXmlConformance();
XMLStreamReader2 xmlStreamReader = (XMLStreamReader2) xmlInputFactory.createXMLStreamReader(articleStream,
StandardCharsets.UTF_8.name());
xmlStreamReader.setProperty(XMLInputFactory.IS_COALESCING, true);
WstxOutputFactory xmloutFactory = (WstxOutputFactory) XMLOutputFactory2.newInstance();
StringWriter sw = new StringWriter();
XMLEventWriter xw = null;
XMLStreamWriter2 xmlwriter = (XMLStreamWriter2) xmloutFactory.createXMLStreamWriter(sw,
StandardCharsets.UTF_8.name());
xmlwriter.setNamespaceContext(new NamespaceContext() {
#Override
public String getNamespaceURI(String prefix) {
return "";
}
#Override
public String getPrefix(String namespaceURI) {
return "";
}
#Override
public Iterator getPrefixes(String namespaceURI) {
return null;
}
});
while (xmlStreamReader.hasNext()) {
xmlStreamReader.next();
xmlwriter.copyEventFromReader(xmlStreamReader, false);
}
System.out.println("str" + xmlwriter.getNamespaceContext().getPrefix(""));
xmlwriter.closeCompletely();
xmlwriter.flush();
xmlStreamReader.closeCompletely();
xmlStreamReader.close();

If you want to remove all namespace prefixes and bindings, you should NOT use copy methods -- they will literally copy those things. Instead read element and attribute names, but only write out using "local name"s, and leave namespaceURI and prefix as nulls (or use methods that only take local name).

Java integration with SSRS

Hi I am looking into using Java (to be deployed as servlets in Websphere 8.5) to integrate with SSRS. I have looked into some of the sample codes out there and try it out.
private static SoapHeader createExecutionIdSoapHeader(String executionId) {
Document doc = DOMUtils.createDocument();
Element executionHeaderElement = doc.createElement("ExecutionHeader");
executionHeaderElement.setAttribute("xmlns", XML_NAMESPACE);
Element executionIdElement = doc.createElement("ExecutionID");
executionIdElement.setTextContent(executionId);
executionHeaderElement.appendChild(executionIdElement);
SoapHeader soapH = new SoapHeader(new QName(XML_NAMESPACE, "ExecutionHeader"), executionHeaderElement);
return soapH;
}
public static Holder<byte[]> getReportResult(String output_type, String reportFolder, String reportName, ArrayOfParameterValue arrayOfParameterValue) {
Holder<byte[]> result = null;
try {
String historyID = null;
String executionID = null;
ReportExecutionServiceSoap service = getExecutionService();
BindingProvider bp = (BindingProvider) service;
bp.getRequestContext().put(BindingProvider.USERNAME_PROPERTY, authenticator.getUsername());
bp.getRequestContext().put(BindingProvider.PASSWORD_PROPERTY, authenticator.getPassword());
ExecutionInfo info = new ExecutionInfo();
info = service.loadReport(REPORT_PATH, historyID);
executionID = info.getExecutionID();
List<Header> headers = new ArrayList<Header>();
SoapHeader header = createExecutionIdSoapHeader(executionID);
headers.add(header);
bp.getRequestContext().put(Header.HEADER_LIST, headers);
if (!arrayOfParameterValue.getParameterValue().isEmpty()) {
service.setExecutionParameters(arrayOfParameterValue, "en-us");
}
// Default to return HTML4.0
String deviceInfo = "";
if (output_type == null || output_type.isEmpty()) {
output_type = "HTML4.0";
}
if ("IMAGE".equalsIgnoreCase(output_type)) {
deviceInfo = RENDER_DEVICE_INFO_IMAGE;
} else {
deviceInfo = RENDER_DEVICE_INFO_HTML;
}
result = new Holder<byte[]>();
Holder<String> extension = new Holder<String>();
Holder<String> mimeType = new Holder<String>();
Holder<String> encoding = new Holder<String>();
Holder<ArrayOfWarning> warnings = new Holder<ArrayOfWarning>();
Holder<ArrayOfString> streamIDs = new Holder<ArrayOfString>();
service.render(output_type, deviceInfo, result, extension, mimeType, encoding, warnings, streamIDs);
} catch (Throwable th) {
th.printStackTrace();
}
return result;
}
protected void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
try {
ArrayOfParameterValue arrayOfParameterValue = new ArrayOfParameterValue();
List<ParameterValue> parameters = arrayOfParameterValue.getParameterValue();
ParameterValue parameterValue = new ParameterValue();
parameterValue.setName(PARAMETER_NAME);
parameterValue.setValue(PARAMETER_VALUE);
parameters.add(parameterValue);
Holder<byte[]> result = GenerateReport.getReportResult(REPORT_FORMAT, REPORT_FOLDER, REPORT_NAME,
arrayOfParameterValue);
System.out.println("--------------------------------- Writing to Browser --------------------------------");
ServletOutputStream out = response.getOutputStream();
out.write(result.value);
out.flush();
out.close();
System.out.println("--------------------------------- Writing to File -----------------------------------");
DateFormat df = new SimpleDateFormat("dd_MM_yy_HH_mm_ss_");
Date date = new Date();
String filename = df.format(date) + "SSRS_Report.pdf";
FileOutputStream o = new FileOutputStream("C:\\Users\\keh\\Desktop\\Temp\\" + filename);
o.write(result.value);
o.flush();
o.close();
} catch (Exception e) {
e.printStackTrace();
}
}
When I run the codes, I have this error :
[5/17/17 19:21:02:704 SGT] 000000c4 SystemErr R javax.xml.ws.soap.SOAPFaultException: The session identifier is missing. A session identifier is required for this operation. ---> Microsoft.ReportingServices.Diagnostics.Utilities.MissingSessionIdException: The session identifier is missing. A session identifier is required for this operation.
Any expert out there can point me to a solution pls?
P.S. I have tried to use WSBindingProvider as shown in Surendra Gurjar's Blog and it ran beautifully on an Apache server, but I got a ClassCastException when I deploy it to Websphere.

OpenNLP train Thai language

I am experimenting with OpenNlp 1.7.2 and maxent-3.0.0.jar to train for thai language , below is the code that reads thai train data and creates the bin model.
public class TrainPerson {
public static void main(String[] args) throws IOException {
String trainFile = "/Documents/workspace/ThaiOpenNLP/bin/thaiPerson.train";
String modelFile = "/Documents/workspace/ThaiOpenNLP/bin/th-ner-person.bin";
writePersonModel(trainFile, modelFile);
}
private static void writePersonModel(String trainFile, String modelFile)
throws FileNotFoundException, IOException {
Charset charset = Charset.forName("UTF-8");
InputStreamFactory fileInputStream = new MarkableFileInputStreamFactory(new File(trainFile));
ObjectStream<String> lineStream = new PlainTextByLineStream(fileInputStream, charset);
ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
TokenNameFinderModel model;
try {
model = NameFinderME.train("th", "person", sampleStream , TrainingParameters.defaultParams(), new TokenNameFinderFactory());
} finally {
sampleStream.close();
}
BufferedOutputStream modelOut = null;
try {
modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
model.serialize(modelOut);
} finally {
if (modelOut != null) {
modelOut.close();
}
}
}}
Thai data looks like as attached in the file trainingData
I am using the output model to detect person name as shown in the below programme. It fails to identify the name.
public class ThaiPersonNameFinder {
static String modelFile = "/Users/avinashpaula/Documents/workspace/ThaiOpenNLP/bin/th-ner-person.bin";
public static void main(String[] args) {
try {
InputStream modelIn = new FileInputStream(new File(modelFile));
TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
NameFinderME nameFinder = new NameFinderME(model);
String sentence[] = new String[]{
"จอห์น",
"30",
"ปี",
"จะ",
"เข้าร่วม",
"ก",
"เริ่มต้น",
"ขึ้น",
"บน",
"มกราคม",
"."
};
Span nameSpans[] = nameFinder.find(sentence);
for (int i = 0; i < nameSpans.length; i++) {
System.out.println(nameSpans[i]);
}
}
catch (IOException e) {
e.printStackTrace();
}
}
}
What am i doing wrong.

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

convert a word documents to HTML with embedded images by TIKA - java

Related

In Java, how do you deal with double quote inside of a CSV that you need to parse

Itext sign pdf by string signature in base 64 from client

Avoid namespace while Parsing xml with woodstox

Java integration with SSRS

OpenNLP train Thai language

Categories

Resources