Apache POI replace text in docx with Java - java

I can replace the text inside the table and footer, but I can't replace the text outside the table. I don't know why.
Please any idea how to replace a paragraph like ${name} outside the table ?
I want that in the Map.
public static boolean changWord(String inputUrl, String outputUrl, Map<String, String> textMap) {
// Template conversion default success
boolean changeFlag = true;
try {
File file = new File(outputUrl);
FileOutputStream stream = new FileOutputStream(file);
#SuppressWarnings("resource")
XWPFDocument document = new XWPFDocument(POIXMLDocument.openPackage(inputUrl));
WorderToNewWordUtils.changeText(document, textMap);
document.write(stream);
stream.close();
} catch (IOException e) {
e.printStackTrace();
changeFlag = false;
}
return changeFlag;
}
public static void changeText(XWPFDocument document, Map<String, String> textMap) {
for (XWPFParagraph p : document.getParagraphs()) {
for (XWPFRun r : p.getRuns()) {
String text = r.getText(0);
if (checkText(text)) {
r.setText(changeValue(r.toString(), textMap), 0);
}
}
}
// Replace Text inside Table
for (XWPFTable tbl : document.getTables()) {
for (XWPFTableRow row : tbl.getRows()) {
for (XWPFTableCell cell : row.getTableCells()) {
for (XWPFParagraph p : cell.getParagraphs()) {
for (XWPFRun r : p.getRuns()) {
String text = r.getText(0);
if (checkText(text)) {
r.setText(changeValue(r.toString(), textMap), 0);
}
// System.out.println("Bevor Fußzeiler" + text);
}
}
}
}
}
// Replace Text in Footer
for (XWPFFooter footer : document.getFooterList()) {
for (XWPFParagraph paragraph1 : footer.getParagraphs()) {
for (XWPFRun r : paragraph1.getRuns()) {
String text = r.getText(0);
if (checkText(text)) {
r.setText(changeValue(r.toString(), textMap), 0);
}
// System.out.println("Nach Fußzeile" + text);
}
}
}
}
public static boolean checkText(String text) {
boolean check = false;
if (text.indexOf("$") != -1) {
check = true;
}
return check;
}
public static String changeValue(String value, Map<String, String> textMap) {
for (Map.Entry<String, String> textSet : textMap.entrySet()) {
// match template and replacement value format ${key}
String key = "${" + textSet.getKey() + "}";
if (value.indexOf(key) != -1) {
value = textSet.getValue();
}
}
return value;
}
public static void main(String[] args) {
// Template file address
String inputUrl = "D:\\Test.docx";
Map<String, String> testMap = new HashMap<>();
testMap.put("ja", "Nein");
testMap.put("red", "Blue");
testMap.put("No", "yes");
testMap.put("Preis", "999$");
testMap.put("Something", "Nothing");
testMap.put("nein", "Ja");
testMap.put("antwort", "Schöne");
testMap.put("name", "Sayer");
testMap.put("Test", "Email");
// .pdf if you want the Document in PDF Format
String outputUrl = "D:\\New-Test.docx";
WorderToNewWordUtils.changWord(inputUrl, outputUrl, testMap);
}
}

i found a Solution if you want it send me your email.

Related

How to replace figure with placeholder or certain image in word document using apache poi,?

Let's assume i have a word document, with this body.
Word document before replacing images
private void findImages(XWPFParagraph p) {
for (XWPFRun r : p.getRuns()) {
for (XWPFPicture pic : r.getEmbeddedPictures()) {
XWPFPicture picture = pic;
XWPFPictureData source = picture.getPictureData();
BufferedImage qrCodeImage = printVersionService.generateQRCodeImage("JASAW EMA WWS");
File imageFile = new File("image.jpg");
try {
ImageIO.write(qrCodeImage, "jpg", imageFile);
} catch (IOException e) {
e.printStackTrace();
}
try ( FileInputStream in = new FileInputStream(imageFile);
OutputStream out = source.getPackagePart().getOutputStream();
) {
byte[] buffer = new byte[2048];
int length;
while ((length = in.read(buffer)) > 0) {
out.write(buffer, 0, length);
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
}
So this code replaces any image with QR code.
But I have one trouble.
Word Document after replacing
So my question is?
How can I replace only the image i chose or how can i replace inserted figure with text with image generated by my own function?
Detecting the picture and replacing the picture data will be the simplest. In following answer I have shown how to detect and replace pictures by name: Java Apache POI: insert an image "infront the text". If you do not know the name of the embedded picture, a picture also can be detected by alt text. To edit the alt text of a picture, open the context menu by right mouse click on the picture and choose Edit A̲lt Text from that context menu.
In How to read alt text of image in word document apache.poi I have shown already how to read alt text of image.
So code could look like:
import java.io.FileInputStream;
import java.io.OutputStream;
import java.io.FileOutputStream;
import org.apache.poi.xwpf.usermodel.*;
public class WordReplacePictureData {
static org.apache.xmlbeans.XmlObject getInlineOrAnchor(org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture ctPictureToFind, org.apache.xmlbeans.XmlObject inlineOrAnchor) {
String declareNameSpaces = "declare namespace pic='http://schemas.openxmlformats.org/drawingml/2006/picture'; ";
org.apache.xmlbeans.XmlObject[] selectedObjects = inlineOrAnchor.selectPath(
declareNameSpaces
+ "$this//pic:pic");
for (org.apache.xmlbeans.XmlObject selectedObject : selectedObjects) {
if (selectedObject instanceof org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture) {
org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture ctPicture = (org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture)selectedObject;
if (ctPictureToFind.equals(ctPicture)) {
// this is the inlineOrAnchor for that picture
return inlineOrAnchor;
}
}
}
return null;
}
static org.apache.xmlbeans.XmlObject getInlineOrAnchor(XWPFRun run, XWPFPicture picture) {
org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture ctPictureToFind = picture.getCTPicture();
for (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDrawing drawing : run.getCTR().getDrawingList()) {
for (org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline inline : drawing.getInlineList()) {
org.apache.xmlbeans.XmlObject inlineOrAnchor = getInlineOrAnchor(ctPictureToFind, inline);
// if inlineOrAnchor is not null, then this is the inline for that picture
if (inlineOrAnchor != null) return inlineOrAnchor;
}
for (org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTAnchor anchor : drawing.getAnchorList()) {
org.apache.xmlbeans.XmlObject inlineOrAnchor = getInlineOrAnchor(ctPictureToFind, anchor);
// if inlineOrAnchor is not null, then this is the anchor for that picture
if (inlineOrAnchor != null) return inlineOrAnchor;
}
}
return null;
}
static org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps getNonVisualDrawingProps(org.apache.xmlbeans.XmlObject inlineOrAnchor) {
if (inlineOrAnchor == null) return null;
if (inlineOrAnchor instanceof org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline) {
org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline inline = (org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline)inlineOrAnchor;
return inline.getDocPr();
} else if (inlineOrAnchor instanceof org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTAnchor) {
org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTAnchor anchor = (org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTAnchor)inlineOrAnchor;
return anchor.getDocPr();
}
return null;
}
static String getSummary(org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps nonVisualDrawingProps) {
if (nonVisualDrawingProps == null) return "";
String summary = "Id:=" + nonVisualDrawingProps.getId();
summary += " Name:=" + nonVisualDrawingProps.getName();
summary += " Title:=" + nonVisualDrawingProps.getTitle();
summary += " Descr:=" + nonVisualDrawingProps.getDescr();
return summary;
}
static XWPFPicture getPictureByAltText(XWPFRun run, String altText) {
if (altText == null) return null;
for (XWPFPicture picture : run.getEmbeddedPictures()) {
String altTextSummary = getSummary(getNonVisualDrawingProps(getInlineOrAnchor(run, picture)));
System.out.println(altTextSummary);
if (altTextSummary.contains(altText)) {
return picture;
}
}
return null;
}
static void replacePictureData(XWPFPictureData source, String pictureResultPath) {
try ( FileInputStream in = new FileInputStream(pictureResultPath);
OutputStream out = source.getPackagePart().getOutputStream();
) {
byte[] buffer = new byte[2048];
int length;
while ((length = in.read(buffer)) > 0) {
out.write(buffer, 0, length);
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
static void replacePicture(XWPFRun run, String altText, String pictureResultPath) {
XWPFPicture picture = getPictureByAltText(run, altText);
if (picture != null) {
XWPFPictureData source = picture.getPictureData();
replacePictureData(source, pictureResultPath);
}
}
public static void main(String[] args) throws Exception {
String templatePath = "./source.docx";
String resultPath = "./result.docx";
String altText = "Placeholder QR-Code";
String pictureResultPath = "./QR.jpg";
try ( XWPFDocument document = new XWPFDocument(new FileInputStream(templatePath));
FileOutputStream out = new FileOutputStream(resultPath);
) {
for (IBodyElement bodyElement : document.getBodyElements()) {
if (bodyElement instanceof XWPFParagraph) {
XWPFParagraph paragraph = (XWPFParagraph)bodyElement;
for (XWPFRun run : paragraph.getRuns()) {
replacePicture(run, altText, pictureResultPath);
}
}
}
document.write(out);
}
}
}
This replaces the picture or pictures having alt text "Placeholder QR-Code". All other pictures remain as they are.
Replacing shapes with pictures is very laborious as shapes are stored in alternate content elements (to choice shape and fallback) and so the shape needs to be changed as well as the fallback. If one would let the fallback untouched, then applications which rely on that fallback will further show the old shape. Furthermore detecting shapes by text box content is not really much simpler than detecting pictures by alt text content.

sheet.getDataValidations() returns an empty list when a cell is validated by a sequence on another sheet

I have a workbook with two sheets:
Sheet1
Sheet2
And there is a sequence on Sheet2 at the range of A1 to A5:
aa
bb
cc
dd
ee
And in Sheet1, the cell A1 is validated by the sequence in Sheet2.
Excel screenshot
However, sheet.getDataValidations() returns an empty list for this case.
Did I miss something?
public static void main(String[] args) throws Exception {
String filePath = "/Users/fujiexiang/ExcelWorkbook.xlsx";
Workbook workbook = WorkbookFactory.create(new FileInputStream(filePath));
Sheet sheet = workbook.getSheetAt(0);
System.out.println("" + dataValidations + " " + dataValidations.size());
}
"[] 0" was printed out.
Apache POI bases on Office Open XML published for Excel 2007. And Excel 2007 had not supported data validation list constraint coming directly from another worksheet. There had must be created a named range for the data validation list constraint. Now current Excel versions support data validation list constraint coming directly from another worksheet but of course not backwards compatible. That's why apache poi also cannot read those constraints as it only reads CTDataValidations which are from Office Open XML published for Excel 2007.
In the XML the difference looks like so in /xl/worksheets/sheet1.xml:
Excel 2007:
<dataValidation type="list" allowBlank="1" showInputMessage="1" showErrorMessage="1" sqref="A1">
<formula1>Sheet2_A1_A5</formula1>
</dataValidation>
There "Sheet2_A1_A5" is a named range in the workbook that points to Sheet2!A1:A5.
Excel 365:
<x14:dataValidation type="list" allowBlank="1" showInputMessage="1" showErrorMessage="1">
<x14:formula1>
<xm:f>Sheet2!A1:A5</xm:f>
</x14:formula1>
<xm:sqref>A1</xm:sqref>
</x14:dataValidation>
There "Sheet2!A1:A5" is a direct reference to the other worksheet.
As you see, the new x14:dataValidation is in a separate name space. This is not covered by apache poi until now.
What one could do is using low level XML parsing methods to get the new XSSFX14DataValidations additional to the XSSFDataValidations. The following example shows a working draft for how to do this.
import java.io.FileInputStream;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.*;
import org.apache.poi.ss.util.CellRangeAddressList;
import org.apache.poi.ss.util.CellRangeAddress;
import org.apache.xmlbeans.XmlObject;
import org.apache.xmlbeans.XmlCursor;
import javax.xml.namespace.QName;
class ReadExcelDataValidaton {
static java.util.List<XmlObject> getX14DataValidations(XSSFSheet sheet) {
java.util.List<XmlObject> x14DataValidations = new java.util.ArrayList<XmlObject>();
XmlCursor cursor = sheet.getCTWorksheet().newCursor();
cursor.selectPath(
"declare namespace x14='http://schemas.microsoft.com/office/spreadsheetml/2009/9/main' .//x14:dataValidation");
while(cursor.hasNextSelection()) {
cursor.toNextSelection();
XmlObject obj = cursor.getObject();
x14DataValidations.add(obj);
}
return x14DataValidations;
}
static void addXSSFX14DataValidations(XSSFSheet sheet, java.util.List<DataValidation> dataValidations) {
java.util.List<XmlObject> x14DataValidations = getX14DataValidations(sheet);
for (XmlObject x14DataValidation : x14DataValidations) {
XSSFX14DataValidation xssfX14DataValidation = new XSSFX14DataValidation(x14DataValidation);
dataValidations.add(xssfX14DataValidation);
}
}
static java.util.List<DataValidation> getDataValidations(Sheet sheet) {
#SuppressWarnings("unchecked")
java.util.List<DataValidation> dataValidations = (java.util.List<DataValidation>)sheet.getDataValidations();
if (sheet instanceof XSSFSheet) {
addXSSFX14DataValidations((XSSFSheet)sheet, dataValidations);
}
return dataValidations;
}
public static void main(String[] args) throws Exception {
Workbook workbook = WorkbookFactory.create(new FileInputStream("./Excel.xlsx"));
Sheet sheet = workbook.getSheetAt(0);
java.util.List<DataValidation> dataValidations = getDataValidations(sheet);
for (DataValidation dataValidation : dataValidations) {
System.out.println(dataValidation);
System.out.println(dataValidation.getValidationConstraint().getFormula1());
}
workbook.close();
}
}
class XSSFX14DataValidation implements DataValidation {
private DataValidationConstraint validationConstraint;
private int errorStyle;
private boolean emptyCellAllowed;
private boolean suppressDropDownArrow;
private boolean showPromptBox;
private boolean showErrorBox;
private String promptBoxTitle;
private String promptBoxText;
private String errorBoxTitle;
private String errorBoxText;
private CellRangeAddressList regions;
public XSSFX14DataValidation(XmlObject x14DataValidation) {
String type = "";
XmlObject typeAttribute = x14DataValidation.selectAttribute(new QName("type"));
if (typeAttribute != null) type = typeAttribute.newCursor().getTextValue();
Integer validationType = DataValidationConstraint.ValidationType.ANY;
if ("CUSTOM".equalsIgnoreCase(type)) {
validationType = DataValidationConstraint.ValidationType.FORMULA;
} else if ("DATE".equalsIgnoreCase(type)) {
validationType = DataValidationConstraint.ValidationType.DATE;
} else if ("DECIMAL".equalsIgnoreCase(type)) {
validationType = DataValidationConstraint.ValidationType.DECIMAL;
} else if ("LIST".equalsIgnoreCase(type)) {
validationType = DataValidationConstraint.ValidationType.LIST;
} else if ("NONE".equalsIgnoreCase(type)) {
validationType = DataValidationConstraint.ValidationType.ANY;
} else if ("TEXT_LENGTH".equalsIgnoreCase(type)) {
validationType = DataValidationConstraint.ValidationType.TEXT_LENGTH;
} else if ("TIME".equalsIgnoreCase(type)) {
validationType = DataValidationConstraint.ValidationType.TIME;
} else if ("WHOLE".equalsIgnoreCase(type)) {
validationType = DataValidationConstraint.ValidationType.INTEGER;
}
String operator = "";
XmlObject operatorAttribute = x14DataValidation.selectAttribute(new QName("operator"));
if (operatorAttribute != null) operator = operatorAttribute.newCursor().getTextValue();
Integer operatorType = DataValidationConstraint.OperatorType.IGNORED;
if ("BETWEEN".equalsIgnoreCase(operator)) {
operatorType = DataValidationConstraint.OperatorType.BETWEEN;
} else if ("NOT_BETWEEN".equalsIgnoreCase(operator)) {
operatorType = DataValidationConstraint.OperatorType.NOT_BETWEEN;
} else if ("EQUAL".equalsIgnoreCase(operator)) {
operatorType = DataValidationConstraint.OperatorType.EQUAL;
} else if ("NOT_EQUAL".equalsIgnoreCase(operator)) {
operatorType = DataValidationConstraint.OperatorType.NOT_EQUAL;
} else if ("GREATER_THAN".equalsIgnoreCase(operator)) {
operatorType = DataValidationConstraint.OperatorType.GREATER_THAN;
} else if ("GREATER_OR_EQUAL".equalsIgnoreCase(operator)) {
operatorType = DataValidationConstraint.OperatorType.GREATER_OR_EQUAL;
} else if ("LESS_THAN".equalsIgnoreCase(operator)) {
operatorType = DataValidationConstraint.OperatorType.LESS_THAN;
} else if ("LESS_OR_EQUAL".equalsIgnoreCase(operator)) {
operatorType = DataValidationConstraint.OperatorType.LESS_OR_EQUAL;
}
String formula1 = null;
XmlObject[] xmlObjects = x14DataValidation.selectChildren(
new QName("http://schemas.microsoft.com/office/spreadsheetml/2009/9/main", "formula1"));
if (xmlObjects.length > 0) {
XmlObject formula1Element = xmlObjects[0];
formula1 = formula1Element.newCursor().getTextValue();
}
String formula2 = null;
xmlObjects = x14DataValidation.selectChildren(
new QName("http://schemas.microsoft.com/office/spreadsheetml/2009/9/main", "formula2"));
if (xmlObjects.length > 0) {
XmlObject formula2Element = xmlObjects[0];
formula2 = formula2Element.newCursor().getTextValue();
}
this.validationConstraint = new XSSFDataValidationConstraint(validationType, operatorType, formula1, formula2);
this.regions = new CellRangeAddressList();
String sqref = "";
xmlObjects = x14DataValidation.selectChildren(
new QName("http://schemas.microsoft.com/office/excel/2006/main", "sqref"));
if (xmlObjects.length > 0) {
XmlObject sqrefElement = xmlObjects[0];
sqref = sqrefElement.newCursor().getTextValue();
}
String [] refs = sqref.split(" ");
for (String ref : refs) {
CellRangeAddress address = CellRangeAddress.valueOf(ref);
this.regions.addCellRangeAddress(address);
}
String allowBlank = "";
XmlObject allowBlankAttribute = x14DataValidation.selectAttribute(new QName("allowBlank"));
if (allowBlankAttribute != null) allowBlank = allowBlankAttribute.newCursor().getTextValue();
this.emptyCellAllowed = ("1".equals(allowBlank) || "TRUE".equalsIgnoreCase(allowBlank));
String showInputMessage = "";
XmlObject showInputMessageAttribute = x14DataValidation.selectAttribute(new QName("showInputMessage"));
if (showInputMessageAttribute != null) showInputMessage = showInputMessageAttribute.newCursor().getTextValue();
this.showPromptBox = ("1".equals(showInputMessage) || "TRUE".equalsIgnoreCase(showInputMessage));
String showErrorMessage = "";
XmlObject showErrorMessageAttribute = x14DataValidation.selectAttribute(new QName("showErrorMessage"));
if (showErrorMessageAttribute != null) showErrorMessage = showErrorMessageAttribute.newCursor().getTextValue();
this.showErrorBox = ("1".equals(showErrorMessage) || "TRUE".equalsIgnoreCase(showErrorMessage));
//TODO: complete
}
public DataValidationConstraint getValidationConstraint() {
return this.validationConstraint;
}
public void setErrorStyle(int errorStyle) {
this.errorStyle = errorStyle;
}
public int getErrorStyle() {
return this.errorStyle;
}
public void setEmptyCellAllowed(boolean allowed) {
this.emptyCellAllowed = allowed;
}
public boolean getEmptyCellAllowed() {
return this.emptyCellAllowed;
}
public void setSuppressDropDownArrow(boolean suppress) {
this.suppressDropDownArrow = suppress;
}
public boolean getSuppressDropDownArrow() {
return this.suppressDropDownArrow;
}
public void setShowPromptBox(boolean show) {
this.showPromptBox = show;
}
public boolean getShowPromptBox() {
return this.showPromptBox;
}
public void setShowErrorBox(boolean show) {
this.showErrorBox = show;
}
public boolean getShowErrorBox() {
return this.showErrorBox;
}
public void createPromptBox(String title, String text) {
this.promptBoxTitle = title;
this.promptBoxText = text;
}
public String getPromptBoxTitle() {
return this.promptBoxTitle;
}
public String getPromptBoxText() {
return this.promptBoxText;
}
public void createErrorBox(String title, String text) {
this.errorBoxTitle = title;
this.errorBoxText = text;
}
public String getErrorBoxTitle() {
return this.errorBoxTitle;
}
public String getErrorBoxText() {
return this.errorBoxText;
}
public CellRangeAddressList getRegions() {
return this.regions;
}
}

AWS-Textract-Key-Value-Pair Java - thread "main" java.lang.NullPointerException

I am using AWS Textract in a Java Spring boot project. I have set up AWS CLI and have the SDK as a maven dependency.
I have written Java code, converted from C# in order to extract the Key and Value pairs and I am receiving the following error after successfully extracting some words
"AGENCYCUSTOMERID:FEIN(ifapplicable)MARITALSTATUS/CIVILUNION(ifapplicable)INSUREDLOCATIONCODEBUSPRIMARYE-MAILADDRESS:FEIN(ifapplicable)LINEOFBUSINESSCELLMARITALSTATUScivilUNION(ifapplicable)CELLCELLHOME":
AGENCYCUSTOMERID:FEIN(ifapplicable)MARITALSTATUS/CIVILUNION(ifapplicable)INSUREDLOCATIONCODEBUSPRIMARYE-MAILADDRESS:FEIN(ifapplicable)LINEOFBUSINESSCELLMARITALSTATUScivilUNION(ifapplicable)CELLCELLHOMEException in thread "main" java.lang.NullPointerException
at ai.tautona.lloyds.mailboxprocessor.service.AWSTextractService.Get_text(AWSTextractService.java:112)
at ai.tautona.lloyds.mailboxprocessor.service.AWSTextractService.getKVMapRelationship(AWSTextractService.java:74)
at ai.tautona.lloyds.mailboxprocessor.service.AWSTextractService.getKVMap(AWSTextractService.java:57)
at ai.tautona.lloyds.mailboxprocessor.service.AWSTextractService.main(AWSTextractService.java:148)
Through debugging I found the line that is causing the error to be :
text += "X ";
It appears that after finding a SELECTION ELEMENT / CHECKBOX it fails?
My code :
public class AWSTextractService {
public static void getKVMap(String localFile) throws IOException {
File file = new File(localFile);
byte[] fileContent = Files.readAllBytes(file.toPath());
AmazonTextract client = AmazonTextractClientBuilder.defaultClient();
AnalyzeDocumentRequest request = new AnalyzeDocumentRequest()
.withDocument(new Document()
.withBytes(ByteBuffer.wrap(fileContent))).withFeatureTypes(FeatureType.FORMS);
AnalyzeDocumentResult result = client.analyzeDocument(request);
//Get the text blocks
List<Block> blocks = result.getBlocks();
//get key and value maps
List<Block> key_map = new ArrayList<>();
List<Block> value_map = new ArrayList<>();
List<Block> block_map = new ArrayList<>();
for (Block block : blocks) {
block_map.add(block);
if (block.getBlockType().equals("KEY_VALUE_SET")) {
if (block.getEntityTypes().contains("KEY")) {
key_map.add(block);
} else {
value_map.add(block);
}
}
}
//Get Key Value relationship
getKVMapRelationship(key_map, value_map, block_map).forEach((k, v) -> System.out.println("key: " + k + " value:" + v));
getKeyValueRelationship.forEach((k,v)-> System.out.println("key: "+k+" value:"+v));
}
#NotNull
public static HashMap<String, String> getKVMapRelationship(List<Block> key_map, List<Block> value_map, List<Block> block_map) throws IOException {
HashMap<String, String> kvs = new HashMap<>();
;
Block value_block;
String key, val = "";
for (Block key_block : key_map) {
value_block = Find_value_block(key_block, value_map);
key = Get_text(key_block, block_map);
val = Get_text(value_block, block_map);
System.out.printf(key, val);
kvs.put("1", "2");
}
return kvs;
}
#NotNull
public static Block Find_value_block(Block block, List<Block> value_map) {
Block value_block = new Block();
for (Relationship relationship : block.getRelationships()) {
if (relationship.getType().equals("VALUE")) {
for (String value_id : relationship.getIds()) {
for (Block value : value_map) {
if (value.getId().equals(value_id)) {
value_block = value;
}
}
}
}
}
return value_block;
}
//null
#NotNull
public static String Get_text(Block result, List<Block> block_map) throws IOException {
String text = "";
Block word = new Block();
Block word2 = null;
if (result.getRelationships().stream().count() > 0) {
for (Relationship relationship : result.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String child_id : relationship.getIds()) {
word = block_map.stream()
.filter((x)-> x.getId().equals(child_id)).findFirst().orElse(word2);
if (word.getBlockType().equals("WORD"))
{
text += (word.getText() ==null ? "" : word.getText()) + "";
}
if (word.getBlockType().equals("SELECTION_ELEMENT"))
{
if(word.getSelectionStatus().equals("SELECTED"))
{
text += "X ";
}
}
}
}
}
}
return text;
}
public static void main (String[]args) throws IOException {
String fileStr = "/home/daniel/Documents/atrium_sources/accordImage-1.png";
AWSTextractService.getKVMap(fileStr);
System.out.println("Done!");
}
}
Im not sure what is the issue?
I am very sure other Java Devs are going to appreciate this Code. I answered my question with the help of Rikus.
package ai.tautona.lloyds.mailboxprocessor.service;
import com.amazonaws.services.textract.AmazonTextract;
import com.amazonaws.services.textract.AmazonTextractClientBuilder;
import com.amazonaws.services.textract.model.Document;
import java.nio.file.Files;
import com.amazonaws.services.textract.model.*;
import org.apache.commons.collections.CollectionUtils;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import javax.validation.constraints.NotNull;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
#Service
#Transactional
public class AWSTextractService {
public static void getKVMap(String localFile) throws IOException {
File file = new File(localFile);
byte[] fileContent = Files.readAllBytes(file.toPath());
AmazonTextract client = AmazonTextractClientBuilder.defaultClient();
AnalyzeDocumentRequest request = new AnalyzeDocumentRequest()
.withDocument(new Document()
.withBytes(ByteBuffer.wrap(fileContent))).withFeatureTypes(FeatureType.FORMS);
AnalyzeDocumentResult result = client.analyzeDocument(request);
//Get the text blocks
List<Block> blocks = result.getBlocks();
//get key and value maps
List<Block> key_map = new ArrayList<>();
List<Block> value_map = new ArrayList<>();
List<Block> block_map = new ArrayList<>();
for (Block block : blocks) {
block_map.add(block);
if (block.getBlockType().equals("KEY_VALUE_SET")) {
if (block.getEntityTypes().contains("KEY")) {
key_map.add(block);
} else {
value_map.add(block);
}
}
}
//Get Key Value relationship
getKVMapRelationship(key_map, value_map, block_map).forEach((k, v) -> System.out.println("key: " + k + " value:" + v));
}
#NotNull
public static HashMap<String, String> getKVMapRelationship(List<Block> key_map, List<Block> value_map, List<Block> block_map) throws IOException {
HashMap<String, String> kvs = new HashMap<>();
;
Block value_block;
String key, val = "";
for (Block key_block : key_map) {
value_block = Find_value_block(key_block, value_map);
key = Get_text(key_block, block_map);
val = Get_text(value_block, block_map);
kvs.put(key, val);
}
return kvs;
}
#NotNull
public static Block Find_value_block(Block block, List<Block> value_map) {
Block value_block = new Block();
for (Relationship relationship : block.getRelationships()) {
if (relationship.getType().equals("VALUE")) {
for (String value_id : relationship.getIds()) {
for (Block value : value_map) {
if (value.getId().equals(value_id)) {
value_block = value;
}
}
}
}
}
return value_block;
}
//null
#NotNull
public static String Get_text(Block result, List<Block> block_map) throws IOException {
String text = "";
Block word2= new Block();
try {
if (result != null
&& CollectionUtils.isNotEmpty(result.getRelationships())) {
for (Relationship relationship : result.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String id : relationship.getIds()) {
Block word= (block_map.stream().filter(x-> x.getId().equals(id)).findFirst().orElse(word2));
if (word.getBlockType().equals("WORD")) {
text += word.getText() + " ";
} else if (word.getBlockType().equals("SELECTION_ELEMENT")) {
if (word.getSelectionStatus().equals("SELECTED")) {
text += "X ";
}
}
}
}
}
}
} catch (Exception e) {
System.out.println(e);
}
return text;
}
public static void main (String[]args) throws IOException {
String fileStr = "/home/daniel/Documents/atrium_sources/accordImage-1.png";
AWSTextractService.getKVMap(fileStr);
System.out.println("Done!");
}
}

How to retrieve tables which exists in a pdf using AWS Textract in java

I found article below to do in python.
https://docs.aws.amazon.com/textract/latest/dg/examples-export-table-csv.html
also I used article below to extract text.
https://docs.aws.amazon.com/textract/latest/dg/detecting-document-text.html
but above article helped to get only text, I also used function "block.getBlockType()"
of Block but none of block returned its type as "CELL" even tables are there in image/pdf.
Help me found java library similar to "boto3" to extract all tables.
What I did, I created models of each dataset in the json response and can use this models to build a table view in jsf.
public static List<TableModel> getTablesFromTextract(TextractModel textractModel) {
List<TableModel> tables = null;
try {
if (textractModel != null) {
tables = new ArrayList<>();
List<BlockModel> tableBlocks = new ArrayList<>();
Map<String, BlockModel> blockMap = new HashMap<>();
for (BlockModel block : textractModel.getBlocks()) {
if (block.getBlockType().equals("TABLE")) {
tableBlocks.add(block);
}
blockMap.put(block.getId(), block);
}
for (BlockModel blockModel : tableBlocks) {
Map<Long, Map<Long, String>> rowMap = new HashMap<>();
for (RelationshipModel relationship : blockModel.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String id : relationship.getIds()) {
BlockModel cell = blockMap.get(id);
if (cell.getBlockType().equals("CELL")) {
long rowIndex = cell.getRowIndex();
long columnIndex = cell.getColumnIndex();
if (!rowMap.containsKey(rowIndex)) {
rowMap.put(rowIndex, new HashMap<>());
}
Map<Long, String> columnMap = rowMap.get(rowIndex);
columnMap.put(columnIndex, getCellText(cell, blockMap));
}
}
}
}
tables.add(new TableModel(blockModel, rowMap));
}
System.out.println("row Map " + tables.toString());
}
} catch (Exception e) {
LOG.error("Could not get table from textract model", e);
}
return tables;
}
private static String getCellText(BlockModel cell, Map<String, BlockModel> blockMap) {
String text = "";
try {
if (cell != null
&& CollectionUtils.isNotEmpty(cell.getRelationships())) {
for (RelationshipModel relationship : cell.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String id : relationship.getIds()) {
BlockModel word = blockMap.get(id);
if (word.getBlockType().equals("WORD")) {
text += word.getText() + " ";
} else if (word.getBlockType().equals("SELECTION_ELEMENT")) {
if (word.getSelectionStatus().equals("SELECTED")) {
text += "X ";
}
}
}
}
}
}
} catch (Exception e) {
LOG.error("Could not get cell text of table", e);
}
return text;
}
TableModel to create the view from:
public class TableModel {
private BlockModel table;
private Map<Long, Map<Long, String>> rowMap;
public TableModel(BlockModel table, Map<Long, Map<Long, String>> rowMap) {
this.table = table;
this.rowMap = rowMap;
}
public BlockModel getTable() {
return table;
}
public void setTable(BlockModel table) {
this.table = table;
}
public Map<Long, Map<Long, String>> getRowMap() {
return rowMap;
}
public void setRowMap(Map<Long, Map<Long, String>> rowMap) {
this.rowMap = rowMap;
}
#Override
public String toString() {
return table.getId() + " - " + rowMap.toString();
}
I have something similar:
public class AnalyzeDocument {
public DocumentModel startProcess(byte[] content) {
Region region = Region.EU_WEST_2;
TextractClient textractClient = TextractClient.builder().region(region)
.credentialsProvider(EnvironmentVariableCredentialsProvider.create()).build();
return analyzeDoc(textractClient, content);
}
public DocumentModel analyzeDoc(TextractClient textractClient, byte[] content) {
try {
SdkBytes sourceBytes = SdkBytes.fromByteArray(content);
Util util = new Util();
Document myDoc = Document.builder().bytes(sourceBytes).build();
List<FeatureType> featureTypes = new ArrayList<FeatureType>();
featureTypes.add(FeatureType.FORMS);
featureTypes.add(FeatureType.TABLES);
AnalyzeDocumentRequest analyzeDocumentRequest = AnalyzeDocumentRequest.builder().featureTypes(featureTypes)
.document(myDoc).build();
AnalyzeDocumentResponse analyzeDocument = textractClient.analyzeDocument(analyzeDocumentRequest);
List<Block> docInfo = analyzeDocument.blocks();
// util.displayBlockInfo(docInfo);
PageModel pageModel = util.getTableResults(docInfo);
DocumentModel documentModel = new DocumentModel();
documentModel.getPages().add(pageModel);
Iterator<Block> blockIterator = docInfo.iterator();
while (blockIterator.hasNext()) {
Block block = blockIterator.next();
log.debug("The block type is " + block.blockType().toString());
}
return documentModel;
} catch (TextractException e) {
System.err.println(e.getMessage());
}
return null;
}
and this is the util file:
public PageModel getTableResults(List<Block> blocks) {
List<Block> tableBlocks = new ArrayList<>();
Map<String, Block> blockMap = new HashMap<>();
for (Block block : blocks) {
blockMap.put(block.id(), block);
if (block.blockType().equals(BlockType.TABLE)) {
tableBlocks.add(block);
log.debug("added table: " + block.text());
}
}
PageModel page = new PageModel();
if (tableBlocks.size() == 0) {
return null;
}
int i = 0;
for (Block table : tableBlocks) {
page.getTables().add(generateTable(table, blockMap, i++));
}
return page;
}
private TableModel generateTable(Block table, Map<String, Block> blockMap, int index) {
TableModel model = new TableModel();
Map<Integer, Map<Integer, String>> rows = getRowsColumnsMap(table, blockMap);
model.setTableId("Table_" + index);
for (Map.Entry<Integer, Map<Integer, String>> entry : rows.entrySet()) {
RowModel rowModel = new RowModel();
Map<Integer, String> value = entry.getValue();
for (int i = 0; i < value.size(); i++) {
rowModel.getCells().add(value.get(i));
}
model.getRows().add(rowModel);
}
return model;
}
private Map<Integer, Map<Integer, String>> getRowsColumnsMap(Block block, Map<String, Block> blockMap) {
Map<Integer, Map<Integer, String>> rows = new HashMap<>();
for (Relationship relationship : block.relationships()) {
if (relationship.type().equals(RelationshipType.CHILD)) {
for (String childId : relationship.ids()) {
Block cell = blockMap.get(childId);
if (cell != null) {
int rowIndex = cell.rowIndex();
int colIndex = cell.columnIndex();
if (rows.get(rowIndex) == null) {
Map<Integer, String> row = new HashMap<>();
rows.put(rowIndex, row);
}
rows.get(rowIndex).put(colIndex, getText(cell, blockMap));
}
}
}
}
return rows;
}
public String getText(Block block, Map<String, Block> blockMap) {
String text = "";
if (block.relationships() != null && block.relationships().size() > 0) {
for (Relationship relationship : block.relationships()) {
if (relationship.type().equals(RelationshipType.CHILD)) {
for (String childId : relationship.ids()) {
Block wordBlock = blockMap.get(childId);
if (wordBlock != null && wordBlock.blockType() != null) {
if (wordBlock.blockType().equals(BlockType.WORD))) {
text += wordBlock.text() + " ";
}
}
}
}
}
}
return text;
}

replace a text in MS word Templete(Docx) using java

I am trying to search a string in docx and replace with some other text using java apache poi but it is replacing randomly
getting error as arrayIndexoutofbound Exception in line
"declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:ffData/w:name/#w:val")[0];
public class WordReplaceTextInFormFields {
private static void replaceFormFieldText(XWPFDocument document, String ffname, String text) {
boolean foundformfield = false;
for (XWPFParagraph paragraph : document.getParagraphs()) {
for (XWPFRun run : paragraph.getRuns()) {
XmlCursor cursor = run.getCTR().newCursor();
cursor.selectPath(
"declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:fldChar/#w:fldCharType");
while (cursor.hasNextSelection()) {
cursor.toNextSelection();
XmlObject obj = cursor.getObject();
if ("begin".equals(((SimpleValue) obj).getStringValue())) {
cursor.toParent();
obj = cursor.getObject();
obj = obj.selectPath(
"declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:ffData/w:name/#w:val")[0];
if (ffname.equals(((SimpleValue) obj).getStringValue())) {
foundformfield = true;
} else {
foundformfield = false;
}
} else if ("end".equals(((SimpleValue) obj).getStringValue())) {
if (foundformfield)
return;
foundformfield = false;
}
}
if (foundformfield && run.getCTR().getTList().size() > 0) {
run.getCTR().getTList().get(0).setStringValue(text);
// System.out.println(run.getCTR());
}
}
}
}
public static void main(String[] args) throws Exception {
XWPFDocument document = new XWPFDocument(new FileInputStream("WordTemplate.docx"));
replaceFormFieldText(document, "Text1", "Моя Компания");
replaceFormFieldText(document, "Text2", "Аксель Джоачимович Рихтер");
replaceFormFieldText(document, "Text3", "Доверенность");
document.write(new FileOutputStream("WordReplaceTextInFormFields.docx"));
document.close();
}
}
it misses some string, it not replaces entire document..please help with sample code
I do something similar in my project at https://github.com/centic9/poi-mail-merge which provides a general mail-merge functionality based on POI. It is using a bit different functionality from XmlBeans which replaces strings in the full XML-content of the document instead of each paragraph separately.
private static void appendBody(CTBody src, String append, boolean first) throws XmlException {
XmlOptions optionsOuter = new XmlOptions();
optionsOuter.setSaveOuter();
String srcString = src.xmlText();
String prefix = srcString.substring(0,srcString.indexOf(">")+1);
final String mainPart;
// exclude template itself in first appending
if(first) {
mainPart = "";
} else {
mainPart = srcString.substring(srcString.indexOf(">")+1,srcString.lastIndexOf("<"));
}
String suffix = srcString.substring( srcString.lastIndexOf("<") );
String addPart = append.substring(append.indexOf(">") + 1, append.lastIndexOf("<"));
CTBody makeBody = CTBody.Factory.parse(prefix+mainPart+addPart+suffix);
src.set(makeBody);
}
}
See line 132 in MailMerge.java

Categories