Docx4j - How to replace placeholder with value - java

I've been trying to work through the examples FieldMailMerge and VariableReplace but can't seem to get a local test case running. I'm basically trying to start with one docx template document and have it create x docx documents from that one template with the variables replaced.
In the code below docx4jReplaceSimpleTest() tries to replace a single variable but fails to do so. The ${} values in the template files are removed as part of the processing therefore I believe it's finding them but not replacing them for some reason. I understand it could be due to formatting as explained in the comments of the sample code but for troubleshooting just to get something working I'm trying it anyways.
In the code below docx4jReplaceTwoPeopleTest(), the one I want to get working, I'm trying to do it in what I believe is the proper way, but that's not finding or replacing anything. It's not even removing the ${} from the docx file.
public static void main(String[] args) throws Exception
{
docx4jReplaceTwoPeopleTest();
docx4jReplaceSimpleTest();
}
private static void docx4jReplaceTwoPeopleTest() throws Exception
{
String docxFile = "C:/temp/template.docx";
WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.load(new java.io.File(docxFile));
List<Map<DataFieldName, String>> data = new ArrayList<Map<DataFieldName, String>>();
Map<DataFieldName, String> map1 = new HashMap<DataFieldName, String>();
map1.put(new DataFieldName("Person.Firstname"), "myFirstname");
map1.put(new DataFieldName("Person.Lastname"), "myLastname");
data.add(map1);
Map<DataFieldName, String> map2 = new HashMap<DataFieldName, String>();
map2.put(new DataFieldName("Person.Firstname"), "myFriendsFirstname");
map2.put(new DataFieldName("Person.Lastname"), "myFriendsLastname");
data.add(map2);
org.docx4j.model.fields.merge.MailMerger.setMERGEFIELDInOutput(OutputField.KEEP_MERGEFIELD);
int x=0;
for(Map<DataFieldName, String> docMapping: data)
{
org.docx4j.model.fields.merge.MailMerger.performMerge(wordMLPackage, docMapping, true);
wordMLPackage.save(new java.io.File("C:/temp/OUT__MAIL_MERGE_" + x++ + ".docx") );
}
}
private static void docx4jReplaceSimpleTest() throws Exception
{
String docxFile = "C:/temp/template.docx";
WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.load(new java.io.File(docxFile));
HashMap<String, String> mappings = new HashMap<String, String>();
mappings.put("Person.Firstname", "myFirstname");
mappings.put("Person.Lastname", "myLastname");
MainDocumentPart documentPart = wordMLPackage.getMainDocumentPart();
documentPart.variableReplace(mappings);
wordMLPackage.save(new java.io.File("C:/temp/OUT_SIMPLE.docx") );
}
The docx file consists of the following text (no formatting is done):
This is a letter to someone
Hi ${Person.Firstname} ${Person.Lastname},
How are you?
Thank you again. I wish to see you soon ${Person.Firstname}
Regards,
Someone
Notice that I'm also trying to replace Person.Firstname at least twice as well. As the lastname is not even replaced I don't think this has anything to do with it but I'm adding it just in case.

I had the same issue and of course I could not force user to do some extra stuff when composing their word document so I decided to just write an algo to scan the whole document for expressions appending run after run, inserting replacement value and remove expressions in the second run. In case other people may need it below is what I did. I got the class from somewhere so it may be familiar. I just added the method searchAndReplace()
package com.my.docx4j;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.bind.JAXBElement;
import javax.xml.bind.JAXBException;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.wml.ContentAccessor;
import org.docx4j.wml.Text;
public class Docx4j {
public static void main(String[] args) throws Docx4JException, IOException, JAXBException {
String filePath = "C:\\Users\\markamm\\Documents\\tmp\\";
String file = "Hello.docx";
Docx4j docx4j = new Docx4j();
WordprocessingMLPackage template = docx4j.getTemplate(filePath+file);
// MainDocumentPart documentPart = template.getMainDocumentPart();
List<Object> texts = getAllElementFromObject(
template.getMainDocumentPart(), Text.class);
searchAndReplace(texts, new HashMap<String, String>(){
{
this.put("${abcd_efg.soanother_hello_broken_shit}", "Company Name here...");
this.put("${I_dont_know}", "Hmmm lemme see");
this.put("${${damn.right_lol}", "Gotcha!!!");
this.put("${one_here_and}", "Firstname");
this.put("${one}", "ChildA");
this.put("${two}", "ChildB");
this.put("${three}", "ChildC");
}
#Override
public String get(Object key) {
// TODO Auto-generated method stub
return super.get(key);
}
});
docx4j.writeDocxToStream(template, filePath+"Hello2.docx");
}
public static void searchAndReplace(List<Object> texts, Map<String, String> values){
// -- scan all expressions
// Will later contain all the expressions used though not used at the moment
List<String> els = new ArrayList<String>();
StringBuilder sb = new StringBuilder();
int PASS = 0;
int PREPARE = 1;
int READ = 2;
int mode = PASS;
// to nullify
List<int[]> toNullify = new ArrayList<int[]>();
int[] currentNullifyProps = new int[4];
// Do scan of els and immediately insert value
for(int i = 0; i<texts.size(); i++){
Object text = texts.get(i);
Text textElement = (Text) text;
String newVal = "";
String v = textElement.getValue();
// System.out.println("text: "+v);
StringBuilder textSofar = new StringBuilder();
int extra = 0;
char[] vchars = v.toCharArray();
for(int col = 0; col<vchars.length; col++){
char c = vchars[col];
textSofar.append(c);
switch(c){
case '$': {
mode=PREPARE;
sb.append(c);
// extra = 0;
} break;
case '{': {
if(mode==PREPARE){
sb.append(c);
mode=READ;
currentNullifyProps[0]=i;
currentNullifyProps[1]=col+extra-1;
System.out.println("extra-- "+extra);
} else {
if(mode==READ){
// consecutive opening curl found. just read it
// but supposedly throw error
sb = new StringBuilder();
mode=PASS;
}
}
} break;
case '}': {
if(mode==READ){
mode=PASS;
sb.append(c);
els.add(sb.toString());
newVal +=textSofar.toString()
+(null==values.get(sb.toString())?sb.toString():values.get(sb.toString()));
textSofar = new StringBuilder();
currentNullifyProps[2]=i;
currentNullifyProps[3]=col+extra;
toNullify.add(currentNullifyProps);
currentNullifyProps = new int[4];
extra += sb.toString().length();
sb = new StringBuilder();
} else if(mode==PREPARE){
mode = PASS;
sb = new StringBuilder();
}
}
default: {
if(mode==READ) sb.append(c);
else if(mode==PREPARE){
mode=PASS;
sb = new StringBuilder();
}
}
}
}
newVal +=textSofar.toString();
textElement.setValue(newVal);
}
// remove original expressions
if(toNullify.size()>0)
for(int i = 0; i<texts.size(); i++){
if(toNullify.size()==0) break;
currentNullifyProps = toNullify.get(0);
Object text = texts.get(i);
Text textElement = (Text) text;
String v = textElement.getValue();
StringBuilder nvalSB = new StringBuilder();
char[] textChars = v.toCharArray();
for(int j = 0; j<textChars.length; j++){
char c = textChars[j];
if(null==currentNullifyProps) {
nvalSB.append(c);
continue;
}
// I know 100000 is too much!!! And so what???
int floor = currentNullifyProps[0]*100000+currentNullifyProps[1];
int ceil = currentNullifyProps[2]*100000+currentNullifyProps[3];
int head = i*100000+j;
if(!(head>=floor && head<=ceil)){
nvalSB.append(c);
}
if(j>currentNullifyProps[3] && i>=currentNullifyProps[2]){
toNullify.remove(0);
if(toNullify.size()==0) {
currentNullifyProps = null;
continue;
}
currentNullifyProps = toNullify.get(0);
}
}
textElement.setValue(nvalSB.toString());
}
}
private WordprocessingMLPackage getTemplate(String name)
throws Docx4JException, FileNotFoundException {
WordprocessingMLPackage template = WordprocessingMLPackage
.load(new FileInputStream(new File(name)));
return template;
}
private static List<Object> getAllElementFromObject(Object obj,
Class<?> toSearch) {
List<Object> result = new ArrayList<Object>();
if (obj instanceof JAXBElement)
obj = ((JAXBElement<?>) obj).getValue();
if (obj.getClass().equals(toSearch))
result.add(obj);
else if (obj instanceof ContentAccessor) {
List<?> children = ((ContentAccessor) obj).getContent();
for (Object child : children) {
result.addAll(getAllElementFromObject(child, toSearch));
}
}
return result;
}
private void replacePlaceholder(WordprocessingMLPackage template,
String name, String placeholder) {
List<Object> texts = getAllElementFromObject(
template.getMainDocumentPart(), Text.class);
for (Object text : texts) {
Text textElement = (Text) text;
if (textElement.getValue().equals(placeholder)) {
textElement.setValue(name);
}
}
}
private void writeDocxToStream(WordprocessingMLPackage template,
String target) throws IOException, Docx4JException {
File f = new File(target);
template.save(f);
}
}

The issue is that I was trying to create the placeholders as just plain text within the docx file. What I should've been doing instead is using the MergeField functionality within Word which I didn't fully understand and appreciate, hence the confusion. Basically I didn't know that this is what was being meant within the documentation because I'd never used it, I just assumed it was still some kind of xml text replacement.
That being said it's still fairly difficult to find a good explanation of this Word feature. After looking at a few dozen explanations I still couldn't find a nice clean explanation of this Word feature. The best explanation I was able find can be found here. Basically you want to do Step 3.
That being said, once I created MergeFields in Word and ran the code, it worked perfectly. The method to use is docx4jReplaceTwoPeopleTest. The problem wasn't in the code but in my understanding of how it worked within Word.

Related

How to replace text in a pdf with correct encoding using Itext

I create a java program for translating PDFs. I am using google API for translation. I am getting the translation correct on my Eclipse IDE Console but when I check the newly created pdf, either it's not translated and copied as it is or few words are translated or the new pdf comes as empty and sometimes corrupted.
I suppose it has something to do with encoding & font types.
I have already gone through the Itext page & all the related questions but none worked for my case. I am trying to translate Portuguese Spanish Finnish French Hungarian, etc into English.
Here is my code:
public static final String SRC = "5587309Finnish.pdf";
public static final String DEST = "changed.pdf";
public static void main(String[] args) throws java.io.IOException, DocumentException {
Translate translate = TranslateOptions.getDefaultInstance().getService();
PdfReader reader = new PdfReader(SRC);
int pages = reader.getNumberOfPages();
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(DEST));
for(int i=1;i<=pages;i++) {
PdfDictionary dict = reader.getPageN(i);
PdfObject object = dict.getDirectObject(PdfName.CONTENTS);
if (object instanceof PRStream) {
String pageContent =
PdfTextExtractor.getTextFromPage(reader, i);
String[] word = pageContent.split(" ");
PRStream stream = (PRStream) object;
byte[] data = PdfReader.getStreamBytes(stream);
String dd = new String(data, BaseFont.CP1252);
for (int j=0; j < word.length; j++)
{
Translation translation = translate.translate(word[j],Translate.TranslateOption.sourceLanguage("fi"),
Translate.TranslateOption.targetLanguage("en"));
System.out.println(word[j]+"-->>"+translation.getTranslatedText());//here i can check the translation is correct.
dd = dd.replace(word[j],translation.getTranslatedText());
}
stream.setData(dd.getBytes());
}
}
stamper.close();
reader.close();
}
Please help.
According to a comment you have improved your code and are
getting the update dd(i.e. content stream which I am printing) correctly with the replaced text. I don't know why I am getting a blank pdf
Thus, I assume that your (hopefully representative) test PDFs have all their fonts of interest encoded in ANSI'ish encodings and the text arguments of the text drawing instructions contain whole words or even phrases which can properly be processed because otherwise text replacement would not have been possible.
Thus, here an example how one can replace text pieces with similarly long ones under such benign circumstances without breaking the content stream syntax. In this example I simply use a Map containing replacement strings. You can do your translation there.
First a frame loading the source, creating a stamper, iterating over the pages, and calling a helper to create a content stream replacement:
Map<String, String> replacements = new HashMap<>();
replacements.put("Förfallodatum", "Ablaufdatum");
try ( InputStream resource = SOURCE_INPUTSTREAM;
OutputStream result = new FileOutputStream(RESULT_FILE) ) {
PdfReader pdfReader = new PdfReader(resource);
PdfStamper pdfStamper = new PdfStamper(pdfReader, result);
for (int pageNum = 1; pageNum <= pdfReader.getNumberOfPages(); pageNum++) {
PdfDictionary page = pdfReader.getPageN(pageNum);
byte[] pageContentInput = ContentByteUtils.getContentBytesForPage(pdfReader, pageNum);
page.remove(PdfName.CONTENTS);
replaceInStringArguments(pageContentInput, pdfStamper.getUnderContent(pageNum), replacements);
}
pdfStamper.close();
}
(EditPageContentSimple test testReplaceInStringArgumentsForklaringAvFakturan)
The method replaceInStringArguments now parses the instructions in the given content stream, isolates string arguments, and calls another helper for each string argument doing the replacement.
void replaceInStringArguments(byte[] contentBytesBefore, PdfContentByte canvas, Map<String, String> replacements) throws IOException {
PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().createSource(contentBytesBefore)));
PdfContentParser ps = new PdfContentParser(tokeniser);
ArrayList<PdfObject> operands = new ArrayList<PdfObject>();
while (ps.parse(operands).size() > 0){
for (int i = 0; i < operands.size(); i++) {
PdfObject pdfObject = operands.get(i);
if (pdfObject instanceof PdfString) {
operands.set(i, replaceInString((PdfString)pdfObject, replacements));
} else if (pdfObject instanceof PdfArray) {
PdfArray pdfArray = (PdfArray) pdfObject;
for (int j = 0; j < pdfArray.size(); j++) {
PdfObject arrayObject = pdfArray.getPdfObject(j);
if (arrayObject instanceof PdfString) {
pdfArray.set(j, replaceInString((PdfString)arrayObject, replacements));
}
}
}
}
for (PdfObject object : operands)
{
object.toPdf(canvas.getPdfWriter(), canvas.getInternalBuffer());
canvas.getInternalBuffer().append((byte) ' ');
}
canvas.getInternalBuffer().append((byte) '\n');
}
}
(EditPageContentSimple helper method)
The method replaceInString in turn retrieves a single string operand (a PdfString instance), manipulates it, and returns the manipulated string version:
PdfString replaceInString(PdfString string, Map<String, String> replacements) {
String value = PdfEncodings.convertToString(string.getBytes(), PdfObject.TEXT_PDFDOCENCODING);
for (Map.Entry<String, String> entry : replacements.entrySet()) {
value = value.replace(entry.getKey(), entry.getValue());
}
return new PdfString(PdfEncodings.convertToBytes(value, PdfObject.TEXT_PDFDOCENCODING));
}
(EditPageContentSimple helper method)
Instead of that for loop here you would call your translation routine and translate value.
As has been mentioned before, this code only works under certain benign circumstances. Don't expect it to work for arbitrary documents from the wild, in particular not for documents with other than Western European glyphs.

Implementing save/open with RichTextFX?

Here is my code:
private void save(File file) {
StyledDocument<ParStyle, Either<StyledText<TextStyle>, LinkedImage<TextStyle>>, TextStyle> doc = textarea.getDocument();
// Use the Codec to save the document in a binary format
textarea.getStyleCodecs().ifPresent(codecs -> {
Codec<StyledDocument<ParStyle, Either<StyledText<TextStyle>, LinkedImage<TextStyle>>, TextStyle>> codec
= ReadOnlyStyledDocument.codec(codecs._1, codecs._2, textarea.getSegOps());
try {
FileOutputStream fos = new FileOutputStream(file);
DataOutputStream dos = new DataOutputStream(fos);
codec.encode(dos, doc);
fos.close();
} catch (IOException fnfe) {
fnfe.printStackTrace();
}
});
}
I am trying to implement the save/loading from the demo from here on the RichTextFX GitHub.
I am getting errors in the following lines:
StyledDocument<ParStyle, Either<StyledText<TextStyle>, LinkedImage<TextStyle>>, TextStyle> doc = textarea.getDocument();
error: incompatible types:
StyledDocument<Collection<String>,StyledText<Collection<String>>,Collection<String>>
cannot be converted to
StyledDocument<ParStyle,Either<StyledText<TextStyle>,LinkedImage<TextStyle>>,TextStyle>
and
= ReadOnlyStyledDocument.codec(codecs._1, codecs._2, textarea.getSegOps());
error: incompatible types: inferred type does not conform to equality
constraint(s) inferred: ParStyle
equality constraints(s): ParStyle,Collection<String>
I have added all the required .java files and imported them into my main code. I thought it would be relatively trivial to implement this demo but it has been nothing but headaches.
If this cannot be resolved, does anyone know an alternative way to save the text with formatting from RichTextFX?
Thank you
This question is quite old, but since i ran into the same problem i figured a solution might be useful to others as well.
In the demo, the code from which you use, ParStyle and TextStyle (Custom Types) are used for defining how information about the style is stored.
The error messages you get pretty much just tell you that your way of storing the information about the style (In your case in a String) is not compatible with the way it is done in the demo.
If you want to store the style in a String, which i did as well, you need to implement some way of serializing and deserializing the information yourself.
You can do that, for example (I used an InlineCssTextArea), in the following way:
public class SerializeManager {
public static final String PAR_REGEX = "#!par!#";
public static final String PAR_CONTENT_REGEX = "#!pcr!#";
public static final String SEG_REGEX = "#!seg!#";
public static final String SEG_CONTENT_REGEX = "#!scr!#";
public static String serialized(InlineCssTextArea textArea) {
StringBuilder builder = new StringBuilder();
textArea.getDocument().getParagraphs().forEach(par -> {
builder.append(par.getParagraphStyle());
builder.append(PAR_CONTENT_REGEX);
par.getStyledSegments().forEach(seg -> builder
.append(
seg.getSegment()
.replaceAll(PAR_REGEX, "")
.replaceAll(PAR_CONTENT_REGEX, "")
.replaceAll(SEG_REGEX, "")
.replaceAll(SEG_CONTENT_REGEX, "")
)
.append(SEG_CONTENT_REGEX)
.append(seg.getStyle())
.append(SEG_REGEX)
);
builder.append(PAR_REGEX);
});
String textAreaSerialized = builder.toString();
return textAreaSerialized;
}
public static InlineCssTextArea fromSerialized(String string) {
InlineCssTextArea textArea = new InlineCssTextArea();
ReadOnlyStyledDocumentBuilder<String, String, String> builder = new ReadOnlyStyledDocumentBuilder<>(
SegmentOps.styledTextOps(),
""
);
if (string.contains(PAR_REGEX)) {
String[] parsSerialized = string.split(PAR_REGEX);
for (int i = 0; i < parsSerialized.length; i++) {
String par = parsSerialized[i];
String[] parContent = par.split(PAR_CONTENT_REGEX);
String parStyle = parContent[0];
List<String> segments = new ArrayList<>();
StyleSpansBuilder<String> spansBuilder = new StyleSpansBuilder<>();
String styleSegments = parContent[1];
Arrays.stream(styleSegments.split(SEG_REGEX)).forEach(seg -> {
String[] segContent = seg.split(SEG_CONTENT_REGEX);
segments.add(segContent[0]);
if (segContent.length > 1) {
spansBuilder.add(segContent[1], segContent[0].length());
} else {
spansBuilder.add("", segContent[0].length());
}
});
StyleSpans<String> spans = spansBuilder.create();
builder.addParagraph(segments, spans, parStyle);
}
textArea.append(builder.build());
}
return textArea;
}
}
You can then take the serialized InlineCssTextArea, write the resulting String to a file, and load and deserialize it.
As you can see in the code, i made up some Strings as regexes which will be removed in the serialization process (We don't want our Serializer to be injectable, do we ;)).
You can change these to whatever you like, just note they will be removed if used in the text of the TextArea, so they should be something users wont miss in their TextArea.
Also note that this solution serializes the Style of the Text, the Text itself and the Paragraph style, BUT not inserted images or parameters of the TextArea (such as width and height), just the text content of the TextArea with its Style.
This issue on github really helped me btw.

Replacing a text in Apache POI XWPF not working

I'm currently trying to work on the code mentioned on a previous post called Replacing a text in Apache POI XWPF.
I have tried the below and it works but I don't know if I am missing anything. When I run the code the text is not replaced but added onto the end of what was searched. For example I have created a basic word document and entered the text "test". In the below code when I run it I eventually get the new document with the text "testDOG".
I have had to change the original code from String text = r.getText(0) to String text = r.toString() because I kept getting a NullError while running the code.
import java.io.*;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
public class testPOI {
public static void main(String[] args) throws Exception{
String filepath = "F:\\MASTER_DOC.docx";
String outpath = "F:\\Test.docx";
XWPFDocument doc = new XWPFDocument(OPCPackage.open(filepath));
for (XWPFParagraph p : doc.getParagraphs()){
for (XWPFRun r : p.getRuns()){
String text = r.toString();
if(text.contains("test")) {
text = text.replace("test", "DOG");
r.setText(text);
}
}
}
doc.write(new FileOutputStream(outpath));
}
EDIT: Thanks for your help everyone. I browsed around and found a solution on Replace table column value in Apache POI
This method replace search Strings in paragraphs and is able to work with Strings spanning over more than one Run.
private long replaceInParagraphs(Map<String, String> replacements, List<XWPFParagraph> xwpfParagraphs) {
long count = 0;
for (XWPFParagraph paragraph : xwpfParagraphs) {
List<XWPFRun> runs = paragraph.getRuns();
for (Map.Entry<String, String> replPair : replacements.entrySet()) {
String find = replPair.getKey();
String repl = replPair.getValue();
TextSegement found = paragraph.searchText(find, new PositionInParagraph());
if ( found != null ) {
count++;
if ( found.getBeginRun() == found.getEndRun() ) {
// whole search string is in one Run
XWPFRun run = runs.get(found.getBeginRun());
String runText = run.getText(run.getTextPosition());
String replaced = runText.replace(find, repl);
run.setText(replaced, 0);
} else {
// The search string spans over more than one Run
// Put the Strings together
StringBuilder b = new StringBuilder();
for (int runPos = found.getBeginRun(); runPos <= found.getEndRun(); runPos++) {
XWPFRun run = runs.get(runPos);
b.append(run.getText(run.getTextPosition()));
}
String connectedRuns = b.toString();
String replaced = connectedRuns.replace(find, repl);
// The first Run receives the replaced String of all connected Runs
XWPFRun partOne = runs.get(found.getBeginRun());
partOne.setText(replaced, 0);
// Removing the text in the other Runs.
for (int runPos = found.getBeginRun()+1; runPos <= found.getEndRun(); runPos++) {
XWPFRun partNext = runs.get(runPos);
partNext.setText("", 0);
}
}
}
}
}
return count;
}
Your logic is not quite right. You need to collate all the text in the runs first and then do the replace. You also need to remove all runs for the paragraph and add a new single run if a match on "test" is found.
Try this instead:
public class testPOI {
public static void main(String[] args) throws Exception{
String filepath = "F:\\MASTER_DOC.docx";
String outpath = "F:\\Test.docx";
XWPFDocument doc = new XWPFDocument(new FileInputStream(filepath));
for (XWPFParagraph p : doc.getParagraphs()){
int numberOfRuns = p.getRuns().size();
// Collate text of all runs
StringBuilder sb = new StringBuilder();
for (XWPFRun r : p.getRuns()){
int pos = r.getTextPosition();
if(r.getText(pos) != null) {
sb.append(r.getText(pos));
}
}
// Continue if there is text and contains "test"
if(sb.length() > 0 && sb.toString().contains("test")) {
// Remove all existing runs
for(int i = 0; i < numberOfRuns; i++) {
p.removeRun(i);
}
String text = sb.toString().replace("test", "DOG");
// Add new run with updated text
XWPFRun run = p.createRun();
run.setText(text);
p.addRun(run);
}
}
doc.write(new FileOutputStream(outpath));
}
}
Worth noticing that, run.getPosition() returns -1 most of the cases. But it does not effect when there is only one text postion per a run. But, technically it can have any number of textPositions and I've experienced such cases. So, the best way is to getCTR () for run and terate through each the run for count of textPositions. Number of textPositions are equal to ctrRun.sizeOfTArray()
A sample code
for (XWPFRun run : p.getRuns()){
CTR ctrRun = run.getCTR();
int sizeOfCtr = ctrRun.sizeOfTArray();
for(int textPosition=0; textPosition<sizeOfCtr){
String text = run.getText(textPosition);
if(text.contains("test")) {
text = text.replace("test", "DOG");
r.setText(text,textPosition);
}
}
}
just change text for every run in your paragraph, and then save the file.
this code worked for mi
XWPFDocument doc = new XWPFDocument(new FileInputStream(filepath));
for (XWPFParagraph p : doc.getParagraphs()) {
StringBuilder sb = new StringBuilder();
for (XWPFRun r : p.getRuns()) {
String text = r.getText(0);
if (text != null && text.contains("variable1")) {
text = text.replace("variable1", "valeur1");
r.setText(text, 0);
}
if (text != null && text.contains("variable2")) {
text = text.replace("variable2", "valeur2");
r.setText(text, 0);
}
if (text != null && text.contains("variable3")) {
text = text.replace("variable3", "valeur3");
r.setText(text, 0);
}
}
}
doc.write(new FileOutputStream(outpath));

Reading text from swf with StuartMacKay's transform-swf library

I need to extract all the texts from some swf files. I'm using Java since I have a lot of modules developed with this language.
Thus, I did a search through the Web for all the free Java library devoted to handle SWF files.
Finally, I found the library developed by StuartMacKay. The library, named transform-swf, may be found on GitHub by clicking here.
The question is: Once I extract the GlyphIndexes from a TextSpan, how can I convert the glyps in characters?
Please, provide a complete working and tested example. No theoretical answer will be accepted nor answers like "it cannot be done", "it ain't possible", etc.
What I know and what I did
I know that the GlyphIndexes are built by using a TextTable, which is constructed by recurring to an integer that represente the font size and a font description provided by a DefineFont2 object, but when I decode all the DefineFont2, all have a zero length advance.
Here follows what I did.
//Creating a Movie object from an swf file.
Movie movie = new Movie();
movie.decodeFromFile(new File(out));
//Saving all the decoded DefineFont2 objects.
Map<Integer,DefineFont2> fonts = new HashMap<>();
for (MovieTag object : list) {
if (object instanceof DefineFont2) {
DefineFont2 df2 = (DefineFont2) object;
fonts.put(df2.getIdentifier(), df2);
}
}
//Now I retrieve all the texts
for (MovieTag object : list) {
if (object instanceof DefineText2) {
DefineText2 dt2 = (DefineText2) object;
for (TextSpan ts : dt2.getSpans()) {
Integer fontIdentifier = ts.getIdentifier();
if (fontIdentifier != null) {
int fontSize = ts.getHeight();
// Here I try to create an object that should
// reverse the process done by a TextTable
ReverseTextTable rtt =
new ReverseTextTable(fonts.get(fontIdentifier), fontSize);
System.out.println(rtt.charactersForText(ts.getCharacters()));
}
}
}
}
The class ReverseTextTable follows here:
public final class ReverseTextTable {
private final transient Map<Character, GlyphIndex> characters;
private final transient Map<GlyphIndex, Character> glyphs;
public ReverseTextTable(final DefineFont2 font, final int fontSize) {
characters = new LinkedHashMap<>();
glyphs = new LinkedHashMap<>();
final List<Integer> codes = font.getCodes();
final List<Integer> advances = font.getAdvances();
final float scale = fontSize / EMSQUARE;
final int count = codes.size();
for (int i = 0; i < count; i++) {
characters.put((char) codes.get(i).intValue(), new GlyphIndex(i,
(int) (advances.get(i) * scale)));
glyphs.put(new GlyphIndex(i,
(int) (advances.get(i) * scale)), (char) codes.get(i).intValue());
}
}
//This method should reverse from a list of GlyphIndexes to a String
public String charactersForText(final List<GlyphIndex> list) {
String text="";
for(GlyphIndex gi: list){
text+=glyphs.get(gi);
}
return text;
}
}
Unfortunately, the list of advances from DefineFont2 is empty, then the constructor of ReverseTableText get an ArrayIndexOutOfBoundException.
Honestly, I don't know how to do that in Java. I'm not claiming that it is not possible, I also believe that there is a way to do that. However, you said that there are a lot of libraries that do that. You also suggested a library, i.e. swftools. So, I suggest to recurr to that library to extract the text from a flash file. To do that you can use Runtime.exec() just to execute a command line to run that library.
Personally, I prefer Apache Commons exec rather than the standard library released with JDK. Well, just let me show you how you should do. The executable file that you should use is "swfstrings.exe". Suppose that it is put in "C:\". Suppose that in the same folder you can find a flash file, e.g. page.swf. Then, I tried the following code (it works fine):
Path pathToSwfFile = Paths.get("C:\" + File.separator + "page.swf");
CommandLine commandLine = CommandLine.parse("C:\" + File.separator + "swfstrings.exe");
commandLine.addArgument("\"" + swfFile.toString() + "\"");
DefaultExecutor executor = new DefaultExecutor();
executor.setExitValues(new int[]{0, 1}); //Notice that swfstrings.exe returns 1 for success,
//0 for file not found, -1 for error
ByteArrayOutputStream stdout = new ByteArrayOutputStream();
PumpStreamHandler psh = new PumpStreamHandler(stdout);
executor.setStreamHandler(psh);
int exitValue;
try{
exitValue = executor.execute(commandLine);
}catch(org.apache.commons.exec.ExecuteException ex){
psh.stop();
}
if(!executor.isFailure(exitValue)){
String out = stdout.toString("UTF-8"); // here you have the extracted text
}
I know, this is not exactly the answer that you requested, but works fine.
I happened to be working on decompiling an SWF in Java now and I came across this question while figuring out how to reverse engineer the original text back.
After looking at the source code, I realise its really straightforward. Each font has an assigned sequence of characters that can be retrieved by calling DefineFont2.getCodes(), and the glyphIndex is the index to the matching character in DefineFont2.getCodes().
However, in cases where there are multiple fonts in use in a single SWF file, it is difficult to match each DefineText to the corresponding DefineFont2 because there's no attributes that identifies the DefineFont2 used for each DefineText.
To work around this issue, I came up with a self-learning algorithm which will attempt to guess the right DefineFont2 for each DefineText and hence derive the original text correctly.
To reverse engineer the original text back, I created a class called FontLearner:
public class FontLearner {
private final ArrayList<DefineFont2> fonts = new ArrayList<DefineFont2>();
private final HashMap<Integer, HashMap<Character, Integer>> advancesMap = new HashMap<Integer, HashMap<Character, Integer>>();
/**
* The same characters from the same font will have similar advance values.
* This constant defines the allowed difference between two advance values
* before they are treated as the same character
*/
private static final int ADVANCE_THRESHOLD = 10;
/**
* Some characters have outlier advance values despite being compared
* to the same character
* This constant defines the minimum accuracy level for each String
* before it is associated with the given font
*/
private static final double ACCURACY_THRESHOLD = 0.9;
/**
* This method adds a DefineFont2 to the learner, and a DefineText
* associated with the font to teach the learner about the given font.
*
* #param font The font to add to the learner
* #param text The text associated with the font
*/
private void addFont(DefineFont2 font, DefineText text) {
fonts.add(font);
HashMap<Character, Integer> advances = new HashMap<Character, Integer>();
advancesMap.put(font.getIdentifier(), advances);
List<Integer> codes = font.getCodes();
List<TextSpan> spans = text.getSpans();
for (TextSpan span : spans) {
List<GlyphIndex> characters = span.getCharacters();
for (GlyphIndex character : characters) {
int glyphIndex = character.getGlyphIndex();
char c = (char) (int) codes.get(glyphIndex);
int advance = character.getAdvance();
advances.put(c, advance);
}
}
}
/**
*
* #param text The DefineText to retrieve the original String from
* #return The String retrieved from the given DefineText
*/
public String getString(DefineText text) {
StringBuilder sb = new StringBuilder();
List<TextSpan> spans = text.getSpans();
DefineFont2 font = null;
for (DefineFont2 getFont : fonts) {
List<Integer> codes = getFont.getCodes();
HashMap<Character, Integer> advances = advancesMap.get(getFont.getIdentifier());
if (advances == null) {
advances = new HashMap<Character, Integer>();
advancesMap.put(getFont.getIdentifier(), advances);
}
boolean notFound = true;
int totalMisses = 0;
int totalCount = 0;
for (TextSpan span : spans) {
List<GlyphIndex> characters = span.getCharacters();
totalCount += characters.size();
int misses = 0;
for (GlyphIndex character : characters) {
int glyphIndex = character.getGlyphIndex();
if (codes.size() > glyphIndex) {
char c = (char) (int) codes.get(glyphIndex);
Integer getAdvance = advances.get(c);
if (getAdvance != null) {
notFound = false;
if (Math.abs(character.getAdvance() - getAdvance) > ADVANCE_THRESHOLD) {
misses += 1;
}
}
} else {
notFound = false;
misses = characters.size();
break;
}
}
totalMisses += misses;
}
double accuracy = (totalCount - totalMisses) * 1.0 / totalCount;
if (accuracy > ACCURACY_THRESHOLD && !notFound) {
font = getFont;
// teach this DefineText to the FontLearner if there are
// any new characters
for (TextSpan span : spans) {
List<GlyphIndex> characters = span.getCharacters();
for (GlyphIndex character : characters) {
int glyphIndex = character.getGlyphIndex();
char c = (char) (int) codes.get(glyphIndex);
int advance = character.getAdvance();
if (advances.get(c) == null) {
advances.put(c, advance);
}
}
}
break;
}
}
if (font != null) {
List<Integer> codes = font.getCodes();
for (TextSpan span : spans) {
List<GlyphIndex> characters = span.getCharacters();
for (GlyphIndex character : characters) {
int glyphIndex = character.getGlyphIndex();
char c = (char) (int) codes.get(glyphIndex);
sb.append(c);
}
sb = new StringBuilder(sb.toString().trim());
sb.append(" ");
}
}
return sb.toString().trim();
}
}
Usage:
Movie movie = new Movie();
movie.decodeFromStream(response.getEntity().getContent());
FontLearner learner = new FontLearner();
DefineFont2 font = null;
List<MovieTag> objects = movie.getObjects();
for (MovieTag object : objects) {
if (object instanceof DefineFont2) {
font = (DefineFont2) object;
} else if (object instanceof DefineText) {
DefineText text = (DefineText) object;
if (font != null) {
learner.addFont(font, text);
font = null;
}
String line = learner.getString(text); // reverse engineers the line
}
I am happy to say that this method has given me a 100% accuracy in reverse engineering the original String using StuartMacKay's transform-swf library.
Its seems to be difficult on what your trying to achieve, Your trying to secompile the file bur i am sorry to say that its not possible , What I would suggest you to do is to convert it into some bitmap (if possible) or by any other method try to read the characters using OCR
There are some software's which do that, you can also check some forums regarding that. Because once compiled version of swf is very difficult (and not possible as far as i know). You can check this decompiler if you want or try using some other languages like the project here
I had a similar problem with long strings using transform-swf library.
Got the source code and debugged it.
I believe there was a small bug in class com.flagstone.transform.coder.SWFDecoder.
Line 540 (applicable to version 3.0.2), change
dest += length;
with
dest += count;
That should do it for you (it's about extracting strings).
I notified Stuart as well. The problem appears only if your strings are very large.
I know this isn't what you asked but I needed to pull text from SWF recently using Java and found the ffdec library much better than transform-swf
Comment if anyone needs sample code

How can I remove all images/drawings from a PDF file and leave text only in Java?

I have a PDF file that's an output from an OCR processor, this OCR processor recognizes the image, adds the text to the pdf but at the end places a low quality image instead of the original one (I have no idea why anyone would do that, but they do).
So, I would like to get this PDF, remove the image stream and leave the text alone, so that I could get it and import (using iText page importing feature) to a PDF I'm creating myself with the real image.
And before someone asks, I have already tried to use another tool to extract text coordinates (JPedal) but when I draw the text on my PDF it isn't at the same position as the original one.
I'd rather have this done in Java, but if another tool can do it better, just let me know. And it could be image removal only, I can live with a PDF with the drawings in there.
I used Apache PDFBox in similar situation.
To be a little bit more specific, try something like that:
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import java.io.IOException;
public class Main {
public static void main(String[] argv) throws COSVisitorException, InvalidPasswordException, CryptographyException, IOException {
PDDocument document = PDDocument.load("input.pdf");
if (document.isEncrypted()) {
document.decrypt("");
}
PDDocumentCatalog catalog = document.getDocumentCatalog();
for (Object pageObj : catalog.getAllPages()) {
PDPage page = (PDPage) pageObj;
PDResources resources = page.findResources();
resources.getImages().clear();
}
document.save("strippedOfImages.pdf");
}
}
It's supposed to remove all types of images (png, jpeg, ...). It should work like that:
.
You need to parse the document as follows:
public static void strip(String pdfFile, String pdfFileOut) throws Exception {
PDDocument doc = PDDocument.load(pdfFile);
List pages = doc.getDocumentCatalog().getAllPages();
for( int i=0; i<pages.size(); i++ ) {
PDPage page = (PDPage)pages.get( i );
// added
COSDictionary newDictionary = new COSDictionary(page.getCOSDictionary());
PDFStreamParser parser = new PDFStreamParser(page.getContents());
parser.parse();
List tokens = parser.getTokens();
List newTokens = new ArrayList();
for(int j=0; j<tokens.size(); j++) {
Object token = tokens.get( j );
if( token instanceof PDFOperator ) {
PDFOperator op = (PDFOperator)token;
if( op.getOperation().equals( "Do") ) {
//remove the one argument to this operator
// added
COSName name = (COSName)newTokens.remove( newTokens.size() -1 );
// added
deleteObject(newDictionary, name);
continue;
}
}
newTokens.add( token );
}
PDStream newContents = new PDStream( doc );
ContentStreamWriter writer = new ContentStreamWriter( newContents.createOutputStream() );
writer.writeTokens( newTokens );
newContents.addCompression();
page.setContents( newContents );
// added
PDResources newResources = new PDResources(newDictionary);
page.setResources(newResources);
}
doc.save(pdfFileOut);
doc.close();
}
// added
public static boolean deleteObject(COSDictionary d, COSName name) {
for(COSName key : d.keySet()) {
if( name.equals(key) ) {
d.removeItem(key);
return true;
}
COSBase object = d.getDictionaryObject(key);
if(object instanceof COSDictionary) {
if( deleteObject((COSDictionary)object, name) ) {
return true;
}
}
}
return false;
}

Categories