I'm using PDFBox to replace text, strange characters - java

I'm using PDFBox to replace text in a template and there are characters (e.g. a simple J) that the tool recognizes as a special character. Any help to solve this problem?
public static PDDocument replaceText(PDDocument document, Map<String, String> mapVars) throws IOException {
for (PDPage page : document.getPages()) {
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List<?> tokens = parser.getTokens();
for (int j = 0; j < tokens.size(); j++) {
Object next = tokens.get(j);
if (next instanceof Operator) {
Operator op = (Operator) next;
String pstring = "";
int prej = 0;
if (op.getName().equals("TJ")) {
COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
if (j == prej) {
pstring += string;
} else {
prej = j;
pstring = string;
}
}
}
if (mapVars.containsKey(pstring.trim())) {
String replacement = mapVars.get(pstring.trim());
COSString sx = new COSString(replacement);
sx.setValue(replacement.getBytes());
previous.clear();
previous.add(0, sx);
}
}
}
}
PDStream updatedStream = new PDStream(document);
OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
out.close();
page.setContents(updatedStream);
}
return document;
}

Related

How to speed up reading in input in Java

I am attempting to read in info from files to implement Dijkstra's algorithm. I believe that the double for loop is causing this to drastically slow down, is there anyway around this?
Edge[] edge = new Edge[127807];
int indexEdge = 0;
String line2 = "";
BufferedReader fileReader2 = new BufferedReader(new FileReader("Road.txt"));
String valueString = null;
String vertex1IDName = null;
String vertex2IDName = null;
String extra = null;
float value = 0;
int vertex1ID = 0;
int vertex2ID = 0;
//Read the file line by line
while ((line2 = fileReader2.readLine()) != null)
{
//Get all tokens available in line
String[] tokens2 = line2.split(DELIMITER);
for(String token1 : tokens2)
{
vertex1IDName = tokens2[0];
vertex2IDName = tokens2[1];
valueString = tokens2[2];
if(tokens2.length - 1 == 3) {
extra = tokens2[tokens2.length - 1];
}
else {
extra = "";
}
vertex1ID = Integer.parseInt(vertex1IDName);
vertex2ID = Integer.parseInt(vertex2IDName);
value = Float.parseFloat(valueString);
}
System.out.println("value: "+ value + " vertex1ID:"+ vertex1ID +" vertex2ID:"+ vertex2ID+ " extra:" + extra);
//if vertex 1 name or vertex 2 name in vertex.getID()
for(int i = 0; i< indexVertex; i++) {
for(int j = 0; j< indexVertex; j++) {
if(vertex1ID == vertex[i].getID() && vertex2ID == vertex[j].getID()) {
vertex[i].addNeighbour(edge[indexEdge] = new Edge(value,vertex[i],vertex[j],extra));
indexEdge++;
}
}
}
}

How to get line number in file from the character position using java

I have one JSON file and having some issue in it. When parsing the json file I will get the ParserException. From parser exception I have extracted the position where the is problem.
Now I want the line number of the that particular position in file.
JSONObject json;
try {
if (!file.exists()) {
throw new ExceptionDoesNotExist(file);
}
scanner = new Scanner(file, Charset.defaultCharset().toString());
String data = scanner.useDelimiter("\\Z").next();
json = (JSONObject) new JSONParser().parse(data);
return json;
} catch (ParseException e) {
this.log.logException(e);
int position = e.getPosition();
String reason = e.getUnexpectedObject().toString();
return new JSONObject();
}
if (!file.exists()) {
throw new ExceptionDoesNotExist(file);
}
scanner = new Scanner(file, Charset.defaultCharset().toString());
String data = scanner.useDelimiter("\\Z").next();
try {
return new JSONParser().parse(data);
} catch (ParseException e) {
String lineAndColumn = lineAndColumn(data, e, 4);
...;
return new JSONObject();
}
public static String lineAndColumn(String text, ParseException e, int tabSize) {
int position = e.getPosition();
int lineNo = 1 + (int) text.substring(0, position).codePoints()
.filter(cp -> cp == '\n')
.count();
int columnNo = 1 + text.substring(0, position).lastIndexOf('\n') + 1; // no \n okay too.
// Tabs
int cI = 0;
for (int i = 0; i < columnNo - 1; ++i) {
if (text.charAt(posion - (columnNo - 1) + i) == '\t') {
cI += tabSize;
cI %= tabSize;
} else {
++cI;
}
}
columnNo = cI + 1;
return String.format("%d:%d"), lineNo, ColumnNo);
}

replace a text using pdfbox for PDF file

I have 4 pdf files that came from one .doc file and I use 4 methods to convert my doc to a pdf (foxite reader, nitro, webservice and Word).
Then I used pdfbox to search and replace some words. The problem is, for some reason it only works for the file from foxite reader and Word, but not for the files created by nitro and the webservice.
Can any one have a clue?
This is the code I used:
public static void replace(String s) {
PDDocument doc = null;
int occurrences = 0;
try {
doc = PDDocument.load(s); // Input PDF File Name
System.out.println("+e" + doc);
List pages = doc.getDocumentCatalog()
.getAllPages();
for (int i = 0; i < pages.size(); i++) {
PDPage page = (PDPage) pages.get(i);
// System.out.println("ddd");
PDStream contents = page.getContents();
PDFStreamParser parser = new PDFStreamParser(contents.getStream());
parser.parse();
List tokens = parser.getTokens();
for (int j = 0; j < tokens.size(); j++) {
// System.out.println("jjjj");
Object next = tokens.get(j);
if (next instanceof PDFOperator) {
PDFOperator op = (PDFOperator) next;
// Tj and TJ are the two operators that display strings in a PDF
if (op.getOperation()
.equals("Tj")) {
// Tj takes one operator and that is the string
// to display so lets update that operator
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();
if (string.contains("#signature#")) {
string = string.replace("#signature#", "sam");
occurrences++;
}
// Word you want to change.
// Currently this code changes word "Good" to "Bad"
previous.reset();
previous.append(string.getBytes("ISO-8859-1"));
} else if (op.getOperation()
.equals("TJ")) {
COSArray previous = (COSArray) tokens.get(j - 1);
COSString temp = new COSString();
String tempString = "";
for (int t = 0; t < previous.size(); t++) {
if (previous.get(t) instanceof COSString) {
tempString += ((COSString) previous.get(t)).getString();
}
}
temp.append(tempString.getBytes("ISO-8859-1"));
tempString = "";
tempString = temp.getString();
if (tempString.contains("#signature#")) {
tempString = tempString.replace("#signature#", "sam");
occurrences++;
}
previous.clear();
String[] stringArray = tempString.split(" ");
for (String string : stringArray) {
COSString cosString = new COSString();
string = string + " ";
cosString.append(string.getBytes("ISO-8859-1"));
previous.add(cosString);
}
}
}
}
// now that the tokens are updated we will replace the page content stream.
PDStream updatedStream = new PDStream(doc);
OutputStream out = updatedStream.createOutputStream();
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
page.setContents(updatedStream);
}
System.out.println("number of matches found: " + occurrences);
doc.save(s + "_convert.pdf"); // Output file name
} catch (Exception ex) {
System.out.println("eee+" + ex.getMessage());
} finally {
if (doc != null) {
try {
doc.close();
} catch (IOException ex) {
ex.getStackTrace();
}
}
}
}

Bad characters when replacing text in pdf using pdfbox

I'm trying to replace text in pdf and it's kind of replaced, this is my code
PDDocument doc = null;
int occurrences = 0;
try {
doc = PDDocument.load("test.pdf"); //Input PDF File Name
List pages = doc.getDocumentCatalog().getAllPages();
for (int i = 0; i < pages.size(); i++) {
PDPage page = (PDPage) pages.get(i);
PDStream contents = page.getContents();
PDFStreamParser parser = new PDFStreamParser(contents.getStream());
parser.parse();
List tokens = parser.getTokens();
for (int j = 0; j < tokens.size(); j++) {
Object next = tokens.get(j);
if (next instanceof PDFOperator) {
PDFOperator op = (PDFOperator) next;
// Tj and TJ are the two operators that display strings in a PDF
if (op.getOperation().equals("Tj")) {
// Tj takes one operator and that is the string
// to display so lets update that operator
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();
if (string.contains("Good")) {
string = string.replace("Good", "Bad");
occurrences++;
}
//Word you want to change. Currently this code changes word "Good" to "Bad"
previous.reset();
previous.append(string.getBytes("ISO-8859-1"));
} else if (op.getOperation().equals("TJ")) {
COSArray previous = (COSArray) tokens.get(j - 1);
COSString temp = new COSString();
String tempString = "";
for (int t = 0; t < previous.size(); t++) {
if (previous.get(t) instanceof COSString) {
tempString += ((COSString) previous.get(t)).getString();
}
}
temp.append(tempString.getBytes("ISO-8859-1"));
tempString = "";
tempString = temp.getString();
if (tempString.contains("Good")) {
tempString = tempString.replace("Good", "Bad");
occurrences++;
}
previous.clear();
String[] stringArray = tempString.split(" ");
for (String string : stringArray) {
COSString cosString = new COSString();
string = string + " ";
cosString.append(string.getBytes("ISO-8859-1"));
previous.add(cosString);
}
}
}
}
// now that the tokens are updated we will replace the page content stream.
PDStream updatedStream = new PDStream(doc);
OutputStream out = updatedStream.createOutputStream();
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
page.setContents(updatedStream);
}
System.out.println("number of matches found: " + occurrences);
doc.save("a.pdf"); //Output file name
} catch (IOException ex) {
Logger.getLogger(ReplaceTextInPDF.class.getName()).log(Level.SEVERE, null, ex);
} catch (COSVisitorException ex) {
Logger.getLogger(ReplaceTextInPDF.class.getName()).log(Level.SEVERE, null, ex);
} finally {
if (doc != null) {
try {
doc.close();
} catch (IOException ex) {
Logger.getLogger(ReplaceTextInPDF.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
the issue that it's replaced in a bad characters or hidden shape ( as example the bad word becomes only d character), but if i copy and paste it in another place it paste the expected word correctly,
also when i search the generated pdf for the new word it doesn't find it, but when i search with the old word it finds it in the replaced places
I found aspose, this link shows how to use it to replace text in pdfs, it's easy and works perfect except that it's not free, so the free version is printing copyrights line on the head of pdf file pages
http://www.aspose.com/docs/display/pdfjava/Replace+Text+in+Pages+of+a+PDF+Document

How to replace centered text in a PDF with PDFBox

I use the PDFTextReplacement example.
It does the replacement as expected, In case my text is left aligned.
But if my input pdf has a text centered, it replaces the text as a left aligned.
Ok, so I have to recalculate the right starting point.
For that reason I have two targets or questions:
How to determine the alignment?
How to calculate the right starting point?
Here is my code:
public PDDocument doIt(String inputFile, Map<String, String> text)
throws IOException, COSVisitorException {
// the document
PDDocument doc = null;
doc = PDDocument.load(inputFile);
List pages = doc.getDocumentCatalog().getAllPages();
for (int i = 0; i < pages.size(); i++) {
PDPage page = (PDPage) pages.get(i);
PDStream contents = page.getContents();
PDFStreamParser parser = new PDFStreamParser(contents.getStream());
parser.parse();
List tokens = parser.getTokens();
for (int j = 0; j < tokens.size(); j++) {
Object next = tokens.get(j);
if (next instanceof PDFOperator) {
PDFOperator op = (PDFOperator) next;
// Tj and TJ are the two operators that display
// strings in a PDF
String pstring = "";
int prej = 0;
if (op.getOperation().equals("Tj")) {
// Tj takes one operator and that is the string
// to display so lets update that operator
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();
// System.out.println(j + " " + string);
if (j == prej) {
pstring += string;
} else {
prej = j;
pstring = string;
}
previous.reset();
previous.append(string.getBytes("ISO-8859-1"));
} else if (op.getOperation().equals("TJ")) {
COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
if (j == prej) {
pstring += string;
} else {
prej = j;
pstring = string;
}
cosString.reset();
// cosString.append(string
// .getBytes("ISO-8859-1"));
}
}
COSString cosString2 = (COSString) previous
.getObject(0);
for (int t = 1; t < previous.size(); t++)
previous.remove(t);
// cosString2.setNeedToBeUpdate(true);
if (text.containsKey(pstring.trim())) {
String textValue = text.get(pstring.trim());
cosString2.append(textValue.getBytes("ISO-8859-1"));
for (int k = 1; k < previous.size(); k++) {
previous.remove(k);
}
}
}
}
}
// now that the tokens are updated we will replace the
// page content stream.
PDStream updatedStream = new PDStream(doc);
OutputStream out = updatedStream.createOutputStream();
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
page.setContents(updatedStream);
}
return doc;
}
you can use this function:
public void doIt( String inputFile, String outputFile, String strToFind, String message)
throws IOException, COSVisitorException
{
// the document
PDDocument doc = null;
try
{
doc = PDDocument.load( inputFile );
List pages = doc.getDocumentCatalog().getAllPages();
for( int i=0; i<pages.size(); i++ )
{
PDPage page = (PDPage)pages.get( i );
PDStream contents = page.getContents();
PDFStreamParser parser = new PDFStreamParser(contents.getStream() );
parser.parse();
List tokens = parser.getTokens();
for( int j=0; j<tokens.size(); j++ )
{
Object next = tokens.get( j );
if( next instanceof PDFOperator )
{
PDFOperator op = (PDFOperator)next;
//Tj and TJ are the two operators that display
//strings in a PDF
if( op.getOperation().equals( "Tj" ) )
{
//Tj takes one operator and that is the string
//to display so lets update that operator
COSString previous = (COSString)tokens.get( j-1 );
String string = previous.getString();
string = string.replaceFirst( strToFind, message );
previous.reset();
previous.append( string.getBytes() );
}
else if( op.getOperation().equals( "TJ" ) )
{
COSArray previous = (COSArray)tokens.get( j-1 );
for( int k=0; k<previous.size(); k++ )
{
Object arrElement = previous.getObject( k );
if( arrElement instanceof COSString )
{
COSString cosString = (COSString)arrElement;
String string = cosString.getString();
string = string.replaceFirst( strToFind, message );
cosString.reset();
cosString.append( string.getBytes() );
}
}
}
}
}
//now that the tokens are updated we will replace the
//page content stream.
PDStream updatedStream = new PDStream(doc);
OutputStream out = updatedStream.createOutputStream();
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens( tokens );
page.setContents( updatedStream );
}
doc.save( outputFile );
}
finally
{
if( doc != null )
{
doc.close();
}
}
}

Categories