Add FormXobject content from resources to content stream using PDFBox?

Add FormXobject content from resources to content stream using PDFBox? - java

I have FormXobject under my page1->Resource -> Xobjects-> Fm0, Fm1, Fm2..
So it is not direct content stream which is not available under contents->contentstream. So I want to move the content stream of from Fm0->Contentstream to page1-> contents-> contentstream.
When we moved content stream like this we parallelly we have to transfer or copy Fm0 related Resources to page level resource.
1.Content stream need to copy under page level contents.
2.Color space objects need to copy under page1->Resource->Colorspace.
3.ExtGState objects need to copy under page1->Resource->ExtGState.
4.properties need to copy under page1->Resource (here need to create that entirely)
I tried some code
private static PDDocument parseFormXobject(PDDocument document, Integer pg_ind) throws IOException {
List<Object> tokens1 = (List<Object>) (getTokens(document, pg_ind)).get(pg_ind);
PDStream newContents = new PDStream(document);
OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter writer = new ContentStreamWriter(out);
PDPage pageinner = document.getPage(pg_ind);
PDResources resources = pageinner.getResources();
PDResources new_resources = new PDResources();
new_resources = resources;
COSDictionary fntdict = new COSDictionary();
COSDictionary imgdict = new COSDictionary();
COSDictionary extgsdict = new COSDictionary();
COSDictionary colordict = new COSDictionary();
COSDictionary pattern = new COSDictionary();
int img_count = 0;
for (COSName xObjectName : resources.getXObjectNames()) {
PDXObject xObject = resources.getXObject(xObjectName);
if (xObject instanceof PDFormXObject
&& tokens1.toString().contains(xObjectName.toString()) ) {
PDFStreamParser parser = new PDFStreamParser(((PDFormXObject) xObject).getContentStream());
parser.parse();
List<Object> tokens3 = parser.getTokens();
int ind =0;
//isTextContains will check is there any Tj operators or there or not
if (isTextContains(tokens3)){
for (COSName colorname :((PDFormXObject) xObject).getResources().getColorSpaceNames())
{
COSName new_name = COSName.getPDFName(colorname.getName());
PDColorSpace pdcolor = ((PDFormXObject) xObject).getResources().getColorSpace(colorname);
colordict.setItem(new_name,pdcolor);
}
for (COSName fontName :((PDFormXObject) xObject).getResources().getFontNames() )
{
COSName new_name = COSName.getPDFName(fontName.getName());
PDFont font =((PDFormXObject) xObject).getResources().getFont(fontName);
font.getCOSObject().setItem(COSName.NAME, new_name);
fntdict.setItem(new_name,font);
}
for (COSName ExtGSName :((PDFormXObject) xObject).getResources().getExtGStateNames() )
{
COSName new_name = COSName.getPDFName(ExtGSName.getName());
PDExtendedGraphicsState ExtGState =((PDFormXObject) xObject).getResources().getExtGState(ExtGSName);
ExtGState.getCOSObject().setItem(COSName.NAME, new_name);
extgsdict.setItem(new_name,ExtGState);
}
imgdict.setItem(xObjectName, xObject);
for (COSName Imgname :((PDFormXObject) xObject).getResources().getXObjectNames() )
{
COSName new_name = COSName.getPDFName(Imgname.getName());
xObject.getCOSObject().setItem(COSName.NAME, new_name);
PDXObject img =((PDFormXObject) xObject).getResources().getXObject(Imgname);
imgdict.setItem(new_name, img);
}
for (COSName paternname :((PDFormXObject) xObject).getResources().getPatternNames() )
{
COSName new_name = COSName.getPDFName(paternname.getName());
PDAbstractPattern pat = ((PDFormXObject) xObject).getResources().getPattern(paternname);
pat.getCOSObject().setItem(COSName.NAME, new_name);
pattern.setItem(new_name,pat);
}
for (int k=0; k< tokens1.size(); k++) {
if ( ((tokens1.get(k) instanceof Operator) && ((Operator)tokens1.get(k)).getName().toString().equals("Do"))
&& ((COSName)tokens1.get(k-1)).getName().toString().equals(xObjectName.getName().toString()) ) {
tokens1.remove(k-1);
tokens1.remove(k-1);
tokens1.add(k-1, Operator.getOperator("q"));
if(((PDFormXObject) xObject).getMatrix() != null) {
tokens1.add(k, new COSFloat(((PDFormXObject) xObject).getMatrix().getScaleX()));
tokens1.add(k + 1, new COSFloat(((PDFormXObject) xObject).getMatrix().getShearY()));
tokens1.add(k + 2, new COSFloat(((PDFormXObject) xObject).getMatrix().getShearX()));
tokens1.add(k + 3, new COSFloat(((PDFormXObject) xObject).getMatrix().getScaleY()));
tokens1.add(k + 4, new COSFloat(((PDFormXObject) xObject).getMatrix().getTranslateX()));
tokens1.add(k + 5, new COSFloat(((PDFormXObject) xObject).getMatrix().getTranslateY()));
tokens1.add(k + 6, Operator.getOperator("cm"));
tokens1.add(k+7, Operator.getOperator("Q"));
ind =k+7;
}else{
tokens1.add(k, Operator.getOperator("Q"));
ind =k;
}
break;
}
}
for (int k=0; k< tokens3.size(); k++) {
if ( (tokens3.size() > k+1) && (tokens3.get(k+1) instanceof Operator) && (((Operator)tokens3.get(k+1)).getName().toString().equals("Do")
|| ((Operator)tokens3.get(k+1)).getName().toString().equals("gs")
|| ((Operator)tokens3.get(k+1)).getName().toString().equals("cs")
|| ((Operator)tokens3.get(k+1)).getName().toString().equals("CS")) ) {
COSName new_name = COSName.getPDFName( ((COSName) tokens3.get(k)).getName() );
tokens1.add(ind+k, new_name );
}else if ( (tokens3.size() > k+2) && (tokens3.get(k+2) instanceof Operator)
&& ((Operator)tokens3.get(k+2)).getName().toString().equals("Tf") ) {
COSName new_name = COSName.getPDFName( ((COSName) tokens3.get(k)).getName() );
tokens1.add(ind+k, new_name );
}
else
tokens1.add(ind+k,tokens3.get(k));
}
img_count +=1;
}else {
imgdict.setItem(xObjectName, xObject);
img_count +=1;
}
}else
imgdict.setItem(xObjectName, xObject);
}
for (COSName fontName :new_resources.getFontNames() )
{
PDFont font =new_resources.getFont(fontName);
fntdict.setItem(fontName,font);
}
for (COSName ExtGSName :new_resources.getExtGStateNames() )
{
PDExtendedGraphicsState extg =new_resources.getExtGState(ExtGSName);
extgsdict.setItem(ExtGSName,extg);
}
for (COSName colorname :new_resources.getColorSpaceNames() )
{
PDColorSpace color =new_resources.getColorSpace(colorname);
colordict.setItem(colorname,color);
}
for (COSName patern :new_resources.getPatternNames() )
{
PDAbstractPattern pat =new_resources.getPattern(patern);
pattern.setItem(patern,pat);
}
resources.getCOSObject().setItem(COSName.EXT_G_STATE,extgsdict);
resources.getCOSObject().setItem(COSName.FONT,fntdict);
resources.getCOSObject().setItem(COSName.XOBJECT,imgdict);
resources.getCOSObject().setItem(COSName.COLORSPACE, colordict);
resources.getCOSObject().setItem(COSName.PATTERN, pattern);
writer.writeTokens(tokens1);
out.close();
document.getPage(pg_ind).setContents(newContents);
document.getPage(pg_ind).setResources(resources);
return document;
}
private static JSONObject getTokens(PDDocument oldDocument, Integer pageIndex) throws IOException {
// TODO Auto- it will return the tokens of pdf
JSONObject oldDocumentTokens = new JSONObject();
PDPage pg = oldDocument.getPage(pageIndex);
PDFStreamParser parser = new PDFStreamParser(pg);
parser.parse();
List<Object> tokens = PDFUtils.removeTokens(parser.getTokens());
oldDocumentTokens.put(pageIndex, tokens);
return oldDocumentTokens;
}
private static boolean isTextContains(List<Object> tokens3) {
for (int k=0; k< tokens3.size(); k++) {
if (tokens3.get(k) instanceof Operator) {
Operator op = (Operator) tokens3.get(k);
if(op.getName().equals("BT"))
return true;
}
}
return false;
}
But I am unable to get Exact Page graphics. I am losing something.
input pdf
output pdf

There are multiple issues, some in details, some in the concept.
Wrapping in a save-graphics-state/restore-graphics-state envelope
When you draw an XObject, graphics state changes in that XObject don't change your current graphics state. To make sure this still is true after you copied the XObject instructions into your page content stream, you have to wrap that block into a save-graphics-state/restore-graphics-state envelope (q ... Q). You can do that by adding these two lines
tokens1.add(ind++, Operator.getOperator("q"));
tokens1.add(ind, Operator.getOperator("Q"));
right before your instruction copying loop
for (int k=0; k< tokens3.size(); k++) {
...
}
Coordinate system
You assume the coordinate system in the XObject equals that of the page. It doesn't necessarily. XObjects may have a Matrix entry denoting the transformation to apply.
Boundary box
You don't limit the area of what is drawn by the XObject instructions. But XObjects have a BBox entry denoting the box to clip the outputs to.
Optional content
XObjects may also have an OC entry denoting their optional content membership. Such a membership needs to be transformed into an equivalent optional content tagging.
Marked content, structure tree
XObjects can also refer to the structural parent tree via their StructParent or StructParents entry. To keep structural integrity of the document, you may have to considerably update the structure tree.
Grouping
XObjects may contain a Group entry indicating that its content shall be treated as a group. In particular in case of Transparency Groups this results in a different behavior of transparency related features than for the same instructions copied into the page content.
Unless you completely analyze the effects of each bit of content drawn with some transparency and from case to case rewrite the instructions drawing it, copying the instructions from the XObject to the page content stream will result in substantial differences in the displayed content.
Usage
Your code assumes that a XObject is used exactly once in the page content streams. This need not be the case, it can also be used more often or not at all.
References
In a comment you asked for references. Actually it's all in the PDF specification ISO 32000, already in the publicly available ISO 32000-1:
8.10 Form XObjects
A form XObject is a PDF content stream that is a self-contained description of any sequence of graphics objects (including path objects, text objects, and sampled images). A form XObject may be painted multiple times—either on several pages or at several locations on the same page—and produces the same results each time, subject only to the graphics state at the time it is invoked.
Thus, any number of usages on a given page is possible
When the Do operator is applied to a form XObject, a conforming reader shall perform the following tasks:
a) Saves the current graphics state, as if by invoking the q operator (see 8.4.4, "Graphics State Operators")
b) Concatenates the matrix from the form dictionary’s Matrix entry with the current transformation matrix (CTM)
c) Clips according to the form dictionary’s BBox entry
d) Paints the graphics objects specified in the form’s content stream
e) Restores the saved graphics state, as if by invoking the Q operator (see 8.4.4, "Graphics State Operators")
When copying into the page content stream, therefore, you should equivalently use a q/Q envelope and respect the Matrix and BBox entries.
8.11.3.3 Optional Content in XObjects and Annotations
In addition to marked content within content streams, form XObjects and image XObjects (see 8.8, "External Objects") and annotations (see 12.5, "Annotations") may contain an OC entry, which shall be an optional content group or an optional content membership dictionary.
A form or image XObject's visibility shall be determined by the state of the group or those of the groups referenced by the membership dictionary in conjunction with its P (or VE) entry, along with the current visibility state in the context in which the XObject is invoked (that is, whether objects are visible in the contents stream at the place where the Do operation occurred).
Thus, respect this optional content information when copying to the page content.
11.6.6 Transparency Group XObjects
A transparency group is represented in PDF as a special type of group XObject (see “Group XObjects”) called a transparency group XObject. A group XObject is in turn a type of form XObject, distinguished by the presence of a Group entry in its form dictionary (see “Form Dictionaries”). The value of this entry is a subsidiary group attributes dictionary defining the properties of the group. The format and meaning of the dictionary’s contents shall be determined by its group subtype, which is specified by the dictionary’s S entry. The entries for a transparency group (subtype Transparency) are shown in Table 147.
...
Annex L
So copying from transparency groups may change the appearance substantially.
14.7.4.3 PDF Objects as Content Items
When a structure element’s content includes an entire PDF object, such as an XObject or an annotation, that is associated with a page but not directly included in the page’s content stream, the object shall be identified in the structure element’s K entry by an object reference dictionary (see Table 325).
...
14.7.4.4 Finding Structure Elements from Content Items
...
To locate the relevant parent tree entry, each object or content stream that is represented in the tree shall contain a special dictionary entry, StructParent or StructParents (see Table 326). Depending on the type of content item, this entry may appear in the page object of a page containing marked-content sequences, in the stream dictionary of a form or image XObject, in an annotation dictionary, or in any other type of object dictionary that is included as a content item in a structure element.
This and more information from the same chapter should indicate clearly that structure information after copying from XObject to page content must be overhauled.

This is just snippet code for above MKL Answer. I am trying to give snippet code for that
Wrapping in a save-graphics-state/restore-graphics-state envelope
When you draw an XObject, graphics state changes in that XObject don't change your current graphics state. To make sure this still is true after you copied the XObject instructions into your page content stream, you have to wrap that block into a save-graphics-state/restore-graphics-state envelope (q ... Q). You can do that by adding these two lines
tokens1.add(ind++, Operator.getOperator("q"));
tokens1.add(ind, Operator.getOperator("Q"));
right before your instruction copying loop
for (int k=0; k< tokens3.size(); k++) {
...
}
Coordinate system:
You assume the coordinate system in the XObject equals that of the page. It doesn't necessarily. XObjects may have a Matrix entry denoting the transformation to apply.
tokens1.add(k-1, Operator.getOperator("q"));
if(((PDFormXObject) xObject).getMatrix() != null) {
tokens1.add(k, new COSFloat(((PDFormXObject) xObject).getMatrix().getScaleX()));
tokens1.add(k + 1, new COSFloat(((PDFormXObject) xObject).getMatrix().getShearY()));
tokens1.add(k + 2, new COSFloat(((PDFormXObject) xObject).getMatrix().getShearX()));
tokens1.add(k + 3, new COSFloat(((PDFormXObject) xObject).getMatrix().getScaleY()));
tokens1.add(k + 4, new COSFloat(((PDFormXObject) xObject).getMatrix().getTranslateX()));
tokens1.add(k + 5, new COSFloat(((PDFormXObject) xObject).getMatrix().getTranslateY()));
tokens1.add(k + 6, Operator.getOperator("cm"));
tokens1.add(k+7, Operator.getOperator("Q"));
ind =k+7;
}
Boundary box:
You don't limit the area of what is drawn by the XObject instructions. But XObjects have a BBox entry denoting the box to clip the outputs to.
if ((PDFormXObject) xObject).getBBox()!= null){
//How can I add this bbox property? is it 're'?
tokens1.add(k, new COSFloat(((PDFormXObject) xObject).getBBox().getLowerLeftX()));
tokens1.add(k+1, new COSFloat(((PDFormXObject) xObject).getBBox().getLowerLeftY()));
tokens1.add(k+2, new COSFloat(((PDFormXObject) xObject).getBBox().getWidth()));
tokens1.add(k+3, new COSFloat(((PDFormXObject) xObject).getBBox().getHeight()));
tokens1.add(k+4, Operator.getOperator("re"));
tokens1.add(k+5, Operator.getOperator("W"));
tokens1.add(k+6, Operator.getOperator("n"));
}
Optional content
XObjects may also have an OC entry denoting their optional content membership. Such a membership needs to be transformed into an equivalent optional content tagging.
//How can I get this oc property from xobject and how can I use it?
Marked content, structure tree
//For now there is no any marked content. Assume every pdf is not Tagged.
Grouping
XObjects may contain a Group entry indicating that its content shall be treated as a group. In particular in case of Transparency Groups this results in a different behavior of transparency related features than for the same instructions copied into the page content.
Unless you completely analyze the effects of each bit of content drawn with some transparency and from case to case rewrite the instructions drawing it, copying the instructions from the XObject to the page content stream will result in substantial differences in the displayed content.
if ((PDFormXObject) xObject).getGroup() != null
//if this is not null how to use this?
Usage
Your code assumes that a XObject is used exactly once in the page content streams. This need not be the case, it can also be used more often or not at all.
//For this I will iterate my main content stream and replace all formxobject's.
References
I am adding snippet to copy all references.
Reading references
for (COSName colorname :((PDFormXObject) xObject).getResources().getColorSpaceNames())
{
COSName new_name = COSName.getPDFName(colorname.getName());
PDColorSpace pdcolor = ((PDFormXObject) xObject).getResources().getColorSpace(colorname);
colordict.setItem(new_name,pdcolor);
}
for (COSName propertyName :((PDFormXObject) xObject).getResources().getPropertiesNames())
{
COSName new_name = COSName.getPDFName(propertyName.getName()+"_Fm"+img_count);
PDPropertyList property =((PDFormXObject) xObject).getResources().getProperties(propertyName);
property.getCOSObject().setItem(COSName.NAME, new_name);
propertiesdict.setItem(new_name,property);
}
for (COSName shadeName :((PDFormXObject) xObject).getResources().getShadingNames() )
{
COSName new_name = COSName.getPDFName(shadeName.getName()+"_Fm"+img_count);
PDShading shade =((PDFormXObject) xObject).getResources().getShading(shadeName);
shade.getCOSObject().setItem(COSName.NAME, new_name);
fntdict.setItem(new_name,shade);
}
for (COSName fontName :((PDFormXObject) xObject).getResources().getFontNames() )
{
COSName new_name = COSName.getPDFName(fontName.getName());
PDFont font =((PDFormXObject) xObject).getResources().getFont(fontName);
font.getCOSObject().setItem(COSName.NAME, new_name);
fntdict.setItem(new_name,font);
}
for (COSName ExtGSName :((PDFormXObject) xObject).getResources().getExtGStateNames() )
{
COSName new_name = COSName.getPDFName(ExtGSName.getName());
PDExtendedGraphicsState ExtGState =((PDFormXObject) xObject).getResources().getExtGState(ExtGSName);
ExtGState.getCOSObject().setItem(COSName.NAME, new_name);
extgsdict.setItem(new_name,ExtGState);
}
imgdict.setItem(xObjectName, xObject);
for (COSName Imgname :((PDFormXObject) xObject).getResources().getXObjectNames() )
{
COSName new_name = COSName.getPDFName(Imgname.getName());
xObject.getCOSObject().setItem(COSName.NAME, new_name);
PDXObject img =((PDFormXObject) xObject).getResources().getXObject(Imgname);
imgdict.setItem(new_name, img);
}
for (COSName paternname :((PDFormXObject) xObject).getResources().getPatternNames() )
{
COSName new_name = COSName.getPDFName(paternname.getName());
PDAbstractPattern pat = ((PDFormXObject) xObject).getResources().getPattern(paternname);
pat.getCOSObject().setItem(COSName.NAME, new_name);
pattern.setItem(new_name,pat);
}
//Later I am placing in place
for (COSName fontName :new_resources.getFontNames() )
{
PDFont font =new_resources.getFont(fontName);
fntdict.setItem(fontName,font);
}
for (COSName ExtGSName :new_resources.getExtGStateNames() )
{
PDExtendedGraphicsState extg =new_resources.getExtGState(ExtGSName);
extgsdict.setItem(ExtGSName,extg);
}
for (COSName colorname :new_resources.getColorSpaceNames() )
{
PDColorSpace color =new_resources.getColorSpace(colorname);
colordict.setItem(colorname,color);
}
for (COSName patern :new_resources.getPatternNames() )
{
PDAbstractPattern pat =new_resources.getPattern(patern);
pattern.setItem(patern,pat);
}
resources.getCOSObject().setItem(COSName.EXT_G_STATE,extgsdict);
resources.getCOSObject().setItem(COSName.FONT,fntdict);
resources.getCOSObject().setItem(COSName.XOBJECT,imgdict);
resources.getCOSObject().setItem(COSName.COLORSPACE, colordict);
resources.getCOSObject().setItem(COSName.PATTERN, pattern);
resources.getCOSObject().setItem(COSName.PROPERTIES, propertiesdict);
//what about shading How can I add here
writer.writeTokens(tokens1);
out.close();
document.getPage(pg_ind).setContents(newContents);
document.getPage(pg_ind).setResources(resources);
#mkl please correct me here to give complete solution. I will try hard to make this work Thanks in advance.

Related

PDFBox in Java trying to Edit Pdf with existing acroForms but values are hidden untill i press on them

I am using PDFBox to get a document that was already generated from a Nestjs using PDF-lib js via the command form.createTextField(field.id); after that i send it to java so i can but a signature box ontop of it and fill the forms now the forms are filled and everything works with pdf viewer js
i can see the fields and the values but when i try to open the pdf file in google chrome i dont see the values at all or when i try to open that in Adobe reader i dont see the values untill i click on the field
here is my java code
public void prepareForSigning(DigestAlgorithm digestAlgorithm,
SignatureType signatureType,
UserData userData, List<FieldInput> formFields) throws IOException, NoSuchAlgorithmException {
this.digestAlgorithm = digestAlgorithm;
id = Utils.generateDocumentId();
pdDocument = PDDocument.load(contentIn);
int accessPermissions = getDocumentPermissions();
if (accessPermissions == 1) {
throw new AisClientException("Cannot sign document [" + name + "]. Document contains a certification " +
"that does not allow any changes.");
}
// add fields
// get the document catalog
try {
PDAcroForm acroForm = pdDocument.getDocumentCatalog().getAcroForm();
acroForm.setSignaturesExist(true);
acroForm.setAppendOnly(true);
acroForm.getCOSObject().setDirect(true);
acroForm.getCOSObject().setNeedToBeUpdated(true);
// acroForm.setNeedAppearances(true);
COSObject pdfFields = acroForm.getCOSObject().getCOSObject(COSName.FIELDS);
if (pdfFields != null) {
pdfFields.setNeedToBeUpdated(true);
}
for (int i = 0; i < formFields.size(); i++) {
PDField field = acroForm.getField(formFields.get(i).id);
if (field != null) {
// will also set a checkbox if the value is Yes
// checking for formFields.get(i).value == "true" returns
if (field.getFieldType() == "Btn" && formFields.get(i).value.equals("true")) {
field.setValue("Yes");
} else {
field.setValue(formFields.get(i).value);
}
field.setReadOnly(true);
field.getCOSObject().setNeedToBeUpdated(true);
field.getWidgets().get(0).getAppearance().getCOSObject().setNeedToBeUpdated(true);
Log.info("set field: " + field.getFullyQualifiedName() + " to " + formFields.get(i).value);
}
}
pdDocument.getDocumentCatalog().getCOSObject().setNeedToBeUpdated(true);
} catch (Exception e) {
Log.warn(e);
}
PDSignature pdSignature = new PDSignature();
Calendar signDate = Calendar.getInstance();
if (signatureType == SignatureType.TIMESTAMP) {
// Now, according to ETSI TS 102 778-4, annex A.2, the type of a Dictionary that
// holds document timestamp should be DocTimeStamp
// However, adding this (as of Feb/17/2021), it trips the ETSI Conformance
// Checked online tool, making it say
// "There is no signature dictionary in the document". So, for now (Feb/17/2021)
// this has been removed. This makes the
// ETSI Conformance Checker happy.
// pdSignature.setType(COSName.DOC_TIME_STAMP);
pdSignature.setFilter(PDSignature.FILTER_ADOBE_PPKLITE);
pdSignature.setSubFilter(COSName.getPDFName("ETSI.RFC3161"));
} else {
pdSignature.setFilter(PDSignature.FILTER_ADOBE_PPKLITE);
pdSignature.setSubFilter(PDSignature.SUBFILTER_ETSI_CADES_DETACHED);
// Add 3 Minutes to move signing time within the OnDemand Certificate Validity
// This is only relevant in case the signature does not include a timestamp
// See section 5.8.5.1 of the Reference Guide
signDate.add(Calendar.MINUTE, 3);
}
pdSignature.setSignDate(signDate);
pdSignature.setName(userData.getSignatureName());
pdSignature.setReason(userData.getSignatureReason());
pdSignature.setLocation(userData.getSignatureLocation());
pdSignature.setContactInfo(userData.getSignatureContactInfo());
SignatureOptions options = new SignatureOptions();
options.setPreferredSignatureSize(signatureType.getEstimatedSignatureSizeInBytes());
// create a visible signature at the specified coordinates
if (signatureDefinition != null) {
Rectangle2D humanRect = new Rectangle2D.Float(signatureDefinition.getX(),
signatureDefinition.getY(),
signatureDefinition.getWidth(),
signatureDefinition.getHeight());
PDRectangle rect = createSignatureRectangle(pdDocument, humanRect);
options.setVisualSignature(
createVisualSignatureTemplate(pdDocument, signatureDefinition.getPage(),
signatureDefinition.getImage(), rect, pdSignature));
options.setPage(signatureDefinition.getPage());
}
pdDocument.addSignature(pdSignature, options);
// Set this signature's access permissions level to 0, to ensure we just sign
// the PDF, not certify it
// for more details:
// https://wwwimages2.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
// see section 12.7.4.5
setPermissionsForSignatureOnly();
pbSigningSupport = pdDocument.saveIncrementalForExternalSigning(inMemoryStream);
MessageDigest digest = MessageDigest.getInstance(digestAlgorithm.getDigestAlgorithm());
byte[] contentToSign = IOUtils.toByteArray(pbSigningSupport.getContent());
byte[] hashToSign = digest.digest(contentToSign);
options.close();
base64HashToSign = Base64.getEncoder().encodeToString(hashToSign);
}
now the field with value 5 is appearing because i already clicked on it which is on focus() mode
adobe reader
when i use acoForm.setNeedAppearances to true i can then see the values but then the signature field is not there am i missing something in code ?
i am expecting to see the values in google chrome or Adobe Reader appearing without me pressing on them
Picture of the pdf fields without values with one field being focused on
PDF SAMLE FILE

How do I ADD bullet points to a word document using Apache POI in Java

I have a word document which is used as a template. Inside this template I have some tables that contain predefined bullet points. Now I'm trying to replace the placeholder string with a set of strings.
I'm totally stuck on this. My simplified methods looks like this.
replaceKeyValue.put("[DescriptionOfItem]", new HashSet<>(Collections.singletonList("This is the description")));
replaceKeyValue.put("[AllowedEntities]", new HashSet<>(Arrays.asList("a", "b")));
replaceKeyValue.put("[OptionalEntities]", new HashSet<>(Arrays.asList("c", "d")));
replaceKeyValue.put("[NotAllowedEntities]", new HashSet<>(Arrays.asList("e", "f")));
try (XWPFDocument template = new XWPFDocument(OPCPackage.open(file))) {
template.getTables().forEach(
xwpfTable -> xwpfTable.getRows().forEach(
xwpfTableRow -> xwpfTableRow.getTableCells().forEach(
xwpfTableCell -> replaceInCell(replaceKeyValue, xwpfTableCell)
)
));
ByteArrayOutputStream baos = new ByteArrayOutputStream();
template.write(baos);
return new ByteArrayResource(baos.toByteArray());
} finally {
if (file.exists()) {
file.delete();
}
}
private void replaceInCell(Map<String, Set<String>> replacementsKeyValuePairs, XWPFTableCell xwpfTableCell) {
for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
for (Map.Entry<String, Set<String>> replPair : replacementsKeyValuePairs.entrySet()) {
String keyToFind = replPair.getKey();
Set<String> replacementStrings = replacementsKeyValuePairs.get(keyToFind);
if (xwpfParagraph.getText().contains(keyToFind)) {
replacementStrings.forEach(replacementString -> {
XWPFParagraph paragraph = xwpfTableCell.addParagraph();
XWPFRun run = paragraph.createRun();
run.setText(replacementString);
});
}
}
}
I was expecting that some more bullet points will be added to the current cell. Am I missing something? The paragraph is the one containing the placeholder string and format.
Thanks for any help!
UPDATE: This is how part of the template looks like. I would like to automatically search for the terms and replace them. Searching works so far. But trying to replace the bullet points ends in an unlocatable NullPointer.
Would it be easier to use fields? I need to keep the bullet point style though.
UPDATE 2: added download link and updated the code. Seems I can't alter the paragraphs if I'm iterating through them. I get a null-pointer.
Download link: WordTemplate

Since Microsoft Word is very, very "strange" in how it divides text in different runs in it's storage, such questions are not possible to answer without having a complete example including all code and the Word documents in question. Having a general usable code for adding content to Word documents seems not be possible, except all the adding or replacement is only in fields (form fields or content controls or mail merge fields).
So I downloaded your WordTemplate.docx which looks like so:
Then I runned the following code:
import java.io.*;
import org.apache.poi.xwpf.usermodel.*;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.apache.xmlbeans.XmlCursor;
import java.util.*;
import java.math.BigInteger;
public class WordReadAndRewrite {
static void addItems(XWPFTableCell cell, XWPFParagraph paragraph, Set<String> items) {
XmlCursor cursor = null;
XWPFRun run = null;
CTR cTR = null; // for a deep copy of the run's low level object
BigInteger numID = paragraph.getNumID();
int indentationLeft = paragraph.getIndentationLeft();
int indentationHanging = paragraph.getIndentationHanging();
boolean first = true;
for (String item : items) {
if (first) {
for (int r = paragraph.getRuns().size()-1; r > 0; r--) {
paragraph.removeRun(r);
}
run = (paragraph.getRuns().size() > 0)?paragraph.getRuns().get(0):null;
if (run == null) run = paragraph.createRun();
run.setText(item, 0);
cTR = (CTR)run.getCTR().copy(); // take a deep copy of the run's low level object
first = false;
} else {
cursor = paragraph.getCTP().newCursor();
boolean thereWasParagraphAfter = cursor.toNextSibling(); // move cursor to next paragraph
// because the new paragraph shall be **after** that paragraph
// thereWasParagraphAfter is true if there is a next paragraph, else false
if (thereWasParagraphAfter) {
paragraph = cell.insertNewParagraph(cursor); // insert new paragraph if there are next paragraphs in cell
} else {
paragraph = cell.addParagraph(); // add new paragraph if there are no other paragraphs present in cell
}
paragraph.setNumID(numID); // set template paragraph's numbering Id
paragraph.setIndentationLeft(indentationLeft); // set template paragraph's indenting from left
if (indentationHanging != -1) paragraph.setIndentationHanging(indentationHanging); // set template paragraph's hanging indenting
run = paragraph.createRun();
if (cTR != null) run.getCTR().set(cTR); // set template paragraph's run formatting
run.setText(item, 0);
}
}
}
public static void main(String[] args) throws Exception {
Map<String, Set<String>> replaceKeyValue = new HashMap<String, Set<String>>();
replaceKeyValue.put("[AllowedEntities]", new HashSet<>(Arrays.asList("allowed 1", "allowed 2", "allowed 3")));
replaceKeyValue.put("[OptionalEntities]", new HashSet<>(Arrays.asList("optional 1", "optional 2", "optional 3")));
replaceKeyValue.put("[NotAllowedEntities]", new HashSet<>(Arrays.asList("not allowed 1", "not allowed 2", "not allowed 3")));
XWPFDocument document = new XWPFDocument(new FileInputStream("WordTemplate.docx"));
List<XWPFTable> tables = document.getTables();
for (XWPFTable table : tables) {
List<XWPFTableRow> rows = table.getRows();
for (XWPFTableRow row : rows) {
List<XWPFTableCell> cells = row.getTableCells();
for (XWPFTableCell cell : cells) {
int countParagraphs = cell.getParagraphs().size();
for (int p = 0; p < countParagraphs; p++) { // do not for each since new paragraphs were added
XWPFParagraph paragraph = cell.getParagraphArray(p);
String placeholder = paragraph.getText();
placeholder = placeholder.trim(); // this is the tricky part to get really the correct placeholder
Set<String> items = replaceKeyValue.get(placeholder);
if (items != null) {
addItems(cell, paragraph, items);
}
}
}
}
}
FileOutputStream out = new FileOutputStream("Result.docx");
document.write(out);
out.close();
document.close();
}
}
The Result.docx looks like so:
The code loops trough the table cells in the Word document and looks for a paragraph which contains exactly the placeholder. This even might be the tricky part since that placeholder might be splitted into differnt text runs by Word. If found it runs a method addItems which takes the found paragraph as a template for numbering and indention (might be incomplter though). Then it sets the first new item in first text run of found paragraph and removes all other text runs which possibly are there. Then it determines wheter new paragraphs must be inserted or added to the cell. For this a XmlCursor is used. In new inserted or added paragrahs the other items are placed and the numbering and indention settings are taken from the placeholder's paragraph.
As said, this is code for showing the principles of how to do. It would must be extended very much to be general usable. In my opinion those trials using text placeholders in Word documents for text replacements are not really good. Placeholders for variable text in Word documents should be fields. This could be form fields, content controls or mail merge fields. Advantage of fields in contrast of text placeholders is that Word knows the fields being entities for variable texts. It will not split them into multiple text runs for multiple strange reasons as it often does with normal text.

Why pdf contain one field only is around 500Kb

Here you can download pdf with one acroform field and his size is exactly 427Kb
If I remove this unique field, file is 3Kb only, why this happens please ?
I tried analyse using PDF Debugger and nothing seems weird to me.

There's an embedded "Arial" font in the acroform default resources, see Root/AcroForm/DR/Font/Arial/FontDescriptor/FontFile2.
Either you or whoever created the pdf added it for no reason. The font is not used / referenced. For the acroform default resources you could check the /DA entry (default appearance) of each field whether it contains the font name.
When you removed the field somehow you also removed the font from the acroForm default resources. (You didn't write how you removed it)
Here's some code to do it (null checks mostly missing):
PDAcroForm acroForm = doc.getDocumentCatalog().getAcroForm();
PDResources defaultResources = acroForm.getDefaultResources();
COSDictionary fontDict = (COSDictionary) defaultResources.getCOSObject().getDictionaryObject(COSName.FONT);
List<String> defaultAppearances = new ArrayList<>();
List<COSName> fontDeletionList = new ArrayList<>();
for (PDField field : acroForm.getFieldTree())
{
if (field instanceof PDVariableText)
{
PDVariableText vtField = (PDVariableText) field;
defaultAppearances.add(vtField.getDefaultAppearance());
}
}
for (COSName fontName : defaultResources.getFontNames())
{
if (COSName.HELV.equals(fontName) || COSName.ZA_DB.equals(fontName))
{
// Adobe default, always keep
continue;
}
boolean found = false;
for (String da : defaultAppearances)
{
if (da != null && da.contains("/" + fontName.getName()))
{
found = true;
break;
}
}
System.out.println(fontName + ": " + found);
if (!found)
{
fontDeletionList.add(fontName);
}
}
System.out.println("deletion list: " + fontDeletionList);
for (COSName fontName : fontDeletionList)
{
fontDict.removeItem(fontName);
}
The resulting file has 5KB size now.
I haven't checked the annotations. Some of them have also a /DA string but it is unclear if the acroform default resources fonts are to be used when reconstructing a missing appearance stream.
Update:
Here's some additional code to replace Arial with Helv:
for (PDField field : acroForm.getFieldTree())
{
if (field instanceof PDVariableText)
{
PDVariableText vtField = (PDVariableText) field;
String defaultAppearance = vtField.getDefaultAppearance();
if (defaultAppearance.startsWith("/Arial"))
{
vtField.setDefaultAppearance("/Helv " + defaultAppearance.substring(7));
vtField.getWidgets().get(0).setAppearance(null); // this removes the font usage
vtField.setValue(vtField.getValueAsString());
}
defaultAppearances.add(vtField.getDefaultAppearance());
}
}
Note that this may not be a good idea, because the standard 14 fonts have only limited characters. Try
vtField.setValue("Ayşe");
and you'll get an exception.
More general code to replace font can be found in this answer.

Itext 7 Split Paragraph

How can I split a given paragraph to 2 paragraphs, due to that it fits only partial into canvas. After split, I would like to add the first part into canvas and the second to a new canvas.
public Paragraph addParagraphToPage(PdfDocument pdfDocument, int pageNum, Rectangle rectangle, Paragraph p)
{
PdfPage page = pdfDocument.getPage(pageNum);
PdfCanvas pdfCanvas = new PdfCanvas(page.newContentStreamAfter(), page.getResources(), pdfDocument);
Canvas canvas = new Canvas(pdfCanvas, pdfDocument, rectangle);
ParagraphRenderer currentRenderer = (ParagraphRenderer) p.createRendererSubTree();
currentRenderer.setParent(canvas.getRenderer());
result = currentRenderer.layout(new LayoutContext(new LayoutArea(pageNum, rectangle)));
ArrayList<Paragraph> paragraphs = new ArrayList<Paragraph>();
if (result.getStatus() != LayoutResult.FULL)
{
paragraphs = ????? // getNextParagraph(paragraphs, result, pageNum, rectangle, canvas);
if(paragraphs.size() == 2)
{
canvas.add( paragraphs.get(0));
return paragraphs.get(1);
}
}
return null;
}

Your approach is correct in general and layout in iText7 is flexible enough to allow you to do required thing in an easy manner. The only thing I see that is not very clear is that Paragraph is actually an element that cannot split itself and no classes in layout framework facilitate element splitting. You could do it manually, but there is no need to. Instead you should work with IRenderer, and ParagraphRenderer in particular, directly.
IRenderer can split itself as a result of layout operation and represents the necessary portion of data only compared to the Paragraph which contains full data.
You can add an IRenderer to the CanvasRenderer:
canvas.getRenderer().addChild(rendererToAdd.setParent(canvas.getRenderer()));
And you can access the partial renderers (the portion that fit the passed area and overflow part) from LayoutResult#getSplitRenderer() and LayoutResult#getOverflowRenderer().
In general, your code can be adapted like follows:
public ParagraphRenderer addParagraphToPage(PdfDocument pdfDocument, int pageNum, Rectangle rectangle, ParagraphRenderer renderer) {
PdfPage page = pdfDocument.getPage(pageNum);
PdfCanvas pdfCanvas = new PdfCanvas(page.newContentStreamAfter(), page.getResources(), pdfDocument);
Canvas canvas = new Canvas(pdfCanvas, pdfDocument, rectangle);
renderer.setParent(canvas.getRenderer());
LayoutResult result = renderer.layout(new LayoutContext(new LayoutArea(pageNum, rectangle)));
IRenderer rendererToAdd = result.getStatus() == LayoutResult.FULL ? renderer : result.getSplitRenderer();
canvas.getRenderer().addChild(rendererToAdd.setParent(canvas.getRenderer()));
return result.getStatus() != LayoutResult.FULL ? (ParagraphRenderer) result.getOverflowRenderer() : null;
}
And then for adding paragraph to sequential pages until all the content is placed you basically need only two lines of code:
ParagraphRenderer renderer = (ParagraphRenderer) p.createRendererSubTree();
while ((renderer = addParagraphToPage(pdfDocument, pageNum++, rectangle, renderer)) != null);

Discover titles/paragraphs in word docs

I'm trying to discover paragraphs/titles in word documents.
I use Apache POI to do this.
An example that I use is:
fs = new POIFSFileSystem(new FileInputStream(filesname));
HWPFDocument doc = new HWPFDocument(fs);
WordExtractor we = new WordExtractor(doc);
ArrayList titles = new ArrayList();
try {
for (int i = 0; i < we.getText().length() - 1; i++) {
int startIndex = i;
int endIndex = i + 1;
Range range = new Range(startIndex, endIndex, doc);
CharacterRun cr = range.getCharacterRun(0);
if (cr.isBold() || cr.isItalic() || cr.getUnderlineCode() != 0) {
while (cr.isBold() || cr.isItalic() || cr.getUnderlineCode() != 0) {
i++;
endIndex += 1;
range = new Range(endIndex, endIndex + 1, doc);
cr = range.getCharacterRun(0);
}
range = new Range(startIndex, endIndex - 1, doc);
titles.add(range.text());
}
}
}
catch (IndexOutOfBoundsException iobe) {
//sometimes this happens have to find out why.
}`enter code here`
This works for all bold, italic or underlined text.
But what I want is to discover the font that is used most often. And then to discover variations compared to that font style. Anyone an Idea?

Well, some thoughts would be to try some of the following:
cr.getFontSize() could be used at the beginning of a paragraph to see if the range changes font size. That in conjunction with bold, italic or underlined would be a good identifier.
cr.getFontName() could also be used to determine when and where the font changes in a given range.
cr.getColor() would be another possibility to help identify if the user is using different colors for a font.
I guess I would iterate over the range and create multiple CharacterRun items each time the text characteristics change. Then evaluate each item based on position in the paragraph as well as all of the afore-mentioned characteristics (size, color, name, bold, italics, etc.). Perhaps create some sort of weighting scale based on the most common values.
It might also be of value to create a Title object and store the values for each set of characteristics to help optimize searches in later character runs in the same document.

You might want to take a look at the buildParagraphTagAndStyle method in Tika's WordExtractor:
https://svn.apache.org/repos/asf/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
For HWPF (.doc), to call it you'd do:
StyleDescription style =
document.getStyleSheet().getStyleDescription(p.getStyleIndex());
TagAndStyle tas = buildParagraphTagAndStyle(
style.getName(), (parentTableLevel>0)
);
For XWPF (.docx) you'd do:
XWPFStyle style = styles.getStyle(paragraph.getStyleID());
TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
style.getName(), paragraph.getPartType() == BodyType.TABLECELL
);

It will be easier if you process the data by converting it into paragraphs.
WordExtractor we = new WordExtractor(doc);
String[] para = we.getParagraphText();
Then work paragraph wise. If your code already couldn't figure out the titles, then you can check for bold and underlines in each paragraph.
The paragraphs function as follows:
for(int i=0;i<para.length;i++)
{
System.out.println("Length of paragraph "+(i+1)+": "+ para[i].length());
System.out.println(para[i].toString());
}
A working example can be found here:
http://sanjaal.com/java/120/java-file/how-to-read-doc-file-using-java-and-apache-poi/#comments

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.