Read acrofields with variable page size within document with iText - java

I am using iText to add and read acrofields. But it runs into issue where page size within document is variable.
So for eg. Pdf document with 3 pages -> letter, legal , letter
Its unable to get all acrofields. But if all pages legal or all pages letter,works perfectly
Here is code which i use to read the acrofields.
String pdf = "D:\\1350211.pdf";
PdfReader reader = new PdfReader( pdf );
AcroFields fields = reader.getAcroFields();
Set<String> fldNames = fields.getFields().keySet();
List<AcrofieldModel> lists = new ArrayList<>();
for (String fldName : fldNames) {
List<FieldPosition> position = fields.getFieldPositions(fldName);
float lowerLeftX = position.get(0).position.getLeft();
float lowerLeftY = position.get(0).position.getBottom();
float upperRightX = position.get(0).position.getRight();
float upperRightY = position.get(0).position.getTop();
float fieldLength = Math.abs(upperRightX-lowerLeftX);
AcrofieldModel acrofieldModel = new AcrofieldModel(fldName, fields.getField( fldName ), "(X:"+lowerLeftX + " , Y:"+lowerLeftY +") ", fieldLength);
lists.add(acrofieldModel);
}
return lists;

Related

itext 7 pdf how to prevent text overflow on right side of the page

I am using itextpdf 7 (7.2.0) to create a pdf file. However even though the TOC part is rendered very well, in the content part the text overflows. Here is my code that generates the pdf:
public class Main {
public static void main(String[] args) throws IOException {
PdfWriter writer = new PdfWriter("fiftyfourthPdf.pdf");
PdfDocument pdf = new PdfDocument(writer);
Document document = new Document(pdf, PageSize.A4,false);
//document.setMargins(30,10,36,10);
// Create a PdfFont
PdfFont font = PdfFontFactory.createFont(StandardFonts.TIMES_ROMAN,"Cp1254");
document
.setTextAlignment(TextAlignment.JUSTIFIED)
.setFont(font)
.setFontSize(11);
PdfOutline outline = null;
java.util.List<AbstractMap.SimpleEntry<String, AbstractMap.SimpleEntry<String, Integer>>> toc = new ArrayList<>();
for(int i=0;i<5000;i++){
String line = "This is paragraph " + String.valueOf(i+1)+ " ";
line = line.concat(line).concat(line).concat(line).concat(line).concat(line);
Paragraph p = new Paragraph(line);
p.setKeepTogether(true);
document.add(p.setFont(font).setFontSize(10).setHorizontalAlignment(HorizontalAlignment.CENTER).setTextAlignment(TextAlignment.LEFT));
//PROCESS FOR TOC
String name = "para " + String.valueOf(i+1);
outline = createOutline(outline,pdf,line ,name );
AbstractMap.SimpleEntry<String, Integer> titlePage = new AbstractMap.SimpleEntry(line, pdf.getNumberOfPages());
p
.setFont(font)
.setFontSize(12)
//.setKeepWithNext(true)
.setDestination(name)
// Add the current page number to the table of contents list
.setNextRenderer(new UpdatePageRenderer(p));
toc.add(new AbstractMap.SimpleEntry(name, titlePage));
}
int contentPageNumber = pdf.getNumberOfPages();
for (int i = 1; i <= contentPageNumber; i++) {
// Write aligned text to the specified by parameters point
document.showTextAligned(new Paragraph(String.format("Sayfa %s / %s", i, contentPageNumber)).setFontSize(10),
559, 26, i, TextAlignment.RIGHT, VerticalAlignment.MIDDLE, 0);
}
//BEGINNING OF TOC
document.add(new AreaBreak());
Paragraph p = new Paragraph("Table of Contents")
.setFont(font)
.setDestination("toc");
document.add(p);
java.util.List<TabStop> tabStops = new ArrayList<>();
tabStops.add(new TabStop(580, TabAlignment.RIGHT, new DottedLine()));
for (AbstractMap.SimpleEntry<String, AbstractMap.SimpleEntry<String, Integer>> entry : toc) {
AbstractMap.SimpleEntry<String, Integer> text = entry.getValue();
p = new Paragraph()
.addTabStops(tabStops)
.add(text.getKey())
.add(new Tab())
.add(String.valueOf(text.getValue()))
.setAction(PdfAction.createGoTo(entry.getKey()));
document.add(p);
}
// Move the table of contents to the first page
int tocPageNumber = pdf.getNumberOfPages();
for (int i = 1; i <= tocPageNumber; i++) {
// Write aligned text to the specified by parameters point
document.showTextAligned(new Paragraph("\n footer text\n second line\nthird line").setFontColor(ColorConstants.RED).setFontSize(8),
300, 26, i, TextAlignment.CENTER, VerticalAlignment.MIDDLE, 0);
}
document.flush();
for(int z = 0; z< (tocPageNumber - contentPageNumber ); z++){
pdf.movePage(tocPageNumber,1);
pdf.getPage(1).setPageLabel(PageLabelNumberingStyle.UPPERCASE_LETTERS,
null, 1);
}
//pdf.movePage(tocPageNumber, 1);
// Add page labels
/*pdf.getPage(1).setPageLabel(PageLabelNumberingStyle.UPPERCASE_LETTERS,
null, 1);*/
pdf.getPage(tocPageNumber - contentPageNumber + 1).setPageLabel(PageLabelNumberingStyle.DECIMAL_ARABIC_NUMERALS,
null, 1);
document.close();
}
private static PdfOutline createOutline(PdfOutline outline, PdfDocument pdf, String title, String name) {
if (outline == null) {
outline = pdf.getOutlines(false);
outline = outline.addOutline(title);
outline.addDestination(PdfDestination.makeDestination(new PdfString(name)));
} else {
PdfOutline kid = outline.addOutline(title);
kid.addDestination(PdfDestination.makeDestination(new PdfString(name)));
}
return outline;
}
private static class UpdatePageRenderer extends ParagraphRenderer {
protected AbstractMap.SimpleEntry<String, Integer> entry;
public UpdatePageRenderer(Paragraph modelElement, AbstractMap.SimpleEntry<String, Integer> entry) {
super(modelElement);
this.entry = entry;
}
public UpdatePageRenderer(Paragraph modelElement) {
super(modelElement);
}
#Override
public LayoutResult layout(LayoutContext layoutContext) {
LayoutResult result = super.layout(layoutContext);
//entry.setValue(layoutContext.getArea().getPageNumber());
if (result.getStatus() != LayoutResult.FULL) {
if (null != result.getOverflowRenderer()) {
result.getOverflowRenderer().setProperty(
Property.LEADING,
result.getOverflowRenderer().getModelElement().getDefaultProperty(Property.LEADING));
} else {
// if overflow renderer is null, that could mean that the whole renderer will overflow
setProperty(
Property.LEADING,
result.getOverflowRenderer().getModelElement().getDefaultProperty(Property.LEADING));
}
}
return result;
}
#Override
// If not overriden, the default renderer will be used for the overflown part of the corresponding paragraph
public IRenderer getNextRenderer() {
return new UpdatePageRenderer((Paragraph) this.getModelElement());
}
}
}
Here are the screen shots of TOC part and content part :
TOC :
Content :
What am I missing? Thank you all for your help.
UPDATE
When I add the line below it renders with no overflow but the page margins of TOC and content part differ (the TOC margin is way more than the content margin). See the picture attached please :
document.setMargins(30,60,36,20);
Right Margin difference between TOC and content:
UPDATE 2 :
When I comment the line
document.setMargins(30,60,36,20);
and set the font size on line :
document.add(p.setFont(font).setFontSize(10).setHorizontalAlignment(HorizontalAlignment.CENTER).setTextAlignment(TextAlignment.LEFT));
to 12 then it renders fine. What difference should possibly the font size cause for the page content and margins? Are not there standard page margins and page setups? Am I unknowingly (I am newbie to itextpdf) messing some standard implementations?
TL; DR: either remove setFontSize in
p
.setFont(font)
.setFontSize(12)
//.setKeepWithNext(true)
.setDestination(name)
or change setFontSize(10) -> setFontSize(12) in
document.add(p.setFont(font).setFontSize(10).setHorizontalAlignment(HorizontalAlignment.CENTER).setTextAlignment(TextAlignment.LEFT));
Explanation: You are setting the Document to not immediately flush elements added to that document with the following line:
Document document = new Document(pdf, PageSize.A4,false);
Then you add an paragraph element with font size equal to 10 to the document with the following line:
document.add(p.setFont(font).setFontSize(10).setHorizontalAlignment(HorizontalAlignment.CENTER).setTextAlignment(TextAlignment.LEFT));
What happens is that the element is being laid out (split in lines etc), but now drawn on the page. Then you do .setFontSize(12) and this new font size is applied for draw only, so iText calculated that X characters would fit into one line assuming the font size is 10 while in reality the font size is 12 and obviously fewer characters can fit into one line.
There is no sense in setting the font size two times to different values - just pick one value you want to see in the resultant document and set it once.

Accessing a COSArray for PDF fields with Apache PDFBox

I'm trying to access all form fields in a PDF file - so I can use code to fill them in - and this is as far as I've gotten:
PDDocumentCatalog pdCatalog = pdf.getDocumentCatalog();
PDAcroForm pdAcroForm = pdCatalog.getAcroForm();
List<PDField> fieldList = pdAcroForm.getFields(); // fieldList.size() = 1
PDField field = fieldList.get(0);
COSDictionary dictionary = field.getCOSObject();
System.out.println("dictionary size = " + dictionary.size());
// my attempt to iterate through fields
for ( Map.Entry<COSName,COSBase> entry : dictionary.entrySet() )
{
COSName key = entry.getKey();
COSBase val = entry.getValue();
if ( val instanceof COSArray )
{
System.out.println("COSArray size = " + ((COSArray)val).size());
}
System.out.println("key = " + key);
System.out.println("val = " + val);
}
which gives an output of:
dictionary size = 3
COSArray size = 2
key = COSName{Kids}
val = COSArray{[COSObject{110, 0}, COSObject{108, 0}]}
key = COSName{T}
val = COSString{form1[0]}
key = COSName{V}
val = COSString{}
Does anyone know how I can access the two COSObjects in the COSArray? I also don't know what the notation COSObject{x, y} means, and can't find any documentation on this. If those are dictionary or array values elements, I also want to know how to access those.
You get the object with get(index) to get the COSObject (an indirect reference) or getObject(index) to get the dereferenced object referenced by the COSObject.
COSObject{110, 0} is the object number and the generation number (usually 0). Open your PDF file with NOTEPAD++ and look for "110 0 obj" to find it, or "110 0 R" to see who references this object.

Save custom page number labels in bookmarks

In the screenshot you can see custom page number labels (i, ii, iii, vii).
How can I save bookmarks with custom page number labels using PDFBox 2.0?
My code actually looks like this:
PDDocumentOutline documentOutline = new PDDocumentOutline();
document.getDocumentCatalog().setDocumentOutline(documentOutline);
PDOutlineItem outline = new PDOutlineItem();
outline.setTitle(toc.getName());
documentOutline.addLast(outline);
addToc(toc, outline);
outline.openNode();
documentOutline.openNode();
private void addToc(Toc toc, PDOutlineItem outlineItem) {
PDPageFitWidthDestination destination = new PDPageFitWidthDestination();
PDPage page = document.getPage(toc.getPageNumber() - 1);
destination.setPage(page);
PDOutlineItem bookmark = new PDOutlineItem();
bookmark.setDestination(destination);
bookmark.setTitle(toc.getName());
outlineItem.addLast(bookmark);
if (toc.getChildren() != null) {
for (Toc subToc : toc.getChildren()) {
addToc(subToc, bookmark);
}
}
}
You can only label pages, not bookmarks. In the example below (with 3 empty pages), roman numbers start at 3, and then decimal at 1. The prefix for the romans is "RO ". So the pages are "RO III", "RO IV", "1".
PDDocument doc = new PDDocument();
doc.addPage(new PDPage());
doc.addPage(new PDPage());
doc.addPage(new PDPage());
PDPageLabels pageLabels = new PDPageLabels(doc);
PDPageLabelRange pageLabelRange1 = new PDPageLabelRange();
pageLabelRange1.setPrefix("RO ");
pageLabelRange1.setStart(3);
pageLabelRange1.setStyle(PDPageLabelRange.STYLE_ROMAN_UPPER);
pageLabels.setLabelItem(0, pageLabelRange1);
PDPageLabelRange pageLabelRange2 = new PDPageLabelRange();
pageLabelRange2.setStart(1);
pageLabelRange2.setStyle(PDPageLabelRange.STYLE_DECIMAL);
pageLabels.setLabelItem(2, pageLabelRange2);
doc.getDocumentCatalog().setPageLabels(pageLabels);
doc.save("labels.pdf");
doc.close();

How to read word document and get parts of it with all styles using docx4j

I am using docx4j to deal with word document formatting. I have one word document which is divided in number of tables. I want to read all the tables and if I find some keywords then I want to take those contents to another word document with all the formatting. My word document is as follow.
Like from above I want to take content which is below Some Title. Here my keyword is Sample Text. So whenever Sample Text gets repeated, content needs to be fetched to new word document.
I am using following code.
MainDocumentPart mainDocumentPart = null;
WordprocessingMLPackage docxFile = WordprocessingMLPackage.load(new File(fileName));
mainDocumentPart = docxFile.getMainDocumentPart();
WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.createPackage();
ClassFinder finder = new ClassFinder(Tbl.class);
new TraversalUtil(mainDocumentPart.getContent(), finder);
Tbl tbl = null;
int noTbls = 0;
int noRows = 0;
int noCells = 0;
int noParas = 0;
int noTexts = 0;
for (Object table : finder.results) {
noTbls++;
tbl = (Tbl) table;
// Get all the Rows in the table
List<Object> allRows = DocxUtility.getDocxUtility()
.getAllElementFromObject(tbl, Tr.class);
for (Object row : allRows) {
Tr tr = (Tr) row;
noRows++;
// Get all the Cells in the Row
List<Object> allCells = DocxUtility.getDocxUtility()
.getAllElementFromObject(tr, Tc.class);
toCell:
for (Object cell : allCells) {
Tc tc = (Tc) cell;
noCells++;
// Get all the Paragraph's in the Cell
List<Object> allParas = DocxUtility.getDocxUtility()
.getAllElementFromObject(tc, P.class);
for (Object para : allParas) {
P p = (P) para;
noParas++;
// Get all the Run's in the Paragraph
List<Object> allRuns = DocxUtility.getDocxUtility()
.getAllElementFromObject(p, R.class);
for (Object run : allRuns) {
R r = (R) run;
// Get the Text in the Run
List<Object> allText = DocxUtility.getDocxUtility()
.getAllElementFromObject(r, Text.class);
for (Object text : allText) {
noTexts++;
Text txt = (Text) text;
}
System.out.println("No of Text in Para No: " + noParas + "are: " + noTexts);
}
}
System.out.println("No of Paras in Cell No: " + noCells + "are: " + noParas);
}
System.out.println("No of Cells in Row No: " + noRows + "are: " + noCells);
}
System.out.println("No of Rows in Table No: " + noTbls + "are: " + noRows);
}
System.out.println("Total no of Tables: " + noTbls );
Assuming your text is in a single run (ie not split across runs), then you can search for it via XPath. Or you can manually traverse using TraversalUtil. See docx4j's Getting Started for more info.
So finding your stuff is pretty easy. Copying the formatting it uses, and any rels in it, is in the general case, complicated. See my post http://www.docx4java.org/blog/2010/11/merging-word-documents/ for more on the issues involved.

Fetch Searched Data/Metadata In Lucene

Hi I am java developer and learning Lucene. I have a java class that index a pdf(lucene_in_action_2nd_edition.pdf) file and a search class that search some text from index. IndexSearcher is giving Document which shows that string exists in index(lucene_in_action_2nd_edition.pdf) or not.
But now I want to get searched data or metadata. i.e. I want to know that at which page string is matched, or few text around matched string, etc... How to do that?
Here is my LuceneSearcher.java class:
public static void main(String[] args) throws Exception {
File indexDir = new File("D:\\index");
String querystr = "Advantages of FastVectorHighlighter";
Query q = new QueryParser(Version.LUCENE_40, "contents",
new StandardAnalyzer(Version.LUCENE_40)).parse(querystr);
int hitsPerPage = 100;
IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir));
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(
hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
System.out.println("Found " + hits.length + " hits.");
for (int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + "... " + d.get("filename"));
System.out.println("=====================================================");
System.out.println(d.get("contents"));
}
// reader can only be closed when there
// is no need to access the documents any more.
reader.close();
}
Here d.get("contents") give full text(generated by Tika) of .pdf file, that was stored at time of indexing.
I want some information about searched text, so that I can show that on my web page or highlight searched text properly(like google search output). How to achieve that? Do we need to write some logic or Lucene does it internally?
Any type of help would be appreciated. Thanks in advance.
The org.apache.lucene.search.highlight package provides this functionality.
Such as:
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
for (int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
String text = doc.get("contents");
String bestFrag = highlighter.getBestFragment(analyzer, "contents", text);
//output, however you like.
You can also get a list of best Fragments from the highlighter, instead of just a single one, if you prefer, see the Highlighter API

Categories