How to write placeholder in pdf using itext - java

I am using itext and converting html to pdf for that i am using this code
import java.io.FileOutputStream;
import java.io.StringReader;
import javax.sql.rowset.spi.XmlWriter;
import com.itextpdf.text.Chunk;
import com.itextpdf.text.Document;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.html.simpleparser.HTMLWorker;
import com.itextpdf.text.pdf.PdfWriter;
public class HtmlToPDF2 {
// itextpdf-5.4.1.jar http://sourceforge.net/projects/itext/files/iText/
public static void main(String ... args ) {
try {
Document document = new Document(PageSize.LETTER);
PdfWriter.getInstance(document, new FileOutputStream("testpdf1.pdf"));
document.open();
HTMLWorker htmlWorker = new HTMLWorker(document);
String firstName = "<name>" ;
String sign = "<sign>";
String str = "<html> " +
"<body>" +
"<form>" +
"<div><strong>Dear</strong> "+firstName +",</div><br/>"+
"<div>"+
"<P> It is informed that you are selected in your interview<br/>"+
" and please report on the <b>20 may</b> with your all original <br/>"+
" document on our head office at jaipur.>"+
" </P>"+
" </div><br/>"+
" <div>"+
" <p>Yours sincierly </p><br/>"+sign+"</div>"+
" </form>"+
"<body>"+
"<html>";
htmlWorker.parse(new StringReader(str));
document.close();
System.out.println("Done");
}
catch (Exception e) {
e.printStackTrace();
}
}
}
but this will give me output
desired output is
and is it correct way to create placeholder .. or i need to do anything else to create placeholder ? if yes then please suggest me .

< and > signs consider as html tags. Because of that it don't show in your pdf.
you can define firstName and sign as below..
public class HtmlToPDF2 {
public static void main(String ... args ) {
....
....
String firstName = "<name>" ;
String sign = "<sign>";
....
....
}
}

Related

Java Exception not understood

I am writing a search engine code using java, and I'm getting this error without knowing the cause:
Exception in thread "main" java.lang.NullPointerException
at WriteToFile.fileWriter(WriteToFile.java:29)
at Main.main(Main.java:14)
Process finished with exit code 1
this is my code :
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Scanner;
public class Search {
private static String URL="https://www.google.com/search?q=";
private Document doc;
private Elements links;
private String html;
public Search() throws IOException {};
public void SearchWeb() throws IOException {
//to get the keywords from the user
Scanner sc = new Scanner(System.in);
System.out.println("Please enter the keyword you want to search for: ");
String word = sc.nextLine();
//Search for the keyword over the net
String url = URL + word;
doc = Jsoup.connect(url).get();
html = doc.html();
Files.write(Paths.get("D:\\OOP\\OOPproj\\data.txt"), html.getBytes());
links = doc.select("cite");
}
public Document getDoc() {
return doc;
}
public String getHtml() {
return html;
}
public Elements getLinks() {
return links;
}
}
and this is the class writeToFile:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
public class WriteToFile extends Search {
public WriteToFile() throws IOException {};
String description = "<!> Could not fetch description <!>";
String keywords = "<!> Could not fetch keywords <!>";
private ArrayList<String> detail = new ArrayList<String>();
BufferedWriter bw = null;
public void fileWriter() throws IOException {
for (Element link : super.getLinks()) {
String text = link.text();
if (text.contains("›")) {
text = text.replaceAll(" › ", "/");
}
detail.add(text);
System.out.println(text);
}
System.out.println("***************************************************");
for (int i = 0; i < detail.size(); i++)
System.out.println("detail [" + (i + 1) + "]" + detail.get(i));
System.out.println("###################################################################");
for (int j = 0; j < detail.size(); j++) {
Document document = Jsoup.connect(detail.get(j)).get();
String web = document.html();
Document d = Jsoup.parse(web);
Elements metaTags = d.getElementsByTag("meta");
for (Element metaTag : metaTags) {
String content = metaTag.attr("content");
String name = metaTag.attr("name");
if ("description".equals(name)) {
description = content;
}
if ("keywords".equals(name)) {
keywords = content;
}
}
String title = d.title();
Files.write(Paths.get("D:\\OOP\\OOPproj\\search.txt"), (detail.get(j) + "\t" + "|" + "\t" + title + "\t" + "|" + "\t" + description + "\t" + "|" + "\t" + keywords + System.lineSeparator()).getBytes(), StandardOpenOption.APPEND);
}
}
}
This is the Main class:
import java.io.IOException;
public class Main {
public static void main(String[] args) throws IOException {
Search a = new Search();
a.SearchWeb();
WriteToFile b = new WriteToFile();
b.fileWriter();
}
}
I tried to print the getLinks() method in main to check if it was null , but it wasn't , the links were cited.
I would be really grateful if someone helps me out.
You are calling SearchWeb() on object a, but you're calling fileWriter() on object b. This means the links are set in a, but not in b.
Since WriteToFile extends Search, you just need an instance of that:
WriteToFile a = new WriteToFile();
a.SearchWeb();
a.fileWriter();

Java crawler to get all first and third party cookies

i would like to build a crawler in Java that give me all cookies from a website. This crawler is believed to crawl a list of websites (and obviously the undersides) automatic.
I have used jSoup and Selenium for my plan.
package com.mycompany.app;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;
public class BasicWebCrawler {
private static Set<String> uniqueURL = new HashSet<String>();
private static List<String> link_list = new ArrayList<String>();
private static Set<String> uniqueCookies = new HashSet<String>();
private static void get_links(String url) {
Connection connection = null;
Connection.Response response = null;
String this_link = null;
try {
connection = Jsoup.connect(url);
response = connection.execute();
//cookies_http = response.cookies();
// fetch the document over HTTP
Document doc = response.parse();
// get all links in page
Elements links = doc.select("a[href]");
if(links.isEmpty()) {
return;
}
for (Element link : links) {
this_link = link.attr("href");
boolean add = uniqueURL.add(this_link);
System.out.println("\n" + this_link + "\n" + "title: " + doc.title());
if (add && (this_link.contains(url))) {
System.out.println("\n" + this_link + "\n" + "title: " + doc.title());
link_list.add(this_link);
get_links(this_link);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
get_links("https://de.wikipedia.org/wiki/Wikipedia");
/**
* Hier kommt Selenium ins Spiel
*/
WebDriver driver;
System.setProperty("webdriver.chrome.driver", "D:\\crawler\\driver\\chromedriver.exe");
driver = new ChromeDriver();
// create file named Cookies to store Login Information
File file = new File("Cookies.data");
FileWriter fileWrite = null;
BufferedWriter Bwrite = null;
try {
// Delete old file if exists
file.delete();
file.createNewFile();
fileWrite = new FileWriter(file);
Bwrite = new BufferedWriter(fileWrite);
// loop for getting the cookie information
} catch (Exception ex) {
ex.printStackTrace();
}
for(String link : link_list) {
System.out.println("Open Link: " + link);
driver.get(link);
try {
// loop for getting the cookie information
for (Cookie ck : driver.manage().getCookies()) {
String tmp = (ck.getName() + ";" + ck.getValue() + ";" + ck.getDomain() + ";" + ck.getPath() + ";" + ck.getExpiry() + ";" + ck.isSecure());
if(uniqueCookies.add(tmp)) {
Bwrite.write("Link: " + link + "\n" + (ck.getName() + ";" + ck.getValue() + ";" + ck.getDomain() + ";" + ck.getPath() + ";" + ck.getExpiry() + ";" + ck.isSecure())+ "\n\n");
Bwrite.newLine();
}
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
try {
Bwrite.close();
fileWrite.close();
driver.close();
driver.quit();
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
I test this code on a wikipedia page and compare the result with a cookie scanner call CookieMetrix.
My code shows only four cookies:
Link: https://de.wikipedia.org/wiki/Wikipedia:Lizenzbestimmungen_Commons_Attribution-ShareAlike_3.0_Unported
GeoIP;DE:NW:M__nster:51.95:7.54:v4;.wikipedia.org;/;null;true
Link: https://de.wikipedia.org/wiki/Wikipedia:Lizenzbestimmungen_Commons_Attribution-ShareAlike_3.0_Unported
WMF-Last-Access-Global;13-May-2019;.wikipedia.org;/;Mon Jan 19 02:28:33 CET 1970;true
Link: https://de.wikipedia.org/wiki/Wikipedia:Lizenzbestimmungen_Commons_Attribution-ShareAlike_3.0_Unported
WMF-Last-Access;13-May-2019;de.wikipedia.org;/;Mon Jan 19 02:28:33 CET 1970;true
Link: https://de.wikipedia.org/wiki/Wikipedia:Lizenzbestimmungen_Commons_Attribution-ShareAlike_3.0_Unported
mwPhp7Seed;55e;de.wikipedia.org;/;Mon Jan 19 03:09:08 CET 1970;false
But the cookie scanner shows seven. I don't know why my code shows lesser than the CookieMetrix. Can you help me?
JavaDoc for java.util.Set<Cookie> getCookies():
Get all the cookies for the current domain. This is the equivalent of calling "document.cookie" and parsing the result
document.cookie will not return HttpOnly cookies, simply because JavaScript does not allow it.
Also notice that the “CookieMetrix” seems to list cookies from different domains.
Solutions:
To get a listing such as “CookieMetrix” (1+2) you could add a proxy after your browser and sniff the requests.
In case you want to get all cookies for the current domain, including HttpOnly (1), you could try accessing Chrome’s DevTools API directly (afair, it’ll also return HttpOnly cookies)

BioNLP stanford - tokenization

I try to tokenize a biomedical text so I decided to use http://nlp.stanford.edu/software/eventparser.shtml. I used the stand-alone program RunBioNLPTokenizer that does what I want.
Now, I want to create my own program that uses Stanford libraries. So, I read the code from RunBioNLPTokenizer describing below.
package edu.stanford.nlp.ie.machinereading.domains.bionlp;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Collection;
import java.util.List;
import java.util.Properties;
import edu.stanford.nlp.ie.machinereading.GenericDataSetReader;
import edu.stanford.nlp.ie.machinereading.msteventextractor.DataSet;
import edu.stanford.nlp.ie.machinereading.msteventextractor.EpigeneticsDataSet;
import edu.stanford.nlp.ie.machinereading.msteventextractor.GENIA11DataSet;
import edu.stanford.nlp.ie.machinereading.msteventextractor.InfectiousDiseasesDataSet;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.StringUtils;
/**
* Standalone program to run our BioNLP tokenizer and save its output
*/
public class RunBioNLPTokenizer extends GenericDataSetReader {
public static void main(String[] args) throws IOException {
Properties props = StringUtils.argsToProperties(args);
String basePath = props.getProperty("base.directory", "/u/nlp/data/bioNLP/2011/originals/");
DataSet dataset = new GENIA11DataSet();
dataset.getFilesystemInformation().setTokenizer("stanford");
runTokenizerForDirectory(dataset, basePath + "genia/training");
runTokenizerForDirectory(dataset, basePath + "genia/development");
runTokenizerForDirectory(dataset, basePath + "genia/testing");
dataset = new EpigeneticsDataSet();
dataset.getFilesystemInformation().setTokenizer("stanford");
runTokenizerForDirectory(dataset, basePath + "epi/training");
runTokenizerForDirectory(dataset, basePath + "epi/development");
runTokenizerForDirectory(dataset, basePath + "epi/testing");
dataset = new InfectiousDiseasesDataSet();
dataset.getFilesystemInformation().setTokenizer("stanford");
runTokenizerForDirectory(dataset, basePath + "infect/training");
runTokenizerForDirectory(dataset, basePath + "infect/development");
runTokenizerForDirectory(dataset, basePath + "infect/testing");
}
private static void runTokenizerForDirectory(DataSet dataset, String path) throws IOException {
System.out.println("Input directory: " + path);
BioNLPFormatReader reader = new BioNLPFormatReader();
for (File rawFile : reader.getRawFiles(path)) {
System.out.println("Input filename: " + rawFile.getName());
String rawText = IOUtils.slurpFile(rawFile);
String docId = rawFile.getName().replace("." + BioNLPFormatReader.TEXT_EXTENSION, "");
String parentPath = rawFile.getParent();
runTokenizer(dataset.getFilesystemInformation().getTokenizedFilename(parentPath, docId), rawText);
}
}
private static void runTokenizer(String tokenizedFilename, String text) {
System.out.println("Tokenized filename: " + tokenizedFilename);
Collection<String> sentences = BioNLPFormatReader.splitSentences(text);
PrintStream os = null;
try {
os = new PrintStream(new FileOutputStream(tokenizedFilename));
} catch (IOException e) {
System.err.println("ERROR: cannot save online tokenization to " + tokenizedFilename);
e.printStackTrace();
System.exit(1);
}
for (String sentence : sentences) {
BioNLPFormatReader.BioNLPTokenizer tokenizer = new BioNLPFormatReader.BioNLPTokenizer(sentence);
List<CoreLabel> tokens = tokenizer.tokenize();
for (CoreLabel l : tokens) {
os.print(l.word() + " ");
}
os.println();
}
os.close();
}
}
I wrote the below code. I achieved to split the text into sentences but I can't use the BioNLPTokenizer as it is used in RunBioNLPTokenizer.
public static void main(String[] args) throws Exception {
// TODO code application logic here
Collection<String> c =BioNLPFormatReader.splitSentences("..");
for (String sentence : c) {
System.out.println(sentence);
BioNLPFormatReader.BioNLPTokenizer x = BioNLPFormatReader.BioNLPTokenizer(sentence);
}
}
I took this error
Exception in thread "main" java.lang.RuntimeException: Uncompilable source code - edu.stanford.nlp.ie.machinereading.domains.bionlp.BioNLPFormatReader.BioNLPTokenizer has protected access in edu.stanford.nlp.ie.machinereading.domains.bionlp.BioNLPFormatReader
My question is. How can I tokenize a biomedical sentence according to Stanford libraries without using RunBioNLPTokenizer?
Unfortunately, we made BioNLPTokenizer a protected inner class, so you'd need to edit the source and change its access to public.
Note that BioNLPTokenizer may not be the most general purpose biomedical sentence tokenzier -- I would spot check the output to make sure it is reasonable. We developed it heavily against the BioNLP 2009/2011 shared tasks.

How to include links on PDFA/1B with iText and XmlWorker (HTML to PDF/A)

I've tested code from this comment and I can't add link because Exception is thrown.
For reproduce the problem, only add a link to html. Only added this line on Main.java
buf.append("<a href='http://google.com'>link to google</a>");
style.css
* {
font-family: "Arial";
font-style: normal;
}
Main.java
package com.itextpdf;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.Reader;
import java.io.StringReader;
import com.itextpdf.text.Document;
import com.itextpdf.text.pdf.ICC_Profile;
import com.itextpdf.text.pdf.PdfAConformanceLevel;
import com.itextpdf.text.pdf.PdfAWriter;
import com.itextpdf.tool.xml.XMLWorker;
import com.itextpdf.tool.xml.XMLWorkerHelper;
import com.itextpdf.tool.xml.css.CssFile;
import com.itextpdf.tool.xml.css.StyleAttrCSSResolver;
import com.itextpdf.tool.xml.html.CssAppliers;
import com.itextpdf.tool.xml.html.CssAppliersImpl;
import com.itextpdf.tool.xml.html.Tags;
import com.itextpdf.tool.xml.parser.XMLParser;
import com.itextpdf.tool.xml.pipeline.css.CSSResolver;
import com.itextpdf.tool.xml.pipeline.css.CssResolverPipeline;
import com.itextpdf.tool.xml.pipeline.end.PdfWriterPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipeline;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;
public class Main {
public static void main(String[] args) {
StringBuffer buf = new StringBuffer();
String title = "Test";
// Sample HTML content.
buf.append("<!DOCTYPE html>");
buf.append("<html>");
buf.append("<head>");
buf.append("<title>" + title + "</title>");
buf.append("</head>");
buf.append("<body>");
buf.append("<p>This is a test</p>");
buf.append("<a href='http://google.com'>link to google</a>"); //<----- Only added this line
buf.append("</body>");
buf.append("</html>");
OutputStream file = null;
Document document = null;
PdfAWriter writer = null;
try {
file = new FileOutputStream(new File("C:\\Users\\amartin\\Desktop\\Test.pdf"));
document = new Document();
writer = PdfAWriter.getInstance(document, file, PdfAConformanceLevel.PDF_A_1B);
// Avoid discrepances between document title and XMP metadata information.
document.addTitle(title);
// Create XMP metadata. It's a PDF/A requirement.
writer.createXmpMetadata();
document.open();
// Set output intent. PDF/A requirement.
ICC_Profile icc = ICC_Profile.getInstance(new FileInputStream("./src/main/resources/com/itextpdf/sRGB Color Space Profile.icm"));
writer.setOutputIntents("Custom", "", "http://www.color.org", "sRGB IEC61966-2.1", icc);
// CSS stylesheet.
CSSResolver cssResolver = new StyleAttrCSSResolver();
CssFile cssFile = XMLWorkerHelper.getCSS(new FileInputStream("./css/style.css"));
cssResolver.addCss(cssFile);
MyFontProvider fontProvider = new MyFontProvider();
fontProvider.register("./fonts/arial.ttf");
/* DEBUG
System.out.println("Fonts present in " + fontProvider.getClass().getName());
Set<String> registeredFonts = fontProvider.getRegisteredFonts();
for (String font : registeredFonts)
System.out.println(font);
*/
CssAppliers cssAppliers = new CssAppliersImpl(fontProvider);
HtmlPipelineContext htmlContext = new HtmlPipelineContext(cssAppliers);
htmlContext.setTagFactory(Tags.getHtmlTagProcessorFactory());
// Pipelines.
PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
HtmlPipeline html = new HtmlPipeline(htmlContext, pdf);
CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
XMLWorker worker = new XMLWorker(css, true);
XMLParser p = new XMLParser(worker);
Reader reader = new StringReader(buf.toString());
p.parse(reader);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (document != null && document.isOpen())
document.close();
try {
if (file != null)
file.close();
} catch (IOException e) {}
if (writer != null && !writer.isCloseStream())
writer.close();
}
}
}
MyFontProvider.java
package com.itextpdf;
import com.itextpdf.text.BaseColor;
import com.itextpdf.text.Font;
import com.itextpdf.text.FontFactoryImp;
public class MyFontProvider extends FontFactoryImp {
#Override
public Font getFont(String fontname, String encoding, boolean embedded,
float size, int style, BaseColor color) {
System.out.println("=fontname: " + fontname + " =encoding: " + encoding + " =embedded : " + embedded + " =size: " + size + " =style: " + style + " =BaseColor: " + color);
return super.getFont(fontname, encoding, embedded, size, style, color);
}
}
Exception
Exception in thread "main" com.itextpdf.text.pdf.PdfAConformanceException: An annotation dictionary shall contain the F key.
at com.itextpdf.text.pdf.internal.PdfA1Checker.checkAnnotation(PdfA1Checker.java:422)
at com.itextpdf.text.pdf.internal.PdfAChecker.checkPdfAConformance(PdfAChecker.java:219)
at com.itextpdf.text.pdf.internal.PdfAConformanceImp.checkPdfIsoConformance(PdfAConformanceImp.java:71)
at com.itextpdf.text.pdf.PdfWriter.checkPdfIsoConformance(PdfWriter.java:3426)
at com.itextpdf.text.pdf.PdfWriter.checkPdfIsoConformance(PdfWriter.java:3422)
at com.itextpdf.text.pdf.PdfAnnotation.toPdf(PdfAnnotation.java:999)
at com.itextpdf.text.pdf.PdfIndirectObject.writeTo(PdfIndirectObject.java:158)
at com.itextpdf.text.pdf.PdfWriter$PdfBody.write(PdfWriter.java:420)
at com.itextpdf.text.pdf.PdfWriter$PdfBody.add(PdfWriter.java:398)
at com.itextpdf.text.pdf.PdfWriter$PdfBody.add(PdfWriter.java:373)
at com.itextpdf.text.pdf.PdfWriter$PdfBody.add(PdfWriter.java:369)
at com.itextpdf.text.pdf.PdfWriter.addToBody(PdfWriter.java:843)
at com.itextpdf.text.pdf.internal.PdfAnnotationsImp.rotateAnnotations(PdfAnnotationsImp.java:209)
at com.itextpdf.text.pdf.PdfDocument.newPage(PdfDocument.java:990)
at com.itextpdf.text.pdf.PdfDocument.close(PdfDocument.java:865)
at com.itextpdf.text.Document.close(Document.java:416)
at com.itextpdf.Main.main(Main.java:113)
How to fix the problem:
As Bruno said, updating to version 5.5.7 the problem is solved.
If is not possible update the library, try to set the text link into <span></span>

Extract all text with string positions from a PDF

This may seem an old question, but I didn't find an exhaustive answer after spending half an hour searching all over SO.
I am using PDFBox and I would like to extract all of the text from a PDF file along with the coordinates of each string. I am using their PrintTextLocations example (http://pdfbox.apache.org/apidocs/org/apache/pdfbox/examples/util/PrintTextLocations.html) but with the kind of pdf I am using (E-Tickets) the program fails to recognize strings, printing each character separately. The output is a list of strings (each representing a TextPosition object) like this:
String[414.93896,637.2442 fs=1.0 xscale=8.0 height=4.94 space=2.2240002 width=4.0] s
String[418.93896,637.2442 fs=1.0 xscale=8.0 height=4.94 space=2.2240002 width=4.447998] a
String[423.38696,637.2442 fs=1.0 xscale=8.0 height=4.94 space=2.2240002 width=1.776001] l
String[425.16296,637.2442 fs=1.0 xscale=8.0 height=4.94 space=2.2240002 width=4.447998] e
While I would like the program to recognize the string "sale" as an unique TextPosition and give me its position.
I also tried to play with the setSpacingTolerance() and setAverageCharacterTolerance() PDFTextStripper methods, setting different values above and under the standard values (which FYI are 0.5 and 0.3 respectively), but the output didn't change at all. Where am I going wrong? Thanks in advance.
As Joey mentioned, PDF is just a collection of instructions telling you where a certain character should be printed.
In order to extract words or lines, you will have to perform some data segmentation: studying the bounding boxes of the characters should let you recognize those that are on a same line and then which one form words.
Here is your Solution:
1. Reading File
2. Fetching Each Page to Text by using PDFParserTextStripper
3. Each Position of the text will be printed by char.
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
class PDFParserTextStripper extends PDFTextStripper {
public PDFParserTextStripper(PDDocument pdd) throws IOException {
super();
document = pdd;
}
public void stripPage(int pageNr) throws IOException {
this.setStartPage(pageNr + 1);
this.setEndPage(pageNr + 1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
writeText(document, dummy); // This call starts the parsing process and calls writeString repeatedly.
}
#Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
for (TextPosition text : textPositions) {
System.out.println("String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSizeInPt()
+ " xscale=" + text.getXScale() + " height=" + text.getHeightDir() + " space="
+ text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + " ] " + text.getUnicode());
}
}
public static void extractText(InputStream inputStream) {
PDDocument pdd = null;
try {
pdd = PDDocument.load(inputStream);
PDFParserTextStripper stripper = new PDFParserTextStripper(pdd);
stripper.setSortByPosition(true);
for (int i = 0; i < pdd.getNumberOfPages(); i++) {
stripper.stripPage(i);
}
} catch (IOException e) {
// throw error
} finally {
if (pdd != null) {
try {
pdd.close();
} catch (IOException e) {
}
}
}
}
public static void main(String[] args) throws IOException {
File f = new File("C://PDFLOCATION//target.pdf");
FileInputStream fis = null;
try {
fis = new FileInputStream(f);
extractText(fis);
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (fis != null)
fis.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
}

Categories