i am facing an issue in pdf reading.
public class GetLinesFromPDF extends PDFTextStripper {
static List<String> lines = new ArrayList<String>();
Map<String, String> auMap = new HashMap();
boolean objFlag = false;
public GetLinesFromPDF() throws IOException {
}
/**
* #throws IOException If there is an error parsing the document.
*/
public static void main(String[] args) throws IOException {
PDDocument document = null;
String fileName = "E:\\sample.pdf";
try {
int i;
document = PDDocument.load(new File(fileName));
PDFTextStripper stripper = new GetLinesFromPDF();
stripper.setSortByPosition(true);
stripper.setStartPage(0);
stripper.setEndPage(document.getNumberOfPages());
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
// print lines
for (String line : lines) {
//System.out.println("line = " + line);
if (line.matches("(.*)Objection(.*)")) {
System.out.println(line);
withObjection(lines);
//System.out.println("iiiiiiiiiiii");
break;
}
//System.out.println("uuuuuuuuuuuuuu");
}
} finally {
if (document != null) {
document.close();
}
}
}
/**
* Override the default functionality of PDFTextStripper.writeString()
*/
#Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
System.out.println("textPositions = " + string);
// System.out.println("tex "+textPositions.get(0).getFont()+ getArticleEnd());
// you may process the line here itself, as and when it is obtained
}
}
in need a output like
My pdf have some title, we need to skip the same.
pdf file content is
how to extract text as in separate formats as specified.
thanks in advance.
Here is the code to read a PDF with iText5, and it works :
public class CreateTOC {
public static final String SRC = "file.pdf";
class FontRenderFilter extends RenderFilter {
public boolean allowText(TextRenderInfo renderInfo) {
String font = renderInfo.getFont().getPostscriptFontName();
return font.endsWith("Bold") || font.endsWith("Oblique");
}
}
public static void main(String[] args) throws IOException, DocumentException {
new CreateTOC().parse(SRC);
}
public void parse(String filename) throws IOException {
PdfReader reader = new PdfReader(filename);
Rectangle rect = new Rectangle(1000, 1000);
RenderFilter regionFilter = new RegionTextRenderFilter(rect);
FontRenderFilter fontFilter = new FontRenderFilter();
TextExtractionStrategy strategy = new FilteredTextRenderListener(
new LocationTextExtractionStrategy(), regionFilter, fontFilter);
System.out.println(PdfTextExtractor.getTextFromPage(reader, 56, strategy));
reader.close();
}
}
Can someone help me to do it working in iText7 ? There are problems with the Rectangle and the TextExtractionStrategy (it's not the same constructor as iText5)
Edit : RenderFilter isn't available in iText7...
Here is initialize code
public class Main {
public void index(String input_path, String index_dir, String separator, String extension, String field, DataHandler handler) {
Index index = new Index(handler);
index.initWriter(index_dir, new StandardAnalyzer());
index.run(input_path, field, extension, separator);
}
public List<?> search(String input_path, String index_dir, String separator, String extension, String field, DataHandler handler) {
Search search = new Search(handler);
search.initSearcher(index_dir, new StandardAnalyzer());
return search.runUsingFiles(input_path, field, extension, separator);
}
#SuppressWarnings("unchecked")
public static void main(String[] args) {
String lang = "en-US";
String dType = "data";
String train = "res/input/" +lang+ "/" +dType +"/train/";
String test = "res/input/"+ lang+ "/" +dType+ "/test/";
String separator = "\\|";
String extension = "csv";
String index_dir = "res/index/" +lang+ "." +dType+ ".index";
String output_file = "res/result/" +lang+ "." +dType+ ".output.json";
String searched_field = "utterance";
Main main = new Main();
DataHandler handler = new DataHandler();
main.index(train, index_dir, separator, extension, searched_field, handler);
//List<JSONObject> result = (List<JSONObject>) main.search(test, index_dir, separator, extension, searched_field, handler);
//handler.writeOutputJson(result, output_file);
}
}
It is my code
public class Index {
private IndexWriter writer;
private DataHandler handler;
public Index(DataHandler handler) {
this.handler = handler;
}
public Index() {
this(new DataHandler());
}
public void initWriter(String index_path, Directory store, Analyzer analyzer) {
IndexWriterConfig config = new IndexWriterConfig(analyzer);
try {
this.writer = new IndexWriter(store, config);
} catch (IOException e) {
e.printStackTrace();
}
}
public void initWriter(String index_path, Analyzer analyzer) {
try {
initWriter(index_path, FSDirectory.open(Paths.get(index_path)), analyzer);
} catch (IOException e) {
e.printStackTrace();
}
}
public void initWriter(String index_path) {
List<String> stopWords = Arrays.asList();
CharArraySet stopSet = new CharArraySet(stopWords, false);
initWriter(index_path, new StandardAnalyzer(stopSet));
}
#SuppressWarnings("unchecked")
public void indexDocs(List<?> datas, String field) throws IOException {
FieldType fieldType = new FieldType();
FieldType fieldType2 = new FieldType();
fieldType.setStored(true);
fieldType.setTokenized(true);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
fieldType2.setStored(true);
fieldType2.setTokenized(false);
fieldType2.setIndexOptions(IndexOptions.DOCS);
for(int i = 0 ; i < datas.size() ; i++) {
Map<String,String> temp = (Map<String,String>) datas.get(i);
Document doc = new Document();
for(String key : temp.keySet()) {
if(key.equals(field))
continue;
doc.add(new Field(key, temp.get(key), fieldType2));
}
doc.add(new Field(field, temp.get(field), fieldType));
this.writer.addDocument(doc);
}
}
public void run(String path, String field, String extension, String separator) {
List<File> files = this.handler.getInputFiles(path, extension);
List<?> data = this.handler.readDocs(files, separator);
try {
System.out.println("start index");
indexDocs(data, field);
this.writer.commit();
this.writer.close();
System.out.println("done");
} catch (IOException e) {
e.printStackTrace();
}
}
public void run(String path) {
run(path, "search_field", "csv", "\t");
}
I made simple search module using Java and Lucene.
This module consisted of two phase, index and search.
In index phase, It read csv files and convert to Document each row and add to IndexWriter object using IndexWriter.addDocument() method.
Finaly, It call IndexWriter.commit() method.
It is working well in my local PC (windows)
but in Ubuntu PC, doesn't finished IndexWriter.commit() method.
Of course IndexWriter.flush() method doesn't work.
What is the problem?
I created 500 copies of a file and made Lucene Apache index them all. I named them like "0.txt", "1.txt", "2.txt"..."499.txt". When I search for a specific word it returns files from 0 to 9 while it was supposed to return 0 to 499.
This is my indexer:
public class Indexer {
private IndexWriter writer;
public Indexer(String indexDirectoryPath) throws IOException {
new File(indexDirectoryPath).mkdirs();
Directory indexDirectory = FSDirectory.open(new File(indexDirectoryPath));
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
writer = new IndexWriter(indexDirectory, config);
}
public void close() throws CorruptIndexException, IOException {
writer.close();
}
private Document getDocument(File file) throws IOException {
Document document = new Document();
Field contentField = new Field(LuceneConstants.CONTENTS, new FileReader(file));
Field fileNameField = new Field(LuceneConstants.FILE_NAME, file.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED);
Field filePathField = new Field(LuceneConstants.FILE_PATH, file.getCanonicalPath(), Field.Store.YES,
Field.Index.NOT_ANALYZED);
document.add(contentField);
document.add(fileNameField);
document.add(filePathField);
return document;
}
public void indexFile(File file) throws IOException {
Document document = getDocument(file);
writer.addDocument(document);
}}
This is my Searcher:
public class Searcher {
IndexSearcher indexSearcher;
QueryParser queryParser;
Query query;
#SuppressWarnings("deprecation")
public Searcher(String indexDirectoryPath) throws IOException {
Directory indexDirectory = FSDirectory
.open(new File(indexDirectoryPath));
indexSearcher = new IndexSearcher(indexDirectory);
queryParser = new QueryParser(Version.LUCENE_36,
LuceneConstants.CONTENTS, new StandardAnalyzer(
Version.LUCENE_36));
}
public TopDocs search(String searchQuery) throws IOException,
ParseException {
query = queryParser.parse(QueryParser.escape(searchQuery));
return indexSearcher.search(query, LuceneConstants.MAX_SEARCH);
}
public Document getDocument(ScoreDoc scoreDoc)
throws CorruptIndexException, IOException {
return indexSearcher.doc(scoreDoc.doc);
}
public void close() throws IOException {
indexSearcher.close();
}}
This is how I'm calling it:
Indexer indexer = new Indexer(DIR_INDEX);
for (int i = 0; i < 500; i++) {
indexer.indexFile(new File("files/" + i + ".txt"));
}
indexer.close();
Searcher searcher = new Searcher(DIR_INDEX);
TopDocs hits = searcher.search("Saude");
for (ScoreDoc scoreDoc : hits.scoreDocs) {
org.apache.lucene.document.Document doc = searcher.getDocument(scoreDoc);
System.out.println(doc.get(LuceneConstants.FILE_PATH));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}
When you call
indexSearcher.search(query, LuceneConstants.MAX_SEARCH);
the second parameter is the maximum number of returned hits; check that value, it is probably 10.
I am messing around with Lucene to see how it can help me and I am unable to get a very simple example working. I am using Lucene 5.1
Expectations is that when I search, I get the document ID for the document I added to the index in the console. I get nothing, no errors (just "Done" printed to console at the end)
Here is my code:
public static void main(String[] args) throws Exception {
// create structure on file system
IndexWriter writer = createOrGetIndexWriter(LocalDate.now());
writer.close();
// open for writing
writer = createOrGetIndexWriter(LocalDate.now());
Document document = new Document();
document.add(new IntField("test_field", 1, Field.Store.YES));
// write document and close.
writer.addDocument(document);
writer.commit();
writer.close();
// open reader
IndexReader reader = getIndexReader(LocalDate.now());
IndexSearcher indexSearcher = new IndexSearcher(reader);
Query q = new TermQuery(new Term("test_field", "1"));
// callback should be synchronous
indexSearcher.search(q, new SimpleCollector() {
#Override
public void collect(int i) throws IOException {
System.out.println(i);
}
#Override
public boolean needsScores() {
return false;
}
});
System.out.println("Done");
}
public static IndexWriter createOrGetIndexWriter(LocalDate date) throws Exception {
Directory directory = FSDirectory.open(Paths.get(date.toString()));
IndexWriterConfig iwc = new IndexWriterConfig(new StandardAnalyzer());
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
return new IndexWriter(directory, iwc);
}
public static IndexReader getIndexReader(LocalDate date) throws Exception {
return DirectoryReader.open(FSDirectory.open(Paths.get(date.toString())));
}