Tika in Action book examples Lucene StandardAnalyzer does not work - java

First of all I am a total noob when it comes to Tika and Lucene. I am working through the Tika in Action book trying out the examples. In chapter 5 this example is given:
package tikatest01;
import java.io.File;
import org.apache.tika.Tika;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
public class LuceneIndexer {
private final Tika tika;
private final IndexWriter writer;
public LuceneIndexer(Tika tika, IndexWriter writer) {
this.tika = tika;
this.writer = writer;
}
public void indexDocument(File file) throws Exception {
Document document = new Document();
document.add(new Field(
"filename", file.getName(),
Store.YES, Index.ANALYZED));
document.add(new Field(
"fulltext", tika.parseToString(file),
Store.NO, Index.ANALYZED));
writer.addDocument(document);
}
}
And this main method:
package tikatest01;
import java.io.File;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.tika.Tika;
public class TikaTest01 {
public static void main(String[] args) throws Exception {
String filename = "C:\\testdoc.pdf";
File file = new File(filename);
IndexWriter writer = new IndexWriter(
new SimpleFSDirectory(file),
new StandardAnalyzer(Version.LUCENE_30),
MaxFieldLength.UNLIMITED);
try {
LuceneIndexer indexer = new LuceneIndexer(new Tika(), writer);
indexer.indexDocument(file);
}
finally {
writer.close();
}
}
}
I've added the libraries tika-app-1.5.jar, lucene-core-4.7.0.jar and lucene-analyzers-common-4.7.0.jar to the project.
Questions:
With the current version of Lucene the Field.Index is deprecated, what should I use instead?
MaxFieldLength is not found. I am missing an import?

For Lucene 4.7 this code for the indexer:
package tikatest01;
import java.io.File;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.tika.Tika;
public class LuceneIndexer {
private final Tika tika;
private final IndexWriter writer;
public LuceneIndexer(Tika tika, IndexWriter writer) {
this.tika = tika;
this.writer = writer;
}
public void indexDocument(File file) throws Exception {
Document document = new Document();
document.add(new TextField(
"filename", file.getName(), Store.YES));
document.add(new TextField(
"fulltext", tika.parseToString(file), Store.NO));
writer.addDocument(document);
}
}
And this code for the main class:
package tikatest01;
import java.io.File;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.Tika;
public class TikaTest01 {
public static void main(String[] args) throws Exception {
String dirname = "C:\\MyTestDir\\";
File dir = new File(dirname);
IndexWriter writer = new IndexWriter(
new SimpleFSDirectory(dir),
new IndexWriterConfig(
Version.LUCENE_47,
new StandardAnalyzer(Version.LUCENE_47)));
try {
LuceneIndexer indexer = new LuceneIndexer(new Tika(), writer);
indexer.indexDocument(dir);
}
finally {
writer.close();
}
}
}

For Lucene 4.7 there isn't this kind of constructor for IndexWriter
Take a look on API - http://lucene.apache.org/core/4_7_0/core/org/apache/lucene/index/IndexWriter.html
It show me only constructor with 2 params, so you need to adopt this example to new Lucene API

Related

Java: Extract text from PDF and shows as header and items seperate array list

I am using PDFBOX and reading and saving the contents from PDF file . Requirement is text should be splitted to Header and Item in seperate array list .
PDF looks below.
Expected :
Following details PO,DeliveryDate,Vendor no should shown in arraylist 1 and other details like barcode,item number,description,quantity should shown in arraylist 2 .
Exisiting code for extracting data as txt from PDF.
PDFBoxReadFromFile.java
package pdfboxreadfromfile;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.io.*;
public class PDFBoxReadFromFile {
/**
* #param args the command line arguments
*/
public static void main(String[] args) {
PDFManager pdfManager = new PDFManager();
pdfManager.setFilePath("C:\\Users\\34\\Documents\\test.pdf");
try {
String text = pdfManager.toText();
System.out.println(text);
File file = new File("C:/Users/34/eclipse-workspace/pdfboxreadfromfile/file.txt");
FileWriter fw = new FileWriter(file);
PrintWriter pw = new PrintWriter(fw);
pw.println(text);
pw.close();
} catch (IOException ex) {
//System.err.println(ex.getMessage());
Logger.getLogger(PDFBoxReadFromFile.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
PDFManager.Java
package pdfboxreadfromfile;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDFManager {
private PDFParser parser;
private PDFTextStripper pdfStripper;
private PDDocument pdDoc;
private COSDocument cosDoc;
private String Text;
private String filePath;
private File file;
public PDFManager() {
}
public String toText() throws IOException {
this.pdfStripper = null;
this.pdDoc = null;
this.cosDoc = null;
file = new File(filePath);
parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
pdDoc.getNumberOfPages();
pdfStripper.setStartPage(0);
pdfStripper.setEndPage(pdDoc.getNumberOfPages());
Text = pdfStripper.getText(pdDoc);
return Text;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
public PDDocument getPdDoc() {
return pdDoc;
}
}
java

Could not able to parse text and Image from PDF file

I have gone through tika Documentation. I find a solution to extract text. but it does not print return image.
.java File
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
public class Imageextractor3 {
public static void main(String[] args)
throws IOException, TikaException, SAXException {
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
TesseractOCRConfig config = new TesseractOCRConfig();
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
parseContext.set(PDFParserConfig.class, pdfConfig);
//need to add this to make sure recursive parsing happens!
parseContext.set(Parser.class, parser);
File file=new File("C://Users//Vaibhav Shukla//Desktop//8577.00.pdf");
System.out.println(file);
FileInputStream stream = new FileInputStream(new File("C://Users//Vaibhav Shukla//Desktop//pdfs//hh.pdf"));
Metadata metadata = new Metadata();
parser.parse(stream, handler, metadata, parseContext);
System.out.println(metadata);
String content = handler.toString();
FileOutputStream fos=new FileOutputStream("C://Users//Vaibhav Shukla//Desktop//pdfs//hd.doc");
fos.write(content.getBytes());
System.out.println("===============");
System.out.println(content);
System.out.println("Done");
}
}
I need suggestion how to add functionality which can detect the image in pdf file
A quick solution to convert extract images which are embedded in pdf.
public void extract(File file) throws IOException {
PDDocument doc=new PDDocument().load(file);
Iterator<PDPage> itr=doc.getDocumentCatalog().getPages().iterator();
while(itr.hasNext())
{
PDResources res=itr.next().getResources();
Iterable<COSName> cName=res.getXObjectNames();
Iterator<COSName> citr=cName.iterator();
while(citr.hasNext())
{
String imageName= citr.next().getName();
System.out.println(imageName);
COSName cosName=COSName.getPDFName(imageName);
PDImageXObject im = (PDImageXObject) res.getXObject(cosName);
File ff = new
File("C://Users//workspace//Desktop//pdfs//"+imageName+".png");
BufferedImage bi=im.getImage();
ImageIO.write(bi,"png",ff);
}
}}

Lucene special character search

I am trying to move from Database search to Lucene search. I have few textfiles which has data, sample data in one of the text file is
N=Ethernet, L=null, IM=XX123, SN=286-054-754, HBF=null, BON=null,
VSR=null, DUID=null, MID=2, IP=10.21.122.136, MAC=60:C7:98:17:57:80,
SYNC=false, GN=null, CustParam3=null, CustParam2=null, VV=1.06.0007,
CustParam5=null, CustParam4=null, CustParam7=null, CustParam6=null,
BUNAME=null, PN=M132-409-01-R, CustParam8=null, CS=2015-09-30
19:49:25.0, CST=Inactive, BL=3.2, EE=Off, TID=190, PRL=VEM, PAV=null,
FAV=null, MON=2016-04-06 11:13:40.507, DON=null, LPDR=2015-09-30
19:50:23.85, SSID=null, PIP=null, DID=null, MDATE=null,
OV=rel-20120625-SC-3.1.2-B, CID=null, ICBI=false, TID=null,
LCR=2015-10-01 01:50:30.297, SS=No-Recent-Communication, CBU=null,
GMVR=, LID=store, FF=167340, HFP=RATNERCO >> blore, ISA=false,
TF=null, FAM=null, LDPDR=2015-09-30 19:50:39.113, STVER=True,
SID=null, LHB=2015-09-30 21:50:30.297, IDSS=false, FR=81796,
LMOS=2015-09-30 19:49:50.503, LCUS=null, MNAME=XX 123, BBUID=null,
CON=null, DBUN=null, ISDRA=false, POSV=null, UUID=2, TRAM=null,
SPOL=000000000, CustomField1=null, CustomField2=null,
CustomField3=null, MUID=2DE02CF3-0663-420A-8918-7A550E29F570,
CustomField4=null, CustomField5=null, HNAME=blore, customparam1=null,
HID=1048, LBDT=2015-07-06 12:03:45.0, DIC=null, AT=None, LID=null,
IDSA=false, LMPS=2015-09-30 15:49:50.457, MBUN=System, CNC=Ethernet,
LOC=null
I am creating index and searching using StandardAnalyzer, but I am unable to search with the string UUID=1, the value which I am getting here is
also that which does NOT have UUID=1 (In total I have two Files and both the files content are getting displayed). As the data has special characters I also tried using WhiteSpaceAnalyzer, but then it did not return any data. I created a custom analyzer which has whitespace, lowercase and standard token filter, but it did not help. I also extended StopwordAnalyzerBase to create my own analyzer and used NormalizeCharMap for replacing the special characters, it helped but I was not able to do wildcard search.
Request someone to help me out in this. I am very new to Lucene.
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class IndexCreator
{
public void createIndex(String inputFiles, String indexPath)
{
//Input Path Variable
final Path docDir = Paths.get(inputFiles);
try
{
//org.apache.lucene.store.Directory instance
Directory dir = FSDirectory.open( Paths.get(indexPath) );
//analyzer with the default stop words
//Analyzer analyzer = new NewStandardAnalyzer();
//Analyzer analyzer = buildAnalyzer();
//Analyzer analyzer = new WhitespaceAnalyzer();
Analyzer analyzer = new StandardAnalyzer();
//IndexWriter Configuration
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
//IndexWriter writes new index files to the directory
IndexWriter writer = new IndexWriter(dir, iwc);
//Its recursive method to iterate all files and directories
indexDocs(writer, docDir);
writer.commit();
}
catch (IOException e)
{
e.printStackTrace();
}
}
private void indexDocs(final IndexWriter writer, Path path) throws
IOException
{
//Directory?
if (Files.isDirectory(path))
{
//Iterate directory
Files.walkFileTree(path, new SimpleFileVisitor<Path>()
{
#Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException
{
try
{
//Index this file
indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
}
catch (IOException ioe)
{
ioe.printStackTrace();
}
return FileVisitResult.CONTINUE;
}
});
}
else
{
//Index this file
indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
}
}
private void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException
{
try (InputStream stream = Files.newInputStream(file))
{
//Create lucene Document
Document doc = new Document();
String content = new String(Files.readAllBytes(file));
//content = content.replace("-", "\\-");
//content = content.replace(":", "\\:");
//content = content.replace("=", "\\=");
//content = content.replace(".", "\\.");
doc.add(new StringField("path", file.toString(), Field.Store.YES));
doc.add(new LongPoint("modified", lastModified));
doc.add(new TextField("contents", content, Store.YES));
//Updates a document by first deleting the document(s)
//containing <code>term</code> and then adding the new
//document. The delete and then add are atomic as seen
//by a reader on the same index
writer.updateDocument(new Term("path", file.toString()), doc);
}
}
public static Analyzer buildAnalyzer() throws IOException {
return CustomAnalyzer.builder()
.withTokenizer("whitespace")
.addTokenFilter("lowercase")
.addTokenFilter("standard")
.build();
}
public static void main(String[] args) {
IndexCreator indexCreator = new IndexCreator();indexCreator.createIndex(
"C:\\Lucene\\LuceneLatest\\LuceneLatestModified\\Data",
,
"C:\\Lucene\\LuceneLatest\\LuceneLatestModified\\Index");
System.out.println("Done");
}
}
import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Searcher
{
//directory contains the lucene indexes
private static final String INDEX_DIR =
"C:\\Lucene\\LuceneLatest\\LuceneLatestModified\\Index";
public static void main(String[] args) throws Exception
{
//Create lucene searcher. It search over a single IndexReader.
Searcher searcher = new Searcher();
//Search indexed contents using search term
/*searcher.searchInContent("NETWORKCONFIGURATION=Ethernet AND MACADDRESS=60\\:C7\\:98\\:17\\:57\\:80", searcher.createSearcher());
searcher.searchInContent("NETWORKCONFIGURATION=Ethern*", searcher.createSearcher());*/
searcher.searchInContent("UUID=1", searcher.createSearcher());
}
private void searchInContent(String textToFind, IndexSearcher searcher) throws Exception
{
//Create search query
//QueryParser qp = new QueryParser("contents", new StandardAnalyzer());
QueryParser qp = new QueryParser("contents", new StandardAnalyzer());
//textToFind = QueryParser.escape(textToFind).toLowerCase();
Query query = qp.parse(textToFind);
//search the index
TopDocs hits = searcher.search(query, 10);
System.out.println("Total Results :: " + hits.totalHits);
for (ScoreDoc sd : hits.scoreDocs)
{
Document d = searcher.doc(sd.doc);
System.out.println("Path : "+ d.get("path") + ", Score : " + sd.score + ", Content : "+d.get("contents"));
}
}
private IndexSearcher createSearcher() throws IOException
{
Directory dir = FSDirectory.open(Paths.get(INDEX_DIR));
//It is an interface for accessing a point-in-time view of a lucene index
IndexReader reader = DirectoryReader.open(dir);
//Index searcher
IndexSearcher searcher = new IndexSearcher(reader);
return searcher;
}
public static Analyzer buildAnalyzer() throws IOException {
return CustomAnalyzer.builder()
.withTokenizer("whitespace")
.addTokenFilter("lowercase")
.addTokenFilter("standard")
.build();}}

How to parse input type value of html and convert it into pdf?

I am unable to parse the html input type value and convert it into pdf file.I am using pdfWriter to generate the pdf and using xmlworker-5.5.4.jar and itext.jar.It does not parse the input type value of html file and unable to convert into pdf file.This problem is generating when using htmlworker or xmlworker.
Code:
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Element;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.html.simpleparser.HTMLWorker;
import com.itextpdf.text.pdf.PdfWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.StringReader;
public class ParseHtml {
public static final String DEST = "D:/html_1.pdf";
public static void main(String[] args) throws IOException, DocumentException {
File file = new File(DEST);
file.getParentFile().mkdirs();
ParseHtml p=new ParseHtml();
p.createPdf(DEST);
}
#SuppressWarnings("deprecation")
public void createPdf(String file) throws IOException, DocumentException {
StringBuilder sb = new StringBuilder();
sb.append("<input type=\'text\' value=\"43645643634\"/>");
System.out.println("String------"+sb);
FileOutputStream outStream = new FileOutputStream(file);
Document document = new Document(PageSize.A4.rotate());
PdfWriter pdfWriter = PdfWriter.getInstance(document,outStream);
document.open();
document.newPage();
HTMLWorker htmlWorker = new HTMLWorker(document);
htmlWorker.parse(new StringReader(sb.toString()));
document.close();
outStream.close();
System.out.println("Document is created...");
}
}

iText PDF document failing to open: java.io.IOException: No message found for the.document.has.no.pages

So I am practicing with iText to create a small document. The pdf is created in my downloads folder but when I try to open it I get a message and an error:
Message:
it is either not a support file or it has been damaged
Trace:
ExceptionConverter: java.io.IOException: No message found for the.document.has.no.pages
at com.lowagie.text.pdf.PdfPages.writePageTree(Unknown Source)
at com.lowagie.text.pdf.PdfWriter.close(Unknown Source)
at com.lowagie.text.pdf.PdfDocument.close(Unknown Source)
at com.lowagie.text.Document.close(Unknown Source)
at iTextTester.tester.main(tester.java:26)
Code:
package iTextTester;
import com.lowagie.text.Anchor;
import com.lowagie.text.Chapter;
import com.lowagie.text.Document;
import com.lowagie.text.DocumentException;
import com.lowagie.text.Font;
import com.lowagie.text.Paragraph;
import com.lowagie.text.Section;
import com.lowagie.text.pdf.PdfWriter;
import java.io.FileOutputStream;
public class tester {
private static Font catFont = new Font(Font.TIMES_ROMAN, 18, Font.BOLD);
private static Font subFont = new Font(Font.TIMES_ROMAN, 12, Font.BOLD);
public static void main(String[] args) {
try {
Document document = new Document();
PdfWriter.getInstance(document, new FileOutputStream("C:/Users/me/Downloads/FirstPdf.pdf"));
document.open();
document.addTitle("TITLE");
document.addAuthor("AUTHOR");
document.close();
addContent(document);
} catch (Exception e) {
e.printStackTrace();
}
}
private static void addContent(Document document) throws DocumentException {
Anchor anchor = new Anchor("First Chapter", catFont);
anchor.setName("First Chapter");
Chapter catPart = new Chapter(new Paragraph(anchor), 1);
Paragraph subPara = new Paragraph("Subcategory 1", subFont);
Section subCatPart = catPart.addSection(subPara);
subCatPart.add(new Paragraph("Hello"));
}
}
Any idea what I am doing incorrectly?
I was following a very poor tutorial, and found a better one here.
This code is simplified and works well:
package iTextTester;
import java.io.FileOutputStream;
import java.io.IOException;
import com.lowagie.text.Document;
import com.lowagie.text.DocumentException;
import com.lowagie.text.PageSize;
import com.lowagie.text.Paragraph;
import com.lowagie.text.pdf.PdfWriter;
public class tester {
public static final String RESULT = "C:/Users/me/Downloads/text.pdf";
public static void main(String[] args) throws DocumentException,
IOException {
new tester().createPdf(RESULT);
}
public void createPdf(String filename) throws DocumentException,
IOException {
Document document = new Document(PageSize.LETTER);
PdfWriter.getInstance(document, new FileOutputStream(filename));
document.open();
document.add(new Paragraph("Hello World!"));
document.close();
}
}

Categories