Lucene special character search - java
I am trying to move from Database search to Lucene search. I have few textfiles which has data, sample data in one of the text file is
N=Ethernet, L=null, IM=XX123, SN=286-054-754, HBF=null, BON=null,
VSR=null, DUID=null, MID=2, IP=10.21.122.136, MAC=60:C7:98:17:57:80,
SYNC=false, GN=null, CustParam3=null, CustParam2=null, VV=1.06.0007,
CustParam5=null, CustParam4=null, CustParam7=null, CustParam6=null,
BUNAME=null, PN=M132-409-01-R, CustParam8=null, CS=2015-09-30
19:49:25.0, CST=Inactive, BL=3.2, EE=Off, TID=190, PRL=VEM, PAV=null,
FAV=null, MON=2016-04-06 11:13:40.507, DON=null, LPDR=2015-09-30
19:50:23.85, SSID=null, PIP=null, DID=null, MDATE=null,
OV=rel-20120625-SC-3.1.2-B, CID=null, ICBI=false, TID=null,
LCR=2015-10-01 01:50:30.297, SS=No-Recent-Communication, CBU=null,
GMVR=, LID=store, FF=167340, HFP=RATNERCO >> blore, ISA=false,
TF=null, FAM=null, LDPDR=2015-09-30 19:50:39.113, STVER=True,
SID=null, LHB=2015-09-30 21:50:30.297, IDSS=false, FR=81796,
LMOS=2015-09-30 19:49:50.503, LCUS=null, MNAME=XX 123, BBUID=null,
CON=null, DBUN=null, ISDRA=false, POSV=null, UUID=2, TRAM=null,
SPOL=000000000, CustomField1=null, CustomField2=null,
CustomField3=null, MUID=2DE02CF3-0663-420A-8918-7A550E29F570,
CustomField4=null, CustomField5=null, HNAME=blore, customparam1=null,
HID=1048, LBDT=2015-07-06 12:03:45.0, DIC=null, AT=None, LID=null,
IDSA=false, LMPS=2015-09-30 15:49:50.457, MBUN=System, CNC=Ethernet,
LOC=null
I am creating index and searching using StandardAnalyzer, but I am unable to search with the string UUID=1, the value which I am getting here is
also that which does NOT have UUID=1 (In total I have two Files and both the files content are getting displayed). As the data has special characters I also tried using WhiteSpaceAnalyzer, but then it did not return any data. I created a custom analyzer which has whitespace, lowercase and standard token filter, but it did not help. I also extended StopwordAnalyzerBase to create my own analyzer and used NormalizeCharMap for replacing the special characters, it helped but I was not able to do wildcard search.
Request someone to help me out in this. I am very new to Lucene.
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class IndexCreator
{
public void createIndex(String inputFiles, String indexPath)
{
//Input Path Variable
final Path docDir = Paths.get(inputFiles);
try
{
//org.apache.lucene.store.Directory instance
Directory dir = FSDirectory.open( Paths.get(indexPath) );
//analyzer with the default stop words
//Analyzer analyzer = new NewStandardAnalyzer();
//Analyzer analyzer = buildAnalyzer();
//Analyzer analyzer = new WhitespaceAnalyzer();
Analyzer analyzer = new StandardAnalyzer();
//IndexWriter Configuration
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
//IndexWriter writes new index files to the directory
IndexWriter writer = new IndexWriter(dir, iwc);
//Its recursive method to iterate all files and directories
indexDocs(writer, docDir);
writer.commit();
}
catch (IOException e)
{
e.printStackTrace();
}
}
private void indexDocs(final IndexWriter writer, Path path) throws
IOException
{
//Directory?
if (Files.isDirectory(path))
{
//Iterate directory
Files.walkFileTree(path, new SimpleFileVisitor<Path>()
{
#Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException
{
try
{
//Index this file
indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
}
catch (IOException ioe)
{
ioe.printStackTrace();
}
return FileVisitResult.CONTINUE;
}
});
}
else
{
//Index this file
indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
}
}
private void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException
{
try (InputStream stream = Files.newInputStream(file))
{
//Create lucene Document
Document doc = new Document();
String content = new String(Files.readAllBytes(file));
//content = content.replace("-", "\\-");
//content = content.replace(":", "\\:");
//content = content.replace("=", "\\=");
//content = content.replace(".", "\\.");
doc.add(new StringField("path", file.toString(), Field.Store.YES));
doc.add(new LongPoint("modified", lastModified));
doc.add(new TextField("contents", content, Store.YES));
//Updates a document by first deleting the document(s)
//containing <code>term</code> and then adding the new
//document. The delete and then add are atomic as seen
//by a reader on the same index
writer.updateDocument(new Term("path", file.toString()), doc);
}
}
public static Analyzer buildAnalyzer() throws IOException {
return CustomAnalyzer.builder()
.withTokenizer("whitespace")
.addTokenFilter("lowercase")
.addTokenFilter("standard")
.build();
}
public static void main(String[] args) {
IndexCreator indexCreator = new IndexCreator();indexCreator.createIndex(
"C:\\Lucene\\LuceneLatest\\LuceneLatestModified\\Data",
,
"C:\\Lucene\\LuceneLatest\\LuceneLatestModified\\Index");
System.out.println("Done");
}
}
import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Searcher
{
//directory contains the lucene indexes
private static final String INDEX_DIR =
"C:\\Lucene\\LuceneLatest\\LuceneLatestModified\\Index";
public static void main(String[] args) throws Exception
{
//Create lucene searcher. It search over a single IndexReader.
Searcher searcher = new Searcher();
//Search indexed contents using search term
/*searcher.searchInContent("NETWORKCONFIGURATION=Ethernet AND MACADDRESS=60\\:C7\\:98\\:17\\:57\\:80", searcher.createSearcher());
searcher.searchInContent("NETWORKCONFIGURATION=Ethern*", searcher.createSearcher());*/
searcher.searchInContent("UUID=1", searcher.createSearcher());
}
private void searchInContent(String textToFind, IndexSearcher searcher) throws Exception
{
//Create search query
//QueryParser qp = new QueryParser("contents", new StandardAnalyzer());
QueryParser qp = new QueryParser("contents", new StandardAnalyzer());
//textToFind = QueryParser.escape(textToFind).toLowerCase();
Query query = qp.parse(textToFind);
//search the index
TopDocs hits = searcher.search(query, 10);
System.out.println("Total Results :: " + hits.totalHits);
for (ScoreDoc sd : hits.scoreDocs)
{
Document d = searcher.doc(sd.doc);
System.out.println("Path : "+ d.get("path") + ", Score : " + sd.score + ", Content : "+d.get("contents"));
}
}
private IndexSearcher createSearcher() throws IOException
{
Directory dir = FSDirectory.open(Paths.get(INDEX_DIR));
//It is an interface for accessing a point-in-time view of a lucene index
IndexReader reader = DirectoryReader.open(dir);
//Index searcher
IndexSearcher searcher = new IndexSearcher(reader);
return searcher;
}
public static Analyzer buildAnalyzer() throws IOException {
return CustomAnalyzer.builder()
.withTokenizer("whitespace")
.addTokenFilter("lowercase")
.addTokenFilter("standard")
.build();}}
Related
Why would a Lucene search using a ComplexPhraseQueryParser throw an exception for some content, but not all content?
I am using Lucene 8.2, but have also verified this on 8.9. My query string is either ""by~1 word~1"", or ""ky~1 word~1"". I am looking for a phrase of these 2 words, with potential 1 character mispelling, or fuzziness. I realize that 'by' is usually a stop word, that is why I also tested with 'ky'. My simplified test content is either "AC-2.b word", "AC-2.k word", "AC-2.y word". The first part of the test content is pulled from actual data my customers are trying to search. For the query with 'by~1' the exception occurs if the content has '.b' or .y', but not '.k' For the query with 'ky~1' the exception occurs if the content has '.k' or .y', but not '.b' Here is the test code: import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.*; import org.apache.lucene.analysis.standard.*; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.analysis.util.*; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.RAMDirectory; public class phraseTest { public static Analyzer analyzer = new StandardAnalyzer(); public static IndexWriterConfig config = new IndexWriterConfig( analyzer); public static RAMDirectory ramDirectory = new RAMDirectory(); public static IndexWriter indexWriter; public static Query queryToSearch = null; public static IndexReader idxReader; public static IndexSearcher idxSearcher; public static TopDocs hits; public static String query_field = "Content"; // Pick only one content string // public static String content = "AC-2.b word"; public static String content = "AC-2.k word"; // public static String content = "AC-2.y word"; // Pick only one query string // public static String queryString = "\"by~1 word~1\""; public static String queryString = "\"ky~1 word~1\""; #SuppressWarnings("deprecation") public static void main(String[] args) throws IOException { System.out.println("Content is\n " + content); System.out.println("Query field is " + query_field); System.out.println("Query String is '" + queryString + "'"); Document doc = new Document(); // create a new document /** * Create a field with term vector enabled */ FieldType type = new FieldType(); type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); type.setStored(true); type.setStoreTermVectors(true); type.setTokenized(true); type.setStoreTermVectorOffsets(true); //term vector enabled Field cField = new Field(query_field, content, type); doc.add(cField); try { indexWriter = new IndexWriter(ramDirectory, config); indexWriter.addDocument(doc); indexWriter.close(); idxReader = DirectoryReader.open(ramDirectory); idxSearcher = new IndexSearcher(idxReader); ComplexPhraseQueryParser qp = new ComplexPhraseQueryParser(query_field, analyzer); queryToSearch = qp.parse(queryString); // Here is where the searching, etc starts hits = idxSearcher.search(queryToSearch, idxReader.maxDoc()); System.out.println("scoreDoc size: " + hits.scoreDocs.length); // highlight the hits ... } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } Here is the exception (using Lucene 8.2): Exception in thread "main" java.lang.IllegalArgumentException: Unknown query type "org.apache.lucene.search.ConstantScoreQuery" found in phrase query string "ky~1 word~1" at org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser$ComplexPhraseQuery.rewrite(ComplexPhraseQueryParser.java:325) at org.apache.lucene.search.IndexSearcher.rewrite(IndexSearcher.java:666) at org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:439) at org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:564) at org.apache.lucene.search.IndexSearcher.searchAfter(IndexSearcher.java:416) at org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:427) at phraseTest.main(phraseTest.java:79)` Am I using ComplexPhraseQueryParser wrong? Is this a bug in Lucene?
Java: Extract text from PDF and shows as header and items seperate array list
I am using PDFBOX and reading and saving the contents from PDF file . Requirement is text should be splitted to Header and Item in seperate array list . PDF looks below. Expected : Following details PO,DeliveryDate,Vendor no should shown in arraylist 1 and other details like barcode,item number,description,quantity should shown in arraylist 2 . Exisiting code for extracting data as txt from PDF. PDFBoxReadFromFile.java package pdfboxreadfromfile; import java.io.IOException; import java.util.logging.Level; import java.util.logging.Logger; import java.io.File; import java.io.IOException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripperByArea; import java.io.File; import java.io.IOException; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.io.RandomAccessFile; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.io.*; public class PDFBoxReadFromFile { /** * #param args the command line arguments */ public static void main(String[] args) { PDFManager pdfManager = new PDFManager(); pdfManager.setFilePath("C:\\Users\\34\\Documents\\test.pdf"); try { String text = pdfManager.toText(); System.out.println(text); File file = new File("C:/Users/34/eclipse-workspace/pdfboxreadfromfile/file.txt"); FileWriter fw = new FileWriter(file); PrintWriter pw = new PrintWriter(fw); pw.println(text); pw.close(); } catch (IOException ex) { //System.err.println(ex.getMessage()); Logger.getLogger(PDFBoxReadFromFile.class.getName()).log(Level.SEVERE, null, ex); } } } PDFManager.Java package pdfboxreadfromfile; import java.io.File; import java.io.IOException; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.io.RandomAccessFile; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; public class PDFManager { private PDFParser parser; private PDFTextStripper pdfStripper; private PDDocument pdDoc; private COSDocument cosDoc; private String Text; private String filePath; private File file; public PDFManager() { } public String toText() throws IOException { this.pdfStripper = null; this.pdDoc = null; this.cosDoc = null; file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0 parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(0); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); return Text; } public void setFilePath(String filePath) { this.filePath = filePath; } public PDDocument getPdDoc() { return pdDoc; } } java
Could not able to parse text and Image from PDF file
I have gone through tika Documentation. I find a solution to extract text. but it does not print return image. .java File import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.SAXException; public class Imageextractor3 { public static void main(String[] args) throws IOException, TikaException, SAXException { Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); TesseractOCRConfig config = new TesseractOCRConfig(); PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true); ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); parseContext.set(PDFParserConfig.class, pdfConfig); //need to add this to make sure recursive parsing happens! parseContext.set(Parser.class, parser); File file=new File("C://Users//Vaibhav Shukla//Desktop//8577.00.pdf"); System.out.println(file); FileInputStream stream = new FileInputStream(new File("C://Users//Vaibhav Shukla//Desktop//pdfs//hh.pdf")); Metadata metadata = new Metadata(); parser.parse(stream, handler, metadata, parseContext); System.out.println(metadata); String content = handler.toString(); FileOutputStream fos=new FileOutputStream("C://Users//Vaibhav Shukla//Desktop//pdfs//hd.doc"); fos.write(content.getBytes()); System.out.println("==============="); System.out.println(content); System.out.println("Done"); } } I need suggestion how to add functionality which can detect the image in pdf file
A quick solution to convert extract images which are embedded in pdf. public void extract(File file) throws IOException { PDDocument doc=new PDDocument().load(file); Iterator<PDPage> itr=doc.getDocumentCatalog().getPages().iterator(); while(itr.hasNext()) { PDResources res=itr.next().getResources(); Iterable<COSName> cName=res.getXObjectNames(); Iterator<COSName> citr=cName.iterator(); while(citr.hasNext()) { String imageName= citr.next().getName(); System.out.println(imageName); COSName cosName=COSName.getPDFName(imageName); PDImageXObject im = (PDImageXObject) res.getXObject(cosName); File ff = new File("C://Users//workspace//Desktop//pdfs//"+imageName+".png"); BufferedImage bi=im.getImage(); ImageIO.write(bi,"png",ff); } }}
Why does Lucene return no matching results if BM25 algorithm is used to compute Document similarity?
Hello im trying to do some document similarity calculations with the Okapi BM25 algorithm. But I got problems with the query type. I can't get a results except when Im using the default Queryparser. The basic idea was to index target documents and compare them with source documents by building a query with the content of the document. This is a very minimalistic approach, but I got to make it work. Please correct me if Im doing something stupid. My Code looks as followed: package de.paul.bm25; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; public class DocumentSimilarityBM25 { Analyzer analyzer; Directory index; IndexWriterConfig config; IndexWriter writer; IndexReader reader; IndexSearcher searcher; Similarity similarity = new DefaultSimilarity(); String FIELD_CONTENT = "CONTENT"; public DocumentSimilarityBM25() throws IOException { analyzer = new KeywordAnalyzer(); index = new RAMDirectory(); config = new IndexWriterConfig(analyzer); writer = new IndexWriter(index, config); similarity = new BM25Similarity(); } public void start() { try { index(); List<TopDocs> candidates = search(); printResults(candidates); } catch (IOException | ParseException e) { e.printStackTrace(); } } String[] srcDocuments = new String[]{ "apples are tastefull", "apples and oranges grow an trees", "banana are yellow and very sweet", "this is a zero" }; String[] trgDocuments = new String[]{ "apples oranges and banana", "apples grow on appletrees", "bananes have much suga. " + "so they are high caloric", "bananas have a curvy form", "oranges have the orangecolor and are bigger than apples" }; private void index() throws IOException { for(String target :trgDocuments) { addDoc(createDoc(target)); } System.out.println("Number of indexed Files:" + writer.maxDoc()); writer.close(); } private Query createQuery(Document doc) { final DisjunctionMaxQuery qry = new DisjunctionMaxQuery(0.0f); BooleanQuery bQuery = new BooleanQuery(); PhraseQuery pQuery = new PhraseQuery(); //MultiPhraseQuery mPhrase = new MultiPhraseQuery(); String content = doc.get(FIELD_CONTENT); String[] terms = content.split("\\s"); for(String term : terms) { pQuery = new PhraseQuery(); pQuery.add(new Term(FIELD_CONTENT, term)); bQuery.add(pQuery, Occur.SHOULD); } qry.add(bQuery); return qry; } private List<TopDocs> search() throws IOException, ParseException { List<TopDocs> candidates = new ArrayList<>(); //Query query = new org.apache.lucene.queryparser.classic.QueryParser(FIELD_CONTENT, analyzer).parse(srcDocument); reader = DirectoryReader.open(index); searcher = new IndexSearcher(reader); searcher.setSimilarity(similarity); for(String source : srcDocuments) { Query query = createQuery(createDoc(source)); System.out.println("Query:"+query.toString()); TopDocs candidate = searcher.search(query, reader.maxDoc()); candidates.add(candidate); } return candidates; } private void printResults(List<TopDocs> candidates) throws IOException { for(TopDocs candidate : candidates) { prinCandidate(candidate); } reader.close(); } private void prinCandidate(TopDocs candidate) throws IOException { float maxScore = candidate.getMaxScore(); ScoreDoc[] hits = candidate.scoreDocs; System.out.println("Found " + hits.length + " hits."); System.out.println("MaxScore:" + maxScore); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); float score = hits[i].score; System.out.println((i + 1) + ". Score: " + score + " " + d.get(FIELD_CONTENT) + "\t" ); } } private void addDoc(Document doc) throws IOException { writer.addDocument(doc); writer.commit(); } private Document createDoc(String content) throws IOException { Document doc = new Document(); doc.add(new TextField(FIELD_CONTENT, content, Field.Store.YES)); return doc; } }
Your analyzer is the problem. KeywordAnalyzer indexes the entire field as a single token. It should be used for keywords, unique identifiers, part numbers, stuff like that. You are attempting to search text, though. Use StandardAnalyzer instead, and you'll start seeing results: public DocumentSimilarityBM25() throws IOException { analyzer = new StandardAnalyzer(); index = new RAMDirectory(); ...
Tika in Action book examples Lucene StandardAnalyzer does not work
First of all I am a total noob when it comes to Tika and Lucene. I am working through the Tika in Action book trying out the examples. In chapter 5 this example is given: package tikatest01; import java.io.File; import org.apache.tika.Tika; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexWriter; public class LuceneIndexer { private final Tika tika; private final IndexWriter writer; public LuceneIndexer(Tika tika, IndexWriter writer) { this.tika = tika; this.writer = writer; } public void indexDocument(File file) throws Exception { Document document = new Document(); document.add(new Field( "filename", file.getName(), Store.YES, Index.ANALYZED)); document.add(new Field( "fulltext", tika.parseToString(file), Store.NO, Index.ANALYZED)); writer.addDocument(document); } } And this main method: package tikatest01; import java.io.File; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.tika.Tika; public class TikaTest01 { public static void main(String[] args) throws Exception { String filename = "C:\\testdoc.pdf"; File file = new File(filename); IndexWriter writer = new IndexWriter( new SimpleFSDirectory(file), new StandardAnalyzer(Version.LUCENE_30), MaxFieldLength.UNLIMITED); try { LuceneIndexer indexer = new LuceneIndexer(new Tika(), writer); indexer.indexDocument(file); } finally { writer.close(); } } } I've added the libraries tika-app-1.5.jar, lucene-core-4.7.0.jar and lucene-analyzers-common-4.7.0.jar to the project. Questions: With the current version of Lucene the Field.Index is deprecated, what should I use instead? MaxFieldLength is not found. I am missing an import?
For Lucene 4.7 this code for the indexer: package tikatest01; import java.io.File; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.tika.Tika; public class LuceneIndexer { private final Tika tika; private final IndexWriter writer; public LuceneIndexer(Tika tika, IndexWriter writer) { this.tika = tika; this.writer = writer; } public void indexDocument(File file) throws Exception { Document document = new Document(); document.add(new TextField( "filename", file.getName(), Store.YES)); document.add(new TextField( "fulltext", tika.parseToString(file), Store.NO)); writer.addDocument(document); } } And this code for the main class: package tikatest01; import java.io.File; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; import org.apache.tika.Tika; public class TikaTest01 { public static void main(String[] args) throws Exception { String dirname = "C:\\MyTestDir\\"; File dir = new File(dirname); IndexWriter writer = new IndexWriter( new SimpleFSDirectory(dir), new IndexWriterConfig( Version.LUCENE_47, new StandardAnalyzer(Version.LUCENE_47))); try { LuceneIndexer indexer = new LuceneIndexer(new Tika(), writer); indexer.indexDocument(dir); } finally { writer.close(); } } }
For Lucene 4.7 there isn't this kind of constructor for IndexWriter Take a look on API - http://lucene.apache.org/core/4_7_0/core/org/apache/lucene/index/IndexWriter.html It show me only constructor with 2 params, so you need to adopt this example to new Lucene API