Lucene Apache Only Finds 10 Files

Lucene Apache Only Finds 10 Files - java

I created 500 copies of a file and made Lucene Apache index them all. I named them like "0.txt", "1.txt", "2.txt"..."499.txt". When I search for a specific word it returns files from 0 to 9 while it was supposed to return 0 to 499.
This is my indexer:
public class Indexer {
private IndexWriter writer;
public Indexer(String indexDirectoryPath) throws IOException {
new File(indexDirectoryPath).mkdirs();
Directory indexDirectory = FSDirectory.open(new File(indexDirectoryPath));
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
writer = new IndexWriter(indexDirectory, config);
}
public void close() throws CorruptIndexException, IOException {
writer.close();
}
private Document getDocument(File file) throws IOException {
Document document = new Document();
Field contentField = new Field(LuceneConstants.CONTENTS, new FileReader(file));
Field fileNameField = new Field(LuceneConstants.FILE_NAME, file.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED);
Field filePathField = new Field(LuceneConstants.FILE_PATH, file.getCanonicalPath(), Field.Store.YES,
Field.Index.NOT_ANALYZED);
document.add(contentField);
document.add(fileNameField);
document.add(filePathField);
return document;
}
public void indexFile(File file) throws IOException {
Document document = getDocument(file);
writer.addDocument(document);
}}
This is my Searcher:
public class Searcher {
IndexSearcher indexSearcher;
QueryParser queryParser;
Query query;
#SuppressWarnings("deprecation")
public Searcher(String indexDirectoryPath) throws IOException {
Directory indexDirectory = FSDirectory
.open(new File(indexDirectoryPath));
indexSearcher = new IndexSearcher(indexDirectory);
queryParser = new QueryParser(Version.LUCENE_36,
LuceneConstants.CONTENTS, new StandardAnalyzer(
Version.LUCENE_36));
}
public TopDocs search(String searchQuery) throws IOException,
ParseException {
query = queryParser.parse(QueryParser.escape(searchQuery));
return indexSearcher.search(query, LuceneConstants.MAX_SEARCH);
}
public Document getDocument(ScoreDoc scoreDoc)
throws CorruptIndexException, IOException {
return indexSearcher.doc(scoreDoc.doc);
}
public void close() throws IOException {
indexSearcher.close();
}}
This is how I'm calling it:
Indexer indexer = new Indexer(DIR_INDEX);
for (int i = 0; i < 500; i++) {
indexer.indexFile(new File("files/" + i + ".txt"));
}
indexer.close();
Searcher searcher = new Searcher(DIR_INDEX);
TopDocs hits = searcher.search("Saude");
for (ScoreDoc scoreDoc : hits.scoreDocs) {
org.apache.lucene.document.Document doc = searcher.getDocument(scoreDoc);
System.out.println(doc.get(LuceneConstants.FILE_PATH));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}

When you call
indexSearcher.search(query, LuceneConstants.MAX_SEARCH);
the second parameter is the maximum number of returned hits; check that value, it is probably 10.

Related

Extract text from pdf file by pdfbox

i am facing an issue in pdf reading.
public class GetLinesFromPDF extends PDFTextStripper {
static List<String> lines = new ArrayList<String>();
Map<String, String> auMap = new HashMap();
boolean objFlag = false;
public GetLinesFromPDF() throws IOException {
}
/**
* #throws IOException If there is an error parsing the document.
*/
public static void main(String[] args) throws IOException {
PDDocument document = null;
String fileName = "E:\\sample.pdf";
try {
int i;
document = PDDocument.load(new File(fileName));
PDFTextStripper stripper = new GetLinesFromPDF();
stripper.setSortByPosition(true);
stripper.setStartPage(0);
stripper.setEndPage(document.getNumberOfPages());
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
// print lines
for (String line : lines) {
//System.out.println("line = " + line);
if (line.matches("(.*)Objection(.*)")) {
System.out.println(line);
withObjection(lines);
//System.out.println("iiiiiiiiiiii");
break;
}
//System.out.println("uuuuuuuuuuuuuu");
}
} finally {
if (document != null) {
document.close();
}
}
}
/**
* Override the default functionality of PDFTextStripper.writeString()
*/
#Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
System.out.println("textPositions = " + string);
// System.out.println("tex "+textPositions.get(0).getFont()+ getArticleEnd());
// you may process the line here itself, as and when it is obtained
}
}
in need a output like
My pdf have some title, we need to skip the same.
pdf file content is
how to extract text as in separate formats as specified.
thanks in advance.

Lucene IndexWriter.commit() doesn't finished in ubuntu

Here is initialize code
public class Main {
public void index(String input_path, String index_dir, String separator, String extension, String field, DataHandler handler) {
Index index = new Index(handler);
index.initWriter(index_dir, new StandardAnalyzer());
index.run(input_path, field, extension, separator);
}
public List<?> search(String input_path, String index_dir, String separator, String extension, String field, DataHandler handler) {
Search search = new Search(handler);
search.initSearcher(index_dir, new StandardAnalyzer());
return search.runUsingFiles(input_path, field, extension, separator);
}
#SuppressWarnings("unchecked")
public static void main(String[] args) {
String lang = "en-US";
String dType = "data";
String train = "res/input/" +lang+ "/" +dType +"/train/";
String test = "res/input/"+ lang+ "/" +dType+ "/test/";
String separator = "\\|";
String extension = "csv";
String index_dir = "res/index/" +lang+ "." +dType+ ".index";
String output_file = "res/result/" +lang+ "." +dType+ ".output.json";
String searched_field = "utterance";
Main main = new Main();
DataHandler handler = new DataHandler();
main.index(train, index_dir, separator, extension, searched_field, handler);
//List<JSONObject> result = (List<JSONObject>) main.search(test, index_dir, separator, extension, searched_field, handler);
//handler.writeOutputJson(result, output_file);
}
}
It is my code
public class Index {
private IndexWriter writer;
private DataHandler handler;
public Index(DataHandler handler) {
this.handler = handler;
}
public Index() {
this(new DataHandler());
}
public void initWriter(String index_path, Directory store, Analyzer analyzer) {
IndexWriterConfig config = new IndexWriterConfig(analyzer);
try {
this.writer = new IndexWriter(store, config);
} catch (IOException e) {
e.printStackTrace();
}
}
public void initWriter(String index_path, Analyzer analyzer) {
try {
initWriter(index_path, FSDirectory.open(Paths.get(index_path)), analyzer);
} catch (IOException e) {
e.printStackTrace();
}
}
public void initWriter(String index_path) {
List<String> stopWords = Arrays.asList();
CharArraySet stopSet = new CharArraySet(stopWords, false);
initWriter(index_path, new StandardAnalyzer(stopSet));
}
#SuppressWarnings("unchecked")
public void indexDocs(List<?> datas, String field) throws IOException {
FieldType fieldType = new FieldType();
FieldType fieldType2 = new FieldType();
fieldType.setStored(true);
fieldType.setTokenized(true);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
fieldType2.setStored(true);
fieldType2.setTokenized(false);
fieldType2.setIndexOptions(IndexOptions.DOCS);
for(int i = 0 ; i < datas.size() ; i++) {
Map<String,String> temp = (Map<String,String>) datas.get(i);
Document doc = new Document();
for(String key : temp.keySet()) {
if(key.equals(field))
continue;
doc.add(new Field(key, temp.get(key), fieldType2));
}
doc.add(new Field(field, temp.get(field), fieldType));
this.writer.addDocument(doc);
}
}
public void run(String path, String field, String extension, String separator) {
List<File> files = this.handler.getInputFiles(path, extension);
List<?> data = this.handler.readDocs(files, separator);
try {
System.out.println("start index");
indexDocs(data, field);
this.writer.commit();
this.writer.close();
System.out.println("done");
} catch (IOException e) {
e.printStackTrace();
}
}
public void run(String path) {
run(path, "search_field", "csv", "\t");
}
I made simple search module using Java and Lucene.
This module consisted of two phase, index and search.
In index phase, It read csv files and convert to Document each row and add to IndexWriter object using IndexWriter.addDocument() method.
Finaly, It call IndexWriter.commit() method.
It is working well in my local PC (windows)
but in Ubuntu PC, doesn't finished IndexWriter.commit() method.
Of course IndexWriter.flush() method doesn't work.
What is the problem?

How to match exact text in Lucene search?

Im trying to match a text Config migration from ASA5505 8.2 to ASA5516 in column TITLE.
My program looks like this.
Directory directory = FSDirectory.open(indexDir);
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_35,new String[] {"TITLE"}, new StandardAnalyzer(Version.LUCENE_35));
IndexReader reader = IndexReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
queryParser.setPhraseSlop(0);
queryParser.setLowercaseExpandedTerms(true);
Query query = queryParser.parse("TITLE:Config migration from ASA5505 8.2 to ASA5516");
System.out.println(queryStr);
TopDocs topDocs = searcher.search(query,100);
System.out.println(topDocs.totalHits);
ScoreDoc[] hits = topDocs.scoreDocs;
System.out.println(hits.length + " Record(s) Found");
for (int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println("\"Title :\" " +d.get("TITLE") );
}
But its returning
"Title :" Config migration from ASA5505 8.2 to ASA5516
"Title :" Firewall migration from ASA5585 to ASA5555
"Title :" Firewall migration from ASA5585 to ASA5555
Second 2 results are not expected.So what modification required to match exact text Config migration from ASA5505 8.2 to ASA5516
And my indexing function looks like this
public class Lucene {
public static final String INDEX_DIR = "./Lucene";
private static final String JDBC_DRIVER = "oracle.jdbc.OracleDriver";
private static final String CONNECTION_URL = "jdbc:oracle:thin:xxxxxxx"
private static final String USER_NAME = "localhost";
private static final String PASSWORD = "localhost";
private static final String QUERY = "select * from TITLE_TABLE";
public static void main(String[] args) throws Exception {
File indexDir = new File(INDEX_DIR);
Lucene indexer = new Lucene();
try {
Date start = new Date();
Class.forName(JDBC_DRIVER).newInstance();
Connection conn = DriverManager.getConnection(CONNECTION_URL, USER_NAME, PASSWORD);
SimpleAnalyzer analyzer = new SimpleAnalyzer(Version.LUCENE_35);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35, analyzer);
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(indexDir), indexWriterConfig);
System.out.println("Indexing to directory '" + indexDir + "'...");
int indexedDocumentCount = indexer.indexDocs(indexWriter, conn);
indexWriter.close();
System.out.println(indexedDocumentCount + " records have been indexed successfully");
System.out.println("Total Time:" + (new Date().getTime() - start.getTime()) / (1000));
} catch (Exception e) {
e.printStackTrace();
}
}
int indexDocs(IndexWriter writer, Connection conn) throws Exception {
String sql = QUERY;
Statement stmt = conn.createStatement();
stmt.setFetchSize(100000);
ResultSet rs = stmt.executeQuery(sql);
int i = 0;
while (rs.next()) {
System.out.println("Addind Doc No:" + i);
Document d = new Document();
System.out.println(rs.getString("TITLE"));
d.add(new Field("TITLE", rs.getString("TITLE"), Field.Store.YES, Field.Index.ANALYZED));
d.add(new Field("NAME", rs.getString("NAME"), Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(d);
i++;
}
return i;
}
}

PVR is correct, that using a phrase query is probably the right solution here, but they missed on how to use the PhraseQuery class. You are already using QueryParser though, so just use the query parser syntax by enclosing you search text in quotes:
Query query = queryParser.parse("TITLE:\"Config migration from ASA5505 8.2 to ASA5516\"");
Based on your update, you are using a different analyzer at index-time and query-time. SimpleAnalyzer and StandardAnalyzer don't do the same things. Unless you have a very good reason to do otherwise, you should analyze the same way when indexing and querying.
So, change the analyzer in your indexing code to StandardAnalyzer (or vice-versa, use SimpleAnalyzer when querying), and you should see better results.

Here is what i have written for you which works perfectly:
USE: queryParser.parse("\"Config migration from ASA5505 8.2 to ASA5516\"");
To create indexes
public static void main(String[] args)
{
IndexWriter writer = getIndexWriter();
Document doc = new Document();
Document doc1 = new Document();
Document doc2 = new Document();
doc.add(new Field("TITLE", "Config migration from ASA5505 8.2 to ASA5516",Field.Store.YES,Field.Index.ANALYZED));
doc1.add(new Field("TITLE", "Firewall migration from ASA5585 to ASA5555",Field.Store.YES,Field.Index.ANALYZED));
doc2.add(new Field("TITLE", "Firewall migration from ASA5585 to ASA5555",Field.Store.YES,Field.Index.ANALYZED));
try
{
writer.addDocument(doc);
writer.addDocument(doc1);
writer.addDocument(doc2);
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static IndexWriter getIndexWriter()
{
IndexWriter indexWriter=null;
try
{
File file=new File("D://index//");
if(!file.exists())
file.mkdir();
IndexWriterConfig conf=new IndexWriterConfig(Version.LUCENE_34, new StandardAnalyzer(Version.LUCENE_34));
Directory directory=FSDirectory.open(file);
indexWriter=new IndexWriter(directory, conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return indexWriter;
}
}
2.To search string
public static void main(String[] args)
{
IndexReader reader=getIndexReader();
IndexSearcher searcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser(Version.LUCENE_34, "TITLE" ,new StandardAnalyzer(Version.LUCENE_34));
Query query;
try
{
query = parser.parse("\"Config migration from ASA5505 8.2 to ASA5516\"");
TopDocs hits = searcher.search(query,3);
ScoreDoc[] document = hits.scoreDocs;
int i=0;
for(i=0;i<document.length;i++)
{
Document doc = searcher.doc(i);
System.out.println("TITLE=" + doc.get("TITLE"));
}
searcher.close();
}
catch (Exception e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static IndexReader getIndexReader()
{
IndexReader reader=null;
Directory dir;
try
{
dir = FSDirectory.open(new File("D://index//"));
reader=IndexReader.open(dir);
} catch (IOException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
return reader;
}

Try PhraseQuery as follow:
BooleanQuery mainQuery= new BooleanQuery();
String searchTerm="config migration from asa5505 8.2 to asa5516";
String strArray[]= searchTerm.split(" ");
for(int index=0;index<strArray.length;index++)
{
PhraseQuery query1 = new PhraseQuery();
query1.add(new Term("TITLE",strArray[index]));
mainQuery.add(query1,BooleanClause.Occur.MUST);
}
And then execute the mainQuery.
Check out this thread of stackoverflow, It may help you to use PhraseQuery for exact search.

Lucene Apache doesn't keep my old index

I found this example in the internet:
Indexer.java
public class Indexer {
private IndexWriter writer;
#SuppressWarnings("deprecation")
public Indexer(String indexDirectoryPath) throws IOException {
Directory indexDirectory = FSDirectory.open(new File(indexDirectoryPath));
writer = new IndexWriter(indexDirectory, new StandardAnalyzer(Version.LUCENE_36), true,
IndexWriter.MaxFieldLength.UNLIMITED);
}
public void close() throws CorruptIndexException, IOException {
writer.close();
}
private Document getDocument(File file) throws IOException {
Document document = new Document();
Field contentField = new Field(LuceneConstants.CONTENTS, new FileReader(file));
Field fileNameField = new Field(LuceneConstants.FILE_NAME, file.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED);
Field filePathField = new Field(LuceneConstants.FILE_PATH, file.getCanonicalPath(), Field.Store.YES,
Field.Index.NOT_ANALYZED);
document.add(contentField);
document.add(fileNameField);
document.add(filePathField);
return document;
}
public void indexFile(File file) throws IOException {
Document document = getDocument(file);
writer.addDocument(document);
}
public int createIndex(String file) throws IOException {
indexFile(new File(file));
return writer.numDocs();
}
}
Searcher.java
public class Searcher {
IndexSearcher indexSearcher;
QueryParser queryParser;
Query query;
#SuppressWarnings("deprecation")
public Searcher(String indexDirectoryPath) throws IOException {
Directory indexDirectory = FSDirectory
.open(new File(indexDirectoryPath));
indexSearcher = new IndexSearcher(indexDirectory);
queryParser = new QueryParser(Version.LUCENE_36,
LuceneConstants.CONTENTS, new StandardAnalyzer(
Version.LUCENE_36));
}
public TopDocs search(String searchQuery) throws IOException,
ParseException {
query = queryParser.parse(QueryParser.escape(searchQuery));
return indexSearcher.search(query, LuceneConstants.MAX_SEARCH);
}
public Document getDocument(ScoreDoc scoreDoc)
throws CorruptIndexException, IOException {
return indexSearcher.doc(scoreDoc.doc);
}
public void close() throws IOException {
indexSearcher.close();
}
}
LuceneConstants.java
public class LuceneConstants {
public static final String CONTENTS = "contents";
public static final String FILE_NAME = "filename";
public static final String FILE_PATH = "filepath";
public static final int MAX_SEARCH = 10;
}
This is how I use them:
public static void main(String[] args) throws IOException, ParseException {
{
// First file
Indexer indexer = new Indexer("index");
indexer.createIndex("f1.txt");
indexer.close();
Searcher searcher = new Searcher(Constante.DIR_INDEX.getValor());
TopDocs hits = searcher.search("Art. 1°");
for (ScoreDoc scoreDoc : hits.scoreDocs) {
org.apache.lucene.document.Document doc = searcher.getDocument(scoreDoc);
String nomeArquivo = doc.get(LuceneConstants.FILE_PATH);
System.out.println(nomeArquivo);
}
}
System.out.println("-----");
{
// Second file
Indexer indexer = new Indexer("index");
indexer.createIndex("f2.txt");
indexer.close();
Searcher searcher = new Searcher(Constante.DIR_INDEX.getValor());
TopDocs hits = searcher.search("Art. 1°");
for (ScoreDoc scoreDoc : hits.scoreDocs) {
org.apache.lucene.document.Document doc = searcher.getDocument(scoreDoc);
String nomeArquivo = doc.get(LuceneConstants.FILE_PATH);
System.out.println(nomeArquivo);
}
}
}
It works perfectly fine until the "// second file" line.
After I index my second file I'm not able to find anything in my first file.
If I create an instance of Indexer and use it this same instance to index f1.txt and f2.txt and close it then it works like I want it to be. The problem is that if I close my application and open it and decide to index another file I'd lose both f1.txt and f2.txt.
Is there a way to make Lucene always keep the previous index when it index a new file?

Looks like you are using an old version of Lucene (3.6 or below), correct?
The third argument to the IndexWriter constructor specifies whether it should create a new index or open an existing one. If set to true, it will overwrite the existing index, if one exists in the given directory. If you want to open an existing index without overwriting it, it should be false:
writer = new IndexWriter(indexDirectory, new StandardAnalyzer(Version.LUCENE_36), false, IndexWriter.MaxFieldLength.UNLIMITED);

Simple lucene example not working

I am messing around with Lucene to see how it can help me and I am unable to get a very simple example working. I am using Lucene 5.1
Expectations is that when I search, I get the document ID for the document I added to the index in the console. I get nothing, no errors (just "Done" printed to console at the end)
Here is my code:
public static void main(String[] args) throws Exception {
// create structure on file system
IndexWriter writer = createOrGetIndexWriter(LocalDate.now());
writer.close();
// open for writing
writer = createOrGetIndexWriter(LocalDate.now());
Document document = new Document();
document.add(new IntField("test_field", 1, Field.Store.YES));
// write document and close.
writer.addDocument(document);
writer.commit();
writer.close();
// open reader
IndexReader reader = getIndexReader(LocalDate.now());
IndexSearcher indexSearcher = new IndexSearcher(reader);
Query q = new TermQuery(new Term("test_field", "1"));
// callback should be synchronous
indexSearcher.search(q, new SimpleCollector() {
#Override
public void collect(int i) throws IOException {
System.out.println(i);
}
#Override
public boolean needsScores() {
return false;
}
});
System.out.println("Done");
}
public static IndexWriter createOrGetIndexWriter(LocalDate date) throws Exception {
Directory directory = FSDirectory.open(Paths.get(date.toString()));
IndexWriterConfig iwc = new IndexWriterConfig(new StandardAnalyzer());
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
return new IndexWriter(directory, iwc);
}
public static IndexReader getIndexReader(LocalDate date) throws Exception {
return DirectoryReader.open(FSDirectory.open(Paths.get(date.toString())));
}

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Lucene Apache Only Finds 10 Files - java

When you call indexSearcher.search(query, LuceneConstants.MAX_SEARCH); the second parameter is the maximum number of returned hits; check that value, it is probably 10.

Related

Extract text from pdf file by pdfbox

Lucene IndexWriter.commit() doesn't finished in ubuntu

How to match exact text in Lucene search?

Lucene Apache doesn't keep my old index

Simple lucene example not working

Categories

Resources