Getting TF-IDF values from index

Getting TF-IDF values from index - java

The below code is for getting tf-idf value from indexes. But I get an error while running it, on the line with Correct_ME.
Using Lucene 4.8.
DocIndexing.java
public class DocIndexing {
private DocIndexing() {}
/** Index all text files under a directory.
* #param args
* #throws java.io.IOException */
public static void main(String[] args) throws IOException {
String usage = "java org.apache.lucene.demo.IndexFiles"
+ " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
+ "This indexes the documents in DOCS_PATH, creating a Lucene index"
+ "in INDEX_PATH that can be searched with Searching";
String indexPath = "C:/Users/dell/Documents/NetBeansProjects/IndexingSearching/Index";
String docsPath = "C:/Users/dell/Documents/NetBeansProjects/IndexingSearching/ToBeIndexed";
boolean create = true;
for(int i=0;i<args.length;i++) {
if (null != args[i]) switch (args[i]) {
case "-index":
indexPath = args[i+1];
i++;
break;
case "-docs":
docsPath = args[i+1];
i++;
break;
case "-update":
create = false;
break;
}
}
if (docsPath == null) {
System.err.println("Usage: " + usage);
System.exit(1);
}
final File docDir = new File(docsPath);
if (!docDir.canRead() && !docDir.isDirectory() &&
!docDir.isHidden() &&
!docDir.exists()) {
System.out.println("Document directory '" +docDir.getAbsolutePath()+ "' does not exist or is not readable, please check the path");
System.exit(1);
}
Date start = new Date();
try {
System.out.println("Indexing to directory '" + indexPath + "'...");
Directory dir = FSDirectory.open(new File(indexPath));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_48);
//Filter filter = new PorterStemFilter();
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, analyzer);
if (create) {
iwc.setOpenMode(OpenMode.CREATE);
} else {
// Add new documents to an existing index:
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
try (
IndexWriter writer = new IndexWriter(dir, iwc)) {
indexDocs(writer, docDir);
}
Date end = new Date();
System.out.println(end.getTime() - start.getTime() + " total milliseconds");
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
Tf_Idf tfidf = new Tf_Idf();
String field = null,term = null;
tfidf.scoreCalculator(field, term);
}
/*
* #param writer Writer to the index where the given file/dir info will be stored
* #param file The file to index, or the directory to recurse into to find files to index
* #throws IOException If there is a low-level I/O error
*/
static void indexDocs(IndexWriter writer, File file)
throws IOException {
// do not try to index files that cannot be read
if (file.canRead()) {
if (file.isDirectory()) {
String[] files = file.list();
// an IO error could occur
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
}
}
} else {
FileInputStream fis;
try {
fis = new FileInputStream(file);
} catch (FileNotFoundException fnfe) {
return;
}
try {
// make a new, empty document
Document doc = new Document();
// Field termV = new LongField("termVector", file.g)
Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
doc.add(pathField);
Field modifiedField = new LongField("modified", file.lastModified(), Field.Store.NO);
doc.add(modifiedField);
Field titleField = new TextField("title", file.getName(), Field.Store.YES);
doc.add(titleField);
Field contentsField = new TextField("contents", new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8)));
doc.add(contentsField);
//contentsField.setBoost((float)0.5);
//titleField.setBoost((float)2.5);
/* doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));
doc.add(new TextField("title", file.getName(), Field.Store.YES));
doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8))));
*/
// StringField..setBoost(1.2F);
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
// New index, so we just add the document (no old document can be there):
System.out.println("adding " + file);
writer.addDocument(doc);
} else {
// Existing index (an old copy of this document may have been indexed) so
// we use updateDocument instead to replace the old one matching the exact
// path, if present:
System.out.println("updating " + file);
writer.updateDocument(new Term("path", file.getPath()), doc);
}
} finally {
fis.close();
}
}
}
}
}
Tf-idf.java
public class Tf_Idf {
static float tf = 1;
static float idf = 0;
private float tfidf_score;
static float [] tfidf = null;
IndexReader indexReader;
public Tf_Idf() throws IOException {
this.indexReader = DirectoryReader.open(FSDirectory.open(new File("C:/Users/dell/Documents/NetBeansProjects/IndexingSearching/Index")));
}
public void scoreCalculator (String field, String term) throws IOException
{
TFIDFSimilarity tfidfSIM = new DefaultSimilarity();
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
TermsEnum termEnum = MultiFields.getTerms(indexReader, field).iterator(null);
BytesRef bytesRef=null;
while ((bytesRef = termEnum.next()) != null) {
if(bytesRef.utf8ToString().trim().equals(term.trim())) {
if(termEnum.seekExact(bytesRef)) {
idf = tfidfSIM.idf(termEnum.docFreq(), indexReader.numDocs());
DocsEnum docsEnum = termEnum.docs(liveDocs, null);
if(docsEnum != null) {
int doc=0;
while((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
tf = tfidfSIM.tf(docsEnum.freq());
tfidf_score = tf * idf ;
System.out.println(" -tfidf_score-" + tfidf_score);
}
}
}
}
}
}
}

It's obvious that you pass to MultiFields method a null IndexReader
IndexReader reader = null;
tfidf.scoreCalculator( reader, field,term);
You need to write something like this:
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(PATH_TO_LUCENE_INDEX)));
tfidf.scoreCalculator( reader, field,term);
You need to repalce PATH_TO_LUCENE_INDEX with real path, of course.
Another problem, that I see - you open IndexReader in Tf_Idf, but don't use it anywhere, may be it's a good idea to remove it or use it, inside of scoreCalculator method, e.g.
tfidf.scoreCalculator(field,term);
but in method use field of this class, - this.indexReader instead of just indexReader that you try to pass inside method scoreCalculator
UPD
public Tf_Idf() throws IOException {
this.reader = DirectoryReader.open(FSDirectory.open(new File("Index")));
}
In this code, you need to replace "Index" with real path to your Lucene index, e.g. - /home/user/index or C://index or wherever you have it.

Related

Lucene issues in indexing processus

I have a problem with lucene indexing, my documments conains TEXT and FIELD and DocNO. AND my queries containt Title and description. I have a Judgment of relevance. the problem is when i have calculating the MAP, it is very small (0.017). however my frieds have fiding a value of 0.13. I think that i have a problrm with IndexFiles class?? can you help me ? ^-^
public class IndexFiles {
public IndexFiles() {}
public static void main(String[] args) throws IOException, ParseException {
ReadDocuments t = new ReadDocuments();
List<DocumentsParser> docs = new ArrayList<>();
t.readXml(docs, "documents");
final String FIELD_PATH = "path";
final String FIELD_CONTENTS = "contents";
String indexPath = "index1";
Directory dir = FSDirectory.open(new File(indexPath));
Reader r=new FileReader(new File("stopwords.txt"));
StandardAnalyzer analyzer=new StandardAnalyzer(Version.LUCENE_40,r);
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40,analyzer);
/* use BM25 similarity*/
Similarity bm25similarity = new BM25Similarity();
iwc.setSimilarity(bm25similarity);
IndexWriter indexWriter = new IndexWriter(dir, iwc);
for (DocumentsParser doc : docs){
Document document = new Document();
document.add(new StringField("DocNo", doc.getDOCNO(), Field.Store.YES));
document.add(new TextField("TEXT", doc.getTEXT()+" "+doc.getHEAD(),Field.Store.YES));
indexWriter.addDocument(document); }
indexWriter.close();}}
/class SearchFiles/
public class SearchFiles {
public static void main(String[] args) throws Exception {
SearchFiles ch=new SearchFiles();
searchStemTfidfQLong();
}
SearchFiles() {}
public static void searchStemTfidfQLong() throws ParseException, IOException{
String index = "index1";
String field = "TEXT";
int hitsPerPage = 1000;
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
IndexSearcher searcher = new IndexSearcher(reader);
/* use BM25 similarity*/
Similarity bm25similarity = new BM25Similarity();
searcher.setSimilarity(bm25similarity);
Reader r=new FileReader(new File("stopwords.txt"));
StandardAnalyzer analyzer=new StandardAnalyzer(Version.LUCENE_40,r);
QueryParser parser = new QueryParser(Version.LUCENE_40,field,analyzer);
int i=0;
File file = new File("fichier.txt");
FileWriter writere=new FileWriter(file.getAbsoluteFile(), true);
for(Topic topic : Parser.getQuerysTopics(Parser.filename)){
/*query chort*/
String queryChort=topic.getTitle();
queryChort=queryChort.replaceAll("([<>\\(\\):/\\\\',\\s\"])", " ").trim();
i++;
//writere.write(queryChort+"\n");
Query query = parser.parse(queryChort);
System.out.println("Query number : "+(i));
searcher.search(query,1000);
doSearch(i, searcher, query, hitsPerPage);
}
reader.close();
writere.close();
}
public static void doSearch(int idReq, IndexSearcher searcher, Query query, int hitsPerPage) throws IOException {
TopDocs results = searcher.search(query, null, hitsPerPage);
System.out.println(query);
ScoreDoc[] hits = results.scoreDocs;
int numTotalHits = results.totalHits;
System.out.println(numTotalHits + " total matching documents");
int start = 1;
int end = Math.min(numTotalHits, hitsPerPage);
File file = new File("file.txt");
FileWriter writer=new FileWriter(file.getAbsoluteFile(), true);
File file1 = new File("fichier.txt");
FileWriter writere=new FileWriter(file1.getAbsoluteFile(), true);
for (int i = start; i < end; i++) {
Document doc = searcher.doc(hits[i].doc);
String DocNo = doc.get("DocNo");
writere.write(DocNo+"\n");
if (DocNo != null) {
writer.write(idReq+" 0 "+DocNo+" "+String.format("%.6f", new Double(hits[i].score))+" "+ i + " "+"ScoreID"+"\n");
} else {
System.out.println((i + 1) + ". " + "No DocNo for this document"); } }
writer.close();
writere.close();}}

Replace Data to word Document In Alfresco using java code excluding junk characters

I am doing Bulk Upload Task in Alfresco.
Before this i created custom action to call java code, i also successfully read data from excel sheet, and i found node reference of target document as well as source Document. Using that node reference i am also able to create new multiple Documents.
Now My requirement is, I want to replace Excel Data in that newly created Document. I tried to replace it, But It replacing the String only in First line of document, and it also deleting Rest of the existing contents inside newly created document. I have written Below code for this.
In below code i am first simply trying to replace some hard coded data to the Document.
But My requirement is i want to replace the data inside document which i already read from excel file.
Java Code:
public class MoveReplacedActionExecuter extends ActionExecuterAbstractBase {
InputStream is;
Cell cell = null;
public static final String NAME = "move-replaced";
private FileFolderService fileFolderService;
private NodeService nodeService;
private ContentService contentService;
private SearchService searchService;
#Override
protected void addParameterDefinitions(List < ParameterDefinition > paramList) {
}
public void executeImpl(Action ruleAction, NodeRef actionedUponNodeRef) {
try {
ContentReader contentReader = contentService.getReader(actionedUponNodeRef, ContentModel.PROP_CONTENT);
is = contentReader.getContentInputStream();
} catch (NullPointerException ne) {
System.out.println("Null Pointer Exception" + ne);
}
try {
Workbook workbook = new XSSFWorkbook(is);
Sheet firstSheet = workbook.getSheetAt(0);
Iterator < Row > iterator = firstSheet.rowIterator();
while (iterator.hasNext()) {
ArrayList < String > al = new ArrayList < > ();
System.out.println("");
Row nextRow = iterator.next();
Iterator < Cell > cellIterator = nextRow.cellIterator();
while (cellIterator.hasNext()) {
cell = cellIterator.next();
switch (cell.getCellType()) {
case Cell.CELL_TYPE_STRING:
System.out.print("\t" + cell.getStringCellValue());
al.add(cell.getStringCellValue());
break;
case Cell.CELL_TYPE_BOOLEAN:
System.out.print("\t" + cell.getBooleanCellValue());
al.add(String.valueOf(cell.getBooleanCellValue()));
break;
case Cell.CELL_TYPE_NUMERIC:
System.out.print("\t" + cell.getNumericCellValue());
al.add(String.valueOf(cell.getNumericCellValue()));
break;
}
}
}
is.close();
} catch (Exception e) {
e.printStackTrace();
}
String query = "PATH:\"/app:company_home/cm:Dipak/cm:OfferLetterTemplate.doc\"";
SearchParameters sp = new SearchParameters();
StoreRef storeRef = new StoreRef(StoreRef.PROTOCOL_WORKSPACE, "SpacesStore");
sp.addStore(storeRef);
sp.setLanguage(SearchService.LANGUAGE_LUCENE);
sp.setQuery(query);
ResultSet resultSet = searchService.query(sp);
System.out.println("Result Set" + resultSet.length());
NodeRef sourceNodeRef = null;
for (ResultSetRow row: resultSet) {
NodeRef currentNodeRef = row.getNodeRef();
sourceNodeRef = currentNodeRef;
System.out.println(currentNodeRef.toString());
}
NodeRef n = new NodeRef("workspace://SpacesStore/78342318-37b8-4b42-aadc-bb0ed5d413d9");
try {
org.alfresco.service.cmr.model.FileInfo fi = fileFolderService.copy(sourceNodeRef, n, "JustCreated" + Math.random() + ".doc");
NodeRef newNode = fi.getNodeRef();
QName TYPE_AUTHORTY = QName.createQName("sunpharma.hr.model", "hrdoctype");
nodeService.setType(newNode, TYPE_AUTHORTY);
ContentReader contentReader1 = contentService.getReader(newNode, ContentModel.PROP_CONTENT);
InputStream is2 = contentReader1.getContentInputStream();
POIFSFileSystem fs = new POIFSFileSystem(is2);
HWPFDocument doc = new HWPFDocument(fs);
doc = replaceText1(doc, "Company", "Datamatics");
ContentWriter writerDoc = contentService.getWriter(newNode, ContentModel.PROP_CONTENT, true);
writerDoc.putContent(doc.getDocumentText());
} catch (FileExistsException | FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private static HWPFDocument replaceText1(HWPFDocument doc, String findText, String replaceText) {
System.out.println("In the method replacetext" + replaceText);
Range r1 = doc.getRange();
System.out.println("Range of Doc : " + r1);
for (int i = 0; i < r1.numSections(); ++i) {
Section s = r1.getSection(i);
for (int x = 0; x < s.numParagraphs(); x++) {
Paragraph p = s.getParagraph(x);
for (int z = 0; z < p.numCharacterRuns(); z++) {
CharacterRun run = p.getCharacterRun(z);
String text = run.text();
if (text.contains(findText)) {
run.replaceText(findText, replaceText);
} else {
System.out.println("NO text found");
}
}
}
}
return doc;
}
public void setFileFolderService(FileFolderService fileFolderService) {
this.fileFolderService = fileFolderService;
}
public void setNodeService(NodeService nodeService) {
this.nodeService = nodeService;
}
public void setContentService(ContentService contentService) {
this.contentService = contentService;
}
public void setSearchService(SearchService searchService) {
this.searchService = searchService;
}
}

Its not possible to take direct file stream object in alfresco.
so i created one file at local drive, in background i performed all replacement operations. and after that i read all data using file input stream object. and later i used file that stream with node.
and it gave me my desired output. :)

How does regex query work on lucene?

I am trying to implement luecene search engine in my application.
I am using lucene 5.4.1
I have successfully implemented wildequeries and normal queries of lucene.
But my main focus is to search specific text in a text file with regex patterns.
Index Writer code:
public IndexWriter generateIndex(String docsPath) throws IOException {
String indexPath = System.getProperty("java.io.tmpdir") +File.separator+"indexDirectory";
if (indexPath == null) {
throw new IOException("System property 'java.io.tmpdir' does not specify a tmp dir");
}
File tmpDir = new File(indexPath);
if (!tmpDir.exists()) {
boolean created = tmpDir.mkdirs();
if (!created) {
throw new IOException("Unable to create tmp dir " + tmpDir);
}
}
boolean create = true;
final Path docDir = Paths.get(docsPath);
if (!Files.isReadable(docDir)) {
System.out.println("Document directory '" + docDir.toAbsolutePath()
+ "' does not exist or is not readable, please check the path");
System.exit(1);
}
Date start = new Date();
try {
System.out.println("Indexing to directory '" + indexPath + "'...");
Directory dir = FSDirectory.open(Paths.get(indexPath));
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
if (create) {
iwc.setOpenMode(OpenMode.CREATE);
} else {
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
IndexWriter writer = new IndexWriter(dir, iwc);
indexDocs(writer, docDir);
setIndexWriter(writer);
Date end = new Date();
System.out.println(end.getTime() - start.getTime() + " total milliseconds");
writer.close();
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
return getIndexWriter();
}
static void indexDocs(final IndexWriter writer, Path path) throws IOException {
if (Files.isDirectory(path)) {
Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
#Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
try {
indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
} catch (IOException ignore) {
// don't index files that can't be read.
}
return FileVisitResult.CONTINUE;
}
});
} else {
indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
}
}
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
try (InputStream stream = Files.newInputStream(file)) {
Document doc = new Document();
Field pathField = new StringField("path", file.toString(), Field.Store.NO);
doc.add(pathField);
doc.add(new LongField("modified", lastModified, Field.Store.NO));
doc.add(new TextField("contents",
new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
System.out.println("adding " + file);
writer.addDocument(doc);
} else {
System.out.println("updating " + file);
writer.updateDocument(new Term("path", file.toString()), doc);
}
}
}
Index Searching Code:
public IndexReader searchExecutor(String index, String queryString, RegexCapabilities capability) throws Exception {
String field = "contents";
String queries = null;
boolean raw = false;
int hitsPerPage = Integer.MAX_VALUE;
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
BufferedReader in = null;
Query q = new RegexpQuery(new Term("text", queryString));
q = q.rewrite(reader);
RegexQuery query = new RegexQuery(new Term("\\s*(FIND|find)"));
if (capability != null)
query.setRegexImplementation(capability);
System.out.println("Searching for: " + query.toString(field));
searcher.search(query, null, 1000);
doSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);
//reader.close();
return reader;
}
public static void doSearch(BufferedReader in, IndexSearcher searcher, Query query, int hitsPerPage, boolean raw,
boolean interactive)
throws IOException {
TopDocs results = searcher.search(query, 5 * hitsPerPage);
ScoreDoc[] hits = results.scoreDocs;
//generateIndex.deleteDocuments(query);
//generateIndex.getDirectory();
// TermsEnum.totalTermFreq();
int numTotalHits = results.totalHits;
System.out.println(numTotalHits + " total matching documents");
int start = 0;
int end = Math.min(numTotalHits, hitsPerPage);
for (int i = start; i < end; i++) {
Document doc = searcher.doc(hits[i].doc);
String path = doc.get("path");
File file = new File(path);
if (path != null) {
System.out.println((i + 1) + ". " + path);
String title = doc.get("title");
if (title != null) {
System.out.println(" Title: " + doc.get("title"));
}
} else {
System.out.println((i + 1) + ". " + "No path for this document");
}
}
}
Please help.

Your question is about search with regular expressions in lucene.
You are using RegexQuery which is deprecated so try RegexpQuery
Your regEx-example starts with \s* but you do not use KeywordTokenizer. Most other tokenizer will remove (aka "split at") whitespace
Your regEx-example is not purely lower case. But standard analyzer contains LowerCaseFilter. Be aware: your regEx will go directly against the tokens of your index (not against the original text)
--> read Supported RegExp syntax and syntax in ES and TestRegexpRandom (test class) and play with https://github.com/DmitryKey/luke on your index.

Why indexer doesn't search Persian files?

I use lucene 3 for indexing some txt file like this.
public static void main(String[] args) throws Exception {
String indexDir = "file input";
String dataDir = "file input";
long start = System.currentTimeMillis();
indexer indexer = new indexer(indexDir);
int numIndexed, cnt;
try {
numIndexed = indexer.index(dataDir, new TextFilesFilter());
cnt = indexer.getHitCount("mycontents", "شهردار");
System.out.println("count of search in contents: " + cnt);
} finally {
indexer.close();
}
long end = System.currentTimeMillis();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
}
getHitCount function returns number of hits by an English word but by Persian word, it returns zero!
public int getHitCount(String fieldName, String searchString)
throws IOException, ParseException {
IndexSearcher searcher = new IndexSearcher(directory);
Term t = new Term(fieldName, searchString);
Query query = new TermQuery(t);
int hitCount = searcher.search(query, 1).totalHits;
searcher.close();
return hitCount;
}
How to set utf-8 in my project? I use netbeans and create a simple java project.
I just need a simple search in files!
It's my indexer class:
private IndexWriter writer;
private Directory directory;
public indexer(String indexDir) throws IOException {
directory = FSDirectory.open(new File(indexDir));
writer = new IndexWriter(directory,
new StandardAnalyzer(
Version.LUCENE_30),
true,
IndexWriter.MaxFieldLength.UNLIMITED);
}
public void close() throws IOException {
writer.close();
}
public int index(String dataDir, FileFilter filter)
throws Exception {
File[] files = new File(dataDir).listFiles();
for (File f : files) {
if (!f.isDirectory()
&& !f.isHidden()
&& f.exists()
&& f.canRead()
&& (filter == null || filter.accept(f))) {
indexFile(f);
}
}
return writer.numDocs();
}
private static class TextFilesFilter implements FileFilter {
public boolean accept(File path) {
return path.getName().toLowerCase()
.endsWith(".txt");
}
}
protected Document getDocument(File f) throws Exception {
Document doc = new Document();
doc.add(new Field("mycontents", new FileReader(f)));
doc.add(new Field("filename", f.getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("fullpath", f.getCanonicalPath(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
return doc;
}
private void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc);
}

I suspect that the issue isn't Lucene's encoding, per se, but the FileReader. From the FileReader docs:
The constructors of this class assume that the default character encoding and the default byte-buffer size are appropriate.
The default character encoding is probably not appropriate, in this case.
Instead of:
doc.add(new Field("mycontents", new FileReader(f)));
try (assuming the file to be indexed is UTF-8 encoded):
doc.add(new Field("mycontents", new InputStreamReader(new FileInputStream(f), "UTF8")));

Creating a jsp search form to run a java Search program

The background info here is that I have a working Indexer and Search (in java) that indexes and searches a file directory for the filenames and then copies the files to a "Results" Directory.
What I need/ don't have much experience in is writing jsp files. I need the jsp file to have a search bar for the text and then a search button. When text is entered in the bar, and the button is clicked, I need it to run my search program with the entered text as an arg.
I have added the IndexFiles and the SearchFiles classes for reference.
Please explain with a good example if you can help out!
public class SearchFiles {
static File searchDirectory = new File(
"C:\\Users\\flood.j.2\\Desktop\\IndexSearch\\Results");
static String v = new String();
static String path = null;
String title = null;
File addedFile = null;
OutputStream out = null;
String dirName = "C:\\Users\\flood.j.2\\Desktop\\IndexSearch\\Results";
public static void main(String[] args) throws Exception {
String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string]";
if (args.length > 0
&& ("-h".equals(args[0]) || "-help".equals(args[0]))) {
System.out.println(usage);
System.exit(0);
}
for (int j = 5; j < args.length; j++) {
v += args[j] + " ";
}
String index = "index";
String field = "contents";
String queries = null;
boolean raw = false;
String queryString = null;
int hits = 100;
for (int i = 0; i < args.length; i++) {
if ("-index".equals(args[i])) {
index = args[i + 1];
i++;
} else if ("-field".equals(args[i])) {
field = args[i + 1];
i++;
} else if ("-queries".equals(args[i])) {
queries = args[i + 1];
i++;
} else if ("-query".equals(args[i])) {
queryString = v;
i++;
}
}
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(
index)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
BufferedReader in = null;
if (queries != null) {
in = new BufferedReader(new InputStreamReader(new FileInputStream(
queries), "UTF-8"));
} else {
in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
}
QueryParser parser = new QueryParser(Version.LUCENE_40, field, analyzer);
for (int m = 0; m < 2; m++) {
if (queries == null && queryString == null) {
System.out.println("Enter query: ");
}
String line = queryString != null ? queryString : in.readLine();
if (line == null || line.length() == -1) {
break;
}
line = line.trim();
if (line.length() == 0) {
break;
}
Query query = parser.parse(line);
System.out.println("Searching for: " + query.toString(field));
doPagingSearch(in, searcher, query, hits, raw, queries == null
&& queryString == null);
if (queryString == null) {
break;
}
}
reader.close();
}
public static void doPagingSearch(BufferedReader in,
IndexSearcher searcher, Query query, int hitsPerPage, boolean raw,
boolean interactive) throws IOException {
// Collect enough docs to show 500 pages
TopDocs results = searcher.search(query, 5 * hitsPerPage);
ScoreDoc[] hits = results.scoreDocs;
int numTotalHits = results.totalHits;
System.out.println(numTotalHits + " total matching documents");
int start = 0;
int end = Math.min(numTotalHits, hitsPerPage);
FileUtils.deleteDirectory(searchDirectory);
while (true) {
for (int i = start; i < end; i++) {
Document doc = searcher.doc(hits[i].doc);
path = doc.get("path");
if (path != null) {
System.out.println((i + 1) + ". " + path);
File addFile = new File(path);
try {
FileUtils.copyFileToDirectory(addFile, searchDirectory);
} catch (IOException e) {
e.printStackTrace();
}
}
}
if (!interactive || end == 0) {
break;
}
System.exit(0);
}
}
}
public class IndexFiles {
private IndexFiles() {
}
public static void main(String[] args) {
String usage = "java org.apache.lucene.demo.IndexFiles"
+ " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
+ "This indexes the documents in DOCS_PATH, creating a Lucene index"
+ "in INDEX_PATH that can be searched with SearchFiles";
String indexPath = null;
String docsPath = null;
boolean create = true;
for (int i = 0; i < args.length; i++) {
if ("-index".equals(args[i])) {
indexPath = args[i + 1];
i++;
} else if ("-docs".equals(args[i])) {
docsPath = args[i + 1];
i++;
} else if ("-update".equals(args[i])) {
create = false;
}
}
if (docsPath == null) {
System.err.println("Usage: " + usage);
System.exit(1);
}
final File docDir = new File(docsPath);
if (!docDir.exists() || !docDir.canRead()) {
System.out
.println("Document directory '"
+ docDir.getAbsolutePath()
+ "' does not exist or is not readable, please check the path");
System.exit(1);
}
Date start = new Date();
try {
System.out.println("Indexing to directory '" + indexPath + "'...");
Directory dir = FSDirectory.open(new File(indexPath));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40,
analyzer);
if (create) {
iwc.setOpenMode(OpenMode.CREATE);
} else {
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
IndexWriter writer = new IndexWriter(dir, iwc);
indexDocs(writer, docDir);
writer.close();
Date end = new Date();
System.out.println(end.getTime() - start.getTime()
+ " total milliseconds");
} catch (IOException e) {
System.out.println(" caught a " + e.getClass()
+ "\n with message: " + e.getMessage());
}
}
static void indexDocs(IndexWriter writer, File file) throws IOException {
if (file.canRead()) {
if (file.isDirectory()) {
String[] files = file.list();
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
}
}
} else {
FileInputStream fis;
try {
fis = new FileInputStream(file);
} catch (FileNotFoundException fnfe) {
return;
}
try {
Document doc = new Document();
Field pathField = new StringField("path",
file.getAbsolutePath(), Field.Store.YES);
doc.add(pathField);
doc.add(new LongField("modified", file.lastModified(),
Field.Store.NO));
doc.add(new TextField("title", file.getName(), null));
System.out.println(pathField);
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
System.out.println("adding " + file);
writer.addDocument(doc);
} else {
System.out.println("updating " + file);
writer.updateDocument(new Term("path", file.getPath()),
doc);
}
} finally {
fis.close();
}
}
}
}
}

First, you should definitely do this in a servlet rather than a JSP. Putting lots of logic in JSP is bad practice. (See the servlets info page).
Second, it would probably be better on performance to make a cronjob (Linux) or Task (Windows) to run the search program every hour and store the results in a database and just have your servlet pull from there rather than allow the user to initiate the search program.

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Getting TF-IDF values from index - java

Related

Lucene issues in indexing processus

Replace Data to word Document In Alfresco using java code excluding junk characters

How does regex query work on lucene?

Why indexer doesn't search Persian files?

Creating a jsp search form to run a java Search program

Categories

Resources