lucene indexing of html files

lucene indexing of html files - java

Dear Users I am working on apache lucene for indexing and searching .
I have to index html files stored on the local disc of computer . I have to make indexing on filename and contents of the html files . I am able to store the file names in the lucene index but not the html file contents which should index not only the data but the entire page consisting images link and url and how can i access the contents from those indexed files
for indexing i am using the following code:
File indexDir = new File(indexpath);
File dataDir = new File(datapath);
String suffix = ".htm";
IndexWriter indexWriter = new IndexWriter(
FSDirectory.open(indexDir),
new SimpleAnalyzer(),
true,
IndexWriter.MaxFieldLength.LIMITED);
indexWriter.setUseCompoundFile(false);
indexDirectory(indexWriter, dataDir, suffix);
numIndexed = indexWriter.maxDoc();
indexWriter.optimize();
indexWriter.close();
private void indexDirectory(IndexWriter indexWriter, File dataDir, String suffix) throws IOException {
try {
for (File f : dataDir.listFiles()) {
if (f.isDirectory()) {
indexDirectory(indexWriter, f, suffix);
} else {
indexFileWithIndexWriter(indexWriter, f, suffix);
}
}
} catch (Exception ex) {
System.out.println("exception 2 is" + ex);
}
}
private void indexFileWithIndexWriter(IndexWriter indexWriter, File f,
String suffix) throws IOException {
try {
if (f.isHidden() || f.isDirectory() || !f.canRead() || !f.exists()) {
return;
}
if (suffix != null && !f.getName().endsWith(suffix)) {
return;
}
Document doc = new Document();
doc.add(new Field("contents", new FileReader(f)));
doc.add(new Field("filename", f.getFileName(),
Field.Store.YES, Field.Index.ANALYZED));
indexWriter.addDocument(doc);
} catch (Exception ex) {
System.out.println("exception 4 is" + ex);
}
}
thanks in advance

This line of code is the reason why your contents is not being stored:
doc.add(new Field("contents", new FileReader(f)));
This method DOES NOT STORE the contents being indexed.
If you are trying to index HTML files, try using JTidy. It will make the process much easier.
Sample Codes:
public class JTidyHTMLHandler {
public org.apache.lucene.document.Document getDocument(InputStream is) throws DocumentHandlerException {
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root = tidy.parseDOM(is, null);
Element rawDoc = root.getDocumentElement();
org.apache.lucene.document.Document doc =
new org.apache.lucene.document.Document();
String body = getBody(rawDoc);
if ((body != null) && (!body.equals(""))) {
doc.add(new Field("contents", body, Field.Store.NO, Field.Index.ANALYZED));
}
return doc;
}
protected String getTitle(Element rawDoc) {
if (rawDoc == null) {
return null;
}
String title = "";
NodeList children = rawDoc.getElementsByTagName("title");
if (children.getLength() > 0) {
Element titleElement = ((Element) children.item(0));
Text text = (Text) titleElement.getFirstChild();
if (text != null) {
title = text.getData();
}
}
return title;
}
protected String getBody(Element rawDoc) {
if (rawDoc == null) {
return null;
}
String body = "";
NodeList children = rawDoc.getElementsByTagName("body");
if (children.getLength() > 0) {
body = getText(children.item(0));
}
return body;
}
protected String getText(Node node) {
NodeList children = node.getChildNodes();
StringBuffer sb = new StringBuffer();
for (int i = 0; i < children.getLength(); i++) {
Node child = children.item(i);
switch (child.getNodeType()) {
case Node.ELEMENT_NODE:
sb.append(getText(child));
sb.append(" ");
break;
case Node.TEXT_NODE:
sb.append(((Text) child).getData());
break;
}
}
return sb.toString();
}
}
To get an InputStream from a URL:
URL url = new URL(htmlURLlocation);
URLConnection connection = url.openConnection();
InputStream stream = connection.getInputStream();
To get an InputStream from a File:
InputStream stream = new FileInputStream(new File (htmlFile));

Related

Converting CSV file to Hierarchy XML with JAVA

We have a program in Java that needs to convert CSV file to Hierarchy XML:
the output should be like this:
`<?xml version="1.0" encoding="UTF-8"?>
<UteXmlComuniction xmlns="http://www....../data">
<Client Genaral Data>
<Client>
<pfPg></pfPg>
<name>Arnold</name>
<Family>Bordon</family>
</Client>
<Contract>
<ContractDetail>
<Contract>100020</Contract>
<ContractYear>2019</ContractYear>
</ContractDetail>
</Contract>
</Client Genaral Data>``
But for CSV file we are flexible, we can define it as we want. I thought maybe in this way it works:
"UteXmlComuniction/ClientGeneralData/Client/pfpg", "UteXmlComuniction/ClientGeneralData/Client/name" ,
"UteXmlComuniction/ClientGeneralData/Client/Family" , ...```
This is our code, but it just gives me the flat XML. Also I can not insert "/" character in CSV file, because program can not accept this character.
public class XMLCreators {
// Protected Properties
protected DocumentBuilderFactory domFactory = null;
protected DocumentBuilder domBuilder = null;
public XMLCreators() {
try {
domFactory = DocumentBuilderFactory.newInstance();
domBuilder = domFactory.newDocumentBuilder();
} catch (FactoryConfigurationError exp) {
System.err.println(exp.toString());
} catch (ParserConfigurationException exp) {
System.err.println(exp.toString());
} catch (Exception exp) {
System.err.println(exp.toString());
}
}
public int convertFile(String csvFileName, String xmlFileName,
String delimiter) {
int rowsCount = -1;
try {
Document newDoc = domBuilder.newDocument();
// Root element
Element rootElement = newDoc.createElement("XMLCreators");
newDoc.appendChild(rootElement);
// Read csv file
BufferedReader csvReader;
csvReader = new BufferedReader(new FileReader(csvFileName));
int line = 0;
List<String> headers = new ArrayList<String>(5);
String text = null;
while ((text = csvReader.readLine()) != null) {
StringTokenizer st = new StringTokenizer(text, delimiter, false);
String[] rowValues = new String[st.countTokens()];
int index = 0;
while (st.hasMoreTokens()) {
String next = st.nextToken();
rowValues[index++] = next;
}
if (line == 0) { // Header row
for (String col : rowValues) {
headers.add(col);
}
} else { // Data row
rowsCount++;
Element rowElement = newDoc.createElement("row");
rootElement.appendChild(rowElement);
for (int col = 0; col < headers.size(); col++) {
String header = headers.get(col);
String value = null;
if (col < rowValues.length) {
value = rowValues[col];
} else {
// ?? Default value
value = "";
}
Element curElement = newDoc.createElement(header);
curElement.appendChild(newDoc.createTextNode(value));
rowElement.appendChild(curElement);
}
}
line++;
}
ByteArrayOutputStream baos = null;
OutputStreamWriter osw = null;
try {
baos = new ByteArrayOutputStream();
osw = new OutputStreamWriter(baos);
TransformerFactory tranFactory = TransformerFactory.newInstance();
Transformer aTransformer = tranFactory.newTransformer();
aTransformer.setOutputProperty(OutputKeys.INDENT, "yes");
aTransformer.setOutputProperty(OutputKeys.METHOD, "xml");
aTransformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
Source src = new DOMSource(newDoc);
Result result = new StreamResult(osw);
aTransformer.transform(src, result);
osw.flush();
System.out.println(new String(baos.toByteArray()));
} catch (Exception exp) {
exp.printStackTrace();
} finally {
try {
osw.close();
} catch (Exception e) {
}
try {
baos.close();
} catch (Exception e) {
}
}
// Output to console for testing
// Resultt result = new StreamResult(System.out);
} catch (IOException exp) {
System.err.println(exp.toString());
} catch (Exception exp) {
System.err.println(exp.toString());
}
return rowsCount;
// "XLM Document has been created" + rowsCount;
}
}
Do you have any suggestion that how should I modify the code or how can I change my CSV in order to have a Hierarchy XML?

csv:
pfPg;name;Family;Contract;ContractYear
There are several libs for reading csv in Java. Store the values in a container e.g. hashmap.
Then create java classes representing your xml structure.
class Client {
private String pfPg;
private String name;
private String Family
}
class ClientGenaralData {
private Client client;
private Contract contract;
}
Do the mapping from csv to your Java classes by writing custom code or a mapper like dozer... Then use xml binding with Jackson or JAXB to create xml from Java objects.
Jackson xml
Dozer HowTo

Counting the number of attachments on a PDF using iText

I am trying to count the number of attachments on a PDF to verify our attachment code. The code I have works most of the time but recently it started failing when the number of attachments went up as well as the size of the attachments. Example: I have a PDF with 700 attachments which total 1.6 gb. And another with 65 attachments of around 10mb. The 65 count was done incrementally. We had built it up file by file. At 64 files (about 9.8mb) the routine counted fine. Add file 65 (about .5mb) and the routine failed.
This is on itextpdf-5.5.9.jar under jre1.8.0_162
We are still testing different combinations of file numbers and size to see where it breaks.
private static String CountFiles() throws IOException, DocumentException {
Boolean errorFound = new Boolean(true);
PdfDictionary root;
PdfDictionary names;
PdfDictionary embeddedFiles;
PdfReader reader = null;
String theResult = "unknown";
try {
if (!theBaseFile.toLowerCase().endsWith(".pdf"))
theResult = "file not PDF";
else {
reader = new PdfReader(theBaseFile);
root = reader.getCatalog();
names = root.getAsDict(PdfName.NAMES);
if (names == null)
theResult = "0";
else {
embeddedFiles = names.getAsDict(PdfName.EMBEDDEDFILES);
PdfArray namesArray = embeddedFiles.getAsArray(PdfName.NAMES);
theResult = String.format("%d", namesArray.size() / 2);
}
reader.close();
errorFound = false;
}
}
catch (Exception e) {
theResult = "unknown";
}
finally {
if (reader != null)
reader.close();
}
if (errorFound)
sendError(theResult);
return theResult;
}
private static String AttachFileInDir() throws IOException, DocumentException {
String theResult = "unknown";
String outputFile = theBaseFile.replaceFirst("(?i).pdf$", ".attach.pdf");
int maxFiles = 1000;
int fileCount = 1;
PdfReader reader = null;
PdfStamper stamper = null;
try {
if (!theBaseFile.toLowerCase().endsWith(".pdf"))
theResult = "basefile not PDF";
else if (theFileDir.length() == 0)
theResult = "no attach directory";
else if (!Files.isDirectory(Paths.get(theFileDir)))
theResult = "invalid attach directory";
else {
reader = new PdfReader(theBaseFile);
stamper = new PdfStamper(reader, new FileOutputStream(outputFile));
stamper.getWriter().setPdfVersion(PdfWriter.VERSION_1_7);
Path dir = FileSystems.getDefault().getPath(theFileDir);
DirectoryStream<Path> stream = Files.newDirectoryStream(dir);
for (Path path : stream) {
stamper.addFileAttachment(null, null, path.toFile().toString(), path.toFile().getName());
if (++fileCount > maxFiles) {
theResult = "maxfiles exceeded";
break;
}
}
stream.close();
stamper.close();
reader.close();
theResult = "SUCCESS";
}
}
catch (Exception e) {
theResult = "unknown";
}
finally {
if (stamper != null)
stamper.close();
if (reader != null)
reader.close();
}
if (theResult != "SUCCESS")
sendError(theResult);
return theResult;
}
I expect a simple count of attachments back. What seems to be happening is the namesArray is coming back null. The result stays "unknown". I suspect the namesArray is trying to hold all the files and choking on the size.
Note: The files are being attached using the AttachFileInDir procedure. Dump all the files in a directory and run the AttachFileInDir. And yes the error trapping in AttachFileInDir needs work.
Any help would be appreciated or another method welcome

I finally got it. Turns out each KID is a dictionary of NAMES….
Each NAMES hold 64 file references. At 65 files and up it made a KIDS dictionary array of names. So 279 files = ( 8*64 +46 )/2 (9 total KIDS array elements).
One thing that I had to compensate for. If one deletes all the attachments off a pdf it leaves artifacts behind as opposed to a PDF that never had an attachment
private static String CountFiles() throws IOException, DocumentException {
Boolean errorFound = new Boolean(true);
int totalFiles = 0;
PdfArray filesArray;
PdfDictionary root;
PdfDictionary names;
PdfDictionary embeddedFiles;
PdfReader reader = null;
String theResult = "unknown";
try {
if (!theBaseFile.toLowerCase().endsWith(".pdf"))
theResult = "file not PDF";
else {
reader = new PdfReader(theBaseFile);
root = reader.getCatalog();
names = root.getAsDict(PdfName.NAMES);
if (names == null){
theResult = "0";
errorFound = false;
}
else {
embeddedFiles = names.getAsDict(PdfName.EMBEDDEDFILES);
filesArray = embeddedFiles.getAsArray(PdfName.NAMES);
if (filesArray != null)
totalFiles = filesArray.size();
else {
filesArray = embeddedFiles.getAsArray(PdfName.KIDS);
if (filesArray != null){
for (int i = 0; i < filesArray.size(); i++)
totalFiles += filesArray.getAsDict(i).getAsArray(PdfName.NAMES).size();
}
}
theResult = String.format("%d", totalFiles / 2);
reader.close();
errorFound = false;
}
}
}
catch (Exception e) {
theResult = "unknown" + e.getMessage();
}
finally {
if (reader != null)
reader.close();
}
if (errorFound)
sendError(theResult);
return theResult;
}

pdfbox writing compressed object streams

I'm merging multiple files, which originally have 19mb.
But the result is a total of 56mb. How can I make this final value approach the 19mb.
[EDIT]
public void concatena(InputStream anterior, InputStream novo, OutputStream saida, List<String> marcadores)
throws IOException {
PDFMergerUtility pdfMerger = new PDFMergerUtility();
pdfMerger.setDestinationStream(saida);
PDDocument dest;
PDDocument src;
MemoryUsageSetting setupMainMemoryOnly = MemoryUsageSetting.setupMainMemoryOnly();
if (anterior != null) {
dest = PDDocument.load(anterior, setupMainMemoryOnly);
src = PDDocument.load(novo, setupMainMemoryOnly);
} else {
dest = PDDocument.load(novo, setupMainMemoryOnly);
src = new PDDocument();
}
int totalPages = dest.getNumberOfPages();
pdfMerger.appendDocument(dest, src);
criaMarcador(dest, totalPages, marcadores);
saida = pdfMerger.getDestinationStream();
dest.save(saida);
dest.close();
src.close();
}
Sorry, I still do not know how to use stackoverflow very well. I'm trying to post the rest of the code but I'm getting an error
[Edit 2 - add criaMarcador method]
private void criaMarcador(PDDocument src, int numPaginas, List<String> marcadores) {
if (marcadores != null && !marcadores.isEmpty()) {
PDDocumentOutline documentOutline = src.getDocumentCatalog().getDocumentOutline();
if (documentOutline == null) {
documentOutline = new PDDocumentOutline();
}
PDPage page;
if (src.getNumberOfPages() == numPaginas) {
page = src.getPage(0);
} else {
page = src.getPage(numPaginas);
}
PDOutlineItem bookmark = null;
PDOutlineItem pai = null;
String etiquetaAnterior = null;
for (String etiqueta : marcadores) {
bookmark = bookmark(pai != null ? pai : documentOutline, etiqueta);
if (bookmark == null) {
if (etiquetaAnterior != null && !etiquetaAnterior.equals(etiqueta) && pai == null) {
pai = bookmark(documentOutline, etiquetaAnterior);
}
bookmark = new PDOutlineItem();
bookmark.setTitle(etiqueta);
if (marcadores.indexOf(etiqueta) == marcadores.size() - 1) {
bookmark.setDestination(page);
}
if (pai != null) {
pai.addLast(bookmark);
pai.openNode();
} else {
documentOutline.addLast(bookmark);
}
} else {
pai = bookmark;
}
etiquetaAnterior = etiqueta;
}
src.getDocumentCatalog().setDocumentOutline(documentOutline);
}
}
private PDOutlineItem bookmark(PDOutlineNode outline, String etiqueta) {
PDOutlineItem current = outline.getFirstChild();
while (current != null) {
if (current.getTitle().equals(etiqueta)) {
return current;
}
bookmark(current, etiqueta);
current = current.getNextSibling();
}
return current;
}
[Edit 3]Here is the code used for testing
public class PDFMergeTeste {
public static void main(String[] args) throws IOException {
if (args.length == 1) {
PDFMergeTeste teste = new PDFMergeTeste();
teste.executa(args[0]);
} else {
System.err.println("Argumento tem que ser diretorio contendo arquivos .pdf com nomeclatura no padrão Autos");
}
}
private void executa(String diretorioArquivos) throws IOException {
File[] listFiles = new File(diretorioArquivos).listFiles((pathname) ->
pathname.getName().endsWith(".pdf") || pathname.getName().endsWith(".PDF"));
List<File> lista = Arrays.asList(listFiles);
lista.sort(Comparator.comparing(File::lastModified));
PDFMerge merge = new PDFMerge();
InputStream anterior = null;
ByteArrayOutputStream saida = new ByteArrayOutputStream();
for (File file : lista) {
List<String> marcadores = marcadores(file.getName());
InputStream novo = new FileInputStream(file);
merge.concatena(anterior, novo, saida, marcadores);
anterior = new ByteArrayInputStream(saida.toByteArray());
}
try (OutputStream pdf = new FileOutputStream(pathDestFile)) {
saida.writeTo(pdf);
}
}
private List<String> marcadores(String name) {
String semExtensao = name.substring(0, name.indexOf(".pdf"));
return Arrays.asList(semExtensao.split("_"));
}
}

The error is in the executa method:
InputStream anterior = null;
ByteArrayOutputStream saida = new ByteArrayOutputStream();
for (File file : lista) {
List<String> marcadores = marcadores(file.getName());
InputStream novo = new FileInputStream(file);
merge.concatena(anterior, novo, saida, marcadores);
anterior = new ByteArrayInputStream(saida.toByteArray());
}
Your ByteArrayOutputStream saida is re-used in each loop but it is not cleared in-between. Thus, it contains
after processing file 1:
file 1
after processing file 2:
file 1
concatenation of file 1 and file 2
after processing file 3: file 1
file 1
concatenation of file 1 and file 2
concatenation of file 1 and file 2 and file 3
after processing file 4:
file 1
concatenation of file 1 and file 2
concatenation of file 1 and file 2 and file 3
concatenation of file 1 and file 2 and file 3 and file 4
(Actually this only works because PDFBox tries to be nice and fixes broken input files under the hood as these concatenations of files strictly speaking are broken and PDFBox doesn't need to be able to parse them.)
You can fix this by clearing saida at the start of each iteration:
InputStream anterior = null;
ByteArrayOutputStream saida = new ByteArrayOutputStream();
for (File file : lista) {
saida.reset();
List<String> marcadores = marcadores(file.getName());
InputStream novo = new FileInputStream(file);
merge.concatena(anterior, novo, saida, marcadores);
anterior = new ByteArrayInputStream(saida.toByteArray());
}
With your original method the result size for your inputs is nearly 26 MB, with the fixed method it is about 5 MB, and that latter size approximately represents the sum of the sizes of the input files.

How to replace a part of a string in an xml file?

I have an xml file with something like this:
<Verbiage>
The whiskers plots are based on the responses of incarcerated
<Choice>
<Juvenile> juveniles who have committed sexual offenses. </Juvenile>
<Adult> adult sexual offenders. </Adult>
</Choice>
If the respondent is a
<Choice>
<Adult>convicted sexual offender, </Adult>
<Juvenile>juvenile who has sexually offended, </Juvenile>
</Choice>
#his/her_lc# percentile score, which defines #his/her_lc# position
relative to other such offenders, should be taken into account as well as #his/her_lc# T score. Percentile
scores in the top decile (> 90 %ile) of such offenders suggest that the respondent
may be defensive and #his/her_lc# report should be interpreted with this in mind.
</Verbiage>
I am trying to find a way to parse the xml file (I've been using DOM), search for #his/her_lc# and replace that with "her". I've tried using FileReader,BufferedReader, string.replaceAll, FileWriter, but those didn't work.
Is there a way I could do this using XPath?
Ultimately I want to search this xml file for this string and replace it with another string.
do I have to add a tag around the string I want it parse it that way?
Code I tried:
protected void parse() throws ElementNotValidException {
try {
//Parse xml File
File inputXML = new File("template.xml");
DocumentBuilderFactory parser = DocumentBuilderFactory.newInstance(); // new instance of doc builder
DocumentBuilder dParser = parser.newDocumentBuilder(); // calls it
Document doc = dParser.parse(inputXML); // parses file
FileReader reader = new FileReader(inputXML);
String search = "#his/her_lc#";
String newString;
BufferedReader br = new BufferedReader(reader);
while ((newString = br.readLine()) != null){
newString.replaceAll(search, "her");
}
FileWriter writer = new FileWriter(inputXML);
writer.write(newString);
writer.close();
} catch (ParserConfigurationException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
}
Code I was given to fix:
try {
File inputXML = new File("template.xml"); // creates new input file
DocumentBuilderFactory parser = DocumentBuilderFactory.newInstance(); // new instance of doc builder
DocumentBuilder dParser = parser.newDocumentBuilder(); // calls it
Document doc = dParser.parse(inputXML); // parses file
doc.getDocumentElement().normalize();
NodeList pList = doc.getElementsByTagName("Verbiage"); // gets element by tag name and places into list to begin parsing
int gender = 1; // gender has to be taken from the response file, it is hard coded for testing purposes
System.out.println("----------------------------"); // new line
// loops through the list of Verbiage tags
for (int temp = 0; temp < pList.getLength(); temp++) {
Node pNode = pList.item(0); // sets node to temp
if (pNode.getNodeType() == Node.ELEMENT_NODE) { // if the node type = the element node
Element eElement = (Element) pNode;
NodeList pronounList = doc.getElementsByTagName("pronoun"); // gets a list of pronoun element tags
if (gender == 0) { // if the gender is male
int count1 = 0;
while (count1 < pronounList.getLength()) {
if ("#he/she_lc#".equals(pronounList.item(count1).getTextContent())) {
pronounList.item(count1).setTextContent("he");
}
if ("#he/she_caps#".equals(pronounList.item(count1).getTextContent())) {
pronounList.item(count1).setTextContent("He");
}
if ("#his/her_lc#".equals(pronounList.item(count1).getTextContent())) {
pronounList.item(count1).setTextContent("his");
}
if ("#his/her_caps#".equals(pronounList.item(count1).getTextContent())) {
pronounList.item(count1).setTextContent("His");
}
if ("#him/her_lc#".equals(pronounList.item(count1).getTextContent())) {
pronounList.item(count1).setTextContent("him");
}
count1++;
}
pNode.getNextSibling();
} else if (gender == 1) { // female
int count = 0;
while (count < pronounList.getLength()) {
if ("#he/she_lc#".equals(pronounList.item(count).getTextContent())) {
pronounList.item(count).setTextContent("she");
}
if ("#he/she_caps3".equals(pronounList.item(count).getTextContent())) {
pronounList.item(count).setTextContent("She");
}
if ("#his/her_lc#".equals(pronounList.item(count).getTextContent())) {
pronounList.item(count).setTextContent("her");
}
if ("#his/her_caps#".equals(pronounList.item(count).getTextContent())) {
pronounList.item(count).setTextContent("Her");
}
if ("#him/her_lc#".equals(pronounList.item(count).getTextContent())) {
pronounList.item(count).setTextContent("her");
}
count++;
}
pNode.getNextSibling();
}
}
}
// write the content to file
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
DOMSource source = new DOMSource(doc);
System.out.println("-----------Modified File-----------");
StreamResult consoleResult = new StreamResult(System.out);
transformer.transform(source, new StreamResult(new FileOutputStream("template.xml"))); // writes changes to file
} catch (Exception e) {
e.printStackTrace();
}
}
This code I think would work if I could figure out how to associate the tag Pronoun with the pronounParser that this code is in.

I used this example and your template.xml, and I think it works.
public static void main(String[] args) {
File inputXML = new File("template.xml");
BufferedReader br = null;
String newString = "";
StringBuilder strTotale = new StringBuilder();
try {
FileReader reader = new FileReader(inputXML);
String search = "#his/her_lc#";
br = new BufferedReader(reader);
while ((newString = br.readLine()) != null){
newString = newString.replaceAll(search, "her");
strTotale.append(newString);
}
} catch ( IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} // calls it
finally
{
try {
br.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
System.out.println(strTotale.toString());
}
First you must reassign the result of replaceAll:
newString = newString.replaceAll(search, "her");
Second I used a StringBuffer to collect all lines.
I hope this help.

Since strings are immutable you can not modify them, use
StringBuilder/StringBuffer
instead of String.
FileReader reader = new FileReader(inputXML);
String search = "#his/her_lc#";
String newString;
StringBuffer str;
BufferedReader br = new BufferedReader(reader);
while ((newString = br.readLine()) != null){
str.append(newString.replaceAll(search, "her"));
}
FileWriter writer = new FileWriter(inputXML);
writer.write(str);
writer.close();

Apache POI - Split Word document (docx) to pages

I have been trying to segment a docx document to multiple documents based on a predefined criteria. following is my approach to cut it to paragraphs
try {
FileInputStream in = new FileInputStream(file);
XWPFDocument doc = new XWPFDocument(in);
List<XWPFParagraph> paragraphs = doc.getParagraphs();
for (int idx = 0; idx < paragraphs.size(); idx++) {
XWPFDocument outputDocument = new XWPFDocument();
createParagraphInAnotherDocument(outputDocument, paragraphs.get(idx).getText());
String fullPath = String.format("./content/output/%1$s_%2$s_%3$04d.docx", FileUtils.getFileName(file), getName(), idx);
FileOutputStream outputStream = new FileOutputStream(fullPath);
outputDocument.write(outputStream);
outputDocument.close();
doc.close();
}
} catch (IOException e) {
e.printStackTrace();
}
While I am able to extract paragraphs with the code above, I can't find a way to extract pages. My understanding is that pages in word are render concern, and it happens in the runtime in the word application.

As far as I can see, the only way to do this is by interrogating the DOM model for the Word doc, and then determining how many paragraphs there are on each page. Below is a possible solution to the problem (it only works if the pages are explicitly separated by page breaks)
public static void main(String[] args) {
XWPFDocument doc = null;
try {
//Input Word Document
File file = new File("C:/TestDoc.docx");
FileInputStream in = new FileInputStream(file);
doc = new XWPFDocument(in);
//Determine how many paragraphs per page
List<Integer> paragraphCountList = getParagraphCountPerPage(doc);
if (paragraphCountList != null && paragraphCountList.size() > 0) {
int docCount = 0;
int startIndex = 0;
int endIndex = paragraphCountList.get(0);
//Loop through the paragraph counts for each page
for (int i=0; i < paragraphCountList.size(); i++) {
XWPFDocument outputDocument = new XWPFDocument();
List<XWPFParagraph> paragraphs = doc.getParagraphs();
List<XWPFParagraph> pageParagraphs = new ArrayList<XWPFParagraph>();
if (paragraphs != null && paragraphs.size() > 0) {
//Get the paragraphs from the input Word document
for (int j=startIndex; j < endIndex; j++) {
if (paragraphs.get(j) != null) {
pageParagraphs.add(paragraphs.get(j));
}
}
//Set the start and end point for the next set of paragraphs
startIndex = endIndex;
if (i < paragraphCountList.size()-2) {
endIndex = endIndex + paragraphCountList.get(i+1);
} else {
endIndex = paragraphs.size()-1;
}
//Create a new Word Doc with the paragraph subset
createPageInAnotherDocument(outputDocument, pageParagraphs);
//Write the file
String outputPath = "C:/TestDocOutput"+docCount+".docx";
FileOutputStream outputStream = new FileOutputStream(outputPath);
outputDocument.write(outputStream);
outputDocument.close();
docCount++;
pageParagraphs = new ArrayList<XWPFParagraph>();
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
doc.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
}
private static List<Integer> getParagraphCountPerPage(XWPFDocument doc) throws Exception {
List<Integer> paragraphCountList = new ArrayList<>();
int paragraphCount = 0;
Document domDoc = convertStringToDOM(doc.getDocument().getBody().toString());
NodeList rootChildNodeList = domDoc.getChildNodes().item(0).getChildNodes();
for (int i=0; i < rootChildNodeList.getLength(); i++) {
Node childNode = rootChildNodeList.item(i);
if (childNode.getNodeName().equals("w:p")) {
paragraphCount++;
if (childNode.getChildNodes() != null) {
for (int k=0; k < childNode.getChildNodes().getLength(); k++) {
if (childNode.getChildNodes().item(k).getNodeName().equals("w:r")) {
for (int m=0; m < childNode.getChildNodes().item(k).getChildNodes().getLength(); m++) {
if (childNode.getChildNodes().item(k).getChildNodes().item(m).getNodeName().equals("w:br")) {
paragraphCountList.add(paragraphCount);
paragraphCount = 0;
}
}
}
}
}
}
}
paragraphCountList.add(paragraphCount+1);
return paragraphCountList;
}
private static Document convertStringToDOM(String xmlData) throws Exception {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document document = builder.parse(new InputSource(new StringReader(xmlData)));
return document;
}
private static void createPageInAnotherDocument(XWPFDocument outputDocument, List<XWPFParagraph> pageParagraphs) throws IOException {
for (int i = 0; i < pageParagraphs.size(); i++) {
addParagraphToDocument(outputDocument, pageParagraphs.get(i).getText());
}
}
private static void addParagraphToDocument(XWPFDocument outputDocument, String text) throws IOException {
XWPFParagraph paragraph = outputDocument.createParagraph();
XWPFRun run = paragraph.createRun();
run.setText(text);
}

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

lucene indexing of html files - java

Related

Converting CSV file to Hierarchy XML with JAVA

Counting the number of attachments on a PDF using iText

pdfbox writing compressed object streams

How to replace a part of a string in an xml file?

Apache POI - Split Word document (docx) to pages

Categories

Resources