Apache POI - Split Word document (docx) to pages

Apache POI - Split Word document (docx) to pages - java

I have been trying to segment a docx document to multiple documents based on a predefined criteria. following is my approach to cut it to paragraphs
try {
FileInputStream in = new FileInputStream(file);
XWPFDocument doc = new XWPFDocument(in);
List<XWPFParagraph> paragraphs = doc.getParagraphs();
for (int idx = 0; idx < paragraphs.size(); idx++) {
XWPFDocument outputDocument = new XWPFDocument();
createParagraphInAnotherDocument(outputDocument, paragraphs.get(idx).getText());
String fullPath = String.format("./content/output/%1$s_%2$s_%3$04d.docx", FileUtils.getFileName(file), getName(), idx);
FileOutputStream outputStream = new FileOutputStream(fullPath);
outputDocument.write(outputStream);
outputDocument.close();
doc.close();
}
} catch (IOException e) {
e.printStackTrace();
}
While I am able to extract paragraphs with the code above, I can't find a way to extract pages. My understanding is that pages in word are render concern, and it happens in the runtime in the word application.

As far as I can see, the only way to do this is by interrogating the DOM model for the Word doc, and then determining how many paragraphs there are on each page. Below is a possible solution to the problem (it only works if the pages are explicitly separated by page breaks)
public static void main(String[] args) {
XWPFDocument doc = null;
try {
//Input Word Document
File file = new File("C:/TestDoc.docx");
FileInputStream in = new FileInputStream(file);
doc = new XWPFDocument(in);
//Determine how many paragraphs per page
List<Integer> paragraphCountList = getParagraphCountPerPage(doc);
if (paragraphCountList != null && paragraphCountList.size() > 0) {
int docCount = 0;
int startIndex = 0;
int endIndex = paragraphCountList.get(0);
//Loop through the paragraph counts for each page
for (int i=0; i < paragraphCountList.size(); i++) {
XWPFDocument outputDocument = new XWPFDocument();
List<XWPFParagraph> paragraphs = doc.getParagraphs();
List<XWPFParagraph> pageParagraphs = new ArrayList<XWPFParagraph>();
if (paragraphs != null && paragraphs.size() > 0) {
//Get the paragraphs from the input Word document
for (int j=startIndex; j < endIndex; j++) {
if (paragraphs.get(j) != null) {
pageParagraphs.add(paragraphs.get(j));
}
}
//Set the start and end point for the next set of paragraphs
startIndex = endIndex;
if (i < paragraphCountList.size()-2) {
endIndex = endIndex + paragraphCountList.get(i+1);
} else {
endIndex = paragraphs.size()-1;
}
//Create a new Word Doc with the paragraph subset
createPageInAnotherDocument(outputDocument, pageParagraphs);
//Write the file
String outputPath = "C:/TestDocOutput"+docCount+".docx";
FileOutputStream outputStream = new FileOutputStream(outputPath);
outputDocument.write(outputStream);
outputDocument.close();
docCount++;
pageParagraphs = new ArrayList<XWPFParagraph>();
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
doc.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
}
private static List<Integer> getParagraphCountPerPage(XWPFDocument doc) throws Exception {
List<Integer> paragraphCountList = new ArrayList<>();
int paragraphCount = 0;
Document domDoc = convertStringToDOM(doc.getDocument().getBody().toString());
NodeList rootChildNodeList = domDoc.getChildNodes().item(0).getChildNodes();
for (int i=0; i < rootChildNodeList.getLength(); i++) {
Node childNode = rootChildNodeList.item(i);
if (childNode.getNodeName().equals("w:p")) {
paragraphCount++;
if (childNode.getChildNodes() != null) {
for (int k=0; k < childNode.getChildNodes().getLength(); k++) {
if (childNode.getChildNodes().item(k).getNodeName().equals("w:r")) {
for (int m=0; m < childNode.getChildNodes().item(k).getChildNodes().getLength(); m++) {
if (childNode.getChildNodes().item(k).getChildNodes().item(m).getNodeName().equals("w:br")) {
paragraphCountList.add(paragraphCount);
paragraphCount = 0;
}
}
}
}
}
}
}
paragraphCountList.add(paragraphCount+1);
return paragraphCountList;
}
private static Document convertStringToDOM(String xmlData) throws Exception {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document document = builder.parse(new InputSource(new StringReader(xmlData)));
return document;
}
private static void createPageInAnotherDocument(XWPFDocument outputDocument, List<XWPFParagraph> pageParagraphs) throws IOException {
for (int i = 0; i < pageParagraphs.size(); i++) {
addParagraphToDocument(outputDocument, pageParagraphs.get(i).getText());
}
}
private static void addParagraphToDocument(XWPFDocument outputDocument, String text) throws IOException {
XWPFParagraph paragraph = outputDocument.createParagraph();
XWPFRun run = paragraph.createRun();
run.setText(text);
}

Related

Resize Pages with lowagie.itext from PdfImportedPage

I have this code for copy a pdf. How can resize the pages to A4?
Currently you can come within the pdf pages with different size, I want all the same size
private void getPdfPages(byte[] pdf1, PdfCopy copy) throws IOException, BadPdfFormatException {
PdfReader reader;
int n;
reader = new PdfReader(new ByteArrayInputStream(pdf1));
try {
Field f = reader.getClass().getDeclaredField("encrypted");
f.setAccessible(true);
f.set(reader, false);
} catch (Exception e) {
// ignore
}
n = reader.getNumberOfPages();
for (int page = 0; page < n;) {
copy.addPage(copy.getImportedPage(reader, ++page));
}
copy.freeReader(reader);
reader.close();
}

why file is not deleted inspite of using delete function?

for (int i = 0; i < listOfTempFiles.length; i++) {
for (int j = 0; j < listOfFAQFiles.length; j++) {
if (listOfTempFiles[i].isFile() && listOfTempFiles[i].length() > 0) {
if (listOfTempFiles[i].getName().toLowerCase().contains(".pdf")) {
if (listOfTempFiles[i].getName().substring(listOfTempFiles[i].getName().lastIndexOf("#") + 1).equals(listOfFAQFiles[j].getName())) {
try {
List<InputStream> list = new ArrayList<InputStream>();
list.add(new FileInputStream(listOfTempFiles[i]));
list.add(new FileInputStream(listOfFAQFiles[j]));
System.out.println(listOfTempFiles[i].getName() + "with FAQ: " + listOfFAQFiles[j].getName());
int iend = listOfTempFiles[i].getName().lastIndexOf("#");
if (iend != -1) {
outputFilename = listOfTempFiles[i].getName().substring(0, iend);
}
OutputStream out = new FileOutputStream(new File(finalPDFParh + "/" + outputFilename + ".pdf"));
doMerge(list, out);
boolean flag=listOfTempFiles[i].delete();
System.out.println("Flag----->"+flag);
list.clear();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (DocumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void doMerge(List<InputStream> list, OutputStream outputStream)
throws DocumentException, IOException {
Document document = new Document();
PdfWriter writer = PdfWriter.getInstance(document, outputStream);
document.open();
PdfContentByte cb = writer.getDirectContent();
for (InputStream in : list) {
PdfReader reader = new PdfReader(in);
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
document.newPage();
//import the page from source pdf
PdfImportedPage page = writer.getImportedPage(reader, i);
//add the page to the destination pdf
cb.addTemplate(page, 0, 0);
}
}
outputStream.flush();
document.close();
outputStream.close();
}
I want to delete original file from listofTempFiles after it merges with FAQ file.doMerge methhod merges pdf added in the list.I have used delete function but it is not deleted?What can I do with it? I have used delete function.

Reading formulas which are embedded objects from some OLE formula editor in a Docx file using Apache POI

I am trying to read contents of a Docx file and save them in database using Apache POI api. I am able to read plain text and extract pictures but when it comes to reading equations embedded from some OLE formula editor , i wasnt able to find a method to read them . I tried many solutions but i am not able to get the equation read. Below is my code :
public class TestMyCode {
public static void main(String[] args) throws InvalidFormatException,
IOException {
TestMyCode t = new TestMyCode();
String fileSelected = "testTextBox.docx";
File file = new File(fileSelected);
FileInputStream fis = new FileInputStream(file.getAbsolutePath());
System.out.println(t.extractDataDocx(fis));
}
public String extractDataDocx(FileInputStream fis) throws IOException,
InvalidFormatException {
String str = "";
XWPFDocument wordDoc = new XWPFDocument(OPCPackage.open(fis));
XWPFWordExtractor we = new XWPFWordExtractor(wordDoc);
System.out.println(we.getMetadataTextExtractor());
for (XWPFParagraph p : wordDoc.getParagraphs()) {
printContentsOfTextBox(p);
}
return str;
}
private void printContentsOfTextBox(XWPFParagraph paragraph) {
// Also extract any paragraphs embedded in text boxes:
XmlObject[] textBoxObjects = paragraph
.getCTP()
.selectPath(
"declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p");
for (int i = 0; i < textBoxObjects.length; i++) {
XWPFParagraph embeddedPara = null;
try {
XmlObject[] paraObjects = textBoxObjects[i]
.selectChildren(new QName(
"http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"p"));
for (int j = 0; j < paraObjects.length; j++) {
embeddedPara = new XWPFParagraph(
CTP.Factory.parse(paraObjects[j].xmlText()),
paragraph.getBody());
// Here you have your paragraph;
System.out.println(embeddedPara.getText());
}
} catch (XmlException e) {
// handle
}
}
}
}

how to refresh the content of the jtable if you're using notepad to putting data in the jtable

public void readCar() {
String choice = String.valueOf(car_combo1.getSelectedItem());
int ctr = 0;
String filename = null;
if (choice.equals("STATE-Description")) {
filename = TEXT_CAR1.getText();
ctr = 12;
} else if (choice.equals("DISCAS-Camera")) {
filename = TEXT_CAR2.getText();
ctr = 3;
} else if (choice.equals("DISCAS-CAR(for removal)")) {
filename = TEXT_CAR3.getText();
ctr = 11;
} else if (choice.equals("DISCAS-PAR(for removal)")) {
filename = TEXT_CAR4.getText();
ctr = 9;
} else if (choice.equals("Closing CAS for chg w/ customer initiated")) {
filename = TEXT_CAR5.getText();
ctr = 11;
}
try {
File file = new File(filename);
FileReader fr = new FileReader(file.getAbsoluteFile());
BufferedReader br = new BufferedReader(fr);
for (int i = 0; i < ctr; i++) {
dataCar[i] = br.readLine();
String[] temp;
temp = dataCar[i].split("\\*");
car_table.setValueAt(temp[0], i, 0);
car_table.setValueAt(temp[1], i, 1);
}
} catch (IOException e) {
e.printStackTrace();
}
}
This is the read function
private void car_combo1ActionPerformed(java.awt.event.ActionEvent evt) {
String choice = String.valueOf(car_combo1.getSelectedItem());
JTableHeader th = car_table.getTableHeader();
if (choice.equals("STATE-Description")) {
car_table.getColumnModel().getColumn(0).setHeaderValue("STATE");
car_table.getColumnModel().getColumn(1).setHeaderValue("Description");
String[] change_choice_to = {"-", "1006", "1009", "1011", "1013", "1015", "1017", "1018", "1019", "1020", "1021", "1022", "1023"};
DefaultComboBoxModel model = new DefaultComboBoxModel(change_choice_to);
car_combo2.setModel(model);
DefaultTableModel model2 = (DefaultTableModel) car_table.getModel();
model1.fireTableDataChanged();
readCar();
}
}
table.repaint(); and tableModel.fireTableDataChanged(); doesn't work. I tried both of those function but the jtable wouldn't refresh. The notepad with more lines in it stays in the jTable when a lesser lined notepad is called.
I'm using BufferedReader and FileReader to read it from the notepad.
Is there anyway to refresh the table into its default then show the data from the notepad again? Thank you.

So, the basic idea is either you want to update the existing TableModel or replace it with a new one.
Because the number rows seems to change between requests, replacing the model seems to be the safer bet, for example;
This is taken from you readCar method as it's the important part...
try (BufferedReader br = new BufferedReader(new FileReader(new File(filename)))) {
DefaultTableModel model = new DefaultTableModel(new String[]{"STATE", "Description"}, 0);
for (int i = 0; i < ctr; i++) {
dataCar[i] = br.readLine();
String[] temp = dataCar[i].split("\\*");
model.addRow(temp);
}
car_table.setModel(model);
} catch (IOException e) {
e.printStackTrace();
}
All this does is create a new DefaultTableModel, for each line of the file, adds a row to the model and then applies the model to the table when it's done.
If this fails, either there is something else in your code which isn't working or the file is been read (or saved) properly
Take a look at How to Use Tables and The try-with-resources Statement for more details

public void eraseTableCar(){
for(int i = 0; i<110 ; i++){
car_table.setValueAt(" ",i,0);
car_table.setValueAt(" ",i,1);
}
}
public void readCar() {
String choice = String.valueOf(car_combo1.getSelectedItem());
int ctr = 0;
String filename = null;
if (choice.equals("STATE-Description")) {
filename = TEXT_CAR1.getText();
ctr = 12;
} else if (choice.equals("DISCAS-Camera")) {
filename = TEXT_CAR2.getText();
ctr = 3;
} else if (choice.equals("DISCAS-CAR(for removal)")) {
filename = TEXT_CAR3.getText();
ctr = 11;
} else if (choice.equals("DISCAS-PAR(for removal)")) {
filename = TEXT_CAR4.getText();
ctr = 9;
} else if (choice.equals("Closing CAS for chg w/ customer initiated")) {
filename = TEXT_CAR5.getText();
ctr = 11;
}
eraseTableCar();
try {
File file = new File(filename);
FileReader fr = new FileReader(file.getAbsoluteFile());
BufferedReader br = new BufferedReader(fr);
for (int i = 0; i < ctr; i++) {
dataCar[i] = br.readLine();
String[] temp;
temp = dataCar[i].split("\\*");
car_table.setValueAt(temp[0], i, 0);
car_table.setValueAt(temp[1], i, 1);
}
} catch (IOException e) {
e.printStackTrace();
}
}
This code works although I don't know if it's a good practice in coding

lucene indexing of html files

Dear Users I am working on apache lucene for indexing and searching .
I have to index html files stored on the local disc of computer . I have to make indexing on filename and contents of the html files . I am able to store the file names in the lucene index but not the html file contents which should index not only the data but the entire page consisting images link and url and how can i access the contents from those indexed files
for indexing i am using the following code:
File indexDir = new File(indexpath);
File dataDir = new File(datapath);
String suffix = ".htm";
IndexWriter indexWriter = new IndexWriter(
FSDirectory.open(indexDir),
new SimpleAnalyzer(),
true,
IndexWriter.MaxFieldLength.LIMITED);
indexWriter.setUseCompoundFile(false);
indexDirectory(indexWriter, dataDir, suffix);
numIndexed = indexWriter.maxDoc();
indexWriter.optimize();
indexWriter.close();
private void indexDirectory(IndexWriter indexWriter, File dataDir, String suffix) throws IOException {
try {
for (File f : dataDir.listFiles()) {
if (f.isDirectory()) {
indexDirectory(indexWriter, f, suffix);
} else {
indexFileWithIndexWriter(indexWriter, f, suffix);
}
}
} catch (Exception ex) {
System.out.println("exception 2 is" + ex);
}
}
private void indexFileWithIndexWriter(IndexWriter indexWriter, File f,
String suffix) throws IOException {
try {
if (f.isHidden() || f.isDirectory() || !f.canRead() || !f.exists()) {
return;
}
if (suffix != null && !f.getName().endsWith(suffix)) {
return;
}
Document doc = new Document();
doc.add(new Field("contents", new FileReader(f)));
doc.add(new Field("filename", f.getFileName(),
Field.Store.YES, Field.Index.ANALYZED));
indexWriter.addDocument(doc);
} catch (Exception ex) {
System.out.println("exception 4 is" + ex);
}
}
thanks in advance

This line of code is the reason why your contents is not being stored:
doc.add(new Field("contents", new FileReader(f)));
This method DOES NOT STORE the contents being indexed.
If you are trying to index HTML files, try using JTidy. It will make the process much easier.
Sample Codes:
public class JTidyHTMLHandler {
public org.apache.lucene.document.Document getDocument(InputStream is) throws DocumentHandlerException {
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root = tidy.parseDOM(is, null);
Element rawDoc = root.getDocumentElement();
org.apache.lucene.document.Document doc =
new org.apache.lucene.document.Document();
String body = getBody(rawDoc);
if ((body != null) && (!body.equals(""))) {
doc.add(new Field("contents", body, Field.Store.NO, Field.Index.ANALYZED));
}
return doc;
}
protected String getTitle(Element rawDoc) {
if (rawDoc == null) {
return null;
}
String title = "";
NodeList children = rawDoc.getElementsByTagName("title");
if (children.getLength() > 0) {
Element titleElement = ((Element) children.item(0));
Text text = (Text) titleElement.getFirstChild();
if (text != null) {
title = text.getData();
}
}
return title;
}
protected String getBody(Element rawDoc) {
if (rawDoc == null) {
return null;
}
String body = "";
NodeList children = rawDoc.getElementsByTagName("body");
if (children.getLength() > 0) {
body = getText(children.item(0));
}
return body;
}
protected String getText(Node node) {
NodeList children = node.getChildNodes();
StringBuffer sb = new StringBuffer();
for (int i = 0; i < children.getLength(); i++) {
Node child = children.item(i);
switch (child.getNodeType()) {
case Node.ELEMENT_NODE:
sb.append(getText(child));
sb.append(" ");
break;
case Node.TEXT_NODE:
sb.append(((Text) child).getData());
break;
}
}
return sb.toString();
}
}
To get an InputStream from a URL:
URL url = new URL(htmlURLlocation);
URLConnection connection = url.openConnection();
InputStream stream = connection.getInputStream();
To get an InputStream from a File:
InputStream stream = new FileInputStream(new File (htmlFile));

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Apache POI - Split Word document (docx) to pages - java

Related

Resize Pages with lowagie.itext from PdfImportedPage

why file is not deleted inspite of using delete function?

Reading formulas which are embedded objects from some OLE formula editor in a Docx file using Apache POI

how to refresh the content of the jtable if you're using notepad to putting data in the jtable

lucene indexing of html files

Categories

Resources