I have two docx files which I want to merge and I have the following code in docx4j with some errors! Both the docx files are using for the purpose of track changes. So the output file should be have the details of both the docx files.
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.io.SaveToZipFile;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import com.plutext.merge.BlockRange;
import com.plutext.merge.BlockRange.HfBehaviour;
import com.plutext.merge.BlockRange.SectionBreakBefore;
import com.plutext.merge.BlockRange.NumberingHandler;
import com.plutext.merge.BlockRange.StyleHandler;
import com.plutext.merge.DocumentBuilder;
public class MergeWholeDocumentsUsingBlockRange
{
public final static String DIR_IN = System.getProperty("user.dir")+ "/";
public final static String DIR_OUT = System.getProperty("user.dir")+ "/";
public static void main(String[] args) throws Exception {
String[] files = {"table.docx", "Table1.docx"};
List blockRanges = new ArrayList();
for (int i=0 ; i< files.length; i++) {
BlockRange block = new BlockRange(WordprocessingMLPackage.load(new File(DIR_IN + files[i])));
blockRanges.add( block );
block.setStyleHandler(StyleHandler.RENAME_RETAIN);
block.setNumberingHandler(NumberingHandler.ADD_NEW_LIST);
block.setRestartPageNumbering(false);
block.setHeaderBehaviour(HfBehaviour.DEFAULT);
block.setFooterBehaviour(HfBehaviour.DEFAULT);
block.setSectionBreakBefore(SectionBreakBefore.NEXT_PAGE);
}
// Perform the actual merge
DocumentBuilder documentBuilder = new DocumentBuilder();
WordprocessingMLPackage output = documentBuilder.buildOpenDocument(blockRanges);
// Save the result
SaveToZipFile saver = new SaveToZipFile(output);
saver.save(DIR_OUT+"OUT_MergeWholeDocumentsUsingBlockRange.docx");
}
}
Related
I have a workbook in which I have multiple tabs now I want to read all the tabs name and return the tab name containing particular character in it , in my case its data
example assumpition is :-
Expected tabsname - > welcome_data.xslx , hello_value.xslx;
output -> welcome_data.xslx
Any help is appreciated ,
code which I was trying was :- I was able to get the tabs name but how to filter it .
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
public class EXcelsheet {
public static void main(String[] args) throws IOException {
String xlsxFile = "/home/working/git/ui_spec.xlsx";
FileInputStream file = new FileInputStream(new File(xlsxFile));
XSSFWorkbook workbook = new XSSFWorkbook(file);
System.out.println("number of sheet::" + workbook.getNumberOfSheets());
List<String> sheetNames = new ArrayList<String>();
for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
sheetNames.add(workbook.getSheetName(i));
}
}
}
I don't get the issue.
Does replacing
sheetNames.add(workbook.getSheetName(i));
with
String sheetName = workbook.getSheetName(i);
if (sheetName.contains(<char>)
{
sheetNames.add(workbook.getSheetName(i));
}
not work?
I am using PDFBOX and reading and saving the contents from PDF file . Requirement is text should be splitted to Header and Item in seperate array list .
PDF looks below.
Expected :
Following details PO,DeliveryDate,Vendor no should shown in arraylist 1 and other details like barcode,item number,description,quantity should shown in arraylist 2 .
Exisiting code for extracting data as txt from PDF.
PDFBoxReadFromFile.java
package pdfboxreadfromfile;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.io.*;
public class PDFBoxReadFromFile {
/**
* #param args the command line arguments
*/
public static void main(String[] args) {
PDFManager pdfManager = new PDFManager();
pdfManager.setFilePath("C:\\Users\\34\\Documents\\test.pdf");
try {
String text = pdfManager.toText();
System.out.println(text);
File file = new File("C:/Users/34/eclipse-workspace/pdfboxreadfromfile/file.txt");
FileWriter fw = new FileWriter(file);
PrintWriter pw = new PrintWriter(fw);
pw.println(text);
pw.close();
} catch (IOException ex) {
//System.err.println(ex.getMessage());
Logger.getLogger(PDFBoxReadFromFile.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
PDFManager.Java
package pdfboxreadfromfile;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDFManager {
private PDFParser parser;
private PDFTextStripper pdfStripper;
private PDDocument pdDoc;
private COSDocument cosDoc;
private String Text;
private String filePath;
private File file;
public PDFManager() {
}
public String toText() throws IOException {
this.pdfStripper = null;
this.pdDoc = null;
this.cosDoc = null;
file = new File(filePath);
parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
pdDoc.getNumberOfPages();
pdfStripper.setStartPage(0);
pdfStripper.setEndPage(pdDoc.getNumberOfPages());
Text = pdfStripper.getText(pdDoc);
return Text;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
public PDDocument getPdDoc() {
return pdDoc;
}
}
java
Using java, I need to read a warc archive file, filter it depending on the content of the html page, and write a new archive file.
the following code reads the archive. how to reconstruct an org.archive.io.warc.WARCRecordInfo from an org.archive.io.ArchiveRecord?
import org.apache.commons.io.IOUtils;
import org.archive.io.ArchiveRecord;
import org.archive.io.warc.*;
import org.archive.wayback.resourcestore.resourcefile.WarcResource;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;
public class Test126b {
public static void main() throws Exception {
File out = new java.io.File("out.warc.gz");
OutputStream bos = new BufferedOutputStream(new FileOutputStream(out));
WARCWriterPoolSettings settings = ...
WARCWriter writer = new WARCWriter(new AtomicInteger(), bos, out, settings);
File in = new java.io.File("in.warc.gz");
WARCReader reader = WARCReaderFactory.get(in);
Iterator<ArchiveRecord> it = reader.iterator();
while (it.hasNext()) {
ArchiveRecord archiveRecord = it.next();
if (archiveRecord.getHeader().getHeaderValue("WARC-Type") == "response") {
WARCRecord warcRecord = (WARCRecord) archiveRecord;
WarcResource warcResource = new WarcResource(warcRecord, reader);
warcResource.parseHeaders();
String url = warcResource.getWarcHeaders().getUrl();
System.out.println("+++ url: " + url);
byte[] content = IOUtils.toByteArray(warcResource);
String htmlPage = new String(content);
if (htmlPage.contains("hello world")) {
writer.writeRecord(warcRecordInfo) // how to reconstruct the WARCRecordInfo
}
}
}
reader.close();
writer.close();
}
}
I'm working on resume parser and i can get some of data i.e company details from text but not getting if it is kept in a grid or table
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.JOptionPane;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
public class CmpnyNameex {
public static void main(String[] args)throws IOException {
String text="";
String name="";
XWPFDocument msDocx = new XWPFDocument(new FileInputStream("A:\\Resumes\\Anwesh.docx"));
XWPFWordExtractor extractor = new XWPFWordExtractor(msDocx);
text = extractor.getText();
}
catch(FileNotFoundException ex){ex.printStackTrace();
JOptionPane.showMessageDialog(null,"The system cannot find the file specified file it may be because of old file format","Error",JOptionPane.ERROR_MESSAGE);
}
String rx13="(?<=Have been associated with).*.(.*Ltd?)";
Pattern p1 = Pattern.compile(rx13);
Matcher found1 = p1.matcher(text);
while(found1.find())
{
name= found1.group(0);
}
}
}
I am able to Read the input document using Apache POI and also able to find the data between the tags(What to be hidden) but the problem is i'm unable to write the data in the output file. How can i do the same to write the data and hide it in the output generated file..
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Range;
public class Hidden {
public static void main(String args[]) throws Exception
{
File file = new File("D://me1.doc");
FileInputStream fin = new FileInputStream(file);
FileOutputStream fout = new FileOutputStream("D://Test.doc");
HWPFDocument doc = new HWPFDocument(fin);
Range range = doc.getRange();
WordExtractor extractor = new WordExtractor(doc);
String para[] = extractor.getParagraphText();
String output="";
String hidden="";
for (String p : para) {
String[] w = p.split("[<\\>]");
for(int k=0 ;k<w.length;k++){
if(w[k]!=null && !"".equalsIgnoreCase(w[k])){
if("hidden".equalsIgnoreCase(w[k])){
k++;
CharacterRun run = range.getCharacterRun(k);
hidden= w[k];
k++;
System.out.println(hidden);
run.setVanished(true);
doc.write(fout);
}else{
}
}
}
}
fout.close();
fin.close();
}
}