I'm working on resume parser and i can get some of data i.e company details from text but not getting if it is kept in a grid or table
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.JOptionPane;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
public class CmpnyNameex {
public static void main(String[] args)throws IOException {
String text="";
String name="";
XWPFDocument msDocx = new XWPFDocument(new FileInputStream("A:\\Resumes\\Anwesh.docx"));
XWPFWordExtractor extractor = new XWPFWordExtractor(msDocx);
text = extractor.getText();
}
catch(FileNotFoundException ex){ex.printStackTrace();
JOptionPane.showMessageDialog(null,"The system cannot find the file specified file it may be because of old file format","Error",JOptionPane.ERROR_MESSAGE);
}
String rx13="(?<=Have been associated with).*.(.*Ltd?)";
Pattern p1 = Pattern.compile(rx13);
Matcher found1 = p1.matcher(text);
while(found1.find())
{
name= found1.group(0);
}
}
}
Related
I am trying to convert a .prn file to html. But due to file format, I am not able to parse in a way I want.
I tried many approaches. some of are:
package main;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
public class PrnToHtml {
public static void main(String[] args) {
try (BufferedReader reader = new BufferedReader(new FileReader(".\\Workbook2.prn"));
FileWriter writer = new FileWriter("output_prn.html")) {
writer.write("<html><body><h3>PRN to HTML</h3><table border>\n");
String currentLine;
while ((currentLine = reader.readLine()) != null) {
writer.write("<tr>");
for(String field: currentLine.split("\\s{2,}")) // "\\s{2,}"
writer.write("<td>" + field + "</td>");
writer.write("</tr>\n");
}
writer.write("</table></body></html>\n");
} catch (IOException e) {
e.printStackTrace();
}
}
}
Output of this will be html page looks like this:
prn file\data looks like this:
Other this I tried to read this is:
package main;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class PRNToHtml {
private static final String DILIM_PRN = " ";
private static final Pattern PRN_SPLITTER = Pattern.compile(DILIM_PRN);
public static void main(String[] args) throws URISyntaxException, IOException {
try (#SuppressWarnings("resource")
Stream<String> lines = new BufferedReader(new FileReader(".\\Workbook2.prn")).lines()) {
List<String[]> inputValuesInLines = lines.map(l -> PRN_SPLITTER.split(l)).collect(Collectors.toList());
for (String[] strings : inputValuesInLines) {
for (String s : strings) {
System.out.print(s.replaceAll("\\s+", "") + " ");
}
System.out.println();
}
}
}
}
output of this is the exactly same looking in prn data file. But when I am trying to embed in html, it is looking weird like this:
Help will be appreciated.
Thank you :)
I am using PDFBOX and reading and saving the contents from PDF file . Requirement is text should be splitted to Header and Item in seperate array list .
PDF looks below.
Expected :
Following details PO,DeliveryDate,Vendor no should shown in arraylist 1 and other details like barcode,item number,description,quantity should shown in arraylist 2 .
Exisiting code for extracting data as txt from PDF.
PDFBoxReadFromFile.java
package pdfboxreadfromfile;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.io.*;
public class PDFBoxReadFromFile {
/**
* #param args the command line arguments
*/
public static void main(String[] args) {
PDFManager pdfManager = new PDFManager();
pdfManager.setFilePath("C:\\Users\\34\\Documents\\test.pdf");
try {
String text = pdfManager.toText();
System.out.println(text);
File file = new File("C:/Users/34/eclipse-workspace/pdfboxreadfromfile/file.txt");
FileWriter fw = new FileWriter(file);
PrintWriter pw = new PrintWriter(fw);
pw.println(text);
pw.close();
} catch (IOException ex) {
//System.err.println(ex.getMessage());
Logger.getLogger(PDFBoxReadFromFile.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
PDFManager.Java
package pdfboxreadfromfile;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDFManager {
private PDFParser parser;
private PDFTextStripper pdfStripper;
private PDDocument pdDoc;
private COSDocument cosDoc;
private String Text;
private String filePath;
private File file;
public PDFManager() {
}
public String toText() throws IOException {
this.pdfStripper = null;
this.pdDoc = null;
this.cosDoc = null;
file = new File(filePath);
parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
pdDoc.getNumberOfPages();
pdfStripper.setStartPage(0);
pdfStripper.setEndPage(pdDoc.getNumberOfPages());
Text = pdfStripper.getText(pdDoc);
return Text;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
public PDDocument getPdDoc() {
return pdDoc;
}
}
java
I have two docx files which I want to merge and I have the following code in docx4j with some errors! Both the docx files are using for the purpose of track changes. So the output file should be have the details of both the docx files.
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.io.SaveToZipFile;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import com.plutext.merge.BlockRange;
import com.plutext.merge.BlockRange.HfBehaviour;
import com.plutext.merge.BlockRange.SectionBreakBefore;
import com.plutext.merge.BlockRange.NumberingHandler;
import com.plutext.merge.BlockRange.StyleHandler;
import com.plutext.merge.DocumentBuilder;
public class MergeWholeDocumentsUsingBlockRange
{
public final static String DIR_IN = System.getProperty("user.dir")+ "/";
public final static String DIR_OUT = System.getProperty("user.dir")+ "/";
public static void main(String[] args) throws Exception {
String[] files = {"table.docx", "Table1.docx"};
List blockRanges = new ArrayList();
for (int i=0 ; i< files.length; i++) {
BlockRange block = new BlockRange(WordprocessingMLPackage.load(new File(DIR_IN + files[i])));
blockRanges.add( block );
block.setStyleHandler(StyleHandler.RENAME_RETAIN);
block.setNumberingHandler(NumberingHandler.ADD_NEW_LIST);
block.setRestartPageNumbering(false);
block.setHeaderBehaviour(HfBehaviour.DEFAULT);
block.setFooterBehaviour(HfBehaviour.DEFAULT);
block.setSectionBreakBefore(SectionBreakBefore.NEXT_PAGE);
}
// Perform the actual merge
DocumentBuilder documentBuilder = new DocumentBuilder();
WordprocessingMLPackage output = documentBuilder.buildOpenDocument(blockRanges);
// Save the result
SaveToZipFile saver = new SaveToZipFile(output);
saver.save(DIR_OUT+"OUT_MergeWholeDocumentsUsingBlockRange.docx");
}
}
I've written two separate pieces of code. Now I want to merge both pieces of code. Now one part opens a text file and displays the contents of the text file and the second piece of code validates manually entered postcodes. Now I want to read a text file and then automatically validate postcodes within the text file. Not sure how I can merge them. Any questions please ask as I'm stuck.
package postcodesort;
import java.util.*;
import java.util.Random;
import java.util.Queue;
import java.util.TreeSet;
import java.io.File;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.LinkedList;
import java.util.StringTokenizer;
public class PostCodeSort
{
Queue<String> postcodeStack = new LinkedList<String>();
public static void main(String[] args) throws IOException
{
FileReader fileReader = null;
// Create the FileReader object
try {
fileReader = new FileReader("postcodes1.txt");
BufferedReader br = new BufferedReader(fileReader);
String str;
while((str = br.readLine()) != null)
{
System.out.println(str + "");
}
}
catch (IOException ex)
{
// handle exception;
}
finally
{
fileReader.close();
}
// Close the input
}
}
Second part that manually validates postcodes:
List<String> zips = new ArrayList<String>();
//Valid ZIP codes
zips.add("SW1W 0NY");
zips.add("PO16 7GZ");
zips.add("GU16 7HF");
zips.add("L1 8JQ");
//Invalid ZIP codes
zips.add("Z1A 0B1");
zips.add("A1A 0B11");
String regex = "^[A-Z]{1,2}[0-9R][0-9A-Z]? [0-9][ABD-HJLNP-UW-Z]{2}$";
Pattern pattern = Pattern.compile(regex);
for (String zip : zips)
{
Matcher matcher = pattern.matcher(zip);
System.out.println(matcher.matches());
}
You should create a class called something like ZipCodeValidator that contains the functionality of your second snippet. It will look something like this
public class ZipCodeValidator {
private static String regex = "^[A-Z]{1,2}[0-9R][0-9A-Z]? [0-9][ABD-HJLNP-UW-Z]{2}$";
private static Pattern pattern = Pattern.compile(regex);
public boolean isValid(String zipCode) {
Matcher matcher = pattern.matcher(zip);
return matcher.matches();
}
}
Then you can create an instance of this class
ZipCodeValidator zipCodeValidator = new ZipCodeValidator();
and then use it in your main method
boolean valid = zipCodeValidator.isValid(zipCode);
Merging your question and the answer by #hiflyer I posted this answer, this makes an assumption that the file postcodes1.txt has all the zip codes in separate lines.
package postcodesort;
import java.util.*;
import java.util.Random;
import java.util.Queue;
import java.util.TreeSet;
import java.io.File;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.LinkedList;
import java.util.StringTokenizer;
public class PostCodeSort
{
Queue<String> postcodeStack = new LinkedList<String>();
public static void main(String[] args) throws IOException
{
FileReader fileReader = null;
ZipCodeValidator zipCodeValidator = new ZipCodeValidator();
// Create the FileReader object
try {
fileReader = new FileReader("postcodes1.txt");
BufferedReader br = new BufferedReader(fileReader);
String str;
while((str = br.readLine()) != null)
{
if(zipCodeValidator.isValid(str)){
System.out.println(str + " is valid");
}
else{
System.out.println(str + " is not valid");
}
}
}
catch (IOException ex)
{
// handle exception;
}
finally
{
fileReader.close();
}
}
}
public class ZipCodeValidator {
private static String regex = "^[A-Z]{1,2}[0-9R][0-9A-Z]? [0-9][ABD-HJLNP-UW-Z]{2}$";
private static Pattern pattern = Pattern.compile(regex);
public boolean isValid(String zipCode) {
Matcher matcher = pattern.matcher(zip);
return matcher.matches();
}
}
Im trying to a specific String of number in a csv file and I keep getting an a FileNotFound exception even though the files exists. I cant seem to fix the problem
Sample Csv file
12141895, LM051
12148963, Lm402
12418954, Lm876
User Input : 12141895
Desired Result : True
import javax.swing.*;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
import javax.swing.*;
import java.io.*;
import java.awt.*;
import java.awt.List;
public class tester
{
public static void main (String [] args ) throws IOException
{
boolean cool = checkValidID();
System.out.println(cool);
}
public static boolean checkValidID() throws IOException
{
boolean results = false;
Scanner scan = new Scanner(new File("C:\\Users\\Packard Bell\\Desktop\\ProjectOOD\\IDandCourse.csv"));
String s;
int indexfound=-1;
String words[] = new String[500];
String word = JOptionPane.showInputDialog("Enter your student ID");
while (scan.hasNextLine())
{
s = scan.nextLine();
if(s.indexOf(word)>-1)
indexfound++;
}
if (indexfound>-1)
{
results = true;
}
else
{
results = true;
}
return results;
}
}
use:
import java.io.File;
import java.io.FileNotFoundException;
Also, in your function declaration try using
public static boolean checkValidID() throws FileNotFoundException
and likewise with the main function.
If the file is present and named correctly, this should handle it.