convert pdf to html page wise using pdfbox library

convert pdf to html page wise using pdfbox library - java

public class ExtractText
{
/**
* private constructor.
*/
private ExtractText()
{
//static class
}
public static void main( String[] args ) throws Exception
{
if(l!=null)
{
System.out.println("HERE"+l.length);
deleteSubs(op);
System.out.println("Then"+l.length);
}
else
{
System.out.println("WHERE");
}
File y=new File(imgDes);
if(!y.exists())
{
y.mkdirs();
}
File z=new File(imgDestination);
if(!z.exists())
{
z.mkdirs();
}
File fr=new File(outputFile);
if(!fr.isDirectory())
{
fr.delete();
}
// Defaults to text files
String ext = ".txt";
int startPage = 1;
int endPage = Integer.MAX_VALUE;
Writer output = null;
PDDocument document =null;
try
{
try
{
URL url = new URL( pdfFile );
document = PDDocument.load(url, force);
String fileName = url.getFile();
if( outputFile == null && fileName.length() >4)
{
outputFile = new File( fileName.substring( 0, fileName.length() -4 ) + ext ).getName();
}
}
catch( MalformedURLException e)
{
document = PDDocument.load(pdfFile, force);
if( outputFile == null && pdfFile.length() >4 )
{
outputFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ext;
}
}
//document.print();
if( document.isEncrypted() )
{
StandardDecryptionMaterial sdm = new StandardDecryptionMaterial( password );
document.openProtection(sdm);
AccessPermission ap = document.getCurrentAccessPermission();
if( ! ap.canExtractContent() )
{
throw new IOException("You do not have permission to extract text" );
}
}
if ((encoding == null) && (toHTML))
{
encoding = "UTF-8";
}
if( toConsole )
{
output = new OutputStreamWriter(System.out);
}
else
{
if( encoding != null )
{
output = new OutputStreamWriter(new FileOutputStream( outputFile ), encoding );
}
else
{
//use default encoding
output = new OutputStreamWriter(new FileOutputStream( outputFile ) );
}
}
PDFTextStripper4 stripper = null;
if(toHTML)
{
stripper = new PDFText2HTML(encoding);
}
else
{
stripper = new PDFTextStripper4(encoding);
}
File f= new File(imgDestination);
PDDocument pd;
int i=0;
if(f.exists())
{
pd=PDDocument.load(pdfFile);
PDFontDescriptor fd;
fd = new PDFontDescriptorDictionary();
List<PDPage> li=pd.getDocumentCatalog().getAllPages();
for(PDPage page:li)
{
PDResources pdr=page.getResources();
Map<String, PDFont> m=pdr.getFonts();
PDStream pst;
for(PDFont pdd:m.values())
{
System.out.println("----------"+pdd.getBaseFont());
pdd.getFontDescriptor();
fd = pdd.getFontDescriptor();
pdd.setFontDescriptor((PDFontDescriptorDictionary)fd);
System.out.println("tititititi"+pdd.getFontEncoding());
if(pdd.isType1Font())
{
pst=((PDFontDescriptorDictionary) fd).getFontFile3();
System.out.println("In If "+pst);
if(pst!= null)
{
FileOutputStream fos = new FileOutputStream(new File(imgDestination+pdd.getBaseFont().toString()+".pfb"));
IOUtils.copy(pst.createInputStream(), fos);
i++;
System.out.println(i);
fos.close();
}
}
else
if(pdd.isTrueTypeFont())
{
pst= ((PDFontDescriptorDictionary) fd).getFontFile2();
System.out.println("In Else-if"+pst);
if (pst!= null)
{
FileOutputStream fos = new FileOutputStream(new File(imgDestination+pdd.getBaseFont().toString()+".ttf"));
IOUtils.copy(pst.createInputStream(), fos);
i++;
System.out.println(i);
fos.close();
}
}
else
if(pdd.isSymbolicFont())
{
System.out.println("Symbol.......");
}
else
{
System.out.println("In Else");
}
}
}
int pageCount = document.getDocumentCatalog().getAllPages().size();
for (int p = 0; p < pageCount; ++p)
{
System.out.println("I am in for loop");
stripper.setForceParsing( force );
stripper.setSortByPosition( true );
stripper.setShouldSeparateByBeads(separateBeads);
stripper.setStartPage( p);
stripper.setEndPage( p);
stripper.writeText( document, output );
FileOutputStream fos = new FileOutputStream(new File(f5+(p+1)+".html"));
output.close();
}
PDDocumentInformation info = document.getDocumentInformation();
System.out.println( "Page Count=" + document.getNumberOfPages());
System.out.println( "Title=" + info.getTitle());
System.out.println( "Author=" + info.getAuthor());
System.out.println( "Subject=" + info.getSubject() );
System.out.println( "Keywords=" + info.getKeywords() );
System.out.println( "Creator=" + info.getCreator() );
System.out.println( "Producer=" + info.getProducer() );
System.out.println( "Creation Date=" + info.getCreationDate() );
System.out.println( "Modification Date=" + info.getModificationDate());
System.out.println( "Trapped=" + info.getTrapped());
}
}catch(Exception e)
{
e.printStackTrace();
}
finally
{
if( output != null)
{
output.close();
}
if( document != null )
{
document.close();
}
}
}
private static void deleteSubs(File op)
{
// TODO Auto-generated method stub
File[] files = op.listFiles();
System.out.print("In delete folder");
if(files!=null)
{
//some JVMs return null for empty dirs
for(File f: files)
{
if(f.isDirectory())
{
deleteSubs(f);
}
else
{
f.delete();
}
}
}
op.delete();
}
}
now i am able to get entire pdf to a html file i.e.. I am extracting text only not images but i want to get every page of a pdf in to single html so any solution for this is quite helpful to me.. ThankYou

The answer is in your question: just set
stripper.setStartPage( p );
stripper.setEndPage( p );
accordingly. So you would loop somewhat like this:
int pageCount = document.getDocumentCatalog().getAllPages().size();
for (int p = 0; p < pageCount; ++p)
{
//... your options
stripper.setStartPage(p);
stripper.setEndPage(p);
FileOutputStream fos = new FileOutputStream(new File(f5+(p+1)+".html"));
stripper.writeText(document, fos);
fos.close();
}
Btw if you get an exception relating to the sorting comparator, use setSortByPosition(false), or wait for version 1.8.8 where this problem is fixed.

Related

Read a string from a txt file and compare it with items in drop down list using webdriver/java

Here is the scenario:
1) create a file with input string=Sep 2015
2) Collect the drop down list into an array
3) if array equals string come out of loop else downloads new month report and overwrites txt file with new month name.
I tried below code, but I'm unable to implement the txt comparision part and txt overwrite part, please help.
driver.get("http://www.depreportingservices.state.pa.us/ReportServer/Pages/ReportViewer.aspx?%2fOil_Gas%2fOil_Gas_Well_Historical_Production_Report");
//maximizing the window
driver.manage().window().maximize();
List<WebElement> options;
int i = 0;
do
{
options = driver.findElement(By.id("ReportViewerControl_ctl04_ctl03_ddValue")).findElements(By.tagName("option"));
if(options.get(i).getText().equals("Sep 2015 (Unconventional wells)"))
{
System.out.println("old month");
break;
}
else
{ if (options.get(i).getText().equalsIgnoreCase("All" )){
System.out.println("Download new month");
WebElement identifier = driver.findElement(By.xpath(".//*[#id='ReportViewerControl_ctl04_ctl03_ddValue']"));
Select select1 = new Select(identifier);
//select1.selectByVisibleText("Oct");
select1.selectByVisibleText("Oct 2015 (Unconventional wells)");
Wait(20000);
driver.findElement(By.xpath(".//*[#id='ReportViewerControl_ctl04_ctl00']")).click();
Wait(70000);
//Click on File save button
driver.findElement(By.xpath(".//*[#id='ReportViewerControl_ctl05_ctl04_ctl00_Button']")).click();
//wait time to load the options
Wait(20000);
driver.findElement(By.xpath(".//*[#id='ReportViewerControl_ctl05_ctl04_ctl00_Menu']/div[2]/a")).click();
//fprofile.setPreference( "browser.download.manager.showWhenStarting", false );
//fprofile.setPreference( "pdfjs.disabled", true );
Wait(10000);
String str=options.get(2).getText();
System.out.println("str: " + str);
// driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
System.out.println("Oct month data downloaded in csv format");
//System.out.println("New month");
}
} } while (i++ < options.size());
}

Once try like this:
//Global Variable:
private WebDriver driver;
private String fileName = "/home/saritha/Desktop/MySeleniumFile.txt";
private File file;
In Tesng method:
#Test
public void oilGasTestng() throws InterruptedException {
driver.get("http://www.depreportingservices.state.pa.us/ReportServer/Pages/ReportViewer.aspx?%2fOil_Gas%2fOil_Gas_Well_Historical_Production_Report");
WebElement mSelectElement = driver
.findElement(By
.xpath("//select[#id='ReportViewerControl_ctl04_ctl03_ddValue']"));
List<WebElement> optionsList = mSelectElement.findElements(By
.tagName("option"));
for (int i = 2; i < optionsList.size(); i++) {
WebElement element = optionsList.get(i);
String newMonth = element.getText();
/*
* First we have read the data from file, if the file is empty then
* download the file and save the downloaded month(which is old
* month when v done with the downloading).
*/
String oldMonth = "";
if (i > 2) {
oldMonth = getTheOldMonthFromFile();
}
System.out.println("Old Month= " + oldMonth + " NewMonth= "
+ newMonth);
if (newMonth.equals(oldMonth)) {
// IF the string are same, nthng we need to do
} else if (!newMonth.equals(oldMonth)) {
/*
* If the string are not same,then i.e., considered as new
* Month, download the new month details
*/
element.click();
driver.findElement(
By.xpath(".//*[#id='ReportViewerControl_ctl04_ctl00']"))
.click();
System.out.println(newMonth
+ " month data downloaded in csv format");
saveIntoAFile(newMonth);
/*
* You can which is oldMonth which is new month, by unCommenting
* below condition
*/
// if (i == 4)
break;
}
}
}
//Save data into a file
private void saveIntoAFile(String oldMonth) {
BufferedWriter bw = null;
if (oldMonth != null) {
file = new File(fileName);
try {
if (!file.exists()) {
file.createNewFile();
}
Writer writer = new FileWriter(file);
bw = new BufferedWriter(writer);
bw.write(oldMonth);
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (bw != null) {
bw.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
//Get the oldMonth string from the file
private String getTheOldMonthFromFile() {
if (file == null && !file.exists()) {
return null;
}
String oldMonth = "";
StringBuffer strBuffer = new StringBuffer();
BufferedReader br = null;
java.io.FileReader reader = null;
try {
reader = new java.io.FileReader(file);
br = new BufferedReader(reader);
while ((oldMonth = br.readLine()) != null) {
strBuffer.append(oldMonth);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (br != null) {
br.close();
}
if (reader != null) {
reader.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return strBuffer.toString();
}

Some of my Users can not download the zip?

hello so I have been writing an updater for my game.
1) It checks a .version file on drop box and compares it to the local .version file.
2) If there is any link missing from the local version of the file, it downloads the required link one by one.
The issue I am having is some of the users can download the zips and some cannot.
One of my users who was having the issue was using windows xp. So some of them have old computers.
I was wondering if anyone could help me to get an idea on what could be causing this.
This is the main method that is ran
public void UpdateStart() {
System.out.println("Starting Updater..");
if(new File(cache_dir).exists() == false) {
System.out.print("Creating cache dir.. ");
while(new File(cache_dir).mkdir() == false);
System.out.println("Done");
}
try {
version_live = new Version(new URL(version_file_live));
} catch(MalformedURLException e) {
e.printStackTrace();
}
version_local = new Version(new File(version_file_local));
Version updates = version_live.differences(version_local);
System.out.println("Updated");
int i = 1;
try {
byte[] b = null, data = null;
FileOutputStream fos = null;
BufferedWriter bw = null;
for(String s : updates.files) {
if(s.equals(""))
continue;
System.out.println("Reading file "+s);
text = "Downloading file "+ i + " of "+updates.files.size();
b = readFile(new URL(s));
progress_a = 0;
progress_b = b.length;
text = "Unzipping file "+ i++ +" of "+updates.files.size();
ZipInputStream zipStream = new ZipInputStream(new ByteArrayInputStream(b));
File f = null, parent = null;
ZipEntry entry = null;
int read = 0, entry_read = 0;
long entry_size = 0;
progress_b = 0;
while((entry = zipStream.getNextEntry()) != null)
progress_b += entry.getSize();
zipStream = new ZipInputStream(new ByteArrayInputStream(b));
while((entry = zipStream.getNextEntry()) != null) {
f = new File(cache_dir+entry.getName());
if(entry.isDirectory())
continue;
System.out.println("Making file "+f.toString());
parent = f.getParentFile();
if(parent != null && !parent.exists()) {
System.out.println("Trying to create directory "+parent.getAbsolutePath());
while(parent.mkdirs() == false);
}
entry_read = 0;
entry_size = entry.getSize();
data = new byte[1024];
fos = new FileOutputStream(f);
while(entry_read < entry_size) {
read = zipStream.read(data, 0, (int)Math.min(1024, entry_size-entry_read));
entry_read += read;
progress_a += read;
fos.write(data, 0, read);
}
fos.close();
}
bw = new BufferedWriter(new FileWriter(new File(version_file_local), true));
bw.write(s);
bw.newLine();
bw.close();
}
} catch(Exception e) {
this.e = e;
e.printStackTrace();
return;
}
System.out.println(version_live);
System.out.println(version_local);
System.out.println(updates);
try {
} catch (Exception er) {
er.printStackTrace();
}
}
I have been trying to fix this for the last two days and I am just so stumped at this point
All the best,
Christian

Java - Cannot view modified metadata

I have been trying to access modified metadata of images in Java for the past few hours. I know I am one or two steps away from getting the correct output. Would really appreciate if someone can help me with this.
I want to add an extra field in the metadata, which is like an text similar to:
Writing image metadata in Java, preferably PNG
My issue is that when i add the custom data to my image and when i read the image, i do not see the change in the modified metadata when i call the method readAndDisplay. I think the reason is that i am not saving the image properly into a new file with the modified metadata. Can someone have a look at what i am missing from my code:
public class Metadata {
public static void main(String[] args) throws Exception {
String imageFile = "134.png";
BufferedImage img = null;
try {
img = ImageIO.read(new File(imageFile));
} catch (IOException e) {
System.out.println(e.getMessage());
}
Metadata meta = new Metadata();
byte[] result = meta.writeCustomData(img, "decimalID", "211");
BufferedImage output = ImageIO.read(new ByteArrayInputStream(result));
ImageIO.write(output, "png", new File("output.png"));
meta.readAndDisplayMetadata("output.png");
}
void readAndDisplayMetadata( String fileName ) {
try {
File file = new File( fileName );
ImageInputStream iis = ImageIO.createImageInputStream(file);
Iterator<ImageReader> readers = ImageIO.getImageReaders(iis);
if (readers.hasNext()) {
// pick the first available ImageReader
ImageReader reader = readers.next();
// attach source to the reader
reader.setInput(iis, true);
// read metadata of first image
IIOMetadata metadata = reader.getImageMetadata(0);
String[] names = metadata.getMetadataFormatNames();
int length = names.length;
System.out.println(length);
for (int i = 0; i < length; i++) {
System.out.println( "Format name: " + names[ i ] );
displayMetadata(metadata.getAsTree(names[i]));
}
}
}
catch (Exception e) {
e.printStackTrace();
}
}
void readAndDisplayMetadata( byte[] image ) {
try {
// File file = new File( fileName );
ImageInputStream iis = ImageIO.createImageInputStream(new ByteArrayInputStream(image));
Iterator<ImageReader> readers = ImageIO.getImageReaders(iis);
if (readers.hasNext()) {
// pick the first available ImageReader
ImageReader reader = readers.next();
// attach source to the reader
reader.setInput(iis, true);
// read metadata of first image
IIOMetadata metadata = reader.getImageMetadata(0);
String[] names = metadata.getMetadataFormatNames();
int length = names.length;
System.out.println(length);
for (int i = 0; i < length; i++) {
System.out.println( "Format name: " + names[ i ] );
displayMetadata(metadata.getAsTree(names[i]));
}
}
}
catch (Exception e) {
e.printStackTrace();
}
}
void displayMetadata(Node root) {
displayMetadata(root, 0);
}
void indent(int level) {
for (int i = 0; i < level; i++)
System.out.print(" ");
}
void displayMetadata(Node node, int level) {
// print open tag of element
indent(level);
//System.out.println("Attributes: " + node.getAttributes().getLength());
System.out.print("<" + node.getNodeName());
NamedNodeMap map = node.getAttributes();
if (map != null) {
// print attribute values
int length = map.getLength();
for (int i = 0; i < length; i++) {
Node attr = map.item(i);
System.out.print(" " + attr.getNodeName() +
"=\"" + attr.getNodeValue() + "\"");
}
}
Node child = node.getFirstChild();
if (child == null) {
// no children, so close element and return
System.out.println("/>");
return;
}
// children, so close current tag
System.out.println(">");
while (child != null) {
// print children recursively
displayMetadata(child, level + 1);
child = child.getNextSibling();
}
// print close tag of element
indent(level);
System.out.println("</" + node.getNodeName() + ">");
}
public byte[] writeCustomData(BufferedImage buffImg, String key, String value) throws Exception {
ImageWriter writer = ImageIO.getImageWritersByFormatName("png").next();
ImageWriteParam writeParam = writer.getDefaultWriteParam();
ImageTypeSpecifier typeSpecifier = ImageTypeSpecifier.createFromBufferedImageType(BufferedImage.TYPE_INT_RGB);
//adding metadata
IIOMetadata metadata = writer.getDefaultImageMetadata(typeSpecifier, writeParam);
IIOMetadataNode textEntry = new IIOMetadataNode("tEXtEntry");
textEntry.setAttribute("keyword", key);
textEntry.setAttribute("value", value);
IIOMetadataNode text = new IIOMetadataNode("tEXt");
text.appendChild(textEntry);
IIOMetadataNode root = new IIOMetadataNode("javax_imageio_png_1.0");
root.appendChild(text);
metadata.mergeTree("javax_imageio_png_1.0", root);
//writing the data
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ImageOutputStream stream = ImageIO.createImageOutputStream(baos);
writer.setOutput(stream);
writer.write(metadata, new IIOImage(buffImg, null, metadata), writeParam);
stream.close();
return baos.toByteArray();
}

Reading incorrect values?

I have the follow class, extending Thread.
The idea is to extract the date inside the thread, and everything goes well, until the sometime when the received data is larger than a few kilobytes, and then i am starting to reading complete incorrect data.
public class ThreadBooksPositions extends Thread
{
public ThreadBooksPositions()
{
}
..
// default constructors
public void run()
{
InputStream iSS = null;
HttpURLConnection connection = null;
Integer sectionsDescriptorSize1 = 0;
Integer sectionsDescriptorSize2 = 0;
try
{
URL url = new URL( "192.168.1.4/bookstore.asp?getbooks" );
connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod( "GET" );
connection.connect();
iSS = connection.getInputStream();
BufferedInputStream bIS = new BufferedInputStream( iSS );
if( bIS.available() > 4 )
{
Float lat = 0F;
Float lng = 0F;
ByteArrayOutputStream out = new ByteArrayOutputStream();
byte[] bf;
try
{
bf = new byte[ bIS.available() ];
while ( bIS.read( bf ) != -1)
out.write( bf ); //copy streams
out.flush();
}
catch ( IOException e )
{
// TODO Auto-generated catch block
e.printStackTrace();
} //you can configure the buffer size
byte[] bO = out.toByteArray();
if( out != null )
{
try
{
out.close();
}
catch ( IOException e )
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
ByteBuffer data = ByteBuffer.wrap( bO );
sectionsDescriptorSize1 = data.getInt();
sectionsDescriptorSize2 = data.getInt();
ByteBuffer sectionData;
try
{
if( sectionsDescriptorSize1 > 0 )
{
byte[] bAS0 = new byte[ sectionsDescriptorSize1 ];
data.get( bAS0 );
}
if( sectionsDescriptorSize2 > 1 )
{
// trajectory
byte[] bAS1 = new byte[ sectionsDescriptorSize2 ];
data.get( bAS1, 0, sectionsDescriptorSize2 );
sectionData = ByteBuffer.wrap( bAS1 );
Boolean readingFailed = true;
if( sectionData != null )
{
while( sectionData.available() > 1 )
{
try
{
readingFailed = false;
lat = sectionData.getFloat(); // 4
lng = sectionData.getFloat(); // 4
}
catch( Exception e )
{
readingFailed = true;
}
try
{
if( readingFailed == false )
{
addBookStorePosition( lat, lng );
}
}
catch (Exception e)
{
}
}
}
}
catch( Error e )
{
}
}
}
catch( IOException e )
{
}
finally
{
if( iSS != null )
{
try
{
iSS.close();
}
catch( IOException e )
{
}
}
if( connection != null )
{
connection.disconnect();
}
}
}
}
What might cause reading of incorrect data ?

Found the issue.
Seems like .available() is causing the issue, especially in a threads.

BufferedReader.nextLine() will not advance to the next line so I'm stuck in a loop

I am trying to reuse input method for both Scanner(System.in) and BufferedReader(FileReader). When the user types "readfile" as a command, the program adds the first person from the file (ie the first line from the file) to a list, but the BufferedReader is stuck on the same line and therefore is stuck in an infinite loop.
The Scanner method from keyboard input works like a charm.
class Menu {
public static final int ARRAYINDEXFOREMAIL = 3;
private static final String NEWPERSON = "newperson";
private static final String FROMFILE = "readfile";
public static final String READFILE = "saved.txt";
private Scanner keyboardInput = new Scanner( System.in );
private String name;
private String tlf;
private String[] email;
private String [] stringTmp;
private String in;
private boolean readFromFile = false;
Menu( boolean fileRead ) {
if( fileRead ) {
readFromFile = true;
}
else {
readFromFile = false;
}
}
void run() {
while(true) {
System.out.print("\n" + "ordre> ");
if ( !readFromFile ) {
in = keyboardInput.nextLine();
}
else {
try {
BufferedReader fileOut = new BufferedReader(
new FileReader( READFILE ) );
if( ( in = fileOut.readLine() ) == null ) {
readFromFile = false;
fileOut.close();
break;
}
else {
System.out.println("in stringTmp is: " + in); // For debugging purposes
}
}
catch (FileNotFoundException filenotfoundexception) {
System.out.println("Cannot find file: " + READFILE + " !");
}
catch (IOException ioexception) {
ioexception.printStackTrace();
}
}
stringTmp = in.split("\\s+");
if( stringTmp[0].equalsIgnoreCase(NEWPERSON) ) {
if( stringTmp.length <= 1 ) {
System.out.println("ERROR! Enter one or more parameters..");
continue;
}
else {
name = stringTmp[1];
tlf = stringTmp[2];
if( stringTmp.length > ARRAYINDEXFOREMAIL ) {
email = new String[stringTmp.length - ARRAYINDEXFOREMAIL];
for( int i = ARRAYINDEXFOREMAIL, j = i - ARRAYINDEXFOREMAIL ; i < stringTmp.length; i++, j++ ) {
email[j] = stringTmp[i];
}
}
else {
email = new String[0];
}
mainlist.addPerson( name, tlf, email );
}
}
else if ( stringTmp[0].equalsIgnoreCase(FROMFILE) ) {
Meny m = new Menu( true );
m.run();
}
}
}
}

Note that in the loop itself you set
BufferedReader fileOut = new BufferedReader(new FileReader( READFILE ) );
This will cause a new BufferedReader to be created!
In the loop's next iteration - you will repeat this line, reset fileOut as a result - and when you invoke fileOut.readLine() - it will read the first line again.
You should initialize fileOut before the loop and not in it to solve it.

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

convert pdf to html page wise using pdfbox library - java

Related

Read a string from a txt file and compare it with items in drop down list using webdriver/java

Some of my Users can not download the zip?

Java - Cannot view modified metadata

Reading incorrect values?

BufferedReader.nextLine() will not advance to the next line so I'm stuck in a loop

Categories

Resources