How to parse HTML with java properly?

How to parse HTML with java properly? - java

Scenario/Requirement:
Download html page from some URL
Download images that were mentioned in html tags.
Change tags for images in my file, so I can open it with my browser offline and see them.
I made first 2 points, but am having difficulties with the third one.Tags do not change.What am I doing wrong?
The job is to open a file, find img src tag and replace it by another tag! Can you give me an example?
Code:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import java.awt.image.BufferedImage;
import java.net.URL;
import java.net.URLConnection;
import javax.imageio.ImageIO;
import javax.swing.text.AttributeSet;
import javax.swing.text.html.HTMLDocument;
public class ExtractAllImages {
static String result_doc = "/home/foo/index.html";
static String home_folder = "/home/foo/";
static String start_webURL = "http://www.oracle.com/";
public static void main(String args[]) throws Exception {
String webUrl = start_webURL;
URL url = new URL(webUrl);
URLConnection connection = url.openConnection();
InputStream is = connection.getInputStream();
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
HTMLEditorKit.Parser parser = new ParserDelegator();
HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
parser.parse(br, callback, true);
FileWriter writer = new FileWriter(result_doc);
htmlKit.write(writer, htmlDoc, 0, htmlDoc.getLength());
writer.close();
int number_or_images = 0;
String[] array = new String[4096];
for (HTMLDocument.Iterator iterator = htmlDoc.getIterator(HTML.Tag.IMG); iterator.isValid(); iterator.next()) {
AttributeSet attributes = iterator.getAttributes();
String imgSrc = (String) attributes.getAttribute(HTML.Attribute.SRC);
System.out.println("img_src = " + imgSrc);
if (imgSrc != null && (imgSrc.endsWith(".jpg") || (imgSrc.endsWith(".png")) || (imgSrc.endsWith(".jpeg")) || (imgSrc.endsWith(".bmp")) || (imgSrc.endsWith(".ico")))) {
try {
downloadImage(webUrl, imgSrc);
} catch (IOException ex) {
System.out.println(ex.getMessage());
}
}
array[number_or_images] = imgSrc;
number_or_images++;
///TODO change
}
for(int i =0; i < number_or_images; i++)
{
System.out.println("before = "+array[i]);
while(true)
{
int count = array[i].indexOf('/');
if(count == -1) break;
array[i] = array[i].substring(count+1);
}
System.out.println("after = " + array[i]);
}
//TODO open file and replace tags
int i =0;
File input = new File(result_doc);
Document doc = Jsoup.parse(input, "UTF-8");
System.out.println( input.canWrite());
for( Element img : doc.select("img[src]") )
{
String s = img.attr("src");
System.out.println(s);
img.attr("src", "/home/foo/"+array[i]); // set attribute 'src' to 'your-source-here'
s = img.attr("src");
System.out.println(s);
++i;
}
}
private static void downloadImage(String url, String imgSrc) throws IOException {
BufferedImage image = null;
try {
if (!(imgSrc.startsWith("http"))) {
url = url + imgSrc;
} else {
url = imgSrc;
}
imgSrc = imgSrc.substring(imgSrc.lastIndexOf("/") + 1);
String imageFormat = null;
imageFormat = imgSrc.substring(imgSrc.lastIndexOf(".") + 1);
String imgPath = null;
imgPath = home_folder + imgSrc + "";
URL imageUrl = new URL(url);
image = ImageIO.read(imageUrl);
if (image != null) {
File file = new File(imgPath);
ImageIO.write(image, imageFormat, file);
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}

Solved.
I didn't save changes. Need to add code befire "downloadImage()"
int i = 0;
File input = new File(result_doc);
Document doc = Jsoup.parse(input, "UTF-8");
for( Element img : doc.select("img[src]") ) {
img.attr("src",array[i]); // set attribute 'src' to 'your-source-here'
++i;
}
try {
String strmb = doc.outerHtml();
bw = new BufferedWriter(new FileWriter(result_doc));
bw.write(strmb);
bw.close();
}
catch (Exception ex) {
System.out.println("Program stopped. The problem is " + "\"" +
ex.getMessage()+"\"");
}

You can go with JSOUP
Try something like below
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public static void getAllTags(){
try {
File input=new File("H:\\html pages\\index1.html");
Document document=Jsoup.parse(input, "UTF-8");
Document parse=Jsoup.parse(document.html());
Elements body=parse.select("body");
Elements bodyTags=body.select("*");
for (Element element : bodyTags) {
//Do what you want with tag
System.out.println(element.tagName());
}
} catch (Exception e) {
e.printStackTrace();
}
If you want to parse html then try this
public static void parseHTML(){
try {
File input = new File("H:\\html\\index1.html");
Document document = Jsoup.parse(input, "UTF-8");
Document parse = Jsoup.parse(document.html());
Elements bodyElements = parse.select("div");
Elements elements = bodyElements.select("*");
for (Element element : elements) {
FilterHtml.setHtmlTAG(element.tagName());
FilterHtml.ParseXml();
Elements body = bodyElements.select(FilterHtml.getXmlTAG());
if (body.is(FilterHtml.getXmlTAG())) {
Elements tag = parse.select(FilterHtml.getXmlTAG());
//Do something meaning full with tag
System.out.println(tag.text());
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
Hope this would help. if yes please mark it green.

Related

Java and Weka: Creating a string attribute

I'm trying to create a Java program that converts a text file to an ARFF file for Weka. Somehow my name attribute is set to numerical, but it should be set to a string. I tried everything, I tried fixing it fixing
attr.add(new Attribute("name"));
to
attr.add(new Attribute("name",true));
But when I run it, it prints the names as number (which is in the 2nd column)
1,0,?,?,?
1000,1,?,?,?
1002,2,?,?,?
2,3,?,?,?
3000,4,?,?,?
What am I doing wrong?
import java.util.ArrayList;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.*;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import java.util.*;
import weka.core.Instances;
import weka.core.converters.ArffSaver;
public class WekaCreateARFF {
private static final String FILENAME = "Some File";
public static void main(String[] args) throws IOException {
ArrayList<String> input = new ArrayList<String>();
ArrayList<Attribute> attr = new ArrayList<Attribute>();
Instances dataset;
double [] values;
BufferedReader br = null;
FileReader fr = null;
String date = null;
double id;
String n = null;
Instance inst = new DenseInstance(5);
List nominal_state = new ArrayList(5);
nominal_state.add("CA");
nominal_state.add("NC");
nominal_state.add("TX");
nominal_state.add("SC");
nominal_state.add("NY");
List nominal_party = new ArrayList(2);
nominal_party.add("republican");
nominal_party.add("democrat");
attr.add(new Attribute("id"));
attr.add(new Attribute("name",true));
attr.add(new Attribute("political party", nominal_party));
attr.add(new Attribute("state", nominal_state));
attr.add(new Attribute("birth date", date));
try {
fr = new FileReader(FILENAME);
br = new BufferedReader(fr);
String entry;
dataset = new Instances("SimpleARFF",attr,0);
values = new double[dataset.numAttributes()];
while ((entry = br.readLine()) != null) {
//System.out.println(entry);
input.add(entry);
for (int i = 0; i<5; i++ ) {
String[] parts = entry.split(",");
String part1 = parts[0];
String name = parts[1];
id = Double.parseDouble(part1);
inst.setValue(attr.get(0), id);
inst.setValue(attr.get(1), name);
}
System.out.println(inst);
dataset.add(new DenseInstance(1.0, values));
}
//System.out.println(dataset);
//ArffSaver arff = new ArffSaver();
//arff.setInstances(dataset);
//arff.setFile(new File("Simple.arff"));
//arff.writeBatch();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (br != null)
br.close();
if (fr != null)
fr.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
}

You probably want this constructor:
http://weka.sourceforge.net/doc.dev/weka/core/Attribute.html#Attribute-java.lang.String-boolean-
That is, you essentially have to add a boolean flag to tell Weka that you want a String attribute, and not a numeric attribute (the default):
new Attribute("blah", true)
should give you a String-attribute.

java - xpath to get rows inside a table

I have an html file like: http://scholar.google.gr/citations?user=v9xULZwAAAAJ&hl=el
In this file exist a table with articles. I want to get the first 20 articles (if exist) with xpath.
I try to find fist article:
String str = (String) xpath.evaluate("//form[contains(#id,'citationsForm')]/div[2]/div[1]/table/tbody/tr[2]/td[#id='col-title']/a", docList.get(0), XPathConstants.STRING);
And its Ok! result: Modern information retrieval
for all articles:
String str = (String) xpath.evaluate("//form[contains(#id,'citationsForm')]/div[2]/div[1]/table/tbody/tr/td[#id='col-title']/a", docList.get(0), XPathConstants.STRING);
but do not work
Any Idea?
Than you!
EDIT:
Also I try:
NodeList result = (NodeList)xpath.evaluate("//form[contains(#id,'citationsForm')]/div[2]/div[1]/table/tbody/tr/td[#id='col-title']/a",
docList.get(0), XPathConstants.NODESET);
ArrayList<String>liste = new ArrayList<String>();
for(int i=0; i<result.getLength();i++){
System.out.println(result.item(i).getNodeValue());
liste.add(result.item(i).getNodeName());
}
EDIT 2 All code
Class FileOperation:
package xmlparse;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.parsers.ParserConfigurationException;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
public class FileOperations {
private static final String path = "C:\\Users\\Dimitris\\Desktop\\authors";
public ArrayList<Document> getXmlDocumt() {
ArrayList<Document> xmlFileList = new ArrayList<>();
try {
ArrayList<File> listFiles = listFiles(path);
for (File f : listFiles) {
String html = readfile(f.getAbsolutePath());
xmlFileList.add(ConvertHtml2Xml(html) );
}
} catch (IOException ex) {
Logger.getLogger(FileOperations.class.getName()).log(Level.SEVERE, null, ex);
}
return xmlFileList;
}
private ArrayList<File> listFiles(String directoryName) throws IOException {
ArrayList<File> htmlfilelist = new ArrayList<>();
File directory = new File(directoryName);
//get all the files from a directory
File[] fList = directory.listFiles();
for (File file : fList) {
if (file.isFile()) {
htmlfilelist.add(file);
}
}
return htmlfilelist;
}
private String readfile(String file) throws FileNotFoundException, IOException {
String s = "";
FileReader fr = new FileReader(file);
BufferedReader br = new BufferedReader(fr);
StringBuilder content = new StringBuilder(1024);
while ((s = br.readLine()) != null) {
content.append(s);
}
//System.out.println(content.toString());
return content.toString();
}
private Document ConvertHtml2Xml(String html) {
TagNode tagNode = new HtmlCleaner().clean(html);
Document doc = null;
try {
doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
} catch (ParserConfigurationException ex) {
Logger.getLogger(FileOperations.class.getName()).log(Level.SEVERE, null, ex);
}
return doc;
}
}
Class XpathQueries:
XPath xpath;
ArrayList<Document> docList;
public XpathQueries() {
xpath = XPathFactory.newInstance().newXPath();
FileOperations fo = new FileOperations();
docList = new ArrayList<>(fo.getXmlDocumt());
}
public void getArticle() throws XPathExpressionException {
// String str = (String) xpath.evaluate("//form[contains(#id,'citationsForm')]/div[2]/div[1]/table/tbody//td[1]/a",
// docList.get(0), XPathConstants.STRING);
String str = (String) xpath.evaluate("//*[#id='col-title']/a", docList.get(0), XPathConstants.STRING);
System.out.println(str);
}
}

Try with this:
Object result = xpath.evaluate("//*[#id='col-title']/a", docList.get(0), XPathConstants.STRING);
NodeList nodes = (NodeList) result;
for (int i = 0; i < nodes.getLength(); i++) {
System.out.println(nodes.item(i).getNodeValue());
}

Thank you for help.
The solution is:
int length;
Object result = xpath.evaluate("//a[contains(#href,'citation_for_view')]", docList.get(0), XPathConstants.NODESET);
NodeList nodes = (NodeList) result;
length = nodes.getLength();
if(length>20){
length=20;
}
for (int i = 0; i < length; i++) {
System.out.println(nodes.item(i).getFirstChild().getNodeValue());
}

java - How to print link text available on the URL?

Currently, I am able to print all the URLs of a page but not able to print the text available on the URL....
For example:
<a class="fbl" href="/preferences?hl=en" jsaction="foot.cst" id="fsettl">Settings</a>
The code is able to print only "/preferences?hl=en", but not the text of the link i.e., Settings....
public static List getLinks(String uriStr) {
List result = new ArrayList<String>();
//create a reader on the html content
try{
System.out.println("in the getlinks try");
URL url = new URI(uriStr).toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream());
// Parse the HTML
EditorKit kit = new HTMLEditorKit();
HTMLDocument doc = (HTMLDocument)kit.createDefaultDocument();
kit.read(rd, doc, 0);
// Find all the A elements in the HTML document
HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
while (it.isValid()) {
SimpleAttributeSet s = (SimpleAttributeSet)it.getAttributes();
String link = (String)s.getAttribute(HTML.Attribute.HREF);
if (link != null) {
// Add the link to the result list
System.out.println(link);
//System.out.println("link print finished");
result.add(link);
}
//System.out.println(link);
it.next();
}
}
How would I print the content of the URL?

import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.util.Iterator;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class PrintURL {
public static void main(String[] args) throws Exception
{
Reader r = null;
try {
URL u = new URL("https://www.google.co.in/");
// URL u = new URL(args[0]);
InputStream in = u.openStream();
r = new InputStreamReader(in);
Document jsoup = Jsoup.connect("https://www.google.co.in/").get();
Elements aHref = jsoup.getElementsByTag("a");
Iterator<Element> iterator = aHref.iterator();
while (iterator.hasNext())
{
Element element = iterator.next();
System.out.println("\nLink: " + element.attr("href"));
System.out.println("Link Name: " + element.text());
}
} finally {
if (r != null) {
r.close();
}
}
}
}

XML Document traverser in java

every one knows we can traverse entire xml document using DocumentTraversal's NodeIterator.
my application require some extra work so i decided to write my own XML traverser with the support of java Stack<>.
here is my code (i am not good at coding so the code and logic might look messy).
public class test
{
private static Stack<Node> gStack = new Stack<Node>();
public static void main(String[] args) throws XPathExpressionException
{
String str =
"<section>"
+ "<paragraph>This example combines regular wysiwyg editing of a document with very controlled editing of semantic rich content. The main content can be"
+ "edited like you would in a normal word processor. Though the difference is that the content remains schema valid XML because Xopus will not allow you to perform actions"
+ "on the document that would render it invalid.</paragraph>"
+ "<paragraph>The table is an example of controlled style. The style of the table is controlled by three attributes:</paragraph>"
+ "<unorderedlist>"
+ "<item><paragraph><emphasis>alternaterowcolor</emphasis>, do all rows have the same color, or should the background color alternate?</paragraph></item>"
+ "<item><paragraph><emphasis>border</emphasis>, a limited choice of border styles.</paragraph></item>"
+ "<item><paragraph><emphasis>color</emphasis>, a limited choice of colors.</paragraph></item>"
+ "</unorderedlist>"
+ "<paragraph>You have quite some freedom to style the table, but you can't break the predefined style.</paragraph>"
+ "</section>";
Document domDoc = null;
try
{
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
ByteArrayInputStream bis = new ByteArrayInputStream(str.getBytes());
domDoc = docBuilder.parse(bis);
}
catch (Exception e)
{
e.printStackTrace();
}
Element root = null;
NodeList list = domDoc.getChildNodes();
for (int i = 0; i < list.getLength(); i++)
{
if (list.item(i) instanceof Element)
{
root = (Element) list.item(i);
break;
}
}
NodeList nlist = root.getChildNodes();
System.out.println("root = " + root.getNodeName() + " child count = " + nlist.getLength());
domTraversor(root);
}
private static void domTraversor(Node node)
{
if (node.getNodeName().equals("#text"))
{
System.out.println("textElem = " + node.getTextContent());
if (node.getNextSibling() != null)
{
gStack.push(node.getNextSibling());
domTraversor(node.getNextSibling());
}
else
{
if (node.getParentNode().getNextSibling() != null)
domTraversor(node.getParentNode().getNextSibling());
}
}
else
{
if (node.getChildNodes().getLength() > 1)
{
gStack.push(node);
Node n = node.getFirstChild();
if (n.getNodeName().equals("#text"))
{
System.out.println("textElem = " + n.getTextContent());
if (n.getNextSibling() != null)
{
gStack.push(n.getNextSibling());
domTraversor(n.getNextSibling());
}
}
else
{
gStack.push(n);
domTraversor(n);
}
}
else if (node.getChildNodes().getLength() == 1)
{
Node fnode = node.getFirstChild();
if (fnode.getChildNodes().getLength() > 1)
{
gStack.push(fnode);
domTraversor(fnode);
}
else
{
if (!fnode.getNodeName().equals("#text"))
{
gStack.push(fnode);
domTraversor(fnode);
}
else
{
System.out.println("textElem = " + fnode.getTextContent());
if (fnode.getNodeName().equals("#text"))
{
if (node.getNextSibling() != null)
{
gStack.push(node.getNextSibling());
domTraversor(node.getNextSibling());
}
else
{
if (!gStack.empty())
{
Node sibPn = gStack.pop();
if (sibPn.getNextSibling() == null)
{
sibPn = gStack.pop();
}
domTraversor(sibPn.getNextSibling());
}
}
}
else
{
if (fnode.getNextSibling() != null)
{
domTraversor(fnode.getNextSibling());
}
else
{
if (!gStack.empty())
{
Node sibPn = gStack.pop().getNextSibling();
domTraversor(sibPn);
}
}
}
}
}
}
}
}
}
and it is working fine with some xml document, but not with the document which has tag like.
<unorderedlist>
<item>
<paragraph>
<emphasis>alternaterowcolor</emphasis>
, do all rows have the same color, or should the background
color
alternate?
</paragraph>
</item>
<item>
<paragraph>
<emphasis>border</emphasis>
, a limited choice of border styles.
</paragraph>
</item>
<item>
<paragraph>
<emphasis>color</emphasis>
, a limited choice of colors.
</paragraph>
</item>
</unorderedlist>
here is the scenario if any element has more than three nested children my code get stopped and not going further.
is there any one has a better implementation, please suggest.

try this way
Element e;
NodeList n;
Document doc=StudyParser.XMLfromString(xmlString);
String starttag=doc.getFirstChild().getNodeName();
Log.e("start",starttag );
n=doc.getElementsByTagName(starttag);
for(int i=0;i<n.getLength();i++){
e=(Element)n.item(i);
NodeList np = e.getElementsByTagName("item");
for(int j=0;j<np.getLength();j++){
e=(Element)n.item(i);
try{
String para=StudyParser.getValue(e, "paragraph");
Log.e("paravalue",para);
String emp=StudyParser.getValue(e, "emphasis");
Log.e("empval",emp);
}catch(Exception e){
e.printStackTrace();
}
}
}
StudyParser Class
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class StudyParser {
public StudyParser() {
}
public final static Document XMLfromString(String xml){
Document doc = null;
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
DocumentBuilder db = dbf.newDocumentBuilder();
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(xml));
doc = db.parse(is);
} catch (ParserConfigurationException e) {
System.out.println("XML parse error: " + e.getMessage());
return null;
} catch (SAXException e) {
System.out.println("Wrong XML file structure: " + e.getMessage());
return null;
} catch (IOException e) {
System.out.println("I/O exeption: " + e.getMessage());
return null;
}
return doc;
}
public static String getXMLstring(String xml){
String line = null;
try {
DefaultHttpClient httpClient = new DefaultHttpClient();
HttpPost httpPost = new HttpPost(xml);
HttpResponse httpResponse = httpClient.execute(httpPost);
HttpEntity httpEntity = httpResponse.getEntity();
line = EntityUtils.toString(httpEntity);
} catch (UnsupportedEncodingException e) {
line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
} catch (MalformedURLException e) {
line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
} catch (IOException e) {
line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
}
return line;
}
public static String getXML(InputStream is)throws IOException {
BufferedInputStream bis = new BufferedInputStream(is);
ByteArrayOutputStream buf = new ByteArrayOutputStream();
int result = bis.read();
while(result != -1) {
byte b = (byte)result;
buf.write(b);
result = bis.read();
}
return buf.toString();
}
public final static String getElementValue( Node elem ) {
Node kid;
if( elem != null){
if (elem.hasChildNodes()){
for( kid = elem.getFirstChild(); kid != null; kid = kid.getNextSibling() ){
if( kid.getNodeType() == Node.TEXT_NODE ){
return kid.getNodeValue();
}
}
}
}
return "";
}
public static int numResults(Document doc){
Node results = doc.getDocumentElement();
int res = -1;
try{
res = Integer.valueOf(results.getAttributes().getNamedItem("Categories").getNodeValue());
}catch(Exception e ){
res = -1;
}
return res;
}
public static String getValue(Element item, String str) {
NodeList n = item.getElementsByTagName(str);
return StudyParser.getElementValue(n.item(0));
}
}
Just Normal demo for dynamic xml i have assumed the same xml and but without using getElementByTagName there are many properties you can check accodringlly see
doc = StudyParser.XMLfromString(xml);
String starttag=doc.getFirstChild().getNodeName();
Log.e("start",starttag );
n=doc.getElementsByTagName(starttag);
for(int i=0;i<n.getLength();i++){
e=(Element)n.item(i);
try{
Log.e("1234",""+ e.getTextContent());
}catch(Exception e){
e.printStackTrace();
}
}

extract images from pdf using pdfbox

I m trying to extract images from a pdf using pdfbox. The example pdf here
But i m getting blank images only.
The code i m trying:-
public static void main(String[] args) {
PDFImageExtract obj = new PDFImageExtract();
try {
obj.read_pdf();
} catch (IOException ex) {
System.out.println("" + ex);
}
}
void read_pdf() throws IOException {
PDDocument document = null;
try {
document = PDDocument.load("C:\\Users\\Pradyut\\Documents\\MCS-034.pdf");
} catch (IOException ex) {
System.out.println("" + ex);
}
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
int i =1;
String name = null;
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
Map pageImages = resources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
i ++;
}
}
}
}
Thanks

Here is code using PDFBox 2.0.1 that will get a list of all images from the PDF. This is different than the other code in that it will recurse through the document instead of trying to get the images from the top level.
public List<RenderedImage> getImagesFromPDF(PDDocument document) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (PDPage page : document.getPages()) {
images.addAll(getImagesFromResources(page.getResources()));
}
return images;
}
private List<RenderedImage> getImagesFromResources(PDResources resources) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (COSName xObjectName : resources.getXObjectNames()) {
PDXObject xObject = resources.getXObject(xObjectName);
if (xObject instanceof PDFormXObject) {
images.addAll(getImagesFromResources(((PDFormXObject) xObject).getResources()));
} else if (xObject instanceof PDImageXObject) {
images.add(((PDImageXObject) xObject).getImage());
}
}
return images;
}

The below GetImagesFromPDF java class get all images in 04-Request-Headers.pdf file and save those files into destination folder PDFCopy.
import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
#SuppressWarnings({ "unchecked", "rawtypes", "deprecation" })
public class GetImagesFromPDF {
public static void main(String[] args) {
try {
String sourceDir = "C:/PDFCopy/04-Request-Headers.pdf";// Paste pdf files in PDFCopy folder to read
String destinationDir = "C:/PDFCopy/";
File oldFile = new File(sourceDir);
if (oldFile.exists()) {
PDDocument document = PDDocument.load(sourceDir);
List<PDPage> list = document.getDocumentCatalog().getAllPages();
String fileName = oldFile.getName().replace(".pdf", "_cover");
int totalImages = 1;
for (PDPage page : list) {
PDResources pdResources = page.getResources();
Map pageImages = pdResources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage pdxObjectImage = (PDXObjectImage) pageImages.get(key);
pdxObjectImage.write2file(destinationDir + fileName+ "_" + totalImages);
totalImages++;
}
}
}
} else {
System.err.println("File not exists");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}

For PDFBox 2.0.1, pudaykiran's answer must be slightly modified since some APIs have been changed.
public static void testPDFBoxExtractImages() throws Exception {
PDDocument document = PDDocument.load(new File("D:/Temp/Test.pdf"));
PDPageTree list = document.getPages();
for (PDPage page : list) {
PDResources pdResources = page.getResources();
for (COSName c : pdResources.getXObjectNames()) {
PDXObject o = pdResources.getXObject(c);
if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
File file = new File("D:/Temp/" + System.nanoTime() + ".png");
ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject)o).getImage(), "png", file);
}
}
}
}

Just add the .jpeg to the end of your path:
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i + ".jpeg");
That works for me.

You can use PDPage.convertToImage() function which can convert the PDF page into a BufferedImage. Next you can use the BufferedImage to create an Image.
Use the following reference for further detail:
All PDF realated classes in PDFBox you can get in
Apache PDFBox 1.8.3 API
Here you can see PDPage related documentation.
And do not forget to look for PDPage.convertToImage() function in PDPage class.

This is a kotlin version of #Matt's answer.
fun <R> PDResources.onImageResources(block: (RenderedImage) -> (R)): List<R> =
this.xObjectNames.flatMap {
when (val xObject = this.getXObject(it)) {
is PDFormXObject -> xObject.resources.onImageResources(block)
is PDImageXObject -> listOf(block(xObject.image))
else -> emptyList()
}
}
You can use it on PDPage Resources like this:
page.resources.onImageResources { image ->
Files.createTempFile("image", "xxx").also { path->
if(!ImageIO.write(it, "xxx", file.toFile()))
IllegalStateException("Couldn't write image to file")
}
}
Where "xxx" is the format you need (like "jpeg")

For someone who want just copy and paste this ready to use code
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.UUID;
public class ExtractImagesUseCase extends PDFStreamEngine{
private final String filePath;
private final String outputDir;
// Constructor
public ExtractImagesUseCase(String filePath,
String outputDir){
this.filePath = filePath;
this.outputDir = outputDir;
}
// Execute
public void execute(){
try{
File file = new File(filePath);
PDDocument document = PDDocument.load(file);
for(PDPage page : document.getPages()){
processPage(page);
}
}catch(IOException e){
e.printStackTrace();
}
}
#Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException{
String operation = operator.getName();
if("Do".equals(operation)){
COSName objectName = (COSName) operands.get(0);
PDXObject pdxObject = getResources().getXObject(objectName);
if(pdxObject instanceof PDImageXObject){
// Image
PDImageXObject image = (PDImageXObject) pdxObject;
BufferedImage bImage = image.getImage();
// File
String randomName = UUID.randomUUID().toString();
File outputFile = new File(outputDir,randomName + ".png");
// Write image to file
ImageIO.write(bImage, "PNG", outputFile);
}else if(pdxObject instanceof PDFormXObject){
PDFormXObject form = (PDFormXObject) pdxObject;
showForm(form);
}
}
else super.processOperator(operator, operands);
}
}
Demo
public class ExtractImageDemo{
public static void main(String[] args){
String filePath = "C:\\Users\\John\\Downloads\\Documents\\sample-file.pdf";
String outputDir = "C:\\Users\\John\\Downloads\\Documents\\Output";
ExtractImagesUseCase useCase = new ExtractImagesUseCase(
filePath,
outputDir
);
useCase.execute();
}
}

Instead of calling
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
You can use the ImageIO.write() static method to write the RGB image out in whatever format you need. Here I've used PNG:
File outputFile = new File( "C:\\Users\\Pradyut\\Documents\\image" + i + ".png");
ImageIO.write( image.getRGBImage(), "png", outputFile);

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

How to parse HTML with java properly? - java

Related

Java and Weka: Creating a string attribute

java - xpath to get rows inside a table

java - How to print link text available on the URL?

XML Document traverser in java

extract images from pdf using pdfbox

Categories

Resources