XML Document traverser in java

XML Document traverser in java - java

every one knows we can traverse entire xml document using DocumentTraversal's NodeIterator.
my application require some extra work so i decided to write my own XML traverser with the support of java Stack<>.
here is my code (i am not good at coding so the code and logic might look messy).
public class test
{
private static Stack<Node> gStack = new Stack<Node>();
public static void main(String[] args) throws XPathExpressionException
{
String str =
"<section>"
+ "<paragraph>This example combines regular wysiwyg editing of a document with very controlled editing of semantic rich content. The main content can be"
+ "edited like you would in a normal word processor. Though the difference is that the content remains schema valid XML because Xopus will not allow you to perform actions"
+ "on the document that would render it invalid.</paragraph>"
+ "<paragraph>The table is an example of controlled style. The style of the table is controlled by three attributes:</paragraph>"
+ "<unorderedlist>"
+ "<item><paragraph><emphasis>alternaterowcolor</emphasis>, do all rows have the same color, or should the background color alternate?</paragraph></item>"
+ "<item><paragraph><emphasis>border</emphasis>, a limited choice of border styles.</paragraph></item>"
+ "<item><paragraph><emphasis>color</emphasis>, a limited choice of colors.</paragraph></item>"
+ "</unorderedlist>"
+ "<paragraph>You have quite some freedom to style the table, but you can't break the predefined style.</paragraph>"
+ "</section>";
Document domDoc = null;
try
{
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
ByteArrayInputStream bis = new ByteArrayInputStream(str.getBytes());
domDoc = docBuilder.parse(bis);
}
catch (Exception e)
{
e.printStackTrace();
}
Element root = null;
NodeList list = domDoc.getChildNodes();
for (int i = 0; i < list.getLength(); i++)
{
if (list.item(i) instanceof Element)
{
root = (Element) list.item(i);
break;
}
}
NodeList nlist = root.getChildNodes();
System.out.println("root = " + root.getNodeName() + " child count = " + nlist.getLength());
domTraversor(root);
}
private static void domTraversor(Node node)
{
if (node.getNodeName().equals("#text"))
{
System.out.println("textElem = " + node.getTextContent());
if (node.getNextSibling() != null)
{
gStack.push(node.getNextSibling());
domTraversor(node.getNextSibling());
}
else
{
if (node.getParentNode().getNextSibling() != null)
domTraversor(node.getParentNode().getNextSibling());
}
}
else
{
if (node.getChildNodes().getLength() > 1)
{
gStack.push(node);
Node n = node.getFirstChild();
if (n.getNodeName().equals("#text"))
{
System.out.println("textElem = " + n.getTextContent());
if (n.getNextSibling() != null)
{
gStack.push(n.getNextSibling());
domTraversor(n.getNextSibling());
}
}
else
{
gStack.push(n);
domTraversor(n);
}
}
else if (node.getChildNodes().getLength() == 1)
{
Node fnode = node.getFirstChild();
if (fnode.getChildNodes().getLength() > 1)
{
gStack.push(fnode);
domTraversor(fnode);
}
else
{
if (!fnode.getNodeName().equals("#text"))
{
gStack.push(fnode);
domTraversor(fnode);
}
else
{
System.out.println("textElem = " + fnode.getTextContent());
if (fnode.getNodeName().equals("#text"))
{
if (node.getNextSibling() != null)
{
gStack.push(node.getNextSibling());
domTraversor(node.getNextSibling());
}
else
{
if (!gStack.empty())
{
Node sibPn = gStack.pop();
if (sibPn.getNextSibling() == null)
{
sibPn = gStack.pop();
}
domTraversor(sibPn.getNextSibling());
}
}
}
else
{
if (fnode.getNextSibling() != null)
{
domTraversor(fnode.getNextSibling());
}
else
{
if (!gStack.empty())
{
Node sibPn = gStack.pop().getNextSibling();
domTraversor(sibPn);
}
}
}
}
}
}
}
}
}
and it is working fine with some xml document, but not with the document which has tag like.
<unorderedlist>
<item>
<paragraph>
<emphasis>alternaterowcolor</emphasis>
, do all rows have the same color, or should the background
color
alternate?
</paragraph>
</item>
<item>
<paragraph>
<emphasis>border</emphasis>
, a limited choice of border styles.
</paragraph>
</item>
<item>
<paragraph>
<emphasis>color</emphasis>
, a limited choice of colors.
</paragraph>
</item>
</unorderedlist>
here is the scenario if any element has more than three nested children my code get stopped and not going further.
is there any one has a better implementation, please suggest.

try this way
Element e;
NodeList n;
Document doc=StudyParser.XMLfromString(xmlString);
String starttag=doc.getFirstChild().getNodeName();
Log.e("start",starttag );
n=doc.getElementsByTagName(starttag);
for(int i=0;i<n.getLength();i++){
e=(Element)n.item(i);
NodeList np = e.getElementsByTagName("item");
for(int j=0;j<np.getLength();j++){
e=(Element)n.item(i);
try{
String para=StudyParser.getValue(e, "paragraph");
Log.e("paravalue",para);
String emp=StudyParser.getValue(e, "emphasis");
Log.e("empval",emp);
}catch(Exception e){
e.printStackTrace();
}
}
}
StudyParser Class
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class StudyParser {
public StudyParser() {
}
public final static Document XMLfromString(String xml){
Document doc = null;
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
DocumentBuilder db = dbf.newDocumentBuilder();
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(xml));
doc = db.parse(is);
} catch (ParserConfigurationException e) {
System.out.println("XML parse error: " + e.getMessage());
return null;
} catch (SAXException e) {
System.out.println("Wrong XML file structure: " + e.getMessage());
return null;
} catch (IOException e) {
System.out.println("I/O exeption: " + e.getMessage());
return null;
}
return doc;
}
public static String getXMLstring(String xml){
String line = null;
try {
DefaultHttpClient httpClient = new DefaultHttpClient();
HttpPost httpPost = new HttpPost(xml);
HttpResponse httpResponse = httpClient.execute(httpPost);
HttpEntity httpEntity = httpResponse.getEntity();
line = EntityUtils.toString(httpEntity);
} catch (UnsupportedEncodingException e) {
line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
} catch (MalformedURLException e) {
line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
} catch (IOException e) {
line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
}
return line;
}
public static String getXML(InputStream is)throws IOException {
BufferedInputStream bis = new BufferedInputStream(is);
ByteArrayOutputStream buf = new ByteArrayOutputStream();
int result = bis.read();
while(result != -1) {
byte b = (byte)result;
buf.write(b);
result = bis.read();
}
return buf.toString();
}
public final static String getElementValue( Node elem ) {
Node kid;
if( elem != null){
if (elem.hasChildNodes()){
for( kid = elem.getFirstChild(); kid != null; kid = kid.getNextSibling() ){
if( kid.getNodeType() == Node.TEXT_NODE ){
return kid.getNodeValue();
}
}
}
}
return "";
}
public static int numResults(Document doc){
Node results = doc.getDocumentElement();
int res = -1;
try{
res = Integer.valueOf(results.getAttributes().getNamedItem("Categories").getNodeValue());
}catch(Exception e ){
res = -1;
}
return res;
}
public static String getValue(Element item, String str) {
NodeList n = item.getElementsByTagName(str);
return StudyParser.getElementValue(n.item(0));
}
}
Just Normal demo for dynamic xml i have assumed the same xml and but without using getElementByTagName there are many properties you can check accodringlly see
doc = StudyParser.XMLfromString(xml);
String starttag=doc.getFirstChild().getNodeName();
Log.e("start",starttag );
n=doc.getElementsByTagName(starttag);
for(int i=0;i<n.getLength();i++){
e=(Element)n.item(i);
try{
Log.e("1234",""+ e.getTextContent());
}catch(Exception e){
e.printStackTrace();
}
}

Related

Converting html files with *ngFor to pdf in Java

I have to generate documents in my Java Web application (Maven, runs on a server) and have to insert data from a Java class into this document.
I would like to be able to write a HTML file with placeholders. The placeholder should be replaced from the application with data from a Java class.
I also would like to be able to use conditionals like *ngFor (e.g. inserting a list into a ) or *ngIf from Angular (or attributes with a similar function).
Does anyone know a library for this?
I have a good knowledge of Java, HTML etc. so using such a library (if there is one) will not be a problem for me
In the meantime I've written a little script myself. In case someone needs something similar, I've included it in an answer

In the meantime I have searched further and unfortunately I have not found a suitable solution so far. Therefore, I have now set about programming a solution myself. Much less effort is required than expected. Here is my current code. It is currently a rough draft and certainly needs some improvement.
package com.XYZ.file.bo;
import java.beans.IntrospectionException;
import java.beans.PropertyDescriptor;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.InputStream;
import java.io.StringWriter;
import java.lang.reflect.InvocationTargetException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.XYZ.file.service.FileService;
import com.XYZ.servicelocator.ServiceLocator;
import com.XYZ.util.TechnicalException;
import com.itextpdf.html2pdf.ConverterProperties;
import com.itextpdf.html2pdf.HtmlConverter;
public class TemplateGeneratorBO {
private FileService fileService;
private static final String DOC_TEMPLATE_DIR = FileBO.BASE_DIR + "templates/";
public File generateDoc(String tempFileName, String saveFolder, String saveFileName, Object entity) {
String htmlDoc = parseHtmlDoc(tempFileName, entity);
htmlDoc = replaceSpecialChars(htmlDoc);
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
ConverterProperties converterProperties = new ConverterProperties();
HtmlConverter.convertToPdf(htmlDoc, outStream, converterProperties);
InputStream inStream = new ByteArrayInputStream(outStream.toByteArray());
saveFolder += "/" + callGetter(entity, "id") + "/templates";
if (!getFileService().createAndSaveFile(saveFolder, saveFileName + ".pdf", inStream)) {
int counter = 0;
boolean success = false;
do {
counter++;
success = getFileService().createAndSaveFile(saveFolder, saveFileName + "-" + counter + ".pdf",
inStream);
} while (!success);
return getFileService().getFile(saveFolder, saveFileName + "-" + counter + ".pdf");
}
return getFileService().getFile(saveFolder, saveFileName + ".pdf");
}
private String parseHtmlDoc(String fileName, Object entity) {
try {
File htmlFile = new File(DOC_TEMPLATE_DIR + fileName);
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(htmlFile);
doc.getDocumentElement().normalize();
Element elm = doc.getDocumentElement();
NodeList headList = elm.getElementsByTagName("head");
NodeList bodyList = elm.getElementsByTagName("body");
verifyTemplate(fileName, elm, headList, bodyList);
Node head = headList.item(0);
String html = "<html>" + xmlToString(head) + "<body>";
html += nodeToString(bodyList.item(0),
newTempEntList(new TemplateGenEntity(entity.getClass().getSimpleName(), entity)));
html += "</body></html>";
return html;
} catch (Exception exc) {
throw new TechnicalException("DocGenerator Exception with file " + fileName, exc);
}
}
private String nodeToString(Node parentNode, List<TemplateGenEntity> entities)
throws TransformerException, ScriptException {
NodeList nodes = parentNode.getChildNodes();
StringBuilder string = new StringBuilder("");
for (int i = 0; i < nodes.getLength(); i++) {
Node node = nodes.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element elm = (Element) node;
string.append(elementToString(elm, entities));
} else {
string.append(insertValues(xmlToString(node), entities));
}
}
return string.toString();
}
private String elementToString(Element elm, List<TemplateGenEntity> entities)
throws ScriptException, TransformerException {
if (!proofNgIf(elm, entities)) {
return "";
}
if (elm.hasAttribute("ngFor")) {
return ngForElementToString(elm, entities);
}
return "<" + elm.getNodeName() + getElementAttributes(elm) + ">" + nodeToString(elm, entities) + "</"
+ elm.getNodeName() + ">";
}
#SuppressWarnings("unchecked")
private String ngForElementToString(Element elm, List<TemplateGenEntity> entities)
throws ScriptException, TransformerException {
String attrs = getElementAttributes(elm);
String ngFor = elm.getAttribute("ngFor");
String[] ngForList = ngFor.split(" of ");
StringBuilder string = new StringBuilder();
ScriptEngineManager factory = new ScriptEngineManager();
ScriptEngine engine = factory.getEngineByName("JavaScript");
for (TemplateGenEntity ent : entities) {
engine.put(ent.getEntityName(), ent.getEntity());
}
List<Object> list = (List<Object>) engine.eval(ngForList[1]);
for (Object obj : list) {
string.append("<" + elm.getNodeName() + attrs + ">"
+ nodeToString(elm, newTempEntList(entities, new TemplateGenEntity(ngForList[0], obj))) + "</"
+ elm.getNodeName() + ">");
}
return string.toString();
}
/**
*
* #return true if no ngIf or ngIf condition is true
* #throws ScriptException
*/
private boolean proofNgIf(Element elm, List<TemplateGenEntity> entities) throws ScriptException {
if (!elm.hasAttribute("ngIf")) {
return true;
}
String ngIf = elm.getAttribute("ngIf");
if (ngIf.isBlank()) {
throw new TechnicalException("Document template contains empty ngIf!");
}
ScriptEngineManager factory = new ScriptEngineManager();
ScriptEngine engine = factory.getEngineByName("JavaScript");
for (TemplateGenEntity ent : entities) {
engine.put(ent.getEntityName(), ent.getEntity());
}
return (boolean) engine.eval(ngIf);
}
private String insertValues(String strIn, List<TemplateGenEntity> entities) throws ScriptException {
StringBuilder str = new StringBuilder(strIn);
int begin = str.indexOf("{{");
int end = str.indexOf("}}") + 2;
while (begin != -1 && end != 1) {
String var = str.substring(begin, end);
var = var.replace("{{", "");
var = var.replace("}}", "");
ScriptEngineManager factory = new ScriptEngineManager();
ScriptEngine engine = factory.getEngineByName("JavaScript");
for (TemplateGenEntity ent : entities) {
engine.put(ent.getEntityName(), ent.getEntity());
}
Object val = engine.eval(var);
String valStr = objectToStr(val);
str = str.replace(begin, end, valStr);
begin = str.indexOf("{{");
end = str.indexOf("}}") + 2;
}
return str.toString();
}
private String xmlToString(Node node) throws TransformerException {
StringWriter writer = new StringWriter();
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.setOutputProperty(OutputKeys.INDENT, "no");
transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, "yes");
transformer.transform(new DOMSource(node), new StreamResult(writer));
return writer.toString();
}
private String replaceSpecialChars(String str) {
str = str.replace(">", ">");
return str;
}
private String getElementAttributes(Element elm) {
StringBuilder attrStr = new StringBuilder();
NamedNodeMap attrs = elm.getAttributes();
for (int i = 0; i < attrs.getLength(); i++) {
Attr attr = (Attr) attrs.item(i);
String attrName = attr.getName();
String attrVal = attr.getValue();
if (attrName.equals("ngIf") || attrName.equals("ngFor")) {
continue;
}
attrStr.append(" " + attrName + "=\"" + attrVal + "\"");
}
return attrStr.toString();
}
private void verifyTemplate(String fileName, Element elm, NodeList head, NodeList body) {
if (!elm.getNodeName().equalsIgnoreCase("html")) {
throw new TechnicalException("Document template " + fileName + " doesn't starts with html node!");
}
if (head.getLength() != 1 || head.item(0) == null) {
throw new TechnicalException("Document template " + fileName + " doesn't contains head!");
}
if (body.getLength() != 1 || head.item(0) == null) {
throw new TechnicalException("Document template " + fileName + " doesn't contains body!");
}
}
private FileService getFileService() {
if (fileService == null) {
fileService = ServiceLocator.locateService(FileService.class);
}
return fileService;
}
private Object callGetter(Object obj, String fieldName) {
PropertyDescriptor pd;
try {
pd = new PropertyDescriptor(fieldName, obj.getClass());
return pd.getReadMethod().invoke(obj);
} catch (IntrospectionException | IllegalAccessException | IllegalArgumentException
| InvocationTargetException e) {
throw new TechnicalException(e.getMessage(), e);
}
}
private String objectToStr(Object obj) {
if (obj instanceof Date) {
return new SimpleDateFormat("dd.MM.yyyy").format(obj);
}
return obj.toString();
}
public static List<TemplateGenEntity> newTempEntList(TemplateGenEntity entity) {
List<TemplateGenEntity> list = new ArrayList<>();
list.add(entity);
return list;
}
public static List<TemplateGenEntity> newTempEntList(List<TemplateGenEntity> entities, TemplateGenEntity entity) {
List<TemplateGenEntity> list = new ArrayList<>();
for (TemplateGenEntity ent : entities) {
list.add(ent);
}
list.add(entity);
return list;
}
public class TemplateGenEntity {
private String entityName;
private Object entity;
public TemplateGenEntity(String entityName, Object entity) {
this.entityName = entityName;
this.entity = entity;
}
public String getEntityName() {
return entityName;
}
public void setEntityName(String entityName) {
this.entityName = entityName;
}
public Object getEntity() {
return entity;
}
public void setEntity(Object entity) {
this.entity = entity;
}
}
}

use freemarker for the placeholder replacement and pd4ml for the html convertion, it works well for me.

How to parse HTML with java properly?

Scenario/Requirement:
Download html page from some URL
Download images that were mentioned in html tags.
Change tags for images in my file, so I can open it with my browser offline and see them.
I made first 2 points, but am having difficulties with the third one.Tags do not change.What am I doing wrong?
The job is to open a file, find img src tag and replace it by another tag! Can you give me an example?
Code:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import java.awt.image.BufferedImage;
import java.net.URL;
import java.net.URLConnection;
import javax.imageio.ImageIO;
import javax.swing.text.AttributeSet;
import javax.swing.text.html.HTMLDocument;
public class ExtractAllImages {
static String result_doc = "/home/foo/index.html";
static String home_folder = "/home/foo/";
static String start_webURL = "http://www.oracle.com/";
public static void main(String args[]) throws Exception {
String webUrl = start_webURL;
URL url = new URL(webUrl);
URLConnection connection = url.openConnection();
InputStream is = connection.getInputStream();
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
HTMLEditorKit.Parser parser = new ParserDelegator();
HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
parser.parse(br, callback, true);
FileWriter writer = new FileWriter(result_doc);
htmlKit.write(writer, htmlDoc, 0, htmlDoc.getLength());
writer.close();
int number_or_images = 0;
String[] array = new String[4096];
for (HTMLDocument.Iterator iterator = htmlDoc.getIterator(HTML.Tag.IMG); iterator.isValid(); iterator.next()) {
AttributeSet attributes = iterator.getAttributes();
String imgSrc = (String) attributes.getAttribute(HTML.Attribute.SRC);
System.out.println("img_src = " + imgSrc);
if (imgSrc != null && (imgSrc.endsWith(".jpg") || (imgSrc.endsWith(".png")) || (imgSrc.endsWith(".jpeg")) || (imgSrc.endsWith(".bmp")) || (imgSrc.endsWith(".ico")))) {
try {
downloadImage(webUrl, imgSrc);
} catch (IOException ex) {
System.out.println(ex.getMessage());
}
}
array[number_or_images] = imgSrc;
number_or_images++;
///TODO change
}
for(int i =0; i < number_or_images; i++)
{
System.out.println("before = "+array[i]);
while(true)
{
int count = array[i].indexOf('/');
if(count == -1) break;
array[i] = array[i].substring(count+1);
}
System.out.println("after = " + array[i]);
}
//TODO open file and replace tags
int i =0;
File input = new File(result_doc);
Document doc = Jsoup.parse(input, "UTF-8");
System.out.println( input.canWrite());
for( Element img : doc.select("img[src]") )
{
String s = img.attr("src");
System.out.println(s);
img.attr("src", "/home/foo/"+array[i]); // set attribute 'src' to 'your-source-here'
s = img.attr("src");
System.out.println(s);
++i;
}
}
private static void downloadImage(String url, String imgSrc) throws IOException {
BufferedImage image = null;
try {
if (!(imgSrc.startsWith("http"))) {
url = url + imgSrc;
} else {
url = imgSrc;
}
imgSrc = imgSrc.substring(imgSrc.lastIndexOf("/") + 1);
String imageFormat = null;
imageFormat = imgSrc.substring(imgSrc.lastIndexOf(".") + 1);
String imgPath = null;
imgPath = home_folder + imgSrc + "";
URL imageUrl = new URL(url);
image = ImageIO.read(imageUrl);
if (image != null) {
File file = new File(imgPath);
ImageIO.write(image, imageFormat, file);
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}

Solved.
I didn't save changes. Need to add code befire "downloadImage()"
int i = 0;
File input = new File(result_doc);
Document doc = Jsoup.parse(input, "UTF-8");
for( Element img : doc.select("img[src]") ) {
img.attr("src",array[i]); // set attribute 'src' to 'your-source-here'
++i;
}
try {
String strmb = doc.outerHtml();
bw = new BufferedWriter(new FileWriter(result_doc));
bw.write(strmb);
bw.close();
}
catch (Exception ex) {
System.out.println("Program stopped. The problem is " + "\"" +
ex.getMessage()+"\"");
}

You can go with JSOUP
Try something like below
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public static void getAllTags(){
try {
File input=new File("H:\\html pages\\index1.html");
Document document=Jsoup.parse(input, "UTF-8");
Document parse=Jsoup.parse(document.html());
Elements body=parse.select("body");
Elements bodyTags=body.select("*");
for (Element element : bodyTags) {
//Do what you want with tag
System.out.println(element.tagName());
}
} catch (Exception e) {
e.printStackTrace();
}
If you want to parse html then try this
public static void parseHTML(){
try {
File input = new File("H:\\html\\index1.html");
Document document = Jsoup.parse(input, "UTF-8");
Document parse = Jsoup.parse(document.html());
Elements bodyElements = parse.select("div");
Elements elements = bodyElements.select("*");
for (Element element : elements) {
FilterHtml.setHtmlTAG(element.tagName());
FilterHtml.ParseXml();
Elements body = bodyElements.select(FilterHtml.getXmlTAG());
if (body.is(FilterHtml.getXmlTAG())) {
Elements tag = parse.select(FilterHtml.getXmlTAG());
//Do something meaning full with tag
System.out.println(tag.text());
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
Hope this would help. if yes please mark it green.

Document parsing shows null

I need help in the below concept.
I want to get attributes of xref node in the code. i.e id and its value, location and its value, type and its value.
I am passing xml as string. But the document shows null on parsing.
PLease help me in this.
import java.io.StringReader;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
public class GetAtrribute {
/**
* #param args
*/
public static void main(String[] args) {
String xml = "<xref id=\"19703675\" location=\"abstract\" type=\"external\">PubMed Abstract: http://www.abcd.nlm.nih.gov/...</xref>"; //Populated XML String....
GetAtrribute ga = new GetAtrribute();
try {
ga.getValues(xml);
} catch (Exception e) {
e.printStackTrace();
}
}
public String getValues(String xmlStr) throws Exception {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder;
xmlStr = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + xmlStr;
try {
builder = factory.newDocumentBuilder();
Document document = builder.parse(new InputSource(new StringReader(
xmlStr)));
Element element = document.getDocumentElement();
NodeList list = element.getElementsByTagName("xref");
if (list != null && list.getLength() > 0) {
NodeList subList = list.item(0).getChildNodes();
if (subList != null && subList.getLength() > 0) {
return subList.item(0).getNodeValue();
}
for (int count = 0; count < subList.getLength(); count++) {
System.out.println(subList.item(count).getNodeValue());
}
}
} catch (Exception e) {
e.printStackTrace();
}
return xmlStr;
}
}

Your problem is that when you run this line:
Element element = document.getDocumentElement();
you're actually selecting xref already, because its the only xml element. You could either wrap another object around xref, or just use the variable 'element' to get the details.
p.s. your class name is spelt wrong: GetAtrribute -> GetAttribute

I suggest you to use XPath to find data in your XML:
XPath xPath = XPathFactory.newInstance().newXPath();
Document baseDoc;
try (InputStream pStm = new ByteArrayInputStream(baseXmlString.getBytes("utf-8"))) {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
baseDoc = builder.parse(pStm);
} catch (SAXException | IOException | ParserConfigurationException ex) {
getLogger().error(null, ex);
return null;
}
try {
XPathExpression expression = xPath.compile(xPathExpression);
return (T) expression.evaluate(baseDoc, pathType);
} catch (XPathExpressionException ex) {
getLogger().error(null, ex);
}
return null;
For example take a look at here

How to get a specific event/attribute content from an xml string via stAX or SAX

I have a xml POST response which I receive as a string. I need the content of the particular "pnr" (see in xml) to pass it on to another GET request.
I am trying sax and stAX to achieve this but failing miserably.
I used getElementsByTagName and also getAttribute, but no go...
Here's my code and later the xml string that I receive.
Any kind of help will be a gift
package rest;
import javax.xml.parsers.*;
import org.xml.sax.InputSource;
import org.w3c.dom.*;
import java.io.*;
public class ParseXMLString {
public static void main(String arg[]) {
String outputString = RESTClient.postConfirm(); // this is the xml string response I am getting
try {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(outputString));
Document doc = db.parse(is);
String Parentnode = doc.getDocumentElement().getAttribute("pnr");
// Element element = (Element) Parentnode.;
// NodeList name = element.getElementsByTagName("pnr");
// Element line = (Element) name.item(0);
//String IDList = getCharacterDataFromElement(line);
System.out.println(Parentnode);
}
catch (Exception e) {
e.printStackTrace();
}
}
public static String getCharacterDataFromElement(Element e) {
Node child = e.getFirstChild();
if (child instanceof CharacterData) {
CharacterData cd = (CharacterData) child;
return cd.getData();
}
return "?";
}
}
And here is the the XML String I have received:
<?xml version="1.0" encoding="UTF-8"?><Ticket><bookedSeats>3</bookedSeats><bpAddress>Anand Rao Circle</bpAddress><bpLandMark>ganesha temple</bpLandMark><bpLocation> Ghousia College</bpLocation><bpPhoneNo>98798679769</bpPhoneNo><bpTime>1200</bpTime><busServiceName>efdf</busServiceName><busType>Volvo A/C Semi Sleeper (2+2)</busType><commission>66.19</commission><dateOfJourney>2012-10-05</dateOfJourney><destination>Chennai</destination><fare>600.0</fare><issueTime>2012-10-04T15:46:45.073+05:30</issueTime><noOfSeats>1</noOfSeats><passengerMobile>1234567890</passengerMobile><passengerName>Test</passengerName><pnr>RATPKES44974756</pnr><seatDetails><seatDetail><commission>66.19</commission><fare>600.0</fare><gender>MALE</gender><passengerAge>0</passengerAge><passengerMobile>1234567890</passengerMobile><passengerName>Test</passengerName><prime>false</prime><seatName>3</seatName></seatDetail></seatDetails><source>Bangalore</source><status>BOOKED</status><travelsName>Rajratan Travels</travelsName></Ticket>

Instead of SAX or StAX you could do the following with the javax.xml.xpath APIs in the JDK/JRE since Java SE 5:
Demo
import java.io.StringReader;
import javax.xml.xpath.*;
import org.xml.sax.InputSource;
public class Demo {
public static void main(String[] args) throws Exception {
String xml = "<?xml version='1.0' encoding='UTF-8'?><Ticket><bookedSeats>3</bookedSeats><bpAddress>Anand Rao Circle</bpAddress><bpLandMark>ganesha temple</bpLandMark><bpLocation> Ghousia College</bpLocation><bpPhoneNo>98798679769</bpPhoneNo><bpTime>1200</bpTime><busServiceName>efdf</busServiceName><busType>Volvo A/C Semi Sleeper (2+2)</busType><commission>66.19</commission><dateOfJourney>2012-10-05</dateOfJourney><destination>Chennai</destination><fare>600.0</fare><issueTime>2012-10-04T15:46:45.073+05:30</issueTime><noOfSeats>1</noOfSeats><passengerMobile>1234567890</passengerMobile><passengerName>Test</passengerName><pnr>RATPKES44974756</pnr><seatDetails><seatDetail><commission>66.19</commission><fare>600.0</fare><gender>MALE</gender><passengerAge>0</passengerAge><passengerMobile>1234567890</passengerMobile><passengerName>Test</passengerName><prime>false</prime><seatName>3</seatName></seatDetail></seatDetails><source>Bangalore</source><status>BOOKED</status><travelsName>Rajratan Travels</travelsName></Ticket>";
XPathFactory xpf = XPathFactory.newInstance();
XPath xpath = xpf.newXPath();
InputSource inputSource = new InputSource(new StringReader(xml));
String pnr = (String) xpath.evaluate("/Ticket/pnr", inputSource, XPathConstants.STRING);
System.out.println(pnr);
}
}
Output
RATPKES44974756

this bit of code will get you the pnr :
NodeList nodeLst = doc.getElementsByTagName("Ticket");
Node ticket = nodeLst.item(0);
NodeList attr = ticket.getChildNodes();
for (int i = 0; i < attr.getLength(); i++){
if (attr.item(i).getNodeName().equals("pnr"))
System.out.println(attr.item(i).getTextContent());
}

If I were to solve this problem, I'd probably use XPath. But since you specifically asked for StAX, here's an example parser (note that this is just skeleton code to get you started).
import java.io.StringReader;
import java.util.Iterator;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
public class ParseXMLStringStAX {
private static final String PNR = "pnr";
private String characters;
public String parse(String xmlString) throws XMLStreamException, Exception {
XMLEventReader reader = null;
try {
if (xmlString == null || xmlString.isEmpty()) {
throw new IllegalArgumentException("Illegal initializiation (xmlString is null or empty)");
}
StringReader stringReader = new StringReader(xmlString);
XMLInputFactory inputFact = XMLInputFactory.newInstance();
XMLStreamReader streamReader = inputFact.createXMLStreamReader(stringReader);
reader = inputFact.createXMLEventReader(streamReader);
while (reader.hasNext()) {
XMLEvent event = reader.nextEvent();
if (event.isCharacters()) {
characters(event);
}
if (event.isStartElement()) {
startElement(event);
// handle attributes
Iterator<Attribute> attributes = event.asStartElement().getAttributes();
while(attributes.hasNext()) {
attribute(attributes.next());
}
}
if (event.isEndElement()) {
// found what we want?
if (endElement(event)) {
endDocument(null);
break;
}
}
if (event.isStartDocument()) {
startDocument(event);
}
if (event.isEndDocument()) {
endDocument(event);
}
}
} catch (XMLStreamException ex) {
throw ex;
} finally {
try {
if (reader != null) {
reader.close();
}
} catch (XMLStreamException ex) {
}
}
return characters;
}
private void attribute(XMLEvent event) throws Exception {
}
private void characters(XMLEvent event) throws Exception {
Characters asCharacters = event.asCharacters();
if (asCharacters.isWhiteSpace())
return;
if (characters == null) {
characters = asCharacters.getData();
} else {
characters += asCharacters.getData();
}
}
private void startElement(XMLEvent event) throws Exception {
StartElement startElement = event.asStartElement();
String name = startElement.getName().getLocalPart();
characters = null;
}
private boolean endElement(XMLEvent event) throws Exception {
EndElement endElement = event.asEndElement();
String name = endElement.getName().getLocalPart();
if (PNR.equals(name)) {
return true;
}
return false;
}
private void startDocument(XMLEvent event) {
System.out.println("Parsing started");
}
private void endDocument(XMLEvent event) {
System.out.println("Parsing ended");
}
public static void main(String[] argv) throws XMLStreamException, Exception {
String xml = "";
xml += "<Ticket>";
xml += " <bookedSeats>3</bookedSeats>";
xml += " <bpAddress>Anand Rao Circle</bpAddress>";
xml += " <bpLandMark>ganesha temple</bpLandMark>";
xml += " <bpLocation> Ghousia College</bpLocation>";
xml += " <bpPhoneNo>98798679769</bpPhoneNo>";
xml += " <bpTime>1200</bpTime>";
xml += " <busServiceName>efdf</busServiceName>";
xml += " <busType>Volvo A/C Semi Sleeper (2+2)</busType>";
xml += " <commission>66.19</commission>";
xml += " <dateOfJourney>2012-10-05</dateOfJourney>";
xml += " <destination>Chennai</destination>";
xml += " <fare>600.0</fare>";
xml += " <issueTime>2012-10-04T15:46:45.073+05:30</issueTime>";
xml += " <noOfSeats>1</noOfSeats>";
xml += " <passengerMobile>1234567890</passengerMobile>";
xml += " <passengerName>Test</passengerName>";
xml += " <pnr>RATPKES44974756</pnr>";
xml += " <seatDetails>";
xml += " <seatDetail>";
xml += " <commission>66.19</commission>";
xml += " <fare>600.0</fare>";
xml += " <gender>MALE</gender>";
xml += " <passengerAge>0</passengerAge>";
xml += " <passengerMobile>1234567890</passengerMobile>";
xml += " <passengerName>Test</passengerName>";
xml += " <prime>false</prime>";
xml += " <seatName>3</seatName>";
xml += " </seatDetail>";
xml += " </seatDetails>";
xml += " <source>Bangalore</source>";
xml += " <status>BOOKED</status>";
xml += " <travelsName>Rajratan Travels</travelsName>";
xml += "</Ticket>";
ParseXMLStringStAX parser = new ParseXMLStringStAX();
String pnr = parser.parse(xml);
System.out.println("--> Result: " + String.valueOf(pnr));
}
}

java find value of an xml attribute

this is my xml :
<-tobject.subject tobject.subject.refnum="01016000" />
<-tobject.subject tobject.subject.refnum="10004000" />
I want to extract 01016000 and 10004000 from it .
I used this code:
NodeList nodeLst4 = doc.getElementsByTagName("tobject.subject");
if (nodeLst4 != null) {
int numberofCOdes = nodeLst4.getLength();
aSubjectCodes = new String[numberofCOdes];
for (int i = 0; i < numberofCOdes; i++) {
XPath xpath = XPathFactory.newInstance().newXPath();
aSubjectCodes[i] = xpath.evaluate("//tobject.subject/#tobject.subject.refnum", doc);
the problem is that when i loop through it the evaluate method just return first number and do not give me the second value.
and i am not sure if using xpath.evaluate is good idea or not.
Thanks

there is no need to use the doc.getElementsByTagName.
You are mixing plain DOM with XPath.
Your xpath is correct:
package net.davymeers;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
public class XpathTest {
private static String XMLSTRING = "<data>"
+ "<tobject.subject tobject.subject.refnum=\"01016000\" />\r\n"
+ "\r\n"
+ "<tobject.subject tobject.subject.refnum=\"10004000\" />"
+ "</data>";
/**
* #param args
*/
public static void main(final String[] args) {
final Document doc = createDocument();
final XPath xpath = createXpath();
final NodeList nodes = findElements(
"//tobject.subject/#tobject.subject.refnum", doc, xpath);
final Collection<String> results = convertToCollection(nodes);
for (final String result : results) {
System.out.println(result);
}
}
private static Document createDocument() {
Document doc = null;
try {
final DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory
.newInstance();
documentBuilderFactory.setNamespaceAware(true); // never forget
// this!
final DocumentBuilder builder = documentBuilderFactory
.newDocumentBuilder();
doc = builder.parse(new ByteArrayInputStream(XMLSTRING
.getBytes("ISO-8859-1")));
} catch (final UnsupportedEncodingException exception) {
// TODO handle exception
} catch (final SAXException exception) {
// TODO handle exception
} catch (final IOException exception) {
// TODO handle exception
} catch (final ParserConfigurationException exception) {
// TODO handle exception
}
return doc;
}
private static XPath createXpath() {
final XPathFactory xpathFactory = XPathFactory.newInstance();
final XPath xpath = xpathFactory.newXPath();
return xpath;
}
private static NodeList findElements(final String xpathExpression,
final Document doc, final XPath xpath) {
NodeList nodes = null;
if (doc != null) {
try {
final XPathExpression expr = xpath.compile(xpathExpression);
final Object result = expr
.evaluate(doc, XPathConstants.NODESET);
nodes = (NodeList) result;
} catch (final XPathExpressionException exception) {
// TODO handle exception
}
}
return nodes;
}
private static Collection<String> convertToCollection(final NodeList nodes) {
final Collection<String> result = new ArrayList<String>();
if (nodes != null) {
for (int i = 0; i < nodes.getLength(); i++) {
result.add(nodes.item(i).getNodeValue());
}
}
return result;
}
}

Here's a useful class I found a while back for XMLFiles. Takes a lot of the work off of your shoulders.
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* XMLFile.java
*
* XML file object that represents an xml file and its properties. Used to
* simplify the process of reading from and writing to XML files.
*
* Derived from unknown source. Implemented on 12/03/09. Permission given to
* implement and modify code.
*/
public class XMLFile {
private String name;
private String content;
private Map<String, String> nameAttributes = new HashMap<String, String>();
private Map<String, List<XMLFile>> nameChildren = new HashMap<String, List<XMLFile>>();
private static Element rootElement(String filename, String rootName) {
FileInputStream fileInputStream = null;
try {
fileInputStream = new FileInputStream(filename);
DocumentBuilderFactory builderFactory = DocumentBuilderFactory
.newInstance();
DocumentBuilder builder = builderFactory.newDocumentBuilder();
Document document = builder.parse(fileInputStream);
Element rootElement = document.getDocumentElement();
if (!rootElement.getNodeName().equals(rootName))
throw new RuntimeException("Could not find root node: "
+ rootName);
return rootElement;
} catch (Exception exception) {
throw new RuntimeException(exception);
} finally {
if (fileInputStream != null) {
try {
fileInputStream.close();
} catch (Exception exception) {
throw new RuntimeException(exception);
}
}
}
}
/**
* #param (String) Filepath of XML File (String) Root of XML File
**/
public XMLFile(String filename, String rootName) {
this(rootElement(filename, rootName));
}
/**
* #param (Element) XML File Element
**/
private XMLFile(Element element) {
this.name = element.getNodeName();
this.content = element.getTextContent();
NamedNodeMap namedNodeMap = element.getAttributes();
int n = namedNodeMap.getLength();
for (int i = 0; i < n; i++) {
Node node = namedNodeMap.item(i);
String name = node.getNodeName();
addAttribute(name, node.getNodeValue());
}
NodeList nodes = element.getChildNodes();
n = nodes.getLength();
for (int i = 0; i < n; i++) {
Node node = nodes.item(i);
int type = node.getNodeType();
if (type == Node.ELEMENT_NODE)
addChild(node.getNodeName(), new XMLFile((Element) node));
}
}
/**
* Adds attribute to ???
*
* #param (String) Attribute Name (String) Attribute Value
**/
private void addAttribute(String name, String value) {
nameAttributes.put(name, value);
}
/**
* Adds child directory to ???
*
* #param (String) Name of New Child Directory (XMLFile) XML Documentation
* of Child
**/
private void addChild(String name, XMLFile child) {
List<XMLFile> children = nameChildren.get(name);
if (children == null) {
children = new ArrayList<XMLFile>();
nameChildren.put(name, children);
}
children.add(child);
}
public String name() {
return name;
}
public String content() {
return content;
}
/**
*
**/
public XMLFile child(String name) {
List<XMLFile> children = children(name);
if (children.size() != 1)
throw new RuntimeException("Could not find individual child node: "
+ name);
return children.get(0);
}
/**
*
**/
public List<XMLFile> children(String name) {
List<XMLFile> children = nameChildren.get(name);
return children == null ? new ArrayList<XMLFile>() : children;
}
/**
* Gets the value of a specific field and converts it to a String object
*
* #param (String) Name of Field
**/
public String string(String name) {
String value = nameAttributes.get(name);
if (value == null)
throw new RuntimeException("Could not find attribute: " + name
+ ", in node: " + this.name);
return value;
}
/**
* Gets the value of a specific field and converts it to an int
*
* #param (String) Name of Field
**/
public int integer(String name) {
return Integer.parseInt(string(name));
}
/**
* Gets the value of a specific field and converts it to an
* ArrayList<String>
*
* #param (String) Name of Field
**/
public ArrayList<String> arrayListString(String name) {
String left = new String();
int finished = 0;
ArrayList<String> list = new ArrayList<String>();
try {
left = nameAttributes.get(name);
} catch (Exception e) {
System.err.println("Exception: " + e.getMessage());
}
while (finished == 0) {
if (left.indexOf(", ") > -1) {
list.add(left.substring(0, left.indexOf(", ")));
left = left.substring(left.indexOf(", ") + 2);
} else {
list.add(left);
finished = 1;
}
}
return list;
}
}

Would this work for you? I am parsing RSS XML from here:
http://www.kraftfoods.com/rss/dinnerRecipes.aspx
Look at Media and URL towards the bottom:
package recipeSearchAndFinder.xml;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
public class DomFeedParser extends BaseFeedParser {
public DomFeedParser(String feedUrl) {
super(feedUrl);
}
public List<Message> parse() {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
List<Message> messages = new ArrayList<Message>();
try {
DocumentBuilder builder = factory.newDocumentBuilder();
Document dom = builder.parse(this.getInputStream());
Element root = dom.getDocumentElement();
NodeList items = root.getElementsByTagName(ITEM);
for (int i = 0; i < items.getLength(); i++) {
Message message = new Message();
Node item = items.item(i);
NodeList properties = item.getChildNodes();
for (int j = 0; j < properties.getLength(); j++) {
Node property = properties.item(j);
String name = property.getNodeName();
if (name.equalsIgnoreCase(TITLE)) {
message.setTitle(property.getFirstChild()
.getNodeValue());
} else if (name.equalsIgnoreCase(LINK)) {
message.setLink(property.getFirstChild().getNodeValue());
} else if (name.equalsIgnoreCase(DESCRIPTION)) {
StringBuilder text = new StringBuilder();
NodeList chars = property.getChildNodes();
for (int k = 0; k < chars.getLength(); k++) {
text.append(chars.item(k).getNodeValue());
}
message.setDescription(text.toString());
} else if (name.equalsIgnoreCase(PUB_DATE)) {
message.setDate(property.getFirstChild().getNodeValue());
} else if (name.equalsIgnoreCase(MEDIA)) {
NamedNodeMap nMap = property.getAttributes();
String mediaurl = nMap.getNamedItem("url")
.getNodeValue();
message.setMedia(mediaurl);
}
}
messages.add(message);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
return messages;
}
}

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

XML Document traverser in java - java

Related

Converting html files with *ngFor to pdf in Java

How to parse HTML with java properly?

Document parsing shows null

How to get a specific event/attribute content from an xml string via stAX or SAX

java find value of an xml attribute

Categories

Resources