How to find an attribute using HTMLDocument? - java

Possibly the terminology is different with HTML than with XML, but here is a HTML document from which attributes are being retrieved. Here the attributes a1, a2, a3 are part of the Body tag.
<html>
<head>
Hello World
</head>
<body a1="ABC" a2="3974" a3="A1B2"> <------These attributes
<H1>Start Here<H1>
<p>This is the body</p>
</body>
</html>
Using the following file to parse the above HTML file.
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.Reader;
import javax.swing.text.AttributeSet;
import javax.swing.text.Element;
import javax.swing.text.ElementIterator;
import javax.swing.text.StyleConstants;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class HTMLParserTest
{
public static void main(String args[]) throws Exception {
Reader reader = new FileReader("C:/Downloads/DeleteMe/Example1.html");
BufferedReader br = new BufferedReader(reader );
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
HTMLEditorKit.Parser parser = new ParserDelegator();
HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
parser.parse(br, callback, true);
// Parse
ElementIterator iterator = new ElementIterator(htmlDoc);
Element element;
while ((element = iterator.next()) != null)
{
System.out.println("Element : " + element);
AttributeSet attributes = element.getAttributes();
Object name = attributes.getAttribute(StyleConstants.NameAttribute);
if ((name instanceof HTML.Tag))
//&& ((name == HTML.Tag.H1) || (name == HTML.Tag.H2) || (name == HTML.Tag.H3)))
{
// Build up content text as it may be within multiple elements
StringBuffer text = new StringBuffer();
int count = element.getElementCount();
for (int i = 0; i < count; i++) {
Element child = element.getElement(i);
AttributeSet childAttributes = child.getAttributes();
System.out.println("Element : " + child);
System.out.println(" Attribute count : " + childAttributes.getAttributeCount());
System.out.println(" a1 exists : " + childAttributes.isDefined("a1"));
int startOffset = child.getStartOffset();
int endOffset = child.getEndOffset();
int length = endOffset - startOffset;
text.append(htmlDoc.getText(startOffset, length));
}
}
}
System.exit(0);
}
}
The output is here.
Element : BranchElement(html) 0,1
Element : BranchElement(body) 0,1
Attribute count : 1
a1 exists : false <-----expected true here.
Element : BranchElement(body) 0,1
Element : BranchElement(p) 0,1
Attribute count : 3
a1 exists : false
Element : BranchElement(p) 0,1
Element : LeafElement(content) 0,1
Attribute count : 1
a1 exists : false
Element : LeafElement(content) 0,1
The expectation is that the "a1 exists" check should have returned true once, but it did not.
Eventually all 3 (a1, a2, a3) will be searched.
Is the above code the proper implementation or is this not feasible with the HTML parser?

Maybe this will help:
import java.io.*;
import java.net.*;
import java.util.*;
import javax.swing.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
class AttributeHTML
{
public static void main(String[] args)
{
EditorKit kit = new HTMLEditorKit();
Document doc = kit.createDefaultDocument();
// The Document class does not yet handle charset's properly.
doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
try
{
// Create a reader on the HTML content.
Reader rd = getReader(args[0]);
// Parse the HTML.
kit.read(rd, doc, 0);
// Iterate through the elements of the HTML document.
ElementIterator it = new ElementIterator(doc);
Element elem = null;
while ( (elem = it.next()) != null )
{
if (elem.getName().equals("body"))
{
AttributeSet as = elem.getAttributes();
Enumeration enum1 = as.getAttributeNames();
while( enum1.hasMoreElements() )
{
Object name = enum1.nextElement();
Object value = as.getAttribute( name );
System.out.println( "\t" + name + " : " + value );
}
}
}
}
catch (Exception e)
{
e.printStackTrace();
}
System.exit(1);
}
// Returns a reader on the HTML data. If 'uri' begins
// with "http:", it's treated as a URL; otherwise,
// it's assumed to be a local filename.
static Reader getReader(String uri)
throws IOException
{
// Retrieve from Internet.
if (uri.startsWith("http:"))
{
URLConnection conn = new URL(uri).openConnection();
return new InputStreamReader(conn.getInputStream());
}
// Retrieve from file.
else
{
return new FileReader(uri);
}
}
}
Test using:
java AttributeHTML yourFile.html

I am not aware about HtmlKitbut u can achieve similar result using regex
public static void main(String[] args) throws UnirestException {
String html = "<html>\r\n" +
" <head>\r\n" +
" Hello World\r\n" +
" </head>\r\n" +
" <body a1=\"ABC\" a2=\"3974\" a3=\"A1B2\">\r\n" +
" <H1>Start Here<H1>\r\n" +
" <p>This is the body</p>\r\n" +
" </body>\r\n" +
"</html>";
Pattern regexBodyPattern = Pattern.compile("<body[^>]*>", Pattern.MULTILINE);
Matcher matcher = regexBodyPattern.matcher(html);
while(matcher.find()) {
String bodyTag = matcher.group();
Pattern regexBodyAttrPattern = Pattern.compile("(\\S*)=(\\\"\\w*\\\")", Pattern.MULTILINE);
Matcher attrMatcher = regexBodyAttrPattern.matcher(bodyTag);
while(attrMatcher.find()) {
System.out.println("Key :: "+attrMatcher.group(1)+" , Value "+attrMatcher.group(2));
}
}
}
output
Key :: a1 , Value "ABC"
Key :: a2 , Value "3974"
Key :: a3 , Value "A1B2"

To retrieve the attributes, you can provide your own ParserCallback
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class HTMLParserTest2
{
public static void main(String args[]) throws Exception {
Reader reader = new FileReader("d:/temp/Example.html");
BufferedReader br = new BufferedReader(reader);
System.out.println(HTMLParserTest2.extractTagsAttributes(br));
// output : [title-_implied_=true, body-a1=ABC, body-a2=3974, body-a3=A1B2]
System.exit(0);
}
public static List<String> extractTagsAttributes(Reader r) throws IOException {
final ArrayList<String> list = new ArrayList<String>();
ParserDelegator parserDelegator = new ParserDelegator();
ParserCallback parserCallback = new ParserCallback() {
#Override
public void handleText(final char[] data, final int pos) { }
#Override
public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
Enumeration<?> e=attribute.getAttributeNames();
while(e.hasMoreElements()) {
Object name=e.nextElement();
Object value=attribute.getAttribute(name);
list.add(tag.toString() + "-" + name + "=" +value);
}
}
#Override
public void handleEndTag(Tag t, final int pos) { }
#Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
#Override
public void handleComment(final char[] data, final int pos) { }
#Override
public void handleError(final java.lang.String errMsg, final int pos) { }
};
parserDelegator.parse(r, parserCallback, true);
return list;
}
}

Related

Converting html files with *ngFor to pdf in Java

I have to generate documents in my Java Web application (Maven, runs on a server) and have to insert data from a Java class into this document.
I would like to be able to write a HTML file with placeholders. The placeholder should be replaced from the application with data from a Java class.
I also would like to be able to use conditionals like *ngFor (e.g. inserting a list into a ) or *ngIf from Angular (or attributes with a similar function).
Does anyone know a library for this?
I have a good knowledge of Java, HTML etc. so using such a library (if there is one) will not be a problem for me
In the meantime I've written a little script myself. In case someone needs something similar, I've included it in an answer
In the meantime I have searched further and unfortunately I have not found a suitable solution so far. Therefore, I have now set about programming a solution myself. Much less effort is required than expected. Here is my current code. It is currently a rough draft and certainly needs some improvement.
package com.XYZ.file.bo;
import java.beans.IntrospectionException;
import java.beans.PropertyDescriptor;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.InputStream;
import java.io.StringWriter;
import java.lang.reflect.InvocationTargetException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.XYZ.file.service.FileService;
import com.XYZ.servicelocator.ServiceLocator;
import com.XYZ.util.TechnicalException;
import com.itextpdf.html2pdf.ConverterProperties;
import com.itextpdf.html2pdf.HtmlConverter;
public class TemplateGeneratorBO {
private FileService fileService;
private static final String DOC_TEMPLATE_DIR = FileBO.BASE_DIR + "templates/";
public File generateDoc(String tempFileName, String saveFolder, String saveFileName, Object entity) {
String htmlDoc = parseHtmlDoc(tempFileName, entity);
htmlDoc = replaceSpecialChars(htmlDoc);
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
ConverterProperties converterProperties = new ConverterProperties();
HtmlConverter.convertToPdf(htmlDoc, outStream, converterProperties);
InputStream inStream = new ByteArrayInputStream(outStream.toByteArray());
saveFolder += "/" + callGetter(entity, "id") + "/templates";
if (!getFileService().createAndSaveFile(saveFolder, saveFileName + ".pdf", inStream)) {
int counter = 0;
boolean success = false;
do {
counter++;
success = getFileService().createAndSaveFile(saveFolder, saveFileName + "-" + counter + ".pdf",
inStream);
} while (!success);
return getFileService().getFile(saveFolder, saveFileName + "-" + counter + ".pdf");
}
return getFileService().getFile(saveFolder, saveFileName + ".pdf");
}
private String parseHtmlDoc(String fileName, Object entity) {
try {
File htmlFile = new File(DOC_TEMPLATE_DIR + fileName);
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(htmlFile);
doc.getDocumentElement().normalize();
Element elm = doc.getDocumentElement();
NodeList headList = elm.getElementsByTagName("head");
NodeList bodyList = elm.getElementsByTagName("body");
verifyTemplate(fileName, elm, headList, bodyList);
Node head = headList.item(0);
String html = "<html>" + xmlToString(head) + "<body>";
html += nodeToString(bodyList.item(0),
newTempEntList(new TemplateGenEntity(entity.getClass().getSimpleName(), entity)));
html += "</body></html>";
return html;
} catch (Exception exc) {
throw new TechnicalException("DocGenerator Exception with file " + fileName, exc);
}
}
private String nodeToString(Node parentNode, List<TemplateGenEntity> entities)
throws TransformerException, ScriptException {
NodeList nodes = parentNode.getChildNodes();
StringBuilder string = new StringBuilder("");
for (int i = 0; i < nodes.getLength(); i++) {
Node node = nodes.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element elm = (Element) node;
string.append(elementToString(elm, entities));
} else {
string.append(insertValues(xmlToString(node), entities));
}
}
return string.toString();
}
private String elementToString(Element elm, List<TemplateGenEntity> entities)
throws ScriptException, TransformerException {
if (!proofNgIf(elm, entities)) {
return "";
}
if (elm.hasAttribute("ngFor")) {
return ngForElementToString(elm, entities);
}
return "<" + elm.getNodeName() + getElementAttributes(elm) + ">" + nodeToString(elm, entities) + "</"
+ elm.getNodeName() + ">";
}
#SuppressWarnings("unchecked")
private String ngForElementToString(Element elm, List<TemplateGenEntity> entities)
throws ScriptException, TransformerException {
String attrs = getElementAttributes(elm);
String ngFor = elm.getAttribute("ngFor");
String[] ngForList = ngFor.split(" of ");
StringBuilder string = new StringBuilder();
ScriptEngineManager factory = new ScriptEngineManager();
ScriptEngine engine = factory.getEngineByName("JavaScript");
for (TemplateGenEntity ent : entities) {
engine.put(ent.getEntityName(), ent.getEntity());
}
List<Object> list = (List<Object>) engine.eval(ngForList[1]);
for (Object obj : list) {
string.append("<" + elm.getNodeName() + attrs + ">"
+ nodeToString(elm, newTempEntList(entities, new TemplateGenEntity(ngForList[0], obj))) + "</"
+ elm.getNodeName() + ">");
}
return string.toString();
}
/**
*
* #return true if no ngIf or ngIf condition is true
* #throws ScriptException
*/
private boolean proofNgIf(Element elm, List<TemplateGenEntity> entities) throws ScriptException {
if (!elm.hasAttribute("ngIf")) {
return true;
}
String ngIf = elm.getAttribute("ngIf");
if (ngIf.isBlank()) {
throw new TechnicalException("Document template contains empty ngIf!");
}
ScriptEngineManager factory = new ScriptEngineManager();
ScriptEngine engine = factory.getEngineByName("JavaScript");
for (TemplateGenEntity ent : entities) {
engine.put(ent.getEntityName(), ent.getEntity());
}
return (boolean) engine.eval(ngIf);
}
private String insertValues(String strIn, List<TemplateGenEntity> entities) throws ScriptException {
StringBuilder str = new StringBuilder(strIn);
int begin = str.indexOf("{{");
int end = str.indexOf("}}") + 2;
while (begin != -1 && end != 1) {
String var = str.substring(begin, end);
var = var.replace("{{", "");
var = var.replace("}}", "");
ScriptEngineManager factory = new ScriptEngineManager();
ScriptEngine engine = factory.getEngineByName("JavaScript");
for (TemplateGenEntity ent : entities) {
engine.put(ent.getEntityName(), ent.getEntity());
}
Object val = engine.eval(var);
String valStr = objectToStr(val);
str = str.replace(begin, end, valStr);
begin = str.indexOf("{{");
end = str.indexOf("}}") + 2;
}
return str.toString();
}
private String xmlToString(Node node) throws TransformerException {
StringWriter writer = new StringWriter();
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.setOutputProperty(OutputKeys.INDENT, "no");
transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, "yes");
transformer.transform(new DOMSource(node), new StreamResult(writer));
return writer.toString();
}
private String replaceSpecialChars(String str) {
str = str.replace(">", ">");
return str;
}
private String getElementAttributes(Element elm) {
StringBuilder attrStr = new StringBuilder();
NamedNodeMap attrs = elm.getAttributes();
for (int i = 0; i < attrs.getLength(); i++) {
Attr attr = (Attr) attrs.item(i);
String attrName = attr.getName();
String attrVal = attr.getValue();
if (attrName.equals("ngIf") || attrName.equals("ngFor")) {
continue;
}
attrStr.append(" " + attrName + "=\"" + attrVal + "\"");
}
return attrStr.toString();
}
private void verifyTemplate(String fileName, Element elm, NodeList head, NodeList body) {
if (!elm.getNodeName().equalsIgnoreCase("html")) {
throw new TechnicalException("Document template " + fileName + " doesn't starts with html node!");
}
if (head.getLength() != 1 || head.item(0) == null) {
throw new TechnicalException("Document template " + fileName + " doesn't contains head!");
}
if (body.getLength() != 1 || head.item(0) == null) {
throw new TechnicalException("Document template " + fileName + " doesn't contains body!");
}
}
private FileService getFileService() {
if (fileService == null) {
fileService = ServiceLocator.locateService(FileService.class);
}
return fileService;
}
private Object callGetter(Object obj, String fieldName) {
PropertyDescriptor pd;
try {
pd = new PropertyDescriptor(fieldName, obj.getClass());
return pd.getReadMethod().invoke(obj);
} catch (IntrospectionException | IllegalAccessException | IllegalArgumentException
| InvocationTargetException e) {
throw new TechnicalException(e.getMessage(), e);
}
}
private String objectToStr(Object obj) {
if (obj instanceof Date) {
return new SimpleDateFormat("dd.MM.yyyy").format(obj);
}
return obj.toString();
}
public static List<TemplateGenEntity> newTempEntList(TemplateGenEntity entity) {
List<TemplateGenEntity> list = new ArrayList<>();
list.add(entity);
return list;
}
public static List<TemplateGenEntity> newTempEntList(List<TemplateGenEntity> entities, TemplateGenEntity entity) {
List<TemplateGenEntity> list = new ArrayList<>();
for (TemplateGenEntity ent : entities) {
list.add(ent);
}
list.add(entity);
return list;
}
public class TemplateGenEntity {
private String entityName;
private Object entity;
public TemplateGenEntity(String entityName, Object entity) {
this.entityName = entityName;
this.entity = entity;
}
public String getEntityName() {
return entityName;
}
public void setEntityName(String entityName) {
this.entityName = entityName;
}
public Object getEntity() {
return entity;
}
public void setEntity(Object entity) {
this.entity = entity;
}
}
}
use freemarker for the placeholder replacement and pd4ml for the html convertion, it works well for me.

Java Exception not understood

I am writing a search engine code using java, and I'm getting this error without knowing the cause:
Exception in thread "main" java.lang.NullPointerException
at WriteToFile.fileWriter(WriteToFile.java:29)
at Main.main(Main.java:14)
Process finished with exit code 1
this is my code :
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Scanner;
public class Search {
private static String URL="https://www.google.com/search?q=";
private Document doc;
private Elements links;
private String html;
public Search() throws IOException {};
public void SearchWeb() throws IOException {
//to get the keywords from the user
Scanner sc = new Scanner(System.in);
System.out.println("Please enter the keyword you want to search for: ");
String word = sc.nextLine();
//Search for the keyword over the net
String url = URL + word;
doc = Jsoup.connect(url).get();
html = doc.html();
Files.write(Paths.get("D:\\OOP\\OOPproj\\data.txt"), html.getBytes());
links = doc.select("cite");
}
public Document getDoc() {
return doc;
}
public String getHtml() {
return html;
}
public Elements getLinks() {
return links;
}
}
and this is the class writeToFile:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
public class WriteToFile extends Search {
public WriteToFile() throws IOException {};
String description = "<!> Could not fetch description <!>";
String keywords = "<!> Could not fetch keywords <!>";
private ArrayList<String> detail = new ArrayList<String>();
BufferedWriter bw = null;
public void fileWriter() throws IOException {
for (Element link : super.getLinks()) {
String text = link.text();
if (text.contains("›")) {
text = text.replaceAll(" › ", "/");
}
detail.add(text);
System.out.println(text);
}
System.out.println("***************************************************");
for (int i = 0; i < detail.size(); i++)
System.out.println("detail [" + (i + 1) + "]" + detail.get(i));
System.out.println("###################################################################");
for (int j = 0; j < detail.size(); j++) {
Document document = Jsoup.connect(detail.get(j)).get();
String web = document.html();
Document d = Jsoup.parse(web);
Elements metaTags = d.getElementsByTag("meta");
for (Element metaTag : metaTags) {
String content = metaTag.attr("content");
String name = metaTag.attr("name");
if ("description".equals(name)) {
description = content;
}
if ("keywords".equals(name)) {
keywords = content;
}
}
String title = d.title();
Files.write(Paths.get("D:\\OOP\\OOPproj\\search.txt"), (detail.get(j) + "\t" + "|" + "\t" + title + "\t" + "|" + "\t" + description + "\t" + "|" + "\t" + keywords + System.lineSeparator()).getBytes(), StandardOpenOption.APPEND);
}
}
}
This is the Main class:
import java.io.IOException;
public class Main {
public static void main(String[] args) throws IOException {
Search a = new Search();
a.SearchWeb();
WriteToFile b = new WriteToFile();
b.fileWriter();
}
}
I tried to print the getLinks() method in main to check if it was null , but it wasn't , the links were cited.
I would be really grateful if someone helps me out.
You are calling SearchWeb() on object a, but you're calling fileWriter() on object b. This means the links are set in a, but not in b.
Since WriteToFile extends Search, you just need an instance of that:
WriteToFile a = new WriteToFile();
a.SearchWeb();
a.fileWriter();

I am trying to use a url xml parse but it looks like i keep getting an empty xml

I am trying to get info from a weather API called even though when i am making the request i am getting a response, but when i am trying to get only a specific part of the response i get null response every time can someone help? here is the code for my handler :
package weathercalls;
import java.util.ArrayList;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
public class Handler extends DefaultHandler
{
// Create three array lists to store the data
public ArrayList<Integer> lows = new ArrayList<Integer>();
public ArrayList<Integer> highs = new ArrayList<Integer>();
public ArrayList<String> regions = new ArrayList<String>();
// Make sure that the code in DefaultHandler's
// constructor is called:
public Handler()
{
super();
}
/*** Below are the three methods that we are extending ***/
#Override
public void startDocument()
{
System.out.println("Start document");
}
#Override
public void endDocument()
{
System.out.println("End document");
}
// This is where all the work is happening:
#Override
public void startElement(String uri, String name, String qName, Attributes atts)
{
if(qName.compareTo("region") == 0)
{
String region = atts.getLocalName(0);
System.out.println("Day: " + region);
this.regions.add(region);
}
if(qName.compareToIgnoreCase("wind_degree") == 0)
{
int low = atts.getLength();
System.out.println("Low: " + low);
this.lows.add(low);
}
if(qName.compareToIgnoreCase("high") == 0)
{
int high = Integer.parseInt(atts.getValue(0));
System.out.println("High: " + high);
this.highs.add(high);
}
}
}
and here is my main file code :
package weathercalls;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.SAXException;
public class weatherCalls {
public static void main(String[] args) throws Exception {
//Main url
String main_url = "http://api.weatherapi.com/v1/";
//Live or Weekly forecast
String live_weather = "current.xml?key=";
//String sevendays_weather = "orecast.xml?key=";
//API Key + q
String API_Key = "c2e285e55db74def97f151114201701&q=";
//Location Setters
String location = "London";
InputSource inSource = null;
InputStream in = null;
XMLReader xr = null;
/**
URL weather = new URL(main_url + live_weather + API_Key + location);
URLConnection yc = weather.openConnection();
BufferedReader in1 = new BufferedReader(
new InputStreamReader(
yc.getInputStream()));
String inputLine;
while ((inputLine = in1.readLine()) != null)
System.out.println(inputLine);
in1.close();**/
try
{
// Turn the string into a URL object
String complete_url = main_url + live_weather + API_Key + location;
URL urlObject = new URL(complete_url);
// Open the stream (which returns an InputStream):
in = urlObject.openStream();
/** Now parse the data (the stream) that we received back ***/
// Create an XML reader
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
SAXParser parser = parserFactory.newSAXParser();
xr = parser.getXMLReader();
// Tell that XML reader to use our special Google Handler
Handler ourSpecialHandler = new Handler();
xr.setContentHandler(ourSpecialHandler);
// We have an InputStream, but let's just wrap it in
// an InputSource (the SAX parser likes it that way)
inSource = new InputSource(in);
// And parse it!
xr.parse(inSource);
System.out.println(complete_url);
System.out.println(urlObject);
System.out.println(in);
System.out.println(xr);
System.out.println(inSource);
System.out.println(parser);
}
catch(IOException ioe)
{
ioe.printStackTrace();
}
catch(SAXException se)
{
se.printStackTrace();
}
}
}
and this is my console print:
Start document
Day: null
Low: 0
End document
http://api.weatherapi.com/v1/current.xml?key=c2e285e55db74def97f151114201701&q=London
http://api.weatherapi.com/v1/current.xml?key=c2e285e55db74def97f151114201701&q=London
sun.net.www.protocol.http.HttpURLConnection$HttpInputStream#2471cca7
com.sun.org.apache.xerces.internal.jaxp.SAXParserImpl$JAXPSAXParser#5fe5c6f
org.xml.sax.InputSource#6979e8cb
com.sun.org.apache.xerces.internal.jaxp.SAXParserImpl#763d9750
I think you are trying to extract the values from the XML tags and if it is the case then you are doing it wrong. Attributes object contains the attributes of a particular tag and to get the value you have to do some extra work. Similar to the start of a tag, there are separate events for the contents and the end of a tag. current_tag variable will keep track of the current tag being processed. Below is a sample code:
class Handler extends DefaultHandler {
// Create three array lists to store the data
public ArrayList<Integer> lows = new ArrayList<Integer>();
public ArrayList<Integer> highs = new ArrayList<Integer>();
public ArrayList<String> regions = new ArrayList<String>();
// Make sure that the code in DefaultHandler's
// constructor is called:
public Handler() {
super();
}
/*** Below are the three methods that we are extending ***/
#Override
public void startDocument() {
System.out.println("Start document");
}
#Override
public void endDocument() {
System.out.println("End document");
}
//Keeps track of the current tag;
String currentTag = "";
// This is where all the work is happening:
#Override
public void startElement(String uri, String name, String qName, Attributes atts) {
//Save the current tag being handled
currentTag = qName;
}
//Detect end tag
#Override
public void endElement(String uri, String localName, String qName) throws SAXException {
//Reset it
currentTag = "";
}
#Override
public void characters(char[] ch, int start, int length) throws SAXException {
//Rules based on current tag
switch (currentTag) {
case "region":
String region = String.valueOf(ch, start, length);
this.regions.add(region);
System.out.println("Day: " + region);
break;
case "wind_degree":
int low = Integer.parseInt(String.valueOf(ch, start, length));
System.out.println("Low: " + low);
this.lows.add(low);
break;
case "high":
int high = Integer.parseInt(String.valueOf(ch, start, length));
System.out.println("High: " + high);
this.highs.add(high);
break;
}
}}
NOTE: Please refrain from sharing your API keys or passwords on the internet.

Date Format getting disturb when creating .CSV file in Java

I am creating a web scraper and then store the data in the .CSV file.
My program is running fine but, there is a problem that the website from where I am retrieving data have a date which is in (Month Day, Year) format. So when I save the data in .CSV file it will consider the Year as another column due to which all the data gets manipulated. I actually want to store that data into (MM-MON-YYYY) and store Validity date in one column. I am posting my code below. Kindly, help me out. Thanks!
P.S: I am sorry for not writing the format I want in the original post.
package com.mufapscraping;
//import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
//import java.util.Collections;
import java.util.Iterator;
//import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ComMufapScraping {
boolean writeCSVToConsole = true;
boolean writeCSVToFile = true;
//String destinationCSVFile = "C:\\convertedCSV.csv";
boolean sortTheList = true;
boolean writeToConsole;
boolean writeToFile;
public static Document doc = null;
public static Elements tbodyElements = null;
public static Elements elements = null;
public static Elements tdElements = null;
public static Elements trElement2 = null;
public static String Dcomma = ", 2";
public static ArrayList<Elements> sampleList = new ArrayList<Elements>();
public static void createConnection() throws IOException {
System.setProperty("http.proxyHost", "191.1.1.123");
System.setProperty("http.proxyPort", "8080");
String tempUrl = "http://www.mufap.com.pk/nav_returns_performance.php?tab=01";
doc = Jsoup.connect(tempUrl).get();
}
public static void parsingHTML() throws Exception {
for (int i = 1; i <= 1; i++) {
tbodyElements = doc.getElementsByTag("tbody");
//Element table = doc.getElementById("dataTable");
if (tbodyElements.isEmpty()) {
throw new Exception("Table is not found");
}
elements = tbodyElements.get(0).getElementsByTag("tr");
for (Element trElement : elements) {
trElement2 = trElement.getElementsByTag("tr");
tdElements = trElement.getElementsByTag("td");
FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
if (it.hasNext()) {
sb.append(" \n ");
}
for (Iterator<Element> it2 = trElement2.iterator(); it.hasNext();) {
Element tdElement = it.next();
sb.append(tdElement.text());
if (it2.hasNext()) {
sb.append(" , ");
}
}
System.out.println(sb.toString());
sb.flush();
sb.close();
}
System.out.println(sampleList.add(tdElements));
/* for (Elements elements2 : zakazky) {
System.out.println(elements2);
}*/
}
}
}
public static void main(String[] args) throws IOException, Exception {
createConnection();
parsingHTML();
}
}
Instead of appeding directly the element text in the FileWriter, format it first then append it.
So, replace the following line:
sb.append(tdElement.text());
into
sb.append(formatData(tdElement.text()));
private static final SimpleDateFormat FORMATTER_MMM_d_yyyy = new SimpleDateFormat("MMM d, yyyy", Locale.US);
private static final SimpleDateFormat FORMATTER_dd_MMM_yyyy = new SimpleDateFormat("dd-MMM-YYYY", Locale.US);
public static String formatData(String text) {
String tmp = null;
try {
Date d = FORMATTER_MMM_d_yyyy.parse(text);
tmp = FORMATTER_dd_MMM_yyyy.format(d);
} catch (ParseException pe) {
tmp = text;
}
return tmp;
}
SAMPLE
public static void main(String[] args) {
String[] fields = new String[] { //
"ABL Cash Fund", //
"AA(f)", //
"Apr 18, 2016", //
"10.4729" //
};
for (String field : fields) {
System.out.format("%s\n%s\n\n", field, formatData(field));
}
}
OUTPUT
ABL Cash Fund
ABL Cash Fund
AA(f)
AA(f)
Apr 18, 2016
18-Apr-2016
10.4729
10.4729
Instead of using the method getElementsByTag many times you can use cssSelector which can be much easier and enables you to get the same output in few lines of code
public static void main (String []args) throws IOException{
String tempUrl = "http://www.mufap.com.pk/nav_returns_performance.php?tab=01";
Document doc = Jsoup.connect(tempUrl).get();
Elements trElements = doc.select("#dataTable tbody tr");
FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for(Element tr : trElements){
Elements tdElements = tr.select("td");
for (Element td : tdElements){
sb.append(td.text());
sb.append(";");
}
sb.append("\n");
}
}
This could be achieved by simply surrounding your data with double quotes, so month day, year would become "month day, year". Here's modified code that does the job for you:
package com.mufapscraping;
//import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
//import java.util.Collections;
import java.util.Iterator;
//import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ComMufapScraping {
boolean writeCSVToConsole = true;
boolean writeCSVToFile = true;
//String destinationCSVFile = "C:\\convertedCSV.csv";
boolean sortTheList = true;
boolean writeToConsole;
boolean writeToFile;
public static Document doc = null;
public static Elements tbodyElements = null;
public static Elements elements = null;
public static Elements tdElements = null;
public static Elements trElement2 = null;
public static String Dcomma = ", 2";
public static ArrayList<Elements> sampleList = new ArrayList<Elements>();
public static void createConnection() throws IOException {
System.setProperty("http.proxyHost", "191.1.1.123");
System.setProperty("http.proxyPort", "8080");
String tempUrl = "http://www.mufap.com.pk/nav_returns_performance.php?tab=01";
doc = Jsoup.connect(tempUrl).get();
}
public static void parsingHTML() throws Exception {
for (int i = 1; i <= 1; i++) {
tbodyElements = doc.getElementsByTag("tbody");
//Element table = doc.getElementById("dataTable");
if (tbodyElements.isEmpty()) {
throw new Exception("Table is not found");
}
elements = tbodyElements.get(0).getElementsByTag("tr");
for (Element trElement : elements) {
trElement2 = trElement.getElementsByTag("tr");
tdElements = trElement.getElementsByTag("td");
FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
if (it.hasNext()) {
sb.append(" \n ");
}
for (Iterator<Element> it2 = trElement2.iterator(); it.hasNext();) {
Element tdElement = it.next();
sb.append('\"'); // surround your data
sb.append(tdElement.text());
sb.append('\"'); // with double quotes
if (it2.hasNext()) {
sb.append(" , ");
}
}
System.out.println(sb.toString());
sb.flush();
sb.close();
}
System.out.println(sampleList.add(tdElements));
/* for (Elements elements2 : zakazky) {
System.out.println(elements2);
}*/
}
}
}
public static void main(String[] args) throws IOException, Exception {
createConnection();
parsingHTML();
}
}
Then you do want to split it. ok, then modify the first line by adding "year," column:
Element tdElement = it.next();
final String content = tdElement.text()
sb.append(content);
if (it2.hasNext()) {
sb.append(" , ");
if (content.equals("Validity Date"))
sb.append("Validity Year,");
you probably want to break after the for? or you'll overwrite the file elements.size()-1 times...
FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) { ... }
break;

How to create a regex who verify the existence of a number into an array in java

i want to verify if a number for example 701234567 is an element of my array in java. For this, my code search if my number who is begening with 7 and have 9 digits is a element of my array "numbercall.txt" who have 5 elements. This is my text file:
numbercall.txt [ 702345678, 714326578, 701234567, 791234567,751234567]
This is my code:
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TestNumberLt {
static String[] arr= null;
String filename = "fichiers/numbercall.txt";
static String a = null ;
static List<String> list = new ArrayList<String>();
public static void main(String [] args) throws IOException{
FileInputStream fstream_school = new FileInputStream(filename);
DataInputStream data_input = new DataInputStream(fstream_school);
BufferedReader buffer = new BufferedReader(new InputStreamReader(data_input));
String str_line;
while ((str_line = buffer.readLine()) != null)
{
str_line = str_line.trim();
if ((str_line.length()!=0))
{
list.add(str_line);
}
}
int b = 773214576;
//convert the arraylist to a array
arr = (String[])list.toArray(new String[list.size()]);
Pattern p = Pattern.compile("^7[0|6|7][0-9]{7}$");
Matcher m ;
//a loop for verify if a number exist in this array
for (int j = 0; j < list.size();)
{
System.out.print(" "+list.get(j)+ " ");
m = p.matcher(list.get(j));
/*while(m.find())
System.out.println(m.group());*/
if(list.get(j).equals(b))
{
System.out.println("Trouvé "+list.get(j));
break;
}
else
{
System.out.println("ce numéro ("+b+") n'existe pas!");
}
break;
}
}
}
Do it simply like this
String str_line= "702345678,714326578,701234567,791234567,751234567";
String[] strArray = str_line.split(",");
String key = "702345678";
for(String v:strArray) {
if(v.equals(key)) {
System.out.println("found");
}
}
I'm not realy sure of what you want, but if you just need the index of b in your array just do this:
public static void main(String [] args) throws IOException{
...
int b = 773214576;
int tmp = list.indexOf(b+"");
if(tmp!=-1) {
System.out.println("Trouvé "+ b + " à l'index " + tmp);
} else {
System.out.println("Ce numéro ("+b+") n'existe pas!");
}
...
}
Another answer, using Guava :
(in this case, there really is no need, you could simply use split() method from String object, but like Guava readibility and returns)
package stackoverflow;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import com.google.common.base.Splitter;
public class RegexExample {
String filename = "numbercall.txt";
public boolean isInList(String numberToCheck) throws IOException {
BufferedReader file = loadFile();
for (String number : extractNumberListFrom(file)) {
if (number.trim().equals(numberToCheck)) {
return true;
}
}
return false;
}
private Iterable<String> extractNumberListFrom(BufferedReader buffer) throws IOException {
StringBuilder numberList = new StringBuilder();
String line;
while ((line = buffer.readLine()) != null) {
numberList.append(line);
}
return Splitter.on(",").split(numberList.toString());
}
private BufferedReader loadFile() {
InputStream fstream_school = RegexExample.class.getResourceAsStream(filename);
BufferedReader buffer = new BufferedReader(new InputStreamReader(fstream_school));
return buffer;
}
}

Categories