I am creating a web scraper and then store the data in the .CSV file.
My program is running fine but, there is a problem that the website from where I am retrieving data have a date which is in (Month Day, Year) format. So when I save the data in .CSV file it will consider the Year as another column due to which all the data gets manipulated. I actually want to store that data into (MM-MON-YYYY) and store Validity date in one column. I am posting my code below. Kindly, help me out. Thanks!
P.S: I am sorry for not writing the format I want in the original post.
package com.mufapscraping;
//import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
//import java.util.Collections;
import java.util.Iterator;
//import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ComMufapScraping {
boolean writeCSVToConsole = true;
boolean writeCSVToFile = true;
//String destinationCSVFile = "C:\\convertedCSV.csv";
boolean sortTheList = true;
boolean writeToConsole;
boolean writeToFile;
public static Document doc = null;
public static Elements tbodyElements = null;
public static Elements elements = null;
public static Elements tdElements = null;
public static Elements trElement2 = null;
public static String Dcomma = ", 2";
public static ArrayList<Elements> sampleList = new ArrayList<Elements>();
public static void createConnection() throws IOException {
System.setProperty("http.proxyHost", "191.1.1.123");
System.setProperty("http.proxyPort", "8080");
String tempUrl = "http://www.mufap.com.pk/nav_returns_performance.php?tab=01";
doc = Jsoup.connect(tempUrl).get();
}
public static void parsingHTML() throws Exception {
for (int i = 1; i <= 1; i++) {
tbodyElements = doc.getElementsByTag("tbody");
//Element table = doc.getElementById("dataTable");
if (tbodyElements.isEmpty()) {
throw new Exception("Table is not found");
}
elements = tbodyElements.get(0).getElementsByTag("tr");
for (Element trElement : elements) {
trElement2 = trElement.getElementsByTag("tr");
tdElements = trElement.getElementsByTag("td");
FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
if (it.hasNext()) {
sb.append(" \n ");
}
for (Iterator<Element> it2 = trElement2.iterator(); it.hasNext();) {
Element tdElement = it.next();
sb.append(tdElement.text());
if (it2.hasNext()) {
sb.append(" , ");
}
}
System.out.println(sb.toString());
sb.flush();
sb.close();
}
System.out.println(sampleList.add(tdElements));
/* for (Elements elements2 : zakazky) {
System.out.println(elements2);
}*/
}
}
}
public static void main(String[] args) throws IOException, Exception {
createConnection();
parsingHTML();
}
}
Instead of appeding directly the element text in the FileWriter, format it first then append it.
So, replace the following line:
sb.append(tdElement.text());
into
sb.append(formatData(tdElement.text()));
private static final SimpleDateFormat FORMATTER_MMM_d_yyyy = new SimpleDateFormat("MMM d, yyyy", Locale.US);
private static final SimpleDateFormat FORMATTER_dd_MMM_yyyy = new SimpleDateFormat("dd-MMM-YYYY", Locale.US);
public static String formatData(String text) {
String tmp = null;
try {
Date d = FORMATTER_MMM_d_yyyy.parse(text);
tmp = FORMATTER_dd_MMM_yyyy.format(d);
} catch (ParseException pe) {
tmp = text;
}
return tmp;
}
SAMPLE
public static void main(String[] args) {
String[] fields = new String[] { //
"ABL Cash Fund", //
"AA(f)", //
"Apr 18, 2016", //
"10.4729" //
};
for (String field : fields) {
System.out.format("%s\n%s\n\n", field, formatData(field));
}
}
OUTPUT
ABL Cash Fund
ABL Cash Fund
AA(f)
AA(f)
Apr 18, 2016
18-Apr-2016
10.4729
10.4729
Instead of using the method getElementsByTag many times you can use cssSelector which can be much easier and enables you to get the same output in few lines of code
public static void main (String []args) throws IOException{
String tempUrl = "http://www.mufap.com.pk/nav_returns_performance.php?tab=01";
Document doc = Jsoup.connect(tempUrl).get();
Elements trElements = doc.select("#dataTable tbody tr");
FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for(Element tr : trElements){
Elements tdElements = tr.select("td");
for (Element td : tdElements){
sb.append(td.text());
sb.append(";");
}
sb.append("\n");
}
}
This could be achieved by simply surrounding your data with double quotes, so month day, year would become "month day, year". Here's modified code that does the job for you:
package com.mufapscraping;
//import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
//import java.util.Collections;
import java.util.Iterator;
//import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ComMufapScraping {
boolean writeCSVToConsole = true;
boolean writeCSVToFile = true;
//String destinationCSVFile = "C:\\convertedCSV.csv";
boolean sortTheList = true;
boolean writeToConsole;
boolean writeToFile;
public static Document doc = null;
public static Elements tbodyElements = null;
public static Elements elements = null;
public static Elements tdElements = null;
public static Elements trElement2 = null;
public static String Dcomma = ", 2";
public static ArrayList<Elements> sampleList = new ArrayList<Elements>();
public static void createConnection() throws IOException {
System.setProperty("http.proxyHost", "191.1.1.123");
System.setProperty("http.proxyPort", "8080");
String tempUrl = "http://www.mufap.com.pk/nav_returns_performance.php?tab=01";
doc = Jsoup.connect(tempUrl).get();
}
public static void parsingHTML() throws Exception {
for (int i = 1; i <= 1; i++) {
tbodyElements = doc.getElementsByTag("tbody");
//Element table = doc.getElementById("dataTable");
if (tbodyElements.isEmpty()) {
throw new Exception("Table is not found");
}
elements = tbodyElements.get(0).getElementsByTag("tr");
for (Element trElement : elements) {
trElement2 = trElement.getElementsByTag("tr");
tdElements = trElement.getElementsByTag("td");
FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
if (it.hasNext()) {
sb.append(" \n ");
}
for (Iterator<Element> it2 = trElement2.iterator(); it.hasNext();) {
Element tdElement = it.next();
sb.append('\"'); // surround your data
sb.append(tdElement.text());
sb.append('\"'); // with double quotes
if (it2.hasNext()) {
sb.append(" , ");
}
}
System.out.println(sb.toString());
sb.flush();
sb.close();
}
System.out.println(sampleList.add(tdElements));
/* for (Elements elements2 : zakazky) {
System.out.println(elements2);
}*/
}
}
}
public static void main(String[] args) throws IOException, Exception {
createConnection();
parsingHTML();
}
}
Then you do want to split it. ok, then modify the first line by adding "year," column:
Element tdElement = it.next();
final String content = tdElement.text()
sb.append(content);
if (it2.hasNext()) {
sb.append(" , ");
if (content.equals("Validity Date"))
sb.append("Validity Year,");
you probably want to break after the for? or you'll overwrite the file elements.size()-1 times...
FileWriter sb = new FileWriter("C:\\convertedCSV2.csv", true);
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) { ... }
break;
Related
Possibly the terminology is different with HTML than with XML, but here is a HTML document from which attributes are being retrieved. Here the attributes a1, a2, a3 are part of the Body tag.
<html>
<head>
Hello World
</head>
<body a1="ABC" a2="3974" a3="A1B2"> <------These attributes
<H1>Start Here<H1>
<p>This is the body</p>
</body>
</html>
Using the following file to parse the above HTML file.
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.Reader;
import javax.swing.text.AttributeSet;
import javax.swing.text.Element;
import javax.swing.text.ElementIterator;
import javax.swing.text.StyleConstants;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class HTMLParserTest
{
public static void main(String args[]) throws Exception {
Reader reader = new FileReader("C:/Downloads/DeleteMe/Example1.html");
BufferedReader br = new BufferedReader(reader );
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
HTMLEditorKit.Parser parser = new ParserDelegator();
HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
parser.parse(br, callback, true);
// Parse
ElementIterator iterator = new ElementIterator(htmlDoc);
Element element;
while ((element = iterator.next()) != null)
{
System.out.println("Element : " + element);
AttributeSet attributes = element.getAttributes();
Object name = attributes.getAttribute(StyleConstants.NameAttribute);
if ((name instanceof HTML.Tag))
//&& ((name == HTML.Tag.H1) || (name == HTML.Tag.H2) || (name == HTML.Tag.H3)))
{
// Build up content text as it may be within multiple elements
StringBuffer text = new StringBuffer();
int count = element.getElementCount();
for (int i = 0; i < count; i++) {
Element child = element.getElement(i);
AttributeSet childAttributes = child.getAttributes();
System.out.println("Element : " + child);
System.out.println(" Attribute count : " + childAttributes.getAttributeCount());
System.out.println(" a1 exists : " + childAttributes.isDefined("a1"));
int startOffset = child.getStartOffset();
int endOffset = child.getEndOffset();
int length = endOffset - startOffset;
text.append(htmlDoc.getText(startOffset, length));
}
}
}
System.exit(0);
}
}
The output is here.
Element : BranchElement(html) 0,1
Element : BranchElement(body) 0,1
Attribute count : 1
a1 exists : false <-----expected true here.
Element : BranchElement(body) 0,1
Element : BranchElement(p) 0,1
Attribute count : 3
a1 exists : false
Element : BranchElement(p) 0,1
Element : LeafElement(content) 0,1
Attribute count : 1
a1 exists : false
Element : LeafElement(content) 0,1
The expectation is that the "a1 exists" check should have returned true once, but it did not.
Eventually all 3 (a1, a2, a3) will be searched.
Is the above code the proper implementation or is this not feasible with the HTML parser?
Maybe this will help:
import java.io.*;
import java.net.*;
import java.util.*;
import javax.swing.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
class AttributeHTML
{
public static void main(String[] args)
{
EditorKit kit = new HTMLEditorKit();
Document doc = kit.createDefaultDocument();
// The Document class does not yet handle charset's properly.
doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
try
{
// Create a reader on the HTML content.
Reader rd = getReader(args[0]);
// Parse the HTML.
kit.read(rd, doc, 0);
// Iterate through the elements of the HTML document.
ElementIterator it = new ElementIterator(doc);
Element elem = null;
while ( (elem = it.next()) != null )
{
if (elem.getName().equals("body"))
{
AttributeSet as = elem.getAttributes();
Enumeration enum1 = as.getAttributeNames();
while( enum1.hasMoreElements() )
{
Object name = enum1.nextElement();
Object value = as.getAttribute( name );
System.out.println( "\t" + name + " : " + value );
}
}
}
}
catch (Exception e)
{
e.printStackTrace();
}
System.exit(1);
}
// Returns a reader on the HTML data. If 'uri' begins
// with "http:", it's treated as a URL; otherwise,
// it's assumed to be a local filename.
static Reader getReader(String uri)
throws IOException
{
// Retrieve from Internet.
if (uri.startsWith("http:"))
{
URLConnection conn = new URL(uri).openConnection();
return new InputStreamReader(conn.getInputStream());
}
// Retrieve from file.
else
{
return new FileReader(uri);
}
}
}
Test using:
java AttributeHTML yourFile.html
I am not aware about HtmlKitbut u can achieve similar result using regex
public static void main(String[] args) throws UnirestException {
String html = "<html>\r\n" +
" <head>\r\n" +
" Hello World\r\n" +
" </head>\r\n" +
" <body a1=\"ABC\" a2=\"3974\" a3=\"A1B2\">\r\n" +
" <H1>Start Here<H1>\r\n" +
" <p>This is the body</p>\r\n" +
" </body>\r\n" +
"</html>";
Pattern regexBodyPattern = Pattern.compile("<body[^>]*>", Pattern.MULTILINE);
Matcher matcher = regexBodyPattern.matcher(html);
while(matcher.find()) {
String bodyTag = matcher.group();
Pattern regexBodyAttrPattern = Pattern.compile("(\\S*)=(\\\"\\w*\\\")", Pattern.MULTILINE);
Matcher attrMatcher = regexBodyAttrPattern.matcher(bodyTag);
while(attrMatcher.find()) {
System.out.println("Key :: "+attrMatcher.group(1)+" , Value "+attrMatcher.group(2));
}
}
}
output
Key :: a1 , Value "ABC"
Key :: a2 , Value "3974"
Key :: a3 , Value "A1B2"
To retrieve the attributes, you can provide your own ParserCallback
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class HTMLParserTest2
{
public static void main(String args[]) throws Exception {
Reader reader = new FileReader("d:/temp/Example.html");
BufferedReader br = new BufferedReader(reader);
System.out.println(HTMLParserTest2.extractTagsAttributes(br));
// output : [title-_implied_=true, body-a1=ABC, body-a2=3974, body-a3=A1B2]
System.exit(0);
}
public static List<String> extractTagsAttributes(Reader r) throws IOException {
final ArrayList<String> list = new ArrayList<String>();
ParserDelegator parserDelegator = new ParserDelegator();
ParserCallback parserCallback = new ParserCallback() {
#Override
public void handleText(final char[] data, final int pos) { }
#Override
public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
Enumeration<?> e=attribute.getAttributeNames();
while(e.hasMoreElements()) {
Object name=e.nextElement();
Object value=attribute.getAttribute(name);
list.add(tag.toString() + "-" + name + "=" +value);
}
}
#Override
public void handleEndTag(Tag t, final int pos) { }
#Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
#Override
public void handleComment(final char[] data, final int pos) { }
#Override
public void handleError(final java.lang.String errMsg, final int pos) { }
};
parserDelegator.parse(r, parserCallback, true);
return list;
}
}
I'm working on two word document comparison manually where i should not miss any Strings, Special chars, space and all the stuff and that document is around 150 pages or more. so its very headache to do comparison. Then I have written small java program to compare two documents but I'm not able to list the missing words.
Using Apche POI Library
Thanks in advance.
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFFooter;
import org.apache.poi.xwpf.usermodel.XWPFHeader;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
public class ReadDocFile {
private static XWPFDocument docx;
// private static String path = "C:\\States wise\\NH\\Assessment
// 2nd\\test.docx";
private static ArrayList<String> firstList = new ArrayList<String>(); // refers to first document list
private static ArrayList<String> secondList = new ArrayList<String>(); // refers to second document list
private static List<XWPFParagraph> paragraphList;
private static Map<String, String> map = null;
private static LinkedHashSet<String> firstMissedArray = new LinkedHashSet<String>(); // refers to first document Linked hash set
private static LinkedHashSet<String> secondMissedArray = new LinkedHashSet<String>(); // refers to second document Linked hash set
public static void getFilePath(String path) {
FileInputStream fis;
try {
fis = new FileInputStream(path);
docx = new XWPFDocument(fis);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void get_First_Doc_Data() {
getFilePath("C:\\States wise\\NH\\Assessment 2nd\\test.docx");
paragraphList = docx.getParagraphs();
System.out.println("******************** first list Starts here ******************** ");
System.out.println();
for (int i = 0; i < paragraphList.size() - 1; i++) {
firstList.add(paragraphList.get(i).getText().toString());
System.out.println(firstList.get(i).toString());
}
System.out.println("*********** first list Ends here ********************");
}
public static void get_Second_Doc_Data() {
getFilePath("C:\\States wise\\NH\\Assessment 2nd\\test1.docx");
paragraphList = docx.getParagraphs();
System.out.println("******************** Second list Starts here ******************** ");
System.out.println();
for (int i = 0; i < paragraphList.size() - 1; i++) {
secondList.add(paragraphList.get(i).getText().toString());
System.out.println(secondList.get(i).toString());
}
System.out.println("*********** Second list Ends here ********************");
}
public static void main(String[] args) {
get_First_Doc_Data();
get_Second_Doc_Data();
//System.out.println("First Para: " + firstList.contains(secondList));
compare();
compare_Two_List();
}
private static void compare() {
String firstMiss = null;
//String secondMiss = null;
for (int i = 0; i < firstList.size(); i++) {
for (int j = 0; j < secondList.size(); j++) {
if (!firstList.get(i).toString().equals(secondList.get(i).toString())) {
firstMiss = firstList.get(i).toString();
//secondMiss = secondList.get(i).toString();
map = new HashMap<String, String>();
}
}
firstMissedArray.add(firstMiss);
//secondMissedArray.add(secondMiss);
// System.out.println(missedArray.get(i).toString());
}
}
private static void compare_Two_List() {
int num = 0;
map.clear();
Iterator<String> first = firstMissedArray.iterator();
//Iterator<String> second = secondMissedArray.iterator();
while (first.hasNext()) {
map.put(""+num, first.next());
num++;
}
System.out.println(firstMissedArray.size());
Iterator it = map.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pair = (Map.Entry) it.next();
System.out.println(pair.getKey() + " = " + pair.getValue());
// it.remove(); // avoids a ConcurrentModificationException
}
}
}
I have taken liberty to modify your code to arrive at the solution for your problem. Please go through this.
This should pretty much solve your problem - put SYSO statements wherever you think is necessary and tweak the flow of the program to achieve desired checks as per you requirement. In my hurry, I may not have made use of coding standards of using try catch block for error handling and handling the negative scenarios, so please take care of that when implementing it live.
In case if the documents are not .DOCX but .PDF make use of the Apache PDFBox api.
Here is the Code:
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
public class Comapre_Docs {
private static final String FIRST_DOC_PATH = "E:\\Workspace_Luna\\assignments\\Expected.docx";
private static final String SECOND_DOC_PATH = "E:\\Workspace_Luna\\assignments\\Actual.docx";
private static XWPFDocument docx;
private static List<XWPFParagraph> paragraphList;
private static ArrayList<String> firstList = new ArrayList<String>();
private static ArrayList<String> secondList = new ArrayList<String>();
public static void get_Doc_Data(String filePath, ArrayList listName)
throws IOException {
File file = new File(filePath);
FileInputStream fis = new FileInputStream(file);
docx = new XWPFDocument(fis);
paragraphList = docx.getParagraphs();
for (int i = 0; i <= paragraphList.size() - 1; i++) {
listName.add(paragraphList.get(i).getText().toString());
}
fis.close();
}
public static void main(String[] args) throws IOException {
get_Doc_Data(FIRST_DOC_PATH, firstList);
get_Doc_Data(SECOND_DOC_PATH, secondList);
compare(firstList, secondList);
}
private static void compare(ArrayList<String> firstList_1,
ArrayList<String> secondList_1) {
simpleCheck(firstList_1, secondList_1);
int size = firstList_1.size();
for (int i = 0; i < size; i++) {
paragraphCheck(firstList_1.get(i).toString().split(" "),
secondList_1.get(i).toString().split(" "), i);
}
}
private static void paragraphCheck(String[] firstParaArray,
String[] secondParaArray, int paraNumber) {
System.out
.println("=============================================================");
System.out.println("Paragraph No." + (paraNumber + 1) + ": Started");
if (firstParaArray.length != secondParaArray.length) {
System.out.println("There is mismatch of "
+ Math.abs(firstParaArray.length - secondParaArray.length)
+ " words in this paragraph");
}
TreeMap<String, Integer> firstDocPara = getOccurence(firstParaArray);
TreeMap<String, Integer> secondDocPara = getOccurence(secondParaArray);
ArrayList<String> keyData = new ArrayList<String>(firstDocPara.keySet());
for (int i = 0; i < keyData.size(); i++) {
if (firstDocPara.get(keyData.get(i)) != secondDocPara.get(keyData
.get(i))) {
System.out
.println("The following word is missing in actual document : "
+ keyData.get(i));
}
}
System.out.println("Paragraph No." + (paraNumber + 1) + ": Done");
System.out
.println("=============================================================");
}
private static TreeMap<String, Integer> getOccurence(String[] paraArray) {
TreeMap<String, Integer> paragraphStringCountHolder = new TreeMap<String, Integer>();
paragraphStringCountHolder.clear();
for (String a : paraArray) {
int count = 1;
if (paragraphStringCountHolder.containsKey(a)) {
count = paragraphStringCountHolder.get(a) + 1;
paragraphStringCountHolder.put(a, count);
} else {
paragraphStringCountHolder.put(a, count);
}
}
return paragraphStringCountHolder;
}
private static boolean simpleCheck(ArrayList<String> firstList,
ArrayList<String> secondList) {
boolean flag = false;
if (firstList.size() > secondList.size()) {
System.out
.println("There are more paragraph in Expected document than in Actual document");
} else if (firstList.size() < secondList.size()) {
System.out
.println("There are more paragraph in Actual document than in Expected document");
} else if (firstList.size() == secondList.size()) {
System.out.println("The paragraph count in both documents match");
flag = true;
}
return flag;
}
}
I have created a web scraper which brings the market data of share rates from the website of stock exchange. www.psx.com.pk in that site there is a hyperlink of Market Summary. From that link I have to scrap the data. I have created a program which is as follows.
package com.market_summary;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ComMarket_summary {
boolean writeCSVToConsole = true;
boolean writeCSVToFile = true;
boolean sortTheList = true;
boolean writeToConsole;
boolean writeToFile;
public static Document doc = null;
public static Elements tbodyElements = null;
public static Elements elements = null;
public static Elements tdElements = null;
public static Elements trElement2 = null;
public static String Dcomma = ",";
public static String line = "";
public static ArrayList<Elements> sampleList = new ArrayList<Elements>();
public static void createConnection() throws IOException {
System.setProperty("http.proxyHost", "191.1.1.202");
System.setProperty("http.proxyPort", "8080");
String tempUrl = "http://www.psx.com.pk/index.php";
doc = Jsoup.connect(tempUrl).get();
System.out.println("Successfully Connected");
}
public static void parsingHTML() throws Exception {
File fold = new File("C:\\market_smry.csv");
fold.delete();
File fnew = new File("C:\\market_smry.csv");
for (Element table : doc.getElementsByTag("table")) {
for (Element trElement : table.getElementsByTag("tr")) {
trElement2 = trElement.getElementsByTag("td");
tdElements = trElement.getElementsByTag("td");
FileWriter sb = new FileWriter(fnew, true);
if (trElement.hasClass("marketData")) {
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
if (it.hasNext()) {
sb.append("\r\n");
}
for (Iterator<Element> it2 = trElement2.iterator(); it.hasNext();) {
Element tdElement2 = it.next();
final String content = tdElement2.text();
if (it2.hasNext()) {
sb.append(formatData(content));
sb.append(" | ");
}
}
System.out.println(sb.toString());
sb.flush();
sb.close();
}
}
System.out.println(sampleList.add(tdElements));
}
}
}
private static final SimpleDateFormat FORMATTER_MMM_d_yyyy = new SimpleDateFormat("MMM d, yyyy", Locale.US);
private static final SimpleDateFormat FORMATTER_dd_MMM_yyyy = new SimpleDateFormat("dd-MMM-YYYY", Locale.US);
public static String formatData(String text) {
String tmp = null;
try {
Date d = FORMATTER_MMM_d_yyyy.parse(text);
tmp = FORMATTER_dd_MMM_yyyy.format(d);
} catch (ParseException pe) {
tmp = text;
}
return tmp;
}
public static void main(String[] args) throws IOException, Exception {
createConnection();
parsingHTML();
}
}
Now, the problem is when I execute this program it should create a .csv file but what actually happens is it's not creating any file. When I debug this code I found that program is not entering in the loop. I don't understand that why it is doing so. While when I run the same program on the other website which have slightly different page structure it is running fine.
What I understand that this data is present in the #document which is a virtual element and doesn't mean anything that's why program can't read it while there is no such thing in other website. Kindly, help me out to read the data inside the #document element.
Long Story Short
Change your temp url to http://www.psx.com.pk/phps/index1.php
Explanation
There is no table in the document of http://www.psx.com.pk/index.php.
Instead it is showing it's content in two frameset.
One is dummy with url http://www.psx.com.pk/phps/blank.php.
Another one is the real page which is showing actual data and it's url is
http://www.psx.com.pk/phps/index1.php
I'm a new french user on stack and I have a problem ^^
I use an HTML parse Jsoup for parsing a html page. For that it's ok but I can't parse more url in same time.
This is my code:
first class for parsing a web page
package test2;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public final class Utils {
public static Map<String, String> parse(String url){
Map<String, String> out = new HashMap<String, String>();
try
{
Document doc = Jsoup.connect(url).get();
doc.select("img").remove();
Elements denomination = doc.select(".AmmDenomination");
Elements composition = doc.select(".AmmComposition");
Elements corptexte = doc.select(".AmmCorpTexte");
for(int i = 0; i < denomination.size(); i++)
{
out.put("denomination" + i, denomination.get(i).text());
}
for(int i = 0; i < composition.size(); i++)
{
out.put("composition" + i, composition.get(i).text());
}
for(int i = 0; i < corptexte.size(); i++)
{
out.put("corptexte" + i, corptexte.get(i).text());
System.out.println(corptexte.get(i));
}
} catch(IOException e){
e.printStackTrace();
}
return out;
}//Fin Methode parse
public static void excelizer(int fileId, Map<String, String> values){
try
{
FileOutputStream out = new FileOutputStream("C:/Documents and Settings/c.bon/git/clinsearch/drugs/src/main/resources/META-INF/test/fichier2.xls" );
Workbook wb = new HSSFWorkbook();
Sheet mySheet = wb.createSheet();
Row row1 = mySheet.createRow(0);
Row row2 = mySheet.createRow(1);
String entete[] = {"CIS", "Denomination", "Composition", "Form pharma", "Indication therapeutiques", "Posologie", "Contre indication", "Mise en garde",
"Interraction", "Effet indesirable", "Surdosage", "Pharmacodinamie", "Liste excipients", "Incompatibilité", "Duree conservation",
"Conservation", "Emballage", "Utilisation Manipulation", "TitulaireAMM"};
for (int i = 0; i < entete.length; i++)
{
row1.createCell(i).setCellValue(entete[i]);
}
Set<String> set = values.keySet();
int rowIndexDenom = 1;
int rowIndexCompo = 1;
for(String key : set)
{
if(key.contains("denomination"))
{
mySheet.createRow(1).createCell(1).setCellValue(values.get(key));
rowIndexDenom++;
}
else if(key.contains("composition"))
{
row2.createCell(2).setCellValue(values.get(key));
rowIndexDenom++;
}
}
wb.write(out);
out.close();
}
catch(Exception e)
{
e.printStackTrace();
}
}
}
second class
package test2;
public final class Task extends Thread {
private static int fileId = 0;
private int id;
private String url;
public Task(String url)
{
this.url = url;
id = fileId;
fileId++;
}
#Override
public void run()
{
Utils.excelizer(id, Utils.parse(url));
}
}
the main class (entry point)
package test2;
import java.util.ArrayList;
public class Main {
public static void main(String[] args)
{
ArrayList<String> urls = new ArrayList<String>();
urls.add("http://base-donnees-publique.medicaments.gouv.fr/affichageDoc.php?specid=61266250&typedoc=R");
urls.add("http://base-donnees-publique.medicaments.gouv.fr/affichageDoc.php?specid=66207341&typedoc=R");
for(String url : urls)
{
new Task(url).run();
}
}
}
When the data was copied to my excel file, the second url doesn't work.
Can you help me solve my problem please?
Thanks
I think its because your main() exits before your second thread has a chance to do its job. You should wait for all spawned threads to complete using Thread.join(). Or better yet, create one of the ExecutorService's and use awaitTermination(...) to block until all URLs are parsed.
EDIT See some examples here http://www.javacodegeeks.com/2013/01/java-thread-pool-example-using-executors-and-threadpoolexecutor.html
i want to verify if a number for example 701234567 is an element of my array in java. For this, my code search if my number who is begening with 7 and have 9 digits is a element of my array "numbercall.txt" who have 5 elements. This is my text file:
numbercall.txt [ 702345678, 714326578, 701234567, 791234567,751234567]
This is my code:
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TestNumberLt {
static String[] arr= null;
String filename = "fichiers/numbercall.txt";
static String a = null ;
static List<String> list = new ArrayList<String>();
public static void main(String [] args) throws IOException{
FileInputStream fstream_school = new FileInputStream(filename);
DataInputStream data_input = new DataInputStream(fstream_school);
BufferedReader buffer = new BufferedReader(new InputStreamReader(data_input));
String str_line;
while ((str_line = buffer.readLine()) != null)
{
str_line = str_line.trim();
if ((str_line.length()!=0))
{
list.add(str_line);
}
}
int b = 773214576;
//convert the arraylist to a array
arr = (String[])list.toArray(new String[list.size()]);
Pattern p = Pattern.compile("^7[0|6|7][0-9]{7}$");
Matcher m ;
//a loop for verify if a number exist in this array
for (int j = 0; j < list.size();)
{
System.out.print(" "+list.get(j)+ " ");
m = p.matcher(list.get(j));
/*while(m.find())
System.out.println(m.group());*/
if(list.get(j).equals(b))
{
System.out.println("Trouvé "+list.get(j));
break;
}
else
{
System.out.println("ce numéro ("+b+") n'existe pas!");
}
break;
}
}
}
Do it simply like this
String str_line= "702345678,714326578,701234567,791234567,751234567";
String[] strArray = str_line.split(",");
String key = "702345678";
for(String v:strArray) {
if(v.equals(key)) {
System.out.println("found");
}
}
I'm not realy sure of what you want, but if you just need the index of b in your array just do this:
public static void main(String [] args) throws IOException{
...
int b = 773214576;
int tmp = list.indexOf(b+"");
if(tmp!=-1) {
System.out.println("Trouvé "+ b + " à l'index " + tmp);
} else {
System.out.println("Ce numéro ("+b+") n'existe pas!");
}
...
}
Another answer, using Guava :
(in this case, there really is no need, you could simply use split() method from String object, but like Guava readibility and returns)
package stackoverflow;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import com.google.common.base.Splitter;
public class RegexExample {
String filename = "numbercall.txt";
public boolean isInList(String numberToCheck) throws IOException {
BufferedReader file = loadFile();
for (String number : extractNumberListFrom(file)) {
if (number.trim().equals(numberToCheck)) {
return true;
}
}
return false;
}
private Iterable<String> extractNumberListFrom(BufferedReader buffer) throws IOException {
StringBuilder numberList = new StringBuilder();
String line;
while ((line = buffer.readLine()) != null) {
numberList.append(line);
}
return Splitter.on(",").split(numberList.toString());
}
private BufferedReader loadFile() {
InputStream fstream_school = RegexExample.class.getResourceAsStream(filename);
BufferedReader buffer = new BufferedReader(new InputStreamReader(fstream_school));
return buffer;
}
}