Search address by name link - Jsoup - java

How to get the web address not by the title but by the description of the link (in this case, "następna strona" it's means next page) with html code?
More specifically draw the internet address of the link name which is between text
następna strona
package outerDictionary;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class adressWWW {
public static void main(String[] args) {
Document doc;
List<String> wikiWords = new ArrayList<String>();
String addresWWW="http://pl.wiktionary.org/w/index.php?title=Kategoria:angielski_(indeks)&pagefrom=abducent#mw-pages";
try {
doc = Jsoup .connect(addresWWW).get();
String title = doc.title();
System.out.println(title);
//Element inDiv = doc.select("a[title=Kategoria:angielski (indeks)]").first();
Element inDiv = doc.select("a[title=Kategoria:angielski (indeks)]następna strona").first();
System.out.println(inDiv);
String row = inDiv.attr("abs:href");
System.out.println("xxx "+row);
// System.out.println(row.text());}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
for (String x : wikiWords)
System.out.println(x);
System.out.println(wikiWords.size());
}}

You can test the text of each link:
Document doc = Jsoup.connect("http://pl.wiktionary.org/w/index.php?title=Kategoria:angielski_(indeks)&pagefrom=abducent#mw-pages").get();
for( Element element : doc.select("a") )
{
if( element.text().equalsIgnoreCase("następna strona") )
{
System.out.println(element);
}
}
Or using the selector syntax:
// ...
for( Element element : doc.select("a:contains(następna strona)") )
{
System.out.println(element);
}
In both cases, the result is:
następna strona
następna strona

Related

how can i do web scraping in this case?

i am trying to scrap text from https://in-the-sky.org/data/object.php?id=A216&day=17&month=6&year=2022
so i wrote a code like
import java.util.Iterator;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Main {
public static void main(String args[]) {
int num = 216;
int day = 17;
int month = 6;
int year = 2022;
String url ="https://in-the-sky.org/data/object.php?id=A"+Integer.toString(num)+"&day="+Integer.toString(day)+"&month="+Integer.toString(month)+"&year="+Integer.toString(year);
System.out.println(url);
Document doc = null;
try {
doc = Jsoup.connect(url).get();
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
System.out.println("=======================================================");
Elements element = doc.select("div.col-md-6 col-md-pull-6");
String output = element.select("p").text();
System.out.println(output);
System.out.println("=======================================================");
}
}
but it doesnt work well. i would like someone to help me please
I believe that you can use Elements element = doc.select("div.col-md-6 > p"); to get your desired output.

JSoup can not parse a website

I am trying to get some data from a website. I make copy paste from my old program. But its not working. My code is below.
import java.io.IOException;
import javax.swing.JOptionPane;
import org.jsoup.Jsoup;
import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class Veri {
public static void main(String[] args) {
Veri();
}
public static void Veri() {
try {
String url = "https://www.isyatirim.com.tr/tr-tr/analiz/hisse/Sayfalar/default.aspx";
Response res = Jsoup.connect(url).timeout(6000).execute();
Document doc = res.parse();
Element ele = doc.select("table[class=dataTable hover nowrap excelexport data-tables no-footer]").first();
for (int i = 0; i < 100; i++) {
System.out.println(ele.select("td").iterator().next().text());
}
} catch (IOException c) {
JOptionPane.showMessageDialog(null, "Veriler Alınırken Bir Harta Oluştu!");
c.printStackTrace();
}
}
}
I got the below error
Exception in thread "main" java.lang.NullPointerException at
Veri.Veri(Veri.java:37) at Veri.main(Veri.java:20)
The page has probably changed a little bit since you last used your program.
Try this:
import org.jsoup.Jsoup;
import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Veri {
public static void main(String[] args) {
Veri();
}
public static void Veri() {
try {
String url = "https://www.isyatirim.com.tr/tr-tr/analiz/hisse/Sayfalar/default.aspx";
Response res = Jsoup.connect(url).timeout(6000).execute();
Document doc = res.parse();
Element ele = doc.select("table[class=dataTable hover nowrap excelexport]").first();
Elements lines = ele.select("tr");
for (Element elt : lines) {
System.out.println(elt.text());
System.out.println("------------------------");
}
} catch (IOException c) {
JOptionPane.showMessageDialog(null, "Veriler Alınırken Bir Harta Oluştu!");
c.printStackTrace();
}
}
}
I think you get all the information needed this way.

Jsoup How do I parse this span for its text?

<span class="c-city__hrMin" data-bind="{attr:{id:'p'+id()}}" id="p64">10:52</span>
How do I get this to print out just 10:52
So far I have tried
import java.io.IOException;
import org.jsoup.*;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.w3c.dom.Node;
import org.jsoup.select.*;
public class Main
{
public static void main(String [] args) {
Document doc = null;
try {
doc = Jsoup.connect("https://www.timeanddate.com/worldclock/personal.html").get();
String title = doc.title();
Elements elements = doc.select(".c-city__hrMin");
System.out.println("Website : " + title + elements.text());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
From this the output is Website : The Personal World Clock but their isn't any syntax error
Simply
doc.select(".c-city__hrMin") should work.
But if this class c-city__hrMin presents in other elements too then try
doc.select(span[class=c-city__hrMin]) It will select all span element having that class exclusively.
NB: For more reference and idea about Jsoup CSS Selectors follow this. You can try the selectors for a documents here also.

how can i get all the hyperlinks and its paragraphs in an website?

I want to get all hyperlinks and name it as .txt file, and i want to store all paragraphs inside those each hyperlinks and save as a text file by their article title.
i have the code here and i am fixing this for 2 months. i could not get code for this crawling/scraping logic.
Anyone please code and fix it.
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.BadLocationException;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class App {
public static void main(String[] args) throws URISyntaxException,
IOException, BadLocationException {
HTMLDocument doc = new HTMLDocument() {
public HTMLEditorKit.ParserCallback getReader(int pos) {
return new HTMLEditorKit.ParserCallback() {
public void handleText(char[] data, int pos) {
System.out.println(data);
}
};
}
};
URL url = new URI("http://tamilblog.ishafoundation.org/").toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream());
OutputStreamWriter writer = new OutputStreamWriter(
new FileOutputStream("ram.txt"), "UTF-8");
EditorKit kit = new HTMLEditorKit();
kit.read(rd, doc, 0);
try {
Document docs = Jsoup.connect(
"http://tamilblog.ishafoundation.org/").get();
Elements links = docs.select("a[href]");
Elements elements = docs.select("*");
System.out.println("Total Links :" + links.size());
for (Element element : elements) {
System.out.println(element.ownText());
}
for (Element link : links) {
String hrefUrl = link.attr("href");
if (!"#".equals(hrefUrl) && !hrefUrl.isEmpty()) {
System.out.println(" * a: link :" + hrefUrl);
System.out.println(" * a: text :" + link.text());
Document document = Jsoup.connect(hrefUrl)
.timeout(0) //Infinite timeout
.get();
String html = document.toString();
writer.write(html);
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
writer.close();
}
}
}
Try something like this
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class NewClass {
public static void main(String[] args) throws IOException {
Document doc = Jsoup.connect("http://tamilblog.ishafoundation.org").get();
Elements section = doc.select("section#content");
Elements article = section.select("article");
for (Element a : article) {
System.out.println("Title : \n" + a.select("a").text());
System.out.println("Article summary: \n" + a.select("div.entry-summary").text());
}
}
}

How to extract more than one substring from a larger substring in java?

I need some help in extracting the sub strings from the table from the link (http://www.informatik.uni-trier.de/~ley/pers/hd/k/Kumar:G=_Praveen.htm)..
I need to extract ONLY the names of the authors and store it into a 2D array..
For example:
a[0][0]= G. Praveen kumar
a[0][1]= Anirban Sakar.
a[1][0]= G. Praveen Kumar,
a[1][1]= Arjun Kumar Murmu,
a[1][2]= Biswas Parajuli ,
a[1][3]= Prasenjit Choudhury
and so on for the next row (till the end of the table)...
the code which i tried is given below..
I need to extract the names of the authors (substring) and store in a 2D array ,as the names are separated by commas and : followed by the name of the article..
I do not want the name of the article to be stored in the 2D array but only the names of person while the end of table.
Any help would be appreciated. Thanks in advance.
package codetrial;
import java.io.*;
import java.lang.String.*;
import org.jsoup.*;
import org.jsoup.nodes.*;
import java.io.BufferedWriter.*;
import java.io.FileWriter.*;
import java.io.IOException.*;
import java.util.*;
import org.apache.commons.lang.StringUtils;
public class Main {
public static void main(String[] args) {
try{
String a;
final String url="http://www.informatik.unitrier.de/~ley/pers/hd/k/Kumar:G=_Praveen.html";
Document doc = Jsoup.connect(url).get();
for(Element element : doc.select("table div.data") ) {
a = element.text();
String[] names = a.split(", "); // comma and space
String name_one = StringUtils.substringBetween(url, " ", ",");
String name_two = StringUtils.substringBetween(url, ",", ":");
System.out.println("person1 = " + name_one);
System.out.println("person2 = " +name_two);
for(String name : names) {
System.out.println(name);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
You can use Jsoup library to do this. See my example:
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class SourceCodeProgram {
public static void main(String[] args) throws Exception {
System.out.println(PageParser.readAuthors("http://www.informatik.uni-trier.de/~ley/pers/hd/k/Kumar:G=_Praveen.htm"));
}
}
class PageParser {
public static List<List<String>> readAuthors(String url) throws Exception {
Document document = Jsoup.connect(url).get();
Elements elements = document.getElementsByClass("data");
List<List<String>> result = new ArrayList<List<String>>();
List<String> authors = new ArrayList<String>();
for (Element element : elements) {
for (Element child : element.children()) {
if ("title".equals(child.className())) {
result.add(authors);
authors = new ArrayList<String>();
break;
}
authors.add(child.html());
}
}
return result;
}
}
Output:
[[G. Praveen Kumar, Anirban Sarkar], [G. Praveen Kumar, Arjun Kumar Murmu, Biswas Parajuli, Prasenjit Choudhury], [G. Praveen Kumar, Anirban Sarkar, Narayan C. Debnath]]
Use below code within for loop
String htmlString = element.text();
a = htmlString.replaceAll("\\<.*?>","");
String names = a.split(":")[0].split(",");
for(String name : names) {
System.out.println(name);
}

Categories