I am trying to create a web content crawler for a specific website
http://v1000.vn/bang-xep-hang?ref=bang-xep-hang-1000-doanh-nghiep-dong-thue-thu-nhap-nhieu-nhat-2012
Shortly, my XPath to find the link that change the page (using javascript) is not working which cause NullPointExecetion. I have tried to modify the XPath in various way but nothing worked.
Also, do I need to run any method to get the new page after the script run?
> package gimasys.webService;
import java.io.IOException;
import java.net.MalformedURLException;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.ThreadedRefreshHandler;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlButton;
import com.gargoylesoftware.htmlunit.html.HtmlLink;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class Crawlv1000 {
/**
* #param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
final WebCrawler wc = new WebCrawler();
final PageCrawler pc = new PageCrawler();
final WebClient webClient = new WebClient(BrowserVersion.CHROME_16);
webClient.setRefreshHandler(new ThreadedRefreshHandler()); // This is to allow handling the page operation using threads else an exception will pop up
try {
HtmlPage page = webClient.getPage("http://v1000.vn/bang-xep-hang?ref=bang-xep-hang-1000-doanh-nghiep-dong-thue-thu-nhap-nhieu-nhat-2012");
HtmlAnchor link = page.getFirstByXPath("//a[#href='javascript:loadRankingTable(3)']");
link.click();
System.out.println(page.getTextContent());
} catch (FailingHttpStatusCodeException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
/*
wc.crawl("http://v1000.vn/bang-xep-hang?ref=bang-xep-hang-1000-doanh-nghiep-dong-thue-thu-nhap-nhieu-nhat-2012");
for (String url:wc.urlList)
{
pc.crawl(url);
}
*/
}
}
Thanks,
Minh Nguyen
Very small mistake done by you dear, semicolon mistake
HtmlAnchor link = page.getFirstByXPath("//a[#href='javascript:loadRankingTable(3);']");
link.click();
Related
I am trying to get data from a webpage (http://steamcommunity.com/id/Winning117/games/?tab=all) using a specific tag but I keep getting null. My desired result is to get the "hours played" for a specific game - Cluckles' Adventure in this case.
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class TestScrape {
public static void main(String[] args) throws Exception {
String url = "http://steamcommunity.com/id/Winning117/games/?tab=all";
Document document = Jsoup.connect(url).get();
Element playTime = document.select("div#game_605250").first();
System.out.println(playTime);
}
}
Edit: How can I tell if a webpage is using JavaScript and is therefore unable to be parsed by Jsoup?
To execute javascript in java code there is Selenium :
Selenium-WebDriver makes direct calls to the browser using each
browser’s native support for automation.
To include it with maven use this dependency:
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-server</artifactId>
<version>3.4.0</version>
</dependency>
Next I give you code of simple JUnit test that creates instance of WebDriver and goes to given url and executes simple script to get rgGames .
File chromedriver you have to download at https://sites.google.com/a/chromium.org/chromedriver/downloads.
package SeleniumProject.selenium;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.WebDriverWait;
import junit.framework.TestCase;
#RunWith(JUnit4.class)
public class ChromeTest extends TestCase {
private static ChromeDriverService service;
private WebDriver driver;
#BeforeClass
public static void createAndStartService() {
service = new ChromeDriverService.Builder()
.usingDriverExecutable(new File("D:\\Downloads\\chromedriver_win32\\chromedriver.exe"))
.withVerbose(false).usingAnyFreePort().build();
try {
service.start();
} catch (IOException e) {
System.out.println("service didn't start");
// TODO Auto-generated catch block
e.printStackTrace();
}
}
#AfterClass
public static void createAndStopService() {
service.stop();
}
#Before
public void createDriver() {
ChromeOptions chromeOptions = new ChromeOptions();
DesiredCapabilities capabilities = DesiredCapabilities.chrome();
capabilities.setCapability(ChromeOptions.CAPABILITY, chromeOptions);
driver = new RemoteWebDriver(service.getUrl(), capabilities);
}
#After
public void quitDriver() {
driver.quit();
}
#Test
public void testJS() {
JavascriptExecutor js = (JavascriptExecutor) driver;
// Load a new web page in the current browser window.
driver.get("http://steamcommunity.com/id/Winning117/games/?tab=all");
// Executes JavaScript in the context of the currently selected frame or
// window.
ArrayList<Map> list = (ArrayList<Map>) js.executeScript("return rgGames;");
// Map represent properties for one game
for (Map map : list) {
for (Object key : map.keySet()) {
// take each key to find key "name" and compare its vale to
// Cluckles' Adventure
if (key instanceof String && key.equals("name") && map.get(key).equals("Cluckles' Adventure")) {
// print all properties for game Cluckles' Adventure
map.forEach((key1, value) -> {
System.out.println(key1 + " : " + value);
});
}
}
}
}
}
As you can see selenium loads page at
driver.get("http://steamcommunity.com/id/Winning117/games/?tab=all");
And to get data of all games by Winning117 it returns rgGames variable:
ArrayList<Map> list = (ArrayList<Map>) js.executeScript("return rgGames;");
The page you want to scrape is load by js,and there is not any #game_605250 element that jsoup get.All datas are write in page by using js.
But when I print document to a file ,I see some data like this:
<script language="javascript">
var rgGames = [{"appid":224260,"name":"No More Room in Hell","logo":"http:\/\/cdn.steamstatic.com.8686c.com\/steamcommunity\/public\/images\/apps\/224260\/670e9aba35dc53a6eb2bc686d302d357a4939489.jpg","friendlyURL":224260,"availStatLinks":{"achievements":true,"global_achievements":true,"stats":false,"leaderboards":false,"global_leaderboards":false},"hours_forever":"515","last_played":1492042097},{"appid":241540,"name":"State of Decay","logo":"http:\/\/....
then,you can extract 'rgGames' by some StringTools and format it to json obj.
It't not a clerver method,but it worked
try this :
public class TestScrape {
public static void main(String[] args) throws Exception {
String url = "http://steamcommunity.com/id/Winning117/games/?tab=all";
Document document = Jsoup.connect(url).get();
Element playTime = document.select("div#game_605250");
Elements val = playTime.select(".hours_played");
System.out.println(val.text());
}
}
Community:
Recently while I work in a project with Elasticsearch[2.3.1], I try to make a simple query to ES using a java API compile in a .jar(elasticsearch.2.3.1.jar) file that I add to my project, but when I code next :
QueryBuilder qb = simpleQueryStringQuery("+kimchy -elasticsearch");
The IDE didnt reconize the instruction "simpleQueryStringQuery("+kimchy -elasticsearch")" but in all example in internet and in ES official site appears in this form. What is doing wrong? Thank in advance.
import java.net.InetAddress;
import java.net.UnknownHostException;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.sort.SortOrder;
import org.elasticsearch.search.sort.SortParseElement;
public class Search {
public static void main(String[] args) {
Client client;
Settings settings = Settings.settingsBuilder()
.put("client.transport.ignore_cluster_name", true).build();
try {
client = TransportClient
.builder()
.settings(settings)
.build()
.addTransportAddress(
new InetSocketTransportAddress(InetAddress
.getByName("localhost"), 9300));
} catch (UnknownHostException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
QueryBuilder qb = simpleQueryStringQuery("+kimchy -elasticsearch");
SearchResponse scrollResp = client.prepareSearch("thing")
.addSort(SortParseElement.DOC_FIELD_NAME, SortOrder.ASC)
.setScroll(new TimeValue(60000))
.setQuery(qb)
.setSize(100).execute().actionGet(); //100 hits per shard will be returned for each scroll
//Scroll until no hits are returned
while (true) {
for (SearchHit hit : scrollResp.getHits().getHits()) {
//Handle the hit...
}
scrollResp = client.prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(60000)).execute().actionGet();
//Break condition: No hits are returned
if (scrollResp.getHits().getHits().length == 0) {
break;
}
}
}
}
You know how methods and imports work? The error comes because your class doesn't have a method called simpleQueryStringQuery and you don't import that method.
What you really want is: either use QueryBuilders.simpleQueryStringQuery("...")
Or use a static import for QueryBuilders.simpleQueryStringQuery. See: http://docs.oracle.com/javase/1.5.0/docs/guide/language/static-import.html or https://en.wikipedia.org/wiki/Static_import
I'm writing a Java servlet using Selenium + PhantomJS to log into Alipay (it's like Chinese version of Paypal). I want to get the authentication code by taking screenshot of the login page. My code is as below:
package com.alipay.login.test;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.URL;
import javax.imageio.ImageIO;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.openqa.selenium.By;
import org.openqa.selenium.Dimension;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.Point;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import com.Constants;
public class alipayGetAuthCodeServlet extends HttpServlet {
/**
*
*/
private static final long serialVersionUID = 1L;
public byte[] takeScreenshot() throws IOException {
TakesScreenshot takesScreenshot = (TakesScreenshot) Constants.driver;
return takesScreenshot.getScreenshotAs(OutputType.BYTES);
}
public BufferedImage createElementImage(WebElement webElement)
throws IOException {
Point location = webElement.getLocation();
Dimension size = webElement.getSize();
System.out.println(location + " / " + size);
BufferedImage originalImage = ImageIO.read(new ByteArrayInputStream(takeScreenshot()));
/*BufferedImage croppedImage = originalImage.getSubimage(
location.getX(),
location.getY(),
size.getWidth(),
size.getHeight());*/
return originalImage; // here I return the full screenshot for testing
}
protected void service (HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
DesiredCapabilities caps = new DesiredCapabilities();
caps.setJavascriptEnabled(true);
caps.setCapability("takeScreenshot", true);
caps.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, "F:\\phantomjs-2.0.0-windows\\bin\\phantomjs.exe");
Constants.driver = new PhantomJSDriver(caps);
Constants.driver.get("https://auth.alipay.com/login/index.htm");
try {
Thread.sleep(5000);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
try {
Constants.img = Constants.driver.findElement(By.id("J-checkcode-img")).getAttribute("src");
} catch (Exception e1) {
}
if (!Constants.img.equals("")) {
BufferedImage captcha = createElementImage(Constants.driver.findElement(By.id("J-checkcode-img")));
response.setContentType("image/jpeg");
try {
ImageIO.write(captcha, "jpeg", response.getOutputStream());
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
When I run my servlet, the screenshot I get is like this: http://i.stack.imgur.com/rBxI9.png
However, if I use ChromeDriver instead of PhantomJSDriver, the screenshot would be like this: http://i.stack.imgur.com/UgaB4.jpg, which is what the login page should be like.
So the screenshot taken by PhantomJSDriver has wrong color (I have no idea about this), wrong size (seems that I can handle this) and most importantly, no authentication code. I have checked the html source code returned by both drivers and found that the div of auth code in PhantomJS has a class of "ui-form-item fn-hide" while the counterpart has a class of "ui-form-item". Is it because the server of Alipay examines what browser I'm using and returns different pages accordingly?
Also I cannot login with only username and password using PhantomJS so I guess I do need an auth code.
Sorry for this long question and thanks in advance for any help!
I am trying to implement SSE client in java from this tutorial.
It is working fine when implemented as Servlet client using post method.But it is not working when I am implementing the same in Java project using main method and with same jar files as in servlet.Here is the code I am using along with target URI:-
import javax.ws.rs.Consumes;
import javax.ws.rs.ProcessingException;
import javax.ws.rs.client.Client;
import javax.ws.rs.client.ClientBuilder;
import javax.ws.rs.client.WebTarget;
import javax.ws.rs.core.MediaType;
import org.glassfish.jersey.media.sse.EventListener;
import org.glassfish.jersey.media.sse.EventSource;
import org.glassfish.jersey.media.sse.InboundEvent;
import org.glassfish.jersey.media.sse.SseFeature;
public class SSEreceive {
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
Client client = ClientBuilder.newBuilder().register(SseFeature.class).build();
WebTarget target = ((Client)client).target("http://www.w3schools.com/html/demo_sse.php");
EventSource eventSource = (EventSource)EventSource.target(target).build();
EventListener listener = new EventListener() {
#Override
//#Consumes(MediaType.APPLICATION_JSON)
public void onEvent(InboundEvent inboundEvent) {
// System.out.println(inboundEvent.getName() + "; " + inboundEvent.readData(String.class));
System.out.println(inboundEvent.readData(String.class));
}
};
//eventSource.register(listener, "message-to-client");
eventSource.register(listener);
eventSource.open();
System.out.println("Connection tried");
eventSource.close();
} catch (ProcessingException pe) {
pe.printStackTrace();
System.out.println(pe.getMessage());
} catch (Exception e) {
e.printStackTrace();
System.out.println(e.getMessage());
}
}
}
Can some please help me why this is not working in Java project ?
I am working a project in MAVEN using Java.
I have to get a URL, scroll them down ,and get all the links of other items in this given web page.
Till now, I get the page dynamically using Selenium , and scrolling them down, and fetch the links also. But it takes too much time. Please help me in optimize that.
Example:-, I am working on a page , whose link is here.
My Questions :-
Scrolling web page using selenium is very slow. How can I optimize this? (Suggest any other method
to do the same or help me to optimize this one)
Thanks in advance. Looking for your kind response.
Code to dynamically get and scroll the page:-
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import com.google.common.collect.*;
import java.io.File;
import java.util.ArrayList;
import java.util.Date;
import org.apache.commons.io.FileUtils;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxProfile;
/**
*
* #author jhamb
*/
public class Scroll_down {
private static FirefoxProfile createFirefoxProfile() {
File profileDir = new File("/tmp/firefox-profile-dir");
if (profileDir.exists()) {
return new FirefoxProfile(profileDir);
}
FirefoxProfile firefoxProfile = new FirefoxProfile();
File dir = firefoxProfile.layoutOnDisk();
try {
profileDir.mkdirs();
FileUtils.copyDirectory(dir, profileDir);
} catch (IOException e) {
e.printStackTrace();
}
return firefoxProfile;
}
public static void main(String[] args) throws InterruptedException{
String url1 = "http://www.jabong.com/men/shoes/men-sports-shoes/?source=home-leftnav";
System.out.println("Fetching %s..." + url1);
WebDriver driver = new FirefoxDriver(createFirefoxProfile());
driver.get(url1);
JavascriptExecutor jse = (JavascriptExecutor)driver;
jse.executeScript("window.scrollBy(0,250)", "");
for (int second = 0;; second++) {
if (second >= 60) {
break;
}
jse.executeScript("window.scrollBy(0,200)", "");
Thread.sleep(1000);
}
String hml = driver.getPageSource();
driver.close();
Document document = Jsoup.parse(hml);
Elements links = document.select("div");
for (Element link : links) {
System.out.println(link.attr("data-url"));
}
}
}
Well Selenium scrolling is based on Javascript. I dont know your goal with selenium though, you have no assertion to compare anything in your code ?
When you are so sure that your data fetching so fast then don't use any sleep methode.
Sleep methods makes selenium slower, but yeah it is waiting until the element is properly loaded .....
It's up to you, what to test though
How about page down?
ele.sendKeys(Keys.PAGE_DOWN); //WebElement ele = <Any existing element>
Repeat this till you find that particular item.