Google Search with JSoup - java

i trie to search in google with JSoup. The problem that i have is, that the variable query shows not the URL i want when i start searching.
Also, how does Jsoup search ? Looking for Title or URL or what ?
public class Start {
public static void main(String[] args) {
try {
new Google().Searching("Möbel Beck GmbH & Co.KG");
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
}
public class Google implements Serializable {
private static final long serialVersionUID = 1L;
private static Pattern patternDomainName;
private Matcher matcher;
private static final String DOMAIN_NAME_PATTERN = "([a-zA-Z0-9]([a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]{2,6}";
static {
patternDomainName = Pattern.compile(DOMAIN_NAME_PATTERN);
}
public void Searching(String searchstring) throws IOException {
Google obj = new Google();
Set<String> result = obj.getDataFromGoogle(searchstring);
for (String temp : result) {
if (temp.contains(searchstring)) {
System.out.println(temp + " ----> CONTAINS");
} else {
System.out.println(temp);
}
}
System.out.println(result.size());
}
public String getDomainName(String url) {
String domainName = "";
matcher = patternDomainName.matcher(url);
if (matcher.find()) {
domainName = matcher.group(0).toLowerCase().trim();
}
return domainName;
}
private Set<String> getDataFromGoogle(String query) {
Set<String> result = new HashSet<String>();
String request = "https://www.google.com/search?q=" + query;
System.out.println("Sending request..." + request);
try {
// need http protocol, set this as a Google bot agent :)
Document doc = Jsoup.connect(request)
.userAgent("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)").timeout(6000)
.get();
// get all links
Elements links = doc.select("a[href]");
for (Element link : links) {
String temp = link.attr("href");
if (temp.startsWith("/url?q=")) {
// use regex to get domain name
result.add(getDomainName(temp));
}
}
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
}

Parsing google sites directly is not a good idea. You can try Google API
https://developers.google.com/web-search/docs/#java-access

Related

WebScript for uploading files: during the first upload one extra version created

To upload files to the repository, I use the following Java-backed WebScript:
public class CustomFileUploader extends DeclarativeWebScript {
private static final String FIRM_DOC = "{http://www.firm.com/model/content/1.0}doc";
private static final String FIRM_DOC_FOLDER = "workspace://SpacesStore/8caf07c3-6aa9-4a41-bd63-404cb3e3ef0f";
private FileFolderService fileFolderService;
private ContentService contentService;
private NodeService nodeService;
private SearchService searchService;
protected Map<String, Object> executeImpl(WebScriptRequest req, Status status) {
processUpload(req);
return null;
}
private void writeContent(NodeRef node, FirmFile firmFile) {
try {
ContentWriter contentWriter = contentService.getWriter(node, ContentModel.PROP_CONTENT, true);
contentWriter.setMimetype(firmFile.getFileMimetype());
contentWriter.putContent(firmFile.getFileContent().getInputStream());
} catch (Exception e) {
e.printStackTrace();
}
}
private NodeRef checkIfNodeExists(String fileName) {
StoreRef storeRef = new StoreRef(StoreRef.PROTOCOL_WORKSPACE, "SpacesStore");
ResultSet resultSet = searchService.query(storeRef, SearchService.LANGUAGE_LUCENE/*LANGUAGE_FTS_ALFRESCO*/,
"TYPE:\"firm:doc\" AND #cm\\:name:" + fileName.replaceAll(" ", "\\ ")+ "");
int len = resultSet.length();
if(len == 0) {
return null;
}
NodeRef node = resultSet.getNodeRef(0);
return node;
}
private NodeRef createNewNode(FirmFile firmFile) {
NodeRef parent = new NodeRef(FIRM_DOC_FOLDER);
NodeRef node = createFileNode(parent, firmFile.getFileName());
return node;
}
private void processUpload(WebScriptRequest req) {
FormData formData = (FormData) req.parseContent();
FormData.FormField[] fields = formData.getFields();
for(FormData.FormField field : fields) {
String fieldName = field.getName();
if(fieldName.equalsIgnoreCase("firm_file") && field.getIsFile()) {
String fileName = field.getFilename();
Content fileContent = field.getContent();
String fileMimetype = field.getMimetype();
NodeRef node = checkIfNodeExists(fileName);
// POJO
FirmFile firm = new FirmFile(fileName, fileContent, fileMimetype, FIRM_DOC);
if(node == null) {
node = createNewNode(firmFile);
}
writeContent(node, firmFile);
}
}
}
private NodeRef createFileNode(NodeRef parentNode, String fileName) {
try {
QName contentQName = QName.createQName(FIRM_DOC);
FileInfo fileInfo = fileFolderService.create(parentNode, fileName, contentQName);
return fileInfo.getNodeRef();
} catch (Exception e) {
e.printStackTrace();
}
}
public FileFolderService getFileFolderService() {
return fileFolderService;
}
public void setFileFolderService(FileFolderService fileFolderService) {
this.fileFolderService = fileFolderService;
}
public ContentService getContentService() {
return contentService;
}
public void setContentService(ContentService contentService) {
this.contentService = contentService;
}
public NodeService getNodeService() {
return nodeService;
}
public void setNodeService(NodeService nodeService) {
this.nodeService = nodeService;
}
public SearchService getSearchService() {
return searchService;
}
public void setSearchService(SearchService searchService) {
this.searchService = searchService;
}
}
This WebScript works.
There is one problem: if the uploaded file does not exist in the repository, then two versions are created at once: 1.0 and 1.1. Version 1.0 has a size of 0 bytes and has no content.
I think version 1.0 is created here:
FileInfo fileInfo = fileFolderService.create(parentNode, fileName, contentQName);
Version 1.1, perhaps, is created here, when writing the content:
ContentWriter contentWriter = contentService.getWriter(node, ContentModel.PROP_CONTENT, true);
contentWriter.setMimetype(firmFile.getFileMimetype());
contentWriter.putContent(firmFile.getFileContent().getInputStream());
How to prevent the creation of "empty" version 1.0?
Axel Faust gave an excellent answer to this question here.
The problem was in the fileuploader.post.desc.xml descriptor, I didn't set transaction level required to run the web script:
<transaction>none</transaction>
Thus, methods writeContent() and createFileNode() were transactional by default and each of them created its own version of the content.
I set the following:
<transaction>requiresnew</transaction>
The problem is solved. Thank you, #Axel Faust!

jsoup to get div elements with classes

I am new to Jsoup parsing and I want to get the list of all the companies on this page: https://angel.co/companies?company_types[]=Startup
Now, a way to do this is actually to inspect the page with the div tags relevant to what I need.
However, when I call the method :
Document doc = Jsoup.connect("https://angel.co/companies?company_types[]=Startup").get();
System.out.println(doc.html());
Firstly I cannot even find those DIV tags in my consol html output, (the ones which are supposed to give a list of the companies)
Secondly, even if I did find it, how can I find a certain Div element with class name :
div class=" dc59 frw44 _a _jm"
Pardon the jargon, I have no idea how to go through this.
The data are not embedded in the page but they are retrieved using subsequent API calls :
a POST https://angel.co/company_filters/search_data to get an ids array & a token named hexdigest
a GET https://angel.co/companies/startups to retrieve company data using the output from the previous request
The above is repeated for each page (thus a new token & a list of ids are needed for each page). This process can be seen using Chrome dev console in Network tabs.
The first POST request gives JSON output but the second request (GET) gives HTML data in a property of a JSON object.
The following extracts the company filter :
private static CompanyFilter getCompanyFilter(final String filter, final int page) throws IOException {
String response = Jsoup.connect("https://angel.co/company_filters/search_data")
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
.header("X-Requested-With", "XMLHttpRequest")
.data("filter_data[company_types][]=", filter)
.data("sort", "signal")
.data("page", String.valueOf(page))
.userAgent("Mozilla")
.ignoreContentType(true)
.post().body().text();
GsonBuilder gsonBuilder = new GsonBuilder();
Gson gson = gsonBuilder.create();
return gson.fromJson(response, CompanyFilter.class);
}
Then the following extracts companies :
private static List<Company> getCompanies(final CompanyFilter companyFilter) throws IOException {
List<Company> companies = new ArrayList<>();
URLConnection urlConn = new URL("https://angel.co/companies/startups?" + companyFilter.buildRequest()).openConnection();
urlConn.setRequestProperty("User-Agent", "Mozilla");
urlConn.connect();
BufferedReader reader = new BufferedReader(new InputStreamReader(urlConn.getInputStream(), "UTF-8"));
HtmlContainer htmlObj = new Gson().fromJson(reader, HtmlContainer.class);
Element doc = Jsoup.parse(htmlObj.getHtml());
Elements data = doc.select("div[data-_tn]");
if (data.size() > 0) {
for (int i = 2; i < data.size(); i++) {
companies.add(new Company(data.get(i).select("a").first().attr("title"),
data.get(i).select("a").first().attr("href"),
data.get(i).select("div.pitch").first().text()));
}
} else {
System.out.println("no data");
}
return companies;
}
The main function :
public static void main(String[] args) throws IOException {
int pageCount = 1;
List<Company> companies = new ArrayList<>();
for (int i = 0; i < 10; i++) {
System.out.println("get page n°" + pageCount);
CompanyFilter companyFilter = getCompanyFilter("Startup", pageCount);
pageCount++;
System.out.println("digest : " + companyFilter.getDigest());
System.out.println("count : " + companyFilter.getTotalCount());
System.out.println("array size : " + companyFilter.getIds().size());
System.out.println("page : " + companyFilter.getpage());
companies.addAll(getCompanies(companyFilter));
if (companies.size() == 0) {
break;
} else {
System.out.println("size : " + companies.size());
}
}
}
Company, CompanyFilter & HtmlContainer are model class :
class CompanyFilter {
#SerializedName("ids")
private List<Integer> mIds;
#SerializedName("hexdigest")
private String mDigest;
#SerializedName("total")
private String mTotalCount;
#SerializedName("page")
private int mPage;
#SerializedName("sort")
private String mSort;
#SerializedName("new")
private boolean mNew;
public List<Integer> getIds() {
return mIds;
}
public String getDigest() {
return mDigest;
}
public String getTotalCount() {
return mTotalCount;
}
public int getpage() {
return mPage;
}
private String buildRequest() {
String out = "total=" + mTotalCount + "&";
out += "sort=" + mSort + "&";
out += "page=" + mPage + "&";
out += "new=" + mNew + "&";
for (int i = 0; i < mIds.size(); i++) {
out += "ids[]=" + mIds.get(i) + "&";
}
out += "hexdigest=" + mDigest + "&";
return out;
}
}
private static class Company {
private String mLink;
private String mName;
private String mDescription;
public Company(String name, String link, String description) {
mLink = link;
mName = name;
mDescription = description;
}
public String getLink() {
return mLink;
}
public String getName() {
return mName;
}
public String getDescription() {
return mDescription;
}
}
private static class HtmlContainer {
#SerializedName("html")
private String mHtml;
public String getHtml() {
return mHtml;
}
}
The full code is also available here

Is there a way to retrieve Twitter statuses based on a list of status id:s

I'm currently trying to get a set of statuses based on a long list (>50k) of Twitter status id:s. Currently I'm using Java with twitter4j. I would really like to avoid making individual calls for each id if possible, but any approach that is faster than my current code (see below) would be much appreciated.
public class TweetById {
private static String consumerKey = ...;
private static String consumerSecret = ...;
private static String accessToken = ...;
private static String accessTokenSecret = ...;
public static void main(String[] args) {
// connect to Twitter
ConfigurationBuilder cb = new ConfigurationBuilder();
cb.setDebugEnabled(true);
cb.setOAuthConsumerKey(consumerKey);
cb.setOAuthConsumerSecret(consumerSecret);
cb.setOAuthAccessToken(accessToken);
cb.setOAuthAccessTokenSecret(accessTokenSecret);
Twitter twitter = new TwitterFactory(cb.build()).getInstance();
String[] ids = new String[]{"327664925705109505", "327665399644696576","327666368516341761"};
for(String id : ids){
try {
Status status = twitter.showStatus(Long.parseLong(id));
System.out.println("Time: " + status.getCreatedAt() + ": " + status.getText());
} catch (NumberFormatException e) {
e.printStackTrace();
} catch (TwitterException e) {
e.printStackTrace();
}
}
}
}
Edit:
I have now attempted to use the asynchronous methods in twitter4j but I can't seem to get it working. Is there something I'm missing?
public class TweetById {
private static String consumerKey = ...;
private static String consumerSecret = ...;
private static String accessToken = ...;
private static String accessTokenSecret = ...;
public static void main(String[] args) {
// connect to Twitter
ConfigurationBuilder cb = new ConfigurationBuilder();
cb.setDebugEnabled(true);
cb.setOAuthConsumerKey(consumerKey);
cb.setOAuthConsumerSecret(consumerSecret);
cb.setOAuthAccessToken(accessToken);
cb.setOAuthAccessTokenSecret(accessTokenSecret);
// The factory instance is re-useable and thread safe.
AsyncTwitterFactory factory = new AsyncTwitterFactory(cb.build());
AsyncTwitter asyncTwitter = factory.getInstance();
asyncTwitter.addListener(new TwitterAdapter() {
#Override
public void gotShowStatus(Status status) {
System.out.println("Time: " + status.getCreatedAt() + ": " + status.getText());
}
});
String[] ids = new String[] { "327664925705109505", "327665399644696576", "327666368516341761" };
for (String id : ids) {
asyncTwitter.showStatus(Long.parseLong(id));
}
}
}
It's not ideal but you can use lookup(long[] ids) to retrieve up to 100 statues at time.

C# dll method call from Java

Has anyone an idea about what is wrong with my attempt to call a method from a C# dll in my Java code?
Here is my example:
Java code:
public class CsDllHandler {
public interface IKeywordRun extends Library {
public String KeywordRun(String action, String xpath, String inputData,
String verifyData);
}
private static IKeywordRun jnaInstance = null;
public void runDllMethod(String action, String xpath, String inputData,
String verifyData) {
NativeLibrary.addSearchPath(${projectDllName},
"${projectPath}/bin/x64/Debug");
jnaInstance = (IKeywordRun) Native.loadLibrary(
${projectDllName}, IKeywordRun.class);
String csResult = jnaInstance.KeywordRun(action, xpath, inputData,
verifyData);
System.out.println(csResult);
}
}
And in C#:
[RGiesecke.DllExport.DllExport]
public static string KeywordRun(string action, string xpath, string inputData, string verifyData) {
return "C# here";
}
The Unmanaged Exports nuget should be enough for me to call this method (in theory) but I have some strange error:
Exception in thread "main" java.lang.Error: Invalid memory access
at com.sun.jna.Native.invokePointer(Native Method)
at com.sun.jna.Function.invokePointer(Function.java:470)
at com.sun.jna.Function.invokeString(Function.java:651)
at com.sun.jna.Function.invoke(Function.java:395)
at com.sun.jna.Function.invoke(Function.java:315)
at com.sun.jna.Library$Handler.invoke(Library.java:212)
at com.sun.proxy.$Proxy0.KeywordRun(Unknown Source)
at auto.test.keywords.utils.CsDllHandler.runDllMethod(CsDllHandler.java:34)
at auto.test.keywords.runner.MainClass.main(MainClass.java:24)
Well, after another day of research and "trial and error" I have found the cause of my problem and a solution.
The cause was that my C# dll had a dependency on log4net.dll. For running a static method from a standalone C# dll the code from the question is all you need.
The solution for using C# dll with dependencies is to create another dll with no dependency and to load the original dll in this adapter with reflection. In Java you should load the adapter dll with jna and call any exported method. I was able not only to execute methods from the adapter but also to configure log4net with reflection and Java
Here is my code:
(C#)
public class CSharpDllHandler {
private static Logger log = Logger.getLogger(CSharpDllHandler.class);
public interface IFrameworkAdapter extends Library {
public String runKeyword(String action, String xpath, String inputData,
String verifyData);
public String configureLog4net(String log4netConfigPath);
public String loadAssemblies(String frameworkDllPath,
String log4netDllPath);
}
private static IFrameworkAdapter jnaAdapterInstance = null;
private String jnaSearchPath = null;
public CSharpDllHandler(String searchPath) {
this.jnaSearchPath = searchPath;
// add to JNA search path
System.setProperty("jna.library.path", jnaSearchPath);
// load attempt
jnaAdapterInstance = (IFrameworkAdapter) Native.loadLibrary(
"FrameworkAdapter", IFrameworkAdapter.class);
}
public String loadAssemblies(String frameworkDllPath, String log4netDllPath) {
String csResult = jnaAdapterInstance.loadAssemblies(frameworkDllPath,
log4netDllPath);
log.debug(csResult);
return csResult;
}
public String runKeyword(String action, String xpath, String inputData,
String verifyData) {
String csResult = jnaAdapterInstance.runKeyword(action, xpath,
inputData, verifyData);
log.debug(csResult);
return csResult;
}
public String configureLogging(String log4netConfigPath) {
String csResult = jnaAdapterInstance
.configureLog4net(log4netConfigPath);
log.debug(csResult);
return csResult;
}
public String getJnaSearchPath() {
return jnaSearchPath;
}
}
In the main method just use something like this:
CSharpDllHandler dllHandler = new CSharpDllHandler(
${yourFrameworkAdapterDllLocation});
dllHandler.loadAssemblies(
${yourOriginalDllPath},${pathToTheUsedLog4netDllFile});
dllHandler.configureLogging(${log4net.config file path});
dllHandler.runKeyword("JAVA Action", "JAVA Xpath", "JAVA INPUT",
"JAVA VERIFY");
dllHandler.runKeyword("JAVA Action2", "JAVA Xpath2", "JAVA INPUT2",
"JAVA VERIFY2");
In C# I have the desired methods on the original dll:
public static string KeywordRun(string action, string xpath, string inputData, string verifyData) {
log.Debug("Action = " + action);
log.Debug("Xpath = " + xpath);
log.Debug("InputData = " + inputData);
log.Debug("VerifyData = " + verifyData);
return "C# UserActions result: "+ action+" "+xpath+" "+inputData+" "+verifyData;
}
and all the magic is in the DLL Adapter:
namespace FrameworkAdapter {
[ComVisible(true)]
public class FwAdapter {
private const String OK="OK";
private const String frameworkEntryClassName = "${nameOfTheDllClass with method to run }";
private const String log4netConfiguratorClassName = "log4net.Config.XmlConfigurator";
private static Assembly frameworkDll = null;
private static Type frameworkEntryClass = null;
private static MethodInfo keywordRunMethod = null;
private static Assembly logDll = null;
private static Type logEntryClass = null;
private static MethodInfo logConfigureMethod = null;
private static String errorMessage = "OK";
[RGiesecke.DllExport.DllExport]
public static string loadAssemblies(string frameworkDllPath, string log4netDllPath) {
try {
errorMessage = LoadFrameworkDll(frameworkDllPath, frameworkEntryClassName);
LoadFrameworkMethods("KeywordRun", "Setup", "TearDown");
errorMessage = LoadLogAssembly(log4netDllPath, log4netConfiguratorClassName);
if (errorMessage.CompareTo(OK) == 0)
errorMessage = LoadLogMethods("Configure");
}
catch (Exception e) {
return e.Message;
}
return errorMessage;
}
[RGiesecke.DllExport.DllExport]
public static string configureLog4net(string log4netConfigPath) {
if (errorMessage.CompareTo("OK") == 0) {
StringBuilder sb = new StringBuilder();
sb.AppendLine("Try to configure Log4Net");
try {
FileInfo logConfig = new FileInfo(log4netConfigPath);
logConfigureMethod.Invoke(null, new object[] { logConfig });
sb.AppendLine("Log4Net configured");
}
catch (Exception e) {
sb.AppendLine(e.InnerException.Message);
}
return sb.ToString();
}
return errorMessage;
}
[RGiesecke.DllExport.DllExport]
public static string runKeyword(string action, string xpath, string inputData, string verifyData) {
StringBuilder sb = new StringBuilder();
object result = null;
try {
result = keywordRunMethod.Invoke(null, new object[] { action, xpath, inputData, verifyData });
sb.AppendLine(result.ToString());
}
catch (Exception e) {
sb.AppendLine(e.InnerException.Message);
}
return sb.ToString();
}
private static String LoadFrameworkDll(String dllFolderPath, String entryClassName) {
try {
frameworkDll = Assembly.LoadFrom(dllFolderPath);
Type[] dllTypes = frameworkDll.GetExportedTypes();
foreach (Type t in dllTypes)
if (t.FullName.Equals(entryClassName)) {
frameworkEntryClass = t;
break;
}
}
catch (Exception e) {
return e.InnerException.Message;
}
return OK;
}
private static String LoadLogAssembly(String dllFolderPath, String entryClassName) {
try {
logDll = Assembly.LoadFrom(dllFolderPath);
Type[] dllTypes = logDll.GetExportedTypes();
foreach (Type t in dllTypes)
if (t.FullName.Equals(entryClassName)) {
logEntryClass = t;
break;
}
}
catch (Exception e) {
return e.InnerException.Message;
}
return OK;
}
private static String LoadLogMethods(String logMethodName) {
try {
logConfigureMethod = logEntryClass.GetMethod(logMethodName, new Type[] { typeof(FileInfo) });
}
catch (Exception e) {
return e.Message;
}
return OK;
}
private static void LoadFrameworkMethods(String keywordRunName, String scenarioSetupName, String scenarioTearDownName) {
///TODO load the rest of the desired methods here
keywordRunMethod = frameworkEntryClass.GetMethod(keywordRunName);
}
}
}
Running this code will provide all the logged messages from the original C# DLL to the Java console output (and to a file if configured). In a similar way, we can load any other needed dll files for runtime.
Please forgive my [very probable wrong] way of doing things in C# with reflection, I'm new to this language.

How t get specific value from html in java?

I am developing one Application which show Gold rate and create graph for this.
I find one website which provide me this gold rate regularly.My question is how to extract this specific value from html page.
Here is link which i need to extract = http://www.todaysgoldrate.co.in/todays-gold-rate-in-pune/ and this html page have following tag and content.
<p><em>10 gram gold Rate in pune = Rs.31150.00</em></p>
Here is my code which i use for extracting but i didn't find way to extract specific content.
public class URLExtractor {
private static class HTMLPaserCallBack extends HTMLEditorKit.ParserCallback {
private Set<String> urls;
public HTMLPaserCallBack() {
urls = new LinkedHashSet<String>();
}
public Set<String> getUrls() {
return urls;
}
#Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
handleTag(t, a, pos);
}
#Override
public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
handleTag(t, a, pos);
}
private void handleTag(Tag t, MutableAttributeSet a, int pos) {
if (t == Tag.A) {
Object href = a.getAttribute(HTML.Attribute.HREF);
if (href != null) {
String url = href.toString();
if (!urls.contains(url)) {
urls.add(url);
}
}
}
}
}
public static void main(String[] args) throws IOException {
InputStream is = null;
try {
String u = "http://www.todaysgoldrate.co.in/todays-gold-rate-in-pune/";
//Here i need to extract this content by tag wise or content wise....
Thanks in Advance.......
You can use library like Jsoup
You can get it from here --> Download Jsoup
Here is its API reference --> Jsoup API Reference
Its really very easy to parse HTML content using Jsoup.
Below is a sample code which might be helpful to you..
public class GetPTags {
public static void main(String[] args){
Document doc = Jsoup.parse(readURL("http://www.todaysgoldrate.co.intodays-gold-rate-in-pune/"));
Elements p_tags = doc.select("p");
for(Element p : p_tags)
{
System.out.println("P tag is "+p.text());
}
}
public static String readURL(String url) {
String fileContents = "";
String currentLine = "";
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new URL(url).openStream()));
fileContents = reader.readLine();
while (currentLine != null) {
currentLine = reader.readLine();
fileContents += "\n" + currentLine;
}
reader.close();
reader = null;
} catch (Exception e) {
JOptionPane.showMessageDialog(null, e.getMessage(), "Error Message", JOptionPane.OK_OPTION);
e.printStackTrace();
}
return fileContents;
}
}
http://java-source.net/open-source/crawlers
You can use any of that's apis, but don't parse the HTML with the pure JDK, because it's too painfull.

Categories