Jsoup Reddit Image Scraper Over 18 issue - java

I'm working on an image scraper that scrapes the first page of various subreddits using JSOUP. The issue that arises however is when attempting to scrape a NSFW subreddit, reddit redirects to an over 18 authentication page and the scraper scrapes the authentication page instead. I'm new to scraping and understand this is a noob question, but any help would be much appreciated as I am totally lost.
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.io.*;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.io.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.URL;
import java.util.Scanner;
public class javascraper{
public static final String USER_AGENT = "<User-Agent: github.com/dabeermasood:v1.2.3 (by /u/swedenotswiss)>";
public static void main (String[]args) throws MalformedURLException
{
Scanner scan = new Scanner (System.in);
System.out.println("Where do you want to store the files?");
String folderpath = scan.next();
System.out.println("What subreddit do you want to scrape?");
String subreddit = scan.next();
subreddit = ("http://reddit.com/r/" + subreddit);
new File(folderpath + "/" + subreddit).mkdir();
//test
try{
//gets http protocol
Document doc = Jsoup.connect(subreddit).userAgent(USER_AGENT).timeout(0).get();
//get page title
String title = doc.title();
System.out.println("title : " + title);
//get all links
Elements links = doc.select("a[href]");
for(Element link : links){
//get value from href attribute
String checkLink = link.attr("href");
Elements images = doc.select("img[src~=(?i)\\.(png|jpe?g|gif)]");
if (imgCheck(checkLink)){ // checks to see if img link j
System.out.println("link : " + link.attr("href"));
downloadImages(checkLink, folderpath);
}
}
}
catch (IOException e){
e.printStackTrace();
}
}
public static boolean imgCheck(String http){
String png = ".png";
String jpg = ".jpg";
String jpeg = "jpeg"; // no period so checker will only check last four characaters
String gif = ".gif";
int length = http.length();
if (http.contains(png)|| http.contains("gfycat") || http.contains(jpg)|| http.contains(jpeg) || http.contains(gif)){
return true;
}
else{
return false;
}
}
private static void downloadImages(String src, String folderpath) throws IOException{
String folder = null;
//Exctract the name of the image from the src attribute
int indexname = src.lastIndexOf("/");
if (indexname == src.length()) {
src = src.substring(1, indexname);
}
indexname = src.lastIndexOf("/");
String name = src.substring(indexname, src.length());
System.out.println(name);
//Open a URL Stream
URLConnection connection = (new URL(src)).openConnection();
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
e.printStackTrace();
} //Delay to comply with rate limiting
connection.setRequestProperty("User-Agent", USER_AGENT);
InputStream in = connection.getInputStream();
OutputStream out = new BufferedOutputStream(new FileOutputStream( folderpath+ name));
for (int b; (b = in.read()) != -1;) {
out.write(b);
}
out.close();
in.close();
}
}

I've posted an answer to authenticate against the server using Jsoup in this link. Basically you need to POST your login ID & password and other required data to the server using:
Connection.Response res = Jsoup.connect(url).data(...).method(Method.Post).execute();, then save the response cookie from the server to keep your session authenticated.

Related

how to reconstruct an org.archive.io.warc.WARCRecordInfo from an org.archive.io.ArchiveRecord?

Using java, I need to read a warc archive file, filter it depending on the content of the html page, and write a new archive file.
the following code reads the archive. how to reconstruct an org.archive.io.warc.WARCRecordInfo from an org.archive.io.ArchiveRecord?
import org.apache.commons.io.IOUtils;
import org.archive.io.ArchiveRecord;
import org.archive.io.warc.*;
import org.archive.wayback.resourcestore.resourcefile.WarcResource;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;
public class Test126b {
public static void main() throws Exception {
File out = new java.io.File("out.warc.gz");
OutputStream bos = new BufferedOutputStream(new FileOutputStream(out));
WARCWriterPoolSettings settings = ...
WARCWriter writer = new WARCWriter(new AtomicInteger(), bos, out, settings);
File in = new java.io.File("in.warc.gz");
WARCReader reader = WARCReaderFactory.get(in);
Iterator<ArchiveRecord> it = reader.iterator();
while (it.hasNext()) {
ArchiveRecord archiveRecord = it.next();
if (archiveRecord.getHeader().getHeaderValue("WARC-Type") == "response") {
WARCRecord warcRecord = (WARCRecord) archiveRecord;
WarcResource warcResource = new WarcResource(warcRecord, reader);
warcResource.parseHeaders();
String url = warcResource.getWarcHeaders().getUrl();
System.out.println("+++ url: " + url);
byte[] content = IOUtils.toByteArray(warcResource);
String htmlPage = new String(content);
if (htmlPage.contains("hello world")) {
writer.writeRecord(warcRecordInfo) // how to reconstruct the WARCRecordInfo
}
}
}
reader.close();
writer.close();
}
}

How to get values from grid using regex in java

I'm working on resume parser and i can get some of data i.e company details from text but not getting if it is kept in a grid or table
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.JOptionPane;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
public class CmpnyNameex {
public static void main(String[] args)throws IOException {
String text="";
String name="";
XWPFDocument msDocx = new XWPFDocument(new FileInputStream("A:\\Resumes\\Anwesh.docx"));
XWPFWordExtractor extractor = new XWPFWordExtractor(msDocx);
text = extractor.getText();
}
catch(FileNotFoundException ex){ex.printStackTrace();
JOptionPane.showMessageDialog(null,"The system cannot find the file specified file it may be because of old file format","Error",JOptionPane.ERROR_MESSAGE);
}
String rx13="(?<=Have been associated with).*.(.*Ltd?)";
Pattern p1 = Pattern.compile(rx13);
Matcher found1 = p1.matcher(text);
while(found1.find())
{
name= found1.group(0);
}
}
}

how can i get all the hyperlinks and its paragraphs in an website?

I want to get all hyperlinks and name it as .txt file, and i want to store all paragraphs inside those each hyperlinks and save as a text file by their article title.
i have the code here and i am fixing this for 2 months. i could not get code for this crawling/scraping logic.
Anyone please code and fix it.
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.BadLocationException;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class App {
public static void main(String[] args) throws URISyntaxException,
IOException, BadLocationException {
HTMLDocument doc = new HTMLDocument() {
public HTMLEditorKit.ParserCallback getReader(int pos) {
return new HTMLEditorKit.ParserCallback() {
public void handleText(char[] data, int pos) {
System.out.println(data);
}
};
}
};
URL url = new URI("http://tamilblog.ishafoundation.org/").toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream());
OutputStreamWriter writer = new OutputStreamWriter(
new FileOutputStream("ram.txt"), "UTF-8");
EditorKit kit = new HTMLEditorKit();
kit.read(rd, doc, 0);
try {
Document docs = Jsoup.connect(
"http://tamilblog.ishafoundation.org/").get();
Elements links = docs.select("a[href]");
Elements elements = docs.select("*");
System.out.println("Total Links :" + links.size());
for (Element element : elements) {
System.out.println(element.ownText());
}
for (Element link : links) {
String hrefUrl = link.attr("href");
if (!"#".equals(hrefUrl) && !hrefUrl.isEmpty()) {
System.out.println(" * a: link :" + hrefUrl);
System.out.println(" * a: text :" + link.text());
Document document = Jsoup.connect(hrefUrl)
.timeout(0) //Infinite timeout
.get();
String html = document.toString();
writer.write(html);
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
writer.close();
}
}
}
Try something like this
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class NewClass {
public static void main(String[] args) throws IOException {
Document doc = Jsoup.connect("http://tamilblog.ishafoundation.org").get();
Elements section = doc.select("section#content");
Elements article = section.select("article");
for (Element a : article) {
System.out.println("Title : \n" + a.select("a").text());
System.out.println("Article summary: \n" + a.select("div.entry-summary").text());
}
}
}

Want to Know size of folder on remote windows PC

I want to know the folder size of a remote Windows PC. I have credentials of the remote PC also.
So far I have tested this program. I works on my local system but not remote PC.
package ext.Size;
import java.io.File;
import org.apache.commons.io.FileUtils;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.commons.net.ftp.FTP;
import org.apache.commons.net.ftp.FTPClient;
public class FileSize {
public static void main(String[] args)
{
FileSize fs= new FileSize();
Float size=fs.ReturnSize("\\\\199.258.63.85\\D:\\test_folder");
if(size!=07)
System.out.println(size);
}
public static float ReturnSize(String args) {
String server = "\\\\199.258.63.85";
int port = 22;
String user = "test75";
String pass = "testpass75";
System.out.println("server=="+server);
try {
FTPClient ftpClient = new FTPClient();
ftpClient.connect(server,port);
ftpClient.login(user, pass);
//ftpClient.enterLocalPassiveMode();
//ftpClient.setFileType(FTP.BINARY_FILE_TYPE);
float size = FileUtils.sizeOfDirectory
(new File(args));
System.out.println("Size: " + size + " bytes" + size/1073741824 + "GB" );
return size;
}
catch(Exception e)
{
System.out.println(e);
return 07;
}
/*float size = FileUtils.sizeOfDirectory
(new File(args));
System.out.println("Size: " + size + " bytes" + size/1073741824 + "GB" );*/
}
}
Error:- java.net.UnknownHostException: \199.258.63.85

JSONException Twitter client side application

I am currently trying to build a client application for twitter. One of the functionalities of the app is to search tweet (including historical tweet). I tried to modify the code that I got from Github. However, when I tried to debug the code, I got JSONException cause by null value. Here is my code:
package thematicanalysis;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.util.Date;
import twitter4j.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import twitter4j.JSONException;
/**
*
* #author adichris
*/
public class TweetManager {
private static String getURLResponse(String since, String until, String querySearch, String scrollCursor,int counter) throws Exception{
String appendQuery = " ";
if(since!=null)
appendQuery+= " since:"+since;
if(until!=null)
appendQuery+= " until:"+until;
if(querySearch!=null)
appendQuery+= " "+querySearch;
String url = String.format("https://twitter.com/search?src=typd&q=%s&scroll_cursor=%s", URLEncoder.encode(appendQuery, "UTF-8"),scrollCursor);
System.out.println("URL: "+ url);
URL obj = new URL (url);
HttpURLConnection con = (HttpURLConnection)obj.openConnection();
con.setRequestMethod("GET");
//StringBuilder response;
BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream()));
String inputLine;
StringBuffer response = new StringBuffer();
while((inputLine=in.readLine())!=null)
response.append(inputLine);
in.close();
con.disconnect();
//System.out.println(response.toString());
saveToFile(response.toString(),counter);
return response.toString();
}
private static void saveToFile(String content,int counter) throws IOException
{
try (PrintWriter pw = new PrintWriter("newOutput"+counter+".txt","UTF-8")) {
pw.printf("%s\n",content);
pw.close();
}
}
public static void getTweets (String since, String until, String querySearch) throws JSONException, Exception{
try{
String refreshCursor = null;
int counter = 1;
while(true)
{
String response = getURLResponse(since,until,querySearch,refreshCursor,counter);
JSONObject json = new JSONObject(response);
if(json.equals(null))
System.out.println("hereeee");
counter++;
System.out.println(counter);
refreshCursor = json.getString("scroll_cursor");
Document doc = Jsoup.parse((String)json.get("items_html"));
Elements tweets = doc.select("div.js-stream-tweet");
System.out.println(tweets.size());
if (tweets.isEmpty()){
break;
}
for (Element tweet: tweets){
String userName = tweet.select("span.username.js-action-profile-name b").text();
String text = tweet.select("p.js-tweet-text").text().replaceAll("[^\\u0000-\\uFFFF]", "");
long dateMs = Long.valueOf(tweet.select("small.time span.js-short-timestamp").attr("data-time-ms"));
Date date = new Date(dateMs);
System.out.println(userName);
//saveToFile(text);
}
}
}catch(JSONException e){
e.printStackTrace();
}
}
}

Categories