java web crawler downloads too many GB data - java

I have coded a web crawler. But when crawling it downloads too many GBs of data.
I want to read only the text (avoiding images ...etc).
I use Boilerpipe to extract the content from html
Here is how I find the final redirected url
public String getFinalRedirectedUrl(String url) throws IOException{
HttpURLConnection connection;
String finalUrl = url;
int redirectCount = 0;
do {
connection = (HttpURLConnection) new URL(finalUrl)
.openConnection();
connection.setConnectTimeout(Config.HTTP_CONNECTION_TIMEOUT_TIME);
connection.setReadTimeout(Config.HTTP_READ_TIMEOUT_TIME);
connection.setInstanceFollowRedirects(false);
connection.setUseCaches(false);
connection.setRequestMethod("GET");
connection.connect();
int responseCode = connection.getResponseCode();
if (responseCode >= 300 && responseCode < 400) {
String redirectedUrl = connection.getHeaderField("Location");
if (null == redirectedUrl)
break;
finalUrl = redirectedUrl;
redirectCount++;
if(redirectCount > Config.MAX_REDIRECT_COUNT){
throw new java.net.ProtocolException("Server redirected too many times ("+Config.MAX_REDIRECT_COUNT+")");
}
} else{
break;
}
} while (connection.getResponseCode() != HttpURLConnection.HTTP_OK);
connection.disconnect();
return finalUrl;
}
This is how I fetch the url
private HTMLDocument fetch(URL url) throws IOException{
final HttpURLConnection httpcon = (HttpURLConnection) url.openConnection();
httpcon.setFollowRedirects(true);
httpcon.setConnectTimeout(Config.HTTP_CONNECTION_TIMEOUT_TIME);
httpcon.setReadTimeout(Config.HTTP_READ_TIMEOUT_TIME);
httpcon.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2) Gecko/20100101 Firefox/10.0.2");
final String ct = httpcon.getContentType();
Charset cs = Charset.forName("Cp1252");
if (ct != null) {
if(!ct.contains("text/html")){
System.err.println("Content type is:"+ct);
return new HTMLDocument("");
}
Matcher m = PAT_CHARSET.matcher(ct);
if(m.find()) {
final String charset = m.group(1);
try {
cs = Charset.forName(charset);
} catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
// keep default
}
}
}
InputStream in = httpcon.getInputStream();
final String encoding = httpcon.getContentEncoding();
if(encoding != null) {
if("gzip".equalsIgnoreCase(encoding)) {
in = new GZIPInputStream(in);
} else {
System.err.println("WARN: unsupported Content-Encoding: "+encoding);
}
}
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] buf = new byte[4096];
int r;
while ((r = in.read(buf)) != -1) {
bos.write(buf, 0, r);
}
in.close();
final byte[] data = bos.toByteArray();
return new HTMLDocument(data, cs);
}
And to get the body using Boilerpipe
HTMLDocument htmlDoc = fetch(new URL(url));
String body = ArticleExtractor.INSTANCE.getText(htmlDoc.toInputSource());
How to reduce the amount of data downloaded?

Reduced the GB downloaded and increased the efficiency by using JSoup
public HashMap<String, String> fetchWithJsoup(String url, String iniUrl, int redirCount)
throws IOException
{
HashMap<String, String> returnObj = new HashMap<>();
Connection con;
try{
con = Jsoup.connect(url);
}catch(IllegalArgumentException ex){
if(ex.getMessage().contains("Malformed URL")){
System.err.println("Malformed URL:: "
+ex.getClass().getName()+": "+ex.getMessage()+" > "+iniUrl);
}else{
Logger.getLogger(ContentGetter.class.getName()).log(Level.SEVERE, null, ex);
}
returnObj.put(RETURN_FINAL_URL, url);
returnObj.put(RETURN_BODY, "");
return returnObj;
}
con.userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2) Gecko/20100101 Firefox/10.0.2");
con.timeout(Config.HTTP_CONNECTION_TIMEOUT_TIME);
Document doc = con.get();
String uri = doc.baseUri();
returnObj.put(RETURN_FINAL_URL, uri);
Elements redirEle = doc.head().select("meta[http-equiv=refresh]");
if(redirEle.size() > 0){
String content = redirEle.get(0).attr("content");
Pattern pattern = Pattern.compile("^.*URL=(.+)$", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(content);
if (matcher.matches() && matcher.groupCount() > 0) {
String redirectUrl = matcher.group(1);
if(redirectUrl.startsWith("'")){
/*removes single quotes of urls within single quotes*/
redirectUrl = redirectUrl.replaceAll("(^')|('$)","");
}
if(redirectUrl.startsWith("/")){
String[] splitedUrl = url.split("/");
redirectUrl = splitedUrl[0]+"//"+splitedUrl[2]+redirectUrl;
}
if(!redirectUrl.equals(url)){
redirCount++;
if(redirCount < Config.MAX_REDIRECT_COUNT){
return fetchWithJsoup(redirectUrl, iniUrl, redirCount);
}
}
}
}
HTMLDocument htmlDoc = new HTMLDocument(doc.html());
String body = "";
try{
if(htmlDoc != null){
body = ArticleExtractor.INSTANCE.getText(htmlDoc.toInputSource());
}
}catch(OutOfMemoryError ex){
System.err.println("OutOfMemoryError while extracting text !!!!!!!!");
System.gc();
} catch (BoilerpipeProcessingException ex) {
Logger.getLogger(ContentGetter.class.getName()).log(Level.SEVERE, null, ex);
}
returnObj.put(RETURN_BODY, body);
return returnObj;
}

Related

Java HttpURLConnection http timeout

In my local it works perfectly, but when I deploy it gives me this error
nested exception is java.net.ConnectException: Connection timed out (Connection timed out)
with https everything works normal, but http does not work and it gives me the timeout error.
I also just did the tests with restTemplate, OkHttpClient and I get the same result
What am I doing wrong or what should I configure to work, I hope your help, I would be too grateful
public String getFile(String baseName, String extensioFile) {
String rpt;
int BUFFER_SIZE = 4096;
String urlDonwload = "http://datos.susalud.gob.pe/node/223/download";
try {
URL url = new URL(urlDonwload);
HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
httpConn.setRequestMethod("GET");
httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729)");
httpConn.setConnectTimeout(900000);
httpConn.setReadTimeout(7200000);
int responseCode = httpConn.getResponseCode();
if (responseCode == HttpURLConnection.HTTP_OK) {
String fileName = "";
String disposition = httpConn.getHeaderField("Content-Disposition");
if (disposition != null) {
// extracts file name from header field
int index = disposition.indexOf("filename=");
if (index > 0) {
fileName = disposition.substring(index + 10, disposition.length() - 1);
}
} else {
// extracts file name from URL
// fileName = urlCamaUci.substring(urlCamaUci.lastIndexOf("/") + 1,
// urlCamaUci.length());
LocalDateTime currentDate = LocalDateTime.now(ZoneOffset.of("-05:00"));
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
String formatDateTime = currentDate.format(formatter);
System.out.println();
fileName = baseName + "_" + formatDateTime.replace(" ", "_").replace(":", "-") + "." + extensioFile;
}
InputStream inputStream = httpConn.getInputStream();
// String saveFilePath = PATH + File.separator + fileName;
File pathSave = new File(fileName);
FileOutputStream outputStream = new FileOutputStream(pathSave.getCanonicalPath());
int bytesRead = -1;
byte[] buffer = new byte[BUFFER_SIZE];
while ((bytesRead = inputStream.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
}
outputStream.close();
inputStream.close();
rpt = pathSave.getCanonicalPath();
} else {
rpt = "FAILED";
}
httpConn.disconnect();
} catch (Exception e) {
System.out.println("error search path");
System.out.println(e.getMessage());
rpt = "FAILED";
}
return rpt;
}

Uploading file to server with Java and PHP

I'm making a Java application, and need the user to be able to upload a file to a server through PHP. The problem is that when the user uploads the file, the PHP script doesn't seem to "catch" the file.
This is the code I have so far.
PHP:
<?php
$target_path = "uploads/";
$target_path = $target_path . basename($_FILES['uploadedfile']['name']);
if (move_uploaded_file($_FILES['uploadedfile']['tmp_name'], $target_path)) {
echo "1";
exit();
}
echo "0";
?>
Java:
String filename = "C:\Users\XXX\Pictures\Capture.PNG";
public void uploadFile() {
text = "";
String CrLf = "\r\n";
String filename = filepath.split("/")[filepath.split("/").length-1];
URLConnection conn = null;
OutputStream os = null;
InputStream is = null;
try {
URL con = new URL(connection);
conn = con.openConnection();
conn.setDoOutput(true);
InputStream imgIS = new FileInputStream(filepath);
byte[] imgData = new byte[imgIS.available()];
imgIS.read(imgData);
String message1 = "";
message1 += "-----------------------------4664151417711" + CrLf;
message1 += "Content-Disposition: form-data; name=\"uploadedfile\"; filename=\"Image0001.png\""
+ CrLf;
message1 += "Content-Type: image/png" + CrLf;
message1 += CrLf;
// the image is sent between the messages in the multipart message.
String message2 = "";
message2 += CrLf + "-----------------------------4664151417711--"
+ CrLf;
conn.setRequestProperty("Content-Type",
"multipart/form-data; boundary=---------------------------4664151417711");
// might not need to specify the content-length when sending chunked
// data.
conn.setRequestProperty("Content-Length", String.valueOf((message1
.length() + message2.length() + imgData.length)));
System.out.println("open os");
os = conn.getOutputStream();
System.out.println(message1);
os.write(message1.getBytes());
// SEND THE IMAGE
int index = 0;
int size = 1024;
do {
System.out.println("write:" + index);
if ((index + size) > imgData.length) {
size = imgData.length - index;
}
os.write(imgData, index, size);
index += size;
} while (index < imgData.length);
System.out.println("written:" + index);
System.out.println(message2);
os.write(message2.getBytes());
os.flush();
System.out.println("open is");
is = conn.getInputStream();
char buff = 512;
int len;
byte[] data = new byte[buff];
do {
System.out.println("READ");
len = is.read(data);
if (len > 0) {
System.out.println(new String(data, 0, len));
}
} while (len > 0);
System.out.println("DONE");
} catch (Exception e) {
e.printStackTrace();
} finally {
System.out.println("Close connection");
try {
os.close();
} catch (Exception e) {
}
try {
is.close();
} catch (Exception e) {
}
try {
} catch (Exception e) {
}
}
}
When getting the output from the PHP script, it always returns a "0".
I've tried a lot of different things, but nothing seems to work.

why I can't encode and decode base64 with java and php?

I encode Images on server with php and send the encoded strings to android via get request and decode it with java but It outputs bad base64 .
so I decided to check the base64 string on online checker but the image doesn't appear is the problem with the php encoding ?
here is the encoded image string :
aW1hZ2VzL21haW4vd3d3L25ldCAtIENvcHkucG5n
PHP::
<?php
require_once("config.php");
if(isset($_GET["m"])) {
$dirname = "images/main/";
$arr = array();
$conn = new mysqli(HOST, USERNAME, PASSWORD, DATABASE);
if(!$conn) {
echo "Error connecting to database";
exit();
}
if($stmt = $conn->prepare("SELECT name_ FROM projects")) {
$stmt->execute();
$stmt->bind_result($n);
//$stmt->store_result();
$result = $stmt->get_result();
if($result->num_rows == 0) {
echo "No Projects";
$stmt->close();
$conn->close();
exit();
}else {
while ($row = $result->fetch_assoc()) {
$dirname = $dirname . $row["name_"] . "/";
$images = glob($dirname . "*.*", GLOB_BRACE);
foreach($images as $key => $image) {
$image = base64_encode($image);
//array_push($arr, $image);
$dirname = "images/main/";
echo $image;
echo "/n";
$image = "";
}
}
//echo "hi";//json_encode($arr);
}
}
$stmt->close();
$conn->close();
exit();
}
?>
ANDROID::
#Override
protected String doInBackground(String[] params) {
String add = "http://10.0.2.2/wael/getimages.php?m=all";
byte[] image = null;
Bitmap real = null;
String parsedString = "";
BufferedReader bufferedReader = null;
InputStream is = null;
StringBuilder sb = null;
HttpURLConnection httpConn = null;
URLConnection conn = null;
try {
URL url = new URL(add);
conn = url.openConnection();
httpConn = (HttpURLConnection) conn;
httpConn.setAllowUserInteraction(false);
httpConn.setInstanceFollowRedirects(true);
httpConn.setRequestMethod("GET");
httpConn.connect();
is = httpConn.getInputStream();
sb = new StringBuilder();
String line;
bufferedReader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
bitmaps = new ArrayList<>();
while ((line = bufferedReader.readLine()) != null) {
sb.append(line).append("\\n");
//image = Base64.decode(line, Base64.NO_PADDING);
//real = BitmapFactory.decodeByteArray(image, 0, image.length);
//bitmaps.add(real);
//image = null;
//real = null;
}
String[] lines = sb.toString().split("\\n");
for(String s: lines){
image = Base64.decode(s, Base64.URL_SAFE);
real = BitmapFactory.decodeByteArray(image, 0, image.length);
bitmaps.add(real);
image = null;
real = null;
}
} catch (IOException e) {
e.printStackTrace();
}
finally {
try {
if (is != null) {
is.close();
}
if (bufferedReader != null) {
bufferedReader.close();
}
if (httpConn != null) {
httpConn.disconnect();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return "done";
}
HINT::
all types of modes in decoding using java are not working . for exmaple URL_SAFE , DEFAULT and etc arenot working all output the same bad base64
You aren't base64 encoding an image, you are base64 encoding the path to an image.
It's like if someone said "Send me a picture of the empire state building", and then you wrote them a letter that said "A picture of the empire state building".
For example, in your comment you said
the following is still not outputing an image aW1hZ2VzL21haW4vd3d3L25ldCAtIENvcHkucG5n
But if I do the following:
console.log(atob("aW1hZ2VzL21haW4vd3d3L25ldCAtIENvcHkucG5n"))
when you hit run you will see
images/main/www/net - Copy.png
Clearly, not the intended result.
In PHP, you would instead do something like this:
$image = file_get_contents($filename);
$b64_image_raw = base64_encode($im);
$mime = mime_content_type($filetype)
$b64_image = "data:" . $mime . ";base64," . $b56_image_raw;
which will give you something like:

var image = document.createElement("img")
image.src="";
document.getElementById("image").appendChild(image)
<div id="image"></div>

Why doesn't it get the full source code?

private static String[] getUrlSource2(String site) throws IOException {
List<String> myList = new ArrayList<String>();
URL url = new URL(site);
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); // Cast shouldn't fail
HttpURLConnection.setFollowRedirects(true);
conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
String encoding = conn.getContentEncoding();
InputStream inStr = null;
if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
inStr = new GZIPInputStream(conn.getInputStream());
} else {
inStr = conn.getInputStream();
}
BufferedReader in = new BufferedReader(new InputStreamReader(inStr,"UTF-8"));
String inputLine;
while ((inputLine = in.readLine()) != null)
myList.add(inputLine);
in.close();
String[] arr = myList.toArray(new String[myList.size()]);
return arr;
}
This is my getSource method, for some reason it just gives me part of the source code of a url page, I can't figure out why..
If yo could help, I would be deeply apreciatted.
For example if you run this:
public class Main {
public static void main(String[] args){
try {
String [] A =getUrlSource2("https://www.google.pt/");
for(int i=0;i<A.length;i++){
System.out.print(String.valueOf(i)+" ");
System.out.println(A[i]);
}
}catch(IOException e){
}
}
You will get 5 lines of source code when you should get about 300/400

Android java.lang.IllegalArgumentException: byteCount

I get random error (not all connection) java.lang.IllegalArgumentException: byteCount < 0: -291674 from this code
URL connurl = new URL(newUrl);
conn = (HttpURLConnection) connurl.openConnection();
conn.setRequestProperty("User-Agent", "Android");
conn.setConnectTimeout(5000);
conn.setRequestMethod("GET");
if (finish != 0L)
conn.setRequestProperty("Range", "bytes=" + finish + "-" + size);
conn.connect();
int getResponseCode = conn.getResponseCode();
String getContenttype = conn.getContentType();
int contentLenght = conn.getContentLength();
if (!StringUtils.startsWith(String.valueOf(getResponseCode), "20")) {
}
else if(StringUtils.startsWithIgnoreCase(getContenttype, "text/html")) {
}
else if(contentLenght < (size-finish)) {
}
else {
ReadableByteChannel channel = Channels.newChannel(conn.getInputStream());
accessFile = new RandomAccessFile(filePath, "rwd");
accessFile.seek(finish);
fileStream = new FileOutputStream(filePath, true);
ByteBuffer buffer = ByteBuffer.allocate(1024);
byte[] data;
int temp;
while ((temp = channel.read(buffer)) > 0) { <-- error in this line
}
Error :
java.lang.IllegalArgumentException: byteCount < 0: -1647333
at com.android.okhttp.okio.RealBufferedSource.read(RealBufferedSource.java:46)
at com.android.okhttp.internal.http.HttpConnection$FixedLengthSource.read(HttpConnection.java:418)
at com.android.okhttp.okio.RealBufferedSource$1.read(RealBufferedSource.java:349)
at java.io.InputStream.read(InputStream.java:162)
at java.nio.channels.Channels$InputStreamChannel.read(Channels.java:306)
How to fix it?
Thank you
This is probably linked to that issue: https://github.com/square/okhttp/issues/3104
As suggested, consider using okhttp-urlconnection like:
OkHttpClient okHttpClient = new OkHttpClient();
URL.setURLStreamHandlerFactory(new OkUrlFactory(okHttpClient));

Categories