java.io.UTFDataFormatException while reading file entry name - java

Im trying to "pack" several files (previously inside a jar archive) in another single non-jar file by using DataInputStream / DataOutputStream.
The idea was:
First int = number of entries
First UTF is the first entry name
Second Int is entry byte array length (entry size)
Then repeat for every entry.
The code:
public static void main(String[] args) throws Throwable {
test();
System.out.println("========================================================================================");
final DataInputStream dataInputStream = new DataInputStream(new FileInputStream(new File("C:\\Users\\Admin\\Desktop\\randomJarOut")));
for (int int1 = dataInputStream.readInt(), i = 0; i < int1; ++i) {
final String utf = dataInputStream.readUTF();
System.out.println("Entry name: " + utf);
final byte[] array = new byte[dataInputStream.readInt()];
for (int j = 0; j < array.length; ++j) {
array[j] = dataInputStream.readByte();
}
System.out.println("Entry bytes length: " + array.length);
}
}
Unpacking original & packing to new one:
private static void test() throws Throwable {
JarInputStream stream = new JarInputStream(new FileInputStream(new File("C:\\Users\\Admin\\Desktop\\randomJar.jar")));
JarInputStream stream1 = new JarInputStream(new FileInputStream(new File("C:\\Users\\Admin\\Desktop\\randomJar.jar")));
final byte[] buffer = new byte[2048];
final DataOutputStream outputStream = new DataOutputStream(new FileOutputStream(new File("C:\\Users\\Admin\\Desktop\\randomJarOut")));
int entryCount = 0;
for (ZipEntry entry; (entry = stream.getNextJarEntry()) != null; ) {
entryCount++;
}
outputStream.writeInt(entryCount);
for (JarEntry entry; (entry = stream1.getNextJarEntry()) != null; ) {
int entryRealSize = stream1.read(buffer);
if (!(entryRealSize == -1)) {
System.out.println("Writing: " + entry.getName() + " Length: " + entryRealSize);
outputStream.writeUTF(entry.getName());
outputStream.writeInt(entryRealSize);
for (int len = stream1.read(buffer); len != -1; len = stream1.read(buffer)) {
outputStream.write(buffer, 0, len);
}
}
}
outputStream.flush();
outputStream.close();
}
Apparently im able to unpack the first entry without any problems, the second one and others:
Entry name: META-INF/services/org.jd.gui.spi.ContainerFactory
Entry bytes length: 434
Exception in thread "main" java.io.UTFDataFormatException: malformed input around byte 279
at java.io.DataInputStream.readUTF(DataInputStream.java:656)
at java.io.DataInputStream.readUTF(DataInputStream.java:564)
at it.princekin.esercizio.Bootstrap.main(Bootstrap.java:29)
Disconnected from the target VM, address: '127.0.0.1:54384', transport: 'socket'
Process finished with exit code 1
Does anyone knows how to fix this? Why is this working for the first entry but not the others?

My take on this is that the jar file (which in fact is a zip file) has a Central Directory which is only read with the ZipFile (or JarFile) class.
The Central Directory contains some data about the entries such as the size.
I think the ZipInputStream will not read the Central Directory and thus the ZipEntry will not contain the size (returning -1 as it is unknown) whereas reading ZipEntry from ZipFile class will.
So if you first read the size of each entry using a ZipFile and store that in a map, you can easily get it when reading the data with the ZipInputStream.
This page includes some good examples as well.
So my version of your code would be:
import java.io.*;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
public class JarRepacker {
public static void main(String[] args) throws Throwable {
JarRepacker repacker = new JarRepacker();
repacker.repackJarToMyFileFormat("commons-cli-1.3.1.jar", "randomJarOut.bin");
repacker.readMyFileFormat("randomJarOut.bin");
}
private void repackJarToMyFileFormat(String inputJar, String outputFile) throws Throwable {
int entryCount;
Map<String, Integer> sizeMap = new HashMap<>();
try (ZipFile zipFile = new ZipFile(inputJar)) {
entryCount = zipFile.size();
zipFile.entries().asIterator().forEachRemaining(e -> sizeMap.put(e.getName(), (int) e.getSize()));
}
try (final DataOutputStream outputStream = new DataOutputStream(new FileOutputStream(outputFile))) {
outputStream.writeInt(entryCount);
try (ZipInputStream stream = new ZipInputStream(new BufferedInputStream(new FileInputStream(inputJar)))) {
ZipEntry entry;
final byte[] buffer = new byte[2048];
while ((entry = stream.getNextEntry()) != null) {
final String name = entry.getName();
outputStream.writeUTF(name);
final Integer size = sizeMap.get(name);
outputStream.writeInt(size);
//System.out.println("Writing: " + name + " Size: " + size);
int len;
while ((len = stream.read(buffer)) > 0) {
outputStream.write(buffer, 0, len);
}
}
}
outputStream.flush();
}
}
private void readMyFileFormat(String fileToRead) throws IOException {
try (DataInputStream dataInputStream
= new DataInputStream(new BufferedInputStream(new FileInputStream(fileToRead)))) {
int entries = dataInputStream.readInt();
System.out.println("Entries in file: " + entries);
for (int i = 1; i <= entries; i++) {
final String name = dataInputStream.readUTF();
final int size = dataInputStream.readInt();
System.out.printf("[%3d] Reading: %s of size: %d%n", i, name, size);
final byte[] array = new byte[size];
for (int j = 0; j < array.length; ++j) {
array[j] = dataInputStream.readByte();
}
// Still need to do something with this array...
}
}
}
}

The problem, probably, lies in that you are mixing not reciprocal read/write methods:
The writer method writes with outputStream.writeInt(entryCount) and the main method reads with dataInputStream.readInt(). That is OK.
The writer method writes with outputStream.writeUTF(entry.getName()) and the main method reads with dataInputStream.readUTF(). That is OK.
The writer method writes with outputStream.writeInt(entryRealSize) and the main method reads with dataInputStream.readInt(). That is OK.
The writer method writes with outputStream.write(buffer, 0, len) and the main method reads with dataInputStream.readByte() several times. WRONG.
If you write an array of bytes with write(buffer, offset, len), you must read it with read(buffer, offset, len), because write(buffer, offset, len) writes exactly len physical bytes onto the output stream, while writeByte (the counterpart of readByte) writes a lot of metadata overhead about the object type, and then its state variables.
Bugs in the writer method
There is also a mayor bug in the writer method: It invokes up to three times stream1.read(buffer), but it just uses once the buffer contents. The result is that the real size of file is actually written onto the output stream metadata, but it is followed by just a small part of the data.
If you need to know the input file size before writing it in the output stream, you have two choices:
Either chose a large enough buffer size (like 204800) which will allow you to read the whole file in just one read and write it in just one write.
Or either separate read from write algorithms: First a method to read the whole file and store it in memory (a byte[], for example), and then another method to write the byte[] onto the output stream.
Full fixed solution
I've fixed your program, with specific, decoupled methods for each task. The process consists in parsing the input file to a memory model, write it to an intermediate file according to your custom definition, and then read it back.
public static void main(String[] args)
throws Throwable
{
File inputJarFile=new File(args[0]);
File intermediateFile=new File(args[1]);
List<FileData> fileDataEntries=parse(inputJarFile);
write(fileDataEntries, intermediateFile);
read(intermediateFile);
}
public static List<FileData> parse(File inputJarFile)
throws IOException
{
List<FileData> list=new ArrayList<>();
try (JarInputStream stream=new JarInputStream(new FileInputStream(inputJarFile)))
{
for (ZipEntry entry; (entry=stream.getNextJarEntry()) != null;)
{
byte[] data=readAllBytes(stream);
if (data.length > 0)
{
list.add(new FileData(entry.getName(), data));
}
stream.closeEntry();
}
}
return list;
}
public static void write(List<FileData> fileDataEntries, File output)
throws Throwable
{
try (DataOutputStream outputStream=new DataOutputStream(new FileOutputStream(output)))
{
int entryCount=fileDataEntries.size();
outputStream.writeInt(entryCount);
for (FileData fileData : fileDataEntries)
{
int entryRealSize=fileData.getData().length;
{
System.out.println("Writing: " + fileData.getName() + " Length: " + entryRealSize);
outputStream.writeUTF(fileData.getName());
outputStream.writeInt(entryRealSize);
outputStream.write(fileData.getData());
}
}
outputStream.flush();
}
}
public static void read(File intermediateFile)
throws IOException
{
try (DataInputStream dataInputStream=new DataInputStream(new FileInputStream(intermediateFile)))
{
for (int entryCount=dataInputStream.readInt(), i=0; i < entryCount; i++)
{
String utf=dataInputStream.readUTF();
int entrySize=dataInputStream.readInt();
System.out.println("Entry name: " + utf + " size: " + entrySize);
byte[] data=readFixedLengthBuffer(dataInputStream, entrySize);
System.out.println("Entry bytes length: " + data.length);
}
}
}
private static byte[] readAllBytes(InputStream input)
throws IOException
{
byte[] buffer=new byte[4096];
byte[] total=new byte[0];
int len;
do
{
len=input.read(buffer);
if (len > 0)
{
byte[] total0=total;
total=new byte[total0.length + len];
System.arraycopy(total0, 0, total, 0, total0.length);
System.arraycopy(buffer, 0, total, total0.length, len);
}
}
while (len >= 0);
return total;
}
private static byte[] readFixedLengthBuffer(InputStream input, int size)
throws IOException
{
byte[] buffer=new byte[size];
int pos=0;
int len;
do
{
len=input.read(buffer, pos, size - pos);
if (len > 0)
{
pos+=len;
}
}
while (pos < size);
return buffer;
}
private static class FileData
{
private final String name;
private final byte[] data;
public FileData(String name, byte[] data)
{
super();
this.name=name;
this.data=data;
}
public String getName()
{
return this.name;
}
public byte[] getData()
{
return this.data;
}
}

Related

Java FileInputStream FileOutputStream difference in the run

Could someone tell me why the 1. run is wrong? (The return code is 0, but the file written is only half of the original one.
Thanks in advance!
public class FileCopyFisFos {
public static void main(String[] args) throws IOException {
FileInputStream fis = new FileInputStream("d:/Test1/OrigFile.MP4");
FileOutputStream fos = new FileOutputStream("d:/Test2/DestFile.mp4");
// 1. run
// while (fis.read() != -1){
// int len = fis.read();
// fos.write(len);
// }
// 2. run
// int len;
// while ((len = fis.read()) != -1){
// fos.write(len);
// }
fis.close();
fos.close();
}
}
FileInputStream 's read() method follows this logic:
Reads a byte of data from this input stream. This method blocks if no input is yet available.
So assigning the value of its return to a variable, such as:
while((len = fis.read())!= -1)
Is avoiding the byte of data just read from the stream to be forgotten, as every read() call will be assigned to your len variable.
Instead, this code bypasses one of every two bytes from the stream, as the read() executed in the while condition is never assigned to a variable. So the stream advances without half of the bytes being read (assigned to len):
while (fis.read() != -1) { // reads a byte of data (but not saved)
int len = fis.read(); // next byte of data saved
fos.write(len); // possible -1 written here
}
#aran and others already pointed out the solution to your problem.
However there are more sides to this, so I extended your example:
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
public class FileCopyFisFos {
public static void main(final String[] args) throws IOException {
final File src = new File("d:/Test1/OrigFile.MP4");
final File sink = new File("d:/Test2/DestFile.mp4");
{
final long startMS = System.currentTimeMillis();
final long bytesCopied = copyFileSimple(src, sink);
System.out.println("Simple copy transferred " + bytesCopied + " bytes in " + (System.currentTimeMillis() - startMS) + "ms");
}
{
final long startMS = System.currentTimeMillis();
final long bytesCopied = copyFileSimpleFaster(src, sink);
System.out.println("Simple+Fast copy transferred " + bytesCopied + " bytes in " + (System.currentTimeMillis() - startMS) + "ms");
}
{
final long startMS = System.currentTimeMillis();
final long bytesCopied = copyFileFast(src, sink);
System.out.println("Fast copy transferred " + bytesCopied + " bytes in " + (System.currentTimeMillis() - startMS) + "ms");
}
System.out.println("Test completed.");
}
static public long copyFileSimple(final File pSourceFile, final File pSinkFile) throws IOException {
try (
final FileInputStream fis = new FileInputStream(pSourceFile);
final FileOutputStream fos = new FileOutputStream(pSinkFile);) {
long totalBytesTransferred = 0;
while (true) {
final int readByte = fis.read();
if (readByte < 0) break;
fos.write(readByte);
++totalBytesTransferred;
}
return totalBytesTransferred;
}
}
static public long copyFileSimpleFaster(final File pSourceFile, final File pSinkFile) throws IOException {
try (
final FileInputStream fis = new FileInputStream(pSourceFile);
final FileOutputStream fos = new FileOutputStream(pSinkFile);
BufferedInputStream bis = new BufferedInputStream(fis);
BufferedOutputStream bos = new BufferedOutputStream(fos);) {
long totalBytesTransferred = 0;
while (true) {
final int readByte = bis.read();
if (readByte < 0) break;
bos.write(readByte);
++totalBytesTransferred;
}
return totalBytesTransferred;
}
}
static public long copyFileFast(final File pSourceFile, final File pSinkFile) throws IOException {
try (
final FileInputStream fis = new FileInputStream(pSourceFile);
final FileOutputStream fos = new FileOutputStream(pSinkFile);) {
long totalBytesTransferred = 0;
final byte[] buffer = new byte[20 * 1024];
while (true) {
final int bytesRead = fis.read(buffer);
if (bytesRead < 0) break;
fos.write(buffer, 0, bytesRead);
totalBytesTransferred += bytesRead;
}
return totalBytesTransferred;
}
}
}
The hints that come along with that code:
There is the java.nio package that usualy does those things a lot faster and in less code.
Copying single bytes is 1'000-40'000 times slower that bulk copy.
Using try/resource/catch is the best way to avoid problems with reserved/locked resources like files etc.
If you solve something that is quite commonplace, I suggest you put it in a utility class of your own or even your own library.
There are helper classes like BufferedInputStream and BufferedOutputStream that take care of efficiency greatly; see example copyFileSimpleFaster().
But as usual, it is the quality of the concept that has the most impact on the implementation; see example copyFileFast().
There are even more advanced concepts (similar to java.nio), that take into account concepts like OS caching behaviour etc, which will give performance another kick.
Check my outputs, or run it on your own, to see the differences in performance:
Simple copy transferred 1608799 bytes in 12709ms
Simple+Fast copy transferred 1608799 bytes in 51ms
Fast copy transferred 1608799 bytes in 4ms
Test completed.

Copying Zip File While Reading It

I am trying to copy a zip file while is being read but the problem is that the copy is not the same as the source, even if the number of bytes are the same. Anyone can see/explain what I'm doing wrong? Thanks!
File fileIn = new File("In.zip");
File fileOut = new File("Out.zip");
final OutputStream out = new FileOutputStream(fileOut);
final AtomicInteger totalBytesRead = new AtomicInteger();
BufferedInputStream copy = new BufferedInputStream(new FileInputStream(fileIn)) {
#Override
public synchronized int read(byte[] b, int off, int len) throws IOException {
int total = super.read(b, off, len);
if (total != -1) {
totalBytesRead.addAndGet(total);
out.write(b, 0, total);
}
return total;
}
};
ZipInputStream zipIn = new ZipInputStream(copy);
ZipEntry zipEntry = null;
while ((zipEntry = zipIn.getNextEntry()) != null) {
zipIn.closeEntry();
}
IOUtils.copy(copy, new OutputStream() {
#Override
public void write(int b) throws IOException {
}
});
zipIn.close();
out.close();
System.out.println("Expected: " + fileIn.length() + ", Actual: " + totalBytesRead);
System.out.println(FileUtils.contentEquals(fileIn, fileOut));
The output is:
Expected: 3695, Actual: 3695
false
You aren't reading the ZIP at all, you're only enumerating its entries, and you therefore aren't copying during reading either, and you are doing another copy after all this. Try actually reading the zip entries, and remove the subsequent IOUtils.copy() call.

java: Length of data changes after write in stream

I have saved a binary data in FileOutputStream but when I check the length of the data before and after I found that it changes from 72 to 106.
This is my method:
inputStream = new FileInputStream(certificate_file);
/*Certificate file is a Path of a binary file */
pubkey = readFromStream(inputStream, 0, 71);
System.out.println("length of pubkey: "+pubkey.length());
/* This return : length of pubkey: 72 */
writeToStream(path + "pubkey.bin", pubkey);
inputStream = new FileInputStream(path + "pubkey.bin");
pubkey = readFromStream(inputStream);
System.out.println("length of pubkey: "+pubkey.length());
/* This return : length of pubkey: 106 */
writeToStream method to write data into outputstream:
public void writeToStream(String path, String data)
throws FileNotFoundException {
OutputStream os = new FileOutputStream(path);
PrintStream printStream = new PrintStream(os);
printStream.print(data);
}
readFromStream method to read data from stream:
public static String readFromStream(InputStream inputStream, int begin, int end) throws Exception {
int i = 0;
int data = inputStream.read();
String out = "";
while (data != -1) {
if (i >= begin && i <= end) {
out += (char) data;
}
data = inputStream.read();
i++;
}
return out;
}
public static String readFromStream(InputStream inputStream) throws Exception {
int i = 0;
int data = inputStream.read();
String out = "";
while (data != -1) {
out += (char) data;
data = inputStream.read();
i++;
}
return out;
}
Why I have this problem?
I have solved the problem, I transformed the data from String to bytes[] and I changed the read in readFromStream to readAllBytes.

How to read the large text files efficiently in java

Here, I am reading the 18 MB file and store it in a two dimensional array. But this program takes almost 15 minutes to run. Is there anyway to optimize the running time of the program. The file contains only binary values. Thanks in advanceā€¦
public class test
{
public static void main(String[] args) throws FileNotFoundException, IOException
{
BufferedReader br;
FileReader fr=null;
int m = 2160;
int n = 4320;
int[][] lof = new int[n][m];
String filename = "D:/New Folder/ETOPOCHAR";
try {
Scanner input = new Scanner(new File("D:/New Folder/ETOPOCHAR"));
double range_km=1.0;
double alonn=-57.07; //180 to 180
double alat=38.53;
while (input.hasNextLine()) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
try
{
lof[j][i] = input.nextInt();
System.out.println("value[" + j + "][" + i + "] = "+ lof[j][i]);
}
catch (java.util.NoSuchElementException e) {
// e.printStackTrace();
}
}
} //print the input matrix
}
I have also tried with byte array but i can not save it in twoD array...
public class FileToArrayOfBytes
{
public static void main( String[] args )
{
FileInputStream fileInputStream=null;
File file = new File("name of file");
byte[] bFile = new byte[(int) file.length()];
try {
//convert file into array of bytes
fileInputStream = new FileInputStream(file);
fileInputStream.read(bFile);
fileInputStream.close();
for (int i = 0; i < bFile.length; i++) {
System.out.print((char)bFile[i]);
}
System.out.println("Done");
}catch(Exception e){
e.printStackTrace();
}
}
}
You can read the file into a byte array first, then deserialize these bytes. Start with 2048 bytes buffer (as input buffer), then experiment by increasing/decreasing its size, but the experimental buffer size values should be a power of two (512, 1024, 2048, etc).
As far as I rememenber, there are good chances that the best performance can be achived with a buffer of size 2048 bytes, but it is OS dependent and should be verified.
Code sample (here you can try different values of BUFFER_SIZE variable, in my case I've read a test file of size 7.5M in less then one second):
public static void main(String... args) throws IOException {
File f = new File(args[0]);
byte[] buffer = new byte[BUFFER_SIZE];
ByteBuffer result = ByteBuffer.allocateDirect((int) f.length());
try (FileInputStream fos = new FileInputStream(f)) {
int bytesRead;
int totalBytesRead = 0;
while ((bytesRead = fos.read(buffer, 0, BUFFER_SIZE)) != -1) {
result.put(buffer, 0, bytesRead);
totalBytesRead += bytesRead;
}
// debug info
System.out.printf("Read %d bytes\n", totalBytesRead);
// Here you can do whatever you want with the result, including creation of a 2D array...
int pos = result.position();
result.rewind();
for (int i = 0; i < pos / 4; i++) {
System.out.println(result.getInt());
}
}
}
Take your time and read docs for java.io, java.nio packages as well as Scanner class, just to improve understanding.

Java - Read file and split into multiple files

I have a file which I would like to read in Java and split this file into n (user input) output files. Here is how I read the file:
int n = 4;
BufferedReader br = new BufferedReader(new FileReader("file.csv"));
try {
String line = br.readLine();
while (line != null) {
line = br.readLine();
}
} finally {
br.close();
}
How do I split the file - file.csv into n files?
Note - Since the number of entries in the file are of the order of 100k, I can't store the file content into an array and then split it and save into multiple files.
Since one file can be very large, each split file could be large as well.
Example:
Source File Size: 5GB
Num Splits: 5: Destination
File Size: 1GB each (5 files)
There is no way to read this large split chunk in one go, even if we have such a memory. Basically for each split we can read a fix size byte-array which we know should be feasible in terms of performance as well memory.
NumSplits: 10 MaxReadBytes: 8KB
public static void main(String[] args) throws Exception
{
RandomAccessFile raf = new RandomAccessFile("test.csv", "r");
long numSplits = 10; //from user input, extract it from args
long sourceSize = raf.length();
long bytesPerSplit = sourceSize/numSplits ;
long remainingBytes = sourceSize % numSplits;
int maxReadBufferSize = 8 * 1024; //8KB
for(int destIx=1; destIx <= numSplits; destIx++) {
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream("split."+destIx));
if(bytesPerSplit > maxReadBufferSize) {
long numReads = bytesPerSplit/maxReadBufferSize;
long numRemainingRead = bytesPerSplit % maxReadBufferSize;
for(int i=0; i<numReads; i++) {
readWrite(raf, bw, maxReadBufferSize);
}
if(numRemainingRead > 0) {
readWrite(raf, bw, numRemainingRead);
}
}else {
readWrite(raf, bw, bytesPerSplit);
}
bw.close();
}
if(remainingBytes > 0) {
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream("split."+(numSplits+1)));
readWrite(raf, bw, remainingBytes);
bw.close();
}
raf.close();
}
static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
byte[] buf = new byte[(int) numBytes];
int val = raf.read(buf);
if(val != -1) {
bw.write(buf);
}
}
import java.io.*;
import java.util.Scanner;
public class split {
public static void main(String args[])
{
try{
// Reading file and getting no. of files to be generated
String inputfile = "C:/test.txt"; // Source File Name.
double nol = 2000.0; // No. of lines to be split and saved in each output file.
File file = new File(inputfile);
Scanner scanner = new Scanner(file);
int count = 0;
while (scanner.hasNextLine())
{
scanner.nextLine();
count++;
}
System.out.println("Lines in the file: " + count); // Displays no. of lines in the input file.
double temp = (count/nol);
int temp1=(int)temp;
int nof=0;
if(temp1==temp)
{
nof=temp1;
}
else
{
nof=temp1+1;
}
System.out.println("No. of files to be generated :"+nof); // Displays no. of files to be generated.
//---------------------------------------------------------------------------------------------------------
// Actual splitting of file into smaller files
FileInputStream fstream = new FileInputStream(inputfile); DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in)); String strLine;
for (int j=1;j<=nof;j++)
{
FileWriter fstream1 = new FileWriter("C:/New Folder/File"+j+".txt"); // Destination File Location
BufferedWriter out = new BufferedWriter(fstream1);
for (int i=1;i<=nol;i++)
{
strLine = br.readLine();
if (strLine!= null)
{
out.write(strLine);
if(i!=nol)
{
out.newLine();
}
}
}
out.close();
}
in.close();
}catch (Exception e)
{
System.err.println("Error: " + e.getMessage());
}
}
}
Though its a old question but for reference I am listing out the code which I used to split large files to any sizes and it works with any Java versions above 1.4 .
Sample Split and Join blocks were like below:
public void join(String FilePath) {
long leninfile = 0, leng = 0;
int count = 1, data = 0;
try {
File filename = new File(FilePath);
//RandomAccessFile outfile = new RandomAccessFile(filename,"rw");
OutputStream outfile = new BufferedOutputStream(new FileOutputStream(filename));
while (true) {
filename = new File(FilePath + count + ".sp");
if (filename.exists()) {
//RandomAccessFile infile = new RandomAccessFile(filename,"r");
InputStream infile = new BufferedInputStream(new FileInputStream(filename));
data = infile.read();
while (data != -1) {
outfile.write(data);
data = infile.read();
}
leng++;
infile.close();
count++;
} else {
break;
}
}
outfile.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public void split(String FilePath, long splitlen) {
long leninfile = 0, leng = 0;
int count = 1, data;
try {
File filename = new File(FilePath);
//RandomAccessFile infile = new RandomAccessFile(filename, "r");
InputStream infile = new BufferedInputStream(new FileInputStream(filename));
data = infile.read();
while (data != -1) {
filename = new File(FilePath + count + ".sp");
//RandomAccessFile outfile = new RandomAccessFile(filename, "rw");
OutputStream outfile = new BufferedOutputStream(new FileOutputStream(filename));
while (data != -1 && leng < splitlen) {
outfile.write(data);
leng++;
data = infile.read();
}
leninfile += leng;
leng = 0;
outfile.close();
count++;
}
} catch (Exception e) {
e.printStackTrace();
}
}
Complete java code available here in File Split in Java Program link.
a clean solution to edit.
this solution involves loading the entire file into memory.
set all line of a file in List<String> rowsOfFile;
edit maxSizeFile to choice max size of a single file splitted
public void splitFile(File fileToSplit) throws IOException {
long maxSizeFile = 10000000 // 10mb
StringBuilder buffer = new StringBuilder((int) maxSizeFile);
int sizeOfRows = 0;
int recurrence = 0;
String fileName;
List<String> rowsOfFile;
rowsOfFile = Files.readAllLines(fileToSplit.toPath(), Charset.defaultCharset());
for (String row : rowsOfFile) {
buffer.append(row);
numOfRow++;
sizeOfRows += row.getBytes(StandardCharsets.UTF_8).length;
if (sizeOfRows >= maxSizeFile) {
fileName = generateFileName(recurrence);
File newFile = new File(fileName);
try (PrintWriter writer = new PrintWriter(newFile)) {
writer.println(buffer.toString());
}
recurrence++;
sizeOfRows = 0;
buffer = new StringBuilder();
}
}
// last rows
if (sizeOfRows > 0) {
fileName = generateFileName(recurrence);
File newFile = createFile(fileName);
try (PrintWriter writer = new PrintWriter(newFile)) {
writer.println(buffer.toString());
}
}
Files.delete(fileToSplit.toPath());
}
method to generate Name of file:
public String generateFileName(int numFile) {
String extension = ".txt";
return "myFile" + numFile + extension;
}
Have a counter to count no of entries. Let's say one entry per line.
step1: Initially create new subfile, set counter=0;
step2: increment counter as you read each entry from source file to buffer
step3: when counter reaches limit to number of entries that you want to write in each sub file, flush contents of buffer to subfile. close the subfile
step4 : jump to step1 till you have data in source file to read from
There's no need to loop twice through the file. You could estimate the size of each chunk as the source file size divided by number of chunks needed. Then you just stop filling each cunk with data as it's size exceeds estimated.
Here is one that worked for me and I used it to split 10GB file. it also enables you to add a header and a footer. very useful when splitting document based format such as XML and JSON because you need to add document wrapper in the new split files.
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
public class FileSpliter
{
public static void main(String[] args) throws IOException
{
splitTextFiles("D:\\xref.csx", 750000, "", "", null);
}
public static void splitTextFiles(String fileName, int maxRows, String header, String footer, String targetDir) throws IOException
{
File bigFile = new File(fileName);
int i = 1;
String ext = fileName.substring(fileName.lastIndexOf("."));
String fileNoExt = bigFile.getName().replace(ext, "");
File newDir = null;
if(targetDir != null)
{
newDir = new File(targetDir);
}
else
{
newDir = new File(bigFile.getParent() + "\\" + fileNoExt + "_split");
}
newDir.mkdirs();
try (BufferedReader reader = Files.newBufferedReader(Paths.get(fileName)))
{
String line = null;
int lineNum = 1;
Path splitFile = Paths.get(newDir.getPath() + "\\" + fileNoExt + "_" + String.format("%02d", i) + ext);
BufferedWriter writer = Files.newBufferedWriter(splitFile, StandardOpenOption.CREATE);
while ((line = reader.readLine()) != null)
{
if(lineNum == 1)
{
System.out.print("new file created '" + splitFile.toString());
if(header != null && header.length() > 0)
{
writer.append(header);
writer.newLine();
}
}
writer.append(line);
if (lineNum >= maxRows)
{
if(footer != null && footer.length() > 0)
{
writer.newLine();
writer.append(footer);
}
writer.close();
System.out.println(", " + lineNum + " lines written to file");
lineNum = 1;
i++;
splitFile = Paths.get(newDir.getPath() + "\\" + fileNoExt + "_" + String.format("%02d", i) + ext);
writer = Files.newBufferedWriter(splitFile, StandardOpenOption.CREATE);
}
else
{
writer.newLine();
lineNum++;
}
}
if(lineNum <= maxRows) // early exit
{
if(footer != null && footer.length() > 0)
{
writer.newLine();
lineNum++;
writer.append(footer);
}
}
writer.close();
System.out.println(", " + lineNum + " lines written to file");
}
System.out.println("file '" + bigFile.getName() + "' split into " + i + " files");
}
}
Below code used to split a big file into small files with lesser lines.
long linesWritten = 0;
int count = 1;
try {
File inputFile = new File(inputFilePath);
InputStream inputFileStream = new BufferedInputStream(new FileInputStream(inputFile));
BufferedReader reader = new BufferedReader(new InputStreamReader(inputFileStream));
String line = reader.readLine();
String fileName = inputFile.getName();
String outfileName = outputFolderPath + "\\" + fileName;
while (line != null) {
File outFile = new File(outfileName + "_" + count + ".split");
Writer writer = new OutputStreamWriter(new FileOutputStream(outFile));
while (line != null && linesWritten < linesPerSplit) {
writer.write(line);
line = reader.readLine();
linesWritten++;
}
writer.close();
linesWritten = 0;//next file
count++;//nect file count
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
Split a file to multiple chunks (in memory operation), here I'm splitting any file to a size of 500kb(500000 bytes) :
public static List<ByteArrayOutputStream> splitFile(File f) {
List<ByteArrayOutputStream> datalist = new ArrayList<>();
try {
int sizeOfFiles = 500000;
byte[] buffer = new byte[sizeOfFiles];
try (FileInputStream fis = new FileInputStream(f); BufferedInputStream bis = new BufferedInputStream(fis)) {
int bytesAmount = 0;
while ((bytesAmount = bis.read(buffer)) > 0) {
try (OutputStream out = new ByteArrayOutputStream()) {
out.write(buffer, 0, bytesAmount);
out.flush();
datalist.add((ByteArrayOutputStream) out);
}
}
}
} catch (Exception e) {
//get the error
}
return datalist; }
I am a bit late to answer, But here's how I did it:
Approach:
First I determine how many bytes each of the individual files should contain then I split the large file by bytes. Only one file chunk worth of data is loaded into memory at a time.
Example:- if a 5 GB file is split into 10 files then only 500MB worth of bytes are loaded into memory at a time which are held in the buffer variable in the splitBySize method below.
Code Explaination:
The method splitFile first gets the number of bytes each of the individual file chunks should contain by calling the getSizeInBytes method, then it calls the splitBySize method which splits the large file by size (i..e maxChunkSize represents the number of bytes each of file chunks will contain).
public static List<File> splitFile(File largeFile, int noOfFiles) throws IOException {
return splitBySize(largeFile, getSizeInBytes(largeFile.length(), noOfFiles));
}
public static List<File> splitBySize(File largeFile, int maxChunkSize) throws IOException {
List<File> list = new ArrayList<>();
int numberOfFiles = 0;
try (InputStream in = Files.newInputStream(largeFile.toPath())) {
final byte[] buffer = new byte[maxChunkSize];
int dataRead = in.read(buffer);
while (dataRead > -1) {
list.add(stageLocally(buffer, dataRead));
numberOfFiles++;
dataRead = in.read(buffer);
}
}
System.out.println("Number of files generated: " + numberOfFiles);
return list;
}
private static int getSizeInBytes(long totalBytes, int numberOfFiles) {
if (totalBytes % numberOfFiles != 0) {
totalBytes = ((totalBytes / numberOfFiles) + 1)*numberOfFiles;
}
long x = totalBytes / numberOfFiles;
if (x > Integer.MAX_VALUE){
throw new NumberFormatException("Byte chunk too large");
}
return (int) x;
}
Full Code:
public class StackOverflow {
private static final String INPUT_FILE_PATH = "/Users/malkesingh/Downloads/5MB.zip";
private static final String TEMP_DIRECTORY = "/Users/malkesingh/temp";
public static void main(String[] args) throws IOException {
File input = new File(INPUT_FILE_PATH);
File outPut = fileJoin2(splitFile(input, 5));
try (InputStream in = Files.newInputStream(input.toPath()); InputStream out = Files.newInputStream(outPut.toPath())) {
System.out.println(IOUtils.contentEquals(in, out));
}
}
public static List<File> splitFile(File largeFile, int noOfFiles) throws IOException {
return splitBySize(largeFile, getSizeInBytes(largeFile.length(), noOfFiles));
}
public static List<File> splitBySize(File largeFile, int maxChunkSize) throws IOException {
List<File> list = new ArrayList<>();
int numberOfFiles = 0;
try (InputStream in = Files.newInputStream(largeFile.toPath())) {
final byte[] buffer = new byte[maxChunkSize];
int dataRead = in.read(buffer);
while (dataRead > -1) {
list.add(stageLocally(buffer, dataRead));
numberOfFiles++;
dataRead = in.read(buffer);
}
}
System.out.println("Number of files generated: " + numberOfFiles);
return list;
}
private static int getSizeInBytes(long totalBytes, int numberOfFiles) {
if (totalBytes % numberOfFiles != 0) {
totalBytes = ((totalBytes / numberOfFiles) + 1)*numberOfFiles;
}
long x = totalBytes / numberOfFiles;
if (x > Integer.MAX_VALUE){
throw new NumberFormatException("Byte chunk too large");
}
return (int) x;
}
private static File stageLocally(byte[] buffer, int length) throws IOException {
File outPutFile = File.createTempFile("temp-", "split", new File(TEMP_DIRECTORY));
try(FileOutputStream fos = new FileOutputStream(outPutFile)) {
fos.write(buffer, 0, length);
}
return outPutFile;
}
public static File fileJoin2(List<File> list) throws IOException {
File outPutFile = File.createTempFile("temp-", "unsplit", new File(TEMP_DIRECTORY));
FileOutputStream fos = new FileOutputStream(outPutFile);
for (File file : list) {
Files.copy(file.toPath(), fos);
}
fos.close();
return outPutFile;
}}
import java.util.*;
import java.io.*;
public class task13 {
public static void main(String[] args)throws IOException{
Scanner s =new Scanner(System.in);
System.out.print("Enter path:");
String a=s.next();
File f=new File(a+".txt");
Scanner st=new Scanner(f);
System.out.println(f.canRead()+"\n"+f.canWrite());
long l=f.length();
System.out.println("Length is:"+l);
System.out.print("Enter no.of partitions:");
int p=s.nextInt();
long x=l/p;
st.useDelimiter("\\Z");
String t=st.next();
int j=0;
System.out.println("Each File Length is:"+x);
for(int i=1;i<=p;i++){
File ft=new File(a+"-"+i+".txt");
ft.createNewFile();
int g=(j*(int)x);
int h=(j+1)*(int)x;
if(g<=l&&h<=l){
FileWriter fw=new FileWriter(a+"-"+i+".txt");
String v=t.substring(g,h);
fw.write(v);
j++;
fw.close();
}}
}}

Categories