pdfbox writing compressed object streams - java

I'm merging multiple files, which originally have 19mb.
But the result is a total of 56mb. How can I make this final value approach the 19mb.
[EDIT]
public void concatena(InputStream anterior, InputStream novo, OutputStream saida, List<String> marcadores)
throws IOException {
PDFMergerUtility pdfMerger = new PDFMergerUtility();
pdfMerger.setDestinationStream(saida);
PDDocument dest;
PDDocument src;
MemoryUsageSetting setupMainMemoryOnly = MemoryUsageSetting.setupMainMemoryOnly();
if (anterior != null) {
dest = PDDocument.load(anterior, setupMainMemoryOnly);
src = PDDocument.load(novo, setupMainMemoryOnly);
} else {
dest = PDDocument.load(novo, setupMainMemoryOnly);
src = new PDDocument();
}
int totalPages = dest.getNumberOfPages();
pdfMerger.appendDocument(dest, src);
criaMarcador(dest, totalPages, marcadores);
saida = pdfMerger.getDestinationStream();
dest.save(saida);
dest.close();
src.close();
}
Sorry, I still do not know how to use stackoverflow very well. I'm trying to post the rest of the code but I'm getting an error
[Edit 2 - add criaMarcador method]
private void criaMarcador(PDDocument src, int numPaginas, List<String> marcadores) {
if (marcadores != null && !marcadores.isEmpty()) {
PDDocumentOutline documentOutline = src.getDocumentCatalog().getDocumentOutline();
if (documentOutline == null) {
documentOutline = new PDDocumentOutline();
}
PDPage page;
if (src.getNumberOfPages() == numPaginas) {
page = src.getPage(0);
} else {
page = src.getPage(numPaginas);
}
PDOutlineItem bookmark = null;
PDOutlineItem pai = null;
String etiquetaAnterior = null;
for (String etiqueta : marcadores) {
bookmark = bookmark(pai != null ? pai : documentOutline, etiqueta);
if (bookmark == null) {
if (etiquetaAnterior != null && !etiquetaAnterior.equals(etiqueta) && pai == null) {
pai = bookmark(documentOutline, etiquetaAnterior);
}
bookmark = new PDOutlineItem();
bookmark.setTitle(etiqueta);
if (marcadores.indexOf(etiqueta) == marcadores.size() - 1) {
bookmark.setDestination(page);
}
if (pai != null) {
pai.addLast(bookmark);
pai.openNode();
} else {
documentOutline.addLast(bookmark);
}
} else {
pai = bookmark;
}
etiquetaAnterior = etiqueta;
}
src.getDocumentCatalog().setDocumentOutline(documentOutline);
}
}
private PDOutlineItem bookmark(PDOutlineNode outline, String etiqueta) {
PDOutlineItem current = outline.getFirstChild();
while (current != null) {
if (current.getTitle().equals(etiqueta)) {
return current;
}
bookmark(current, etiqueta);
current = current.getNextSibling();
}
return current;
}
[Edit 3]Here is the code used for testing
public class PDFMergeTeste {
public static void main(String[] args) throws IOException {
if (args.length == 1) {
PDFMergeTeste teste = new PDFMergeTeste();
teste.executa(args[0]);
} else {
System.err.println("Argumento tem que ser diretorio contendo arquivos .pdf com nomeclatura no padrão Autos");
}
}
private void executa(String diretorioArquivos) throws IOException {
File[] listFiles = new File(diretorioArquivos).listFiles((pathname) ->
pathname.getName().endsWith(".pdf") || pathname.getName().endsWith(".PDF"));
List<File> lista = Arrays.asList(listFiles);
lista.sort(Comparator.comparing(File::lastModified));
PDFMerge merge = new PDFMerge();
InputStream anterior = null;
ByteArrayOutputStream saida = new ByteArrayOutputStream();
for (File file : lista) {
List<String> marcadores = marcadores(file.getName());
InputStream novo = new FileInputStream(file);
merge.concatena(anterior, novo, saida, marcadores);
anterior = new ByteArrayInputStream(saida.toByteArray());
}
try (OutputStream pdf = new FileOutputStream(pathDestFile)) {
saida.writeTo(pdf);
}
}
private List<String> marcadores(String name) {
String semExtensao = name.substring(0, name.indexOf(".pdf"));
return Arrays.asList(semExtensao.split("_"));
}
}

The error is in the executa method:
InputStream anterior = null;
ByteArrayOutputStream saida = new ByteArrayOutputStream();
for (File file : lista) {
List<String> marcadores = marcadores(file.getName());
InputStream novo = new FileInputStream(file);
merge.concatena(anterior, novo, saida, marcadores);
anterior = new ByteArrayInputStream(saida.toByteArray());
}
Your ByteArrayOutputStream saida is re-used in each loop but it is not cleared in-between. Thus, it contains
after processing file 1:
file 1
after processing file 2:
file 1
concatenation of file 1 and file 2
after processing file 3: file 1
file 1
concatenation of file 1 and file 2
concatenation of file 1 and file 2 and file 3
after processing file 4:
file 1
concatenation of file 1 and file 2
concatenation of file 1 and file 2 and file 3
concatenation of file 1 and file 2 and file 3 and file 4
(Actually this only works because PDFBox tries to be nice and fixes broken input files under the hood as these concatenations of files strictly speaking are broken and PDFBox doesn't need to be able to parse them.)
You can fix this by clearing saida at the start of each iteration:
InputStream anterior = null;
ByteArrayOutputStream saida = new ByteArrayOutputStream();
for (File file : lista) {
saida.reset();
List<String> marcadores = marcadores(file.getName());
InputStream novo = new FileInputStream(file);
merge.concatena(anterior, novo, saida, marcadores);
anterior = new ByteArrayInputStream(saida.toByteArray());
}
With your original method the result size for your inputs is nearly 26 MB, with the fixed method it is about 5 MB, and that latter size approximately represents the sum of the sizes of the input files.

Related

Counting the number of attachments on a PDF using iText

I am trying to count the number of attachments on a PDF to verify our attachment code. The code I have works most of the time but recently it started failing when the number of attachments went up as well as the size of the attachments. Example: I have a PDF with 700 attachments which total 1.6 gb. And another with 65 attachments of around 10mb. The 65 count was done incrementally. We had built it up file by file. At 64 files (about 9.8mb) the routine counted fine. Add file 65 (about .5mb) and the routine failed.
This is on itextpdf-5.5.9.jar under jre1.8.0_162
We are still testing different combinations of file numbers and size to see where it breaks.
private static String CountFiles() throws IOException, DocumentException {
Boolean errorFound = new Boolean(true);
PdfDictionary root;
PdfDictionary names;
PdfDictionary embeddedFiles;
PdfReader reader = null;
String theResult = "unknown";
try {
if (!theBaseFile.toLowerCase().endsWith(".pdf"))
theResult = "file not PDF";
else {
reader = new PdfReader(theBaseFile);
root = reader.getCatalog();
names = root.getAsDict(PdfName.NAMES);
if (names == null)
theResult = "0";
else {
embeddedFiles = names.getAsDict(PdfName.EMBEDDEDFILES);
PdfArray namesArray = embeddedFiles.getAsArray(PdfName.NAMES);
theResult = String.format("%d", namesArray.size() / 2);
}
reader.close();
errorFound = false;
}
}
catch (Exception e) {
theResult = "unknown";
}
finally {
if (reader != null)
reader.close();
}
if (errorFound)
sendError(theResult);
return theResult;
}
private static String AttachFileInDir() throws IOException, DocumentException {
String theResult = "unknown";
String outputFile = theBaseFile.replaceFirst("(?i).pdf$", ".attach.pdf");
int maxFiles = 1000;
int fileCount = 1;
PdfReader reader = null;
PdfStamper stamper = null;
try {
if (!theBaseFile.toLowerCase().endsWith(".pdf"))
theResult = "basefile not PDF";
else if (theFileDir.length() == 0)
theResult = "no attach directory";
else if (!Files.isDirectory(Paths.get(theFileDir)))
theResult = "invalid attach directory";
else {
reader = new PdfReader(theBaseFile);
stamper = new PdfStamper(reader, new FileOutputStream(outputFile));
stamper.getWriter().setPdfVersion(PdfWriter.VERSION_1_7);
Path dir = FileSystems.getDefault().getPath(theFileDir);
DirectoryStream<Path> stream = Files.newDirectoryStream(dir);
for (Path path : stream) {
stamper.addFileAttachment(null, null, path.toFile().toString(), path.toFile().getName());
if (++fileCount > maxFiles) {
theResult = "maxfiles exceeded";
break;
}
}
stream.close();
stamper.close();
reader.close();
theResult = "SUCCESS";
}
}
catch (Exception e) {
theResult = "unknown";
}
finally {
if (stamper != null)
stamper.close();
if (reader != null)
reader.close();
}
if (theResult != "SUCCESS")
sendError(theResult);
return theResult;
}
I expect a simple count of attachments back. What seems to be happening is the namesArray is coming back null. The result stays "unknown". I suspect the namesArray is trying to hold all the files and choking on the size.
Note: The files are being attached using the AttachFileInDir procedure. Dump all the files in a directory and run the AttachFileInDir. And yes the error trapping in AttachFileInDir needs work.
Any help would be appreciated or another method welcome
I finally got it. Turns out each KID is a dictionary of NAMES….
Each NAMES hold 64 file references. At 65 files and up it made a KIDS dictionary array of names. So 279 files = ( 8*64 +46 )/2 (9 total KIDS array elements).
One thing that I had to compensate for. If one deletes all the attachments off a pdf it leaves artifacts behind as opposed to a PDF that never had an attachment
private static String CountFiles() throws IOException, DocumentException {
Boolean errorFound = new Boolean(true);
int totalFiles = 0;
PdfArray filesArray;
PdfDictionary root;
PdfDictionary names;
PdfDictionary embeddedFiles;
PdfReader reader = null;
String theResult = "unknown";
try {
if (!theBaseFile.toLowerCase().endsWith(".pdf"))
theResult = "file not PDF";
else {
reader = new PdfReader(theBaseFile);
root = reader.getCatalog();
names = root.getAsDict(PdfName.NAMES);
if (names == null){
theResult = "0";
errorFound = false;
}
else {
embeddedFiles = names.getAsDict(PdfName.EMBEDDEDFILES);
filesArray = embeddedFiles.getAsArray(PdfName.NAMES);
if (filesArray != null)
totalFiles = filesArray.size();
else {
filesArray = embeddedFiles.getAsArray(PdfName.KIDS);
if (filesArray != null){
for (int i = 0; i < filesArray.size(); i++)
totalFiles += filesArray.getAsDict(i).getAsArray(PdfName.NAMES).size();
}
}
theResult = String.format("%d", totalFiles / 2);
reader.close();
errorFound = false;
}
}
}
catch (Exception e) {
theResult = "unknown" + e.getMessage();
}
finally {
if (reader != null)
reader.close();
}
if (errorFound)
sendError(theResult);
return theResult;
}

How to get SdCard path in android? [duplicate]

This question already has answers here:
Find location of a removable SD card
(24 answers)
Closed 4 years ago.
I need SdCard path to save files in it. I have tried some codes but these code didn't work on some devices or Android versions. Now I need a code/path that get SdCard path for all device and all Android versions.
For getting sdcard path,try following code:
public static String getExternalSDCardRootDirectory() {
String cmdMOUNT = "cat /proc/mounts";
Runtime run = Runtime.getRuntime();
List<String> paths = new ArrayList<>();
try {
Process p = run.exec(cmdMOUNT);
BufferedInputStream in = new BufferedInputStream(p.getInputStream());
BufferedReader inBr = new BufferedReader(new InputStreamReader(in));
String lineStr;
while ((lineStr = inBr.readLine()) != null) {
Log.d(TAG, lineStr);
if (lineStr.toLowerCase().contains("sdcard") || lineStr.toLowerCase().contains("ext") ) {
String[] strArray = lineStr.split(" ");
if (strArray.length >= 3 &&
(!strArray[1].contains("/system") &&
!strArray[1].contains("/data") &&
!strArray[1].contains("/cache") &&
!strArray[1].contains("/persist")
)) {
String result = strArray[1].trim();
if((result.contains("ext") || result.contains("1")) && result.contains("storage")) {
paths.add(result);
}
//return result;
}
}
if (p.waitFor() != 0 && p.exitValue() == 1) {
Log.e(TAG, "check mount info failed");
return null;
}
}
inBr.close();
in.close();
} catch (Exception e) {
e.printStackTrace();
return null;
}
if (paths.size() > 0) {
return paths.get(0);
}
else {
return null;
}
}
For getting path you need to call Environment.getExternalStorageState()
I've found an existing post.
Simply change it to...
public static HashSet<String> getExternalMounts() {
final HashSet<String> out = new HashSet<String>();
String reg = "(?i).*vold.*(vfat|ntfs|exfat|fat32|ext3|ext4).*rw.*";
String s = "";
try {
final Process process = new ProcessBuilder().command("mount")
.redirectErrorStream(true).start();
process.waitFor();
final InputStream is = process.getInputStream();
final byte[] buffer = new byte[1024];
while (is.read(buffer) != -1) {
s = s + new String(buffer);
}
is.close();
} catch (final Exception e) {
e.printStackTrace();
}
// parse output
final String[] lines = s.split("\n");
for (String line : lines) {
if (!line.toLowerCase(Locale.US).contains("asec")) {
if (line.matches(reg)) {
String[] parts = line.split(" ");
for (String part : parts) {
if (part.startsWith("/"))
if (!part.toLowerCase(Locale.US).contains("vold"))
out.add(part);
}
}
}
}
return out;
}
The tested solution on different platforms can be found here.

Removing a blank page from a pdf with itextpdf corrupts my file

I am basing my code from this
https://github.com/Betel-Flowers/BetelFlowers/blob/master/BetelFlowers-ejb/src/main/java/com/betel/flowers/pdf/util/RemoveBlankPageFromPDF.java
or this
http://www.rgagnon.com/javadetails/java-detect-and-remove-blank-page-in-pdf.html
I am trying to use a byte array as input a byte array as output.
This is my code
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import com.lowagie.text.Document;
import com.lowagie.text.DocumentException;
import com.lowagie.text.pdf.PdfCopy;
import com.lowagie.text.pdf.PdfDictionary;
import com.lowagie.text.pdf.PdfImportedPage;
import com.lowagie.text.pdf.PdfName;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.RandomAccessFileOrArray;
public class RemoveBlankPageFromPDF {
// value where we can consider that this is a blank image
// can be much higher or lower depending of what is considered as a blank page
public static final int BLANK_THRESHOLD = 160;
public static byte[] removeBlankPdfPages(byte[] fuente) throws IOException, DocumentException{
PdfReader r = null;
RandomAccessFileOrArray raf = null;
Document document = null;
PdfCopy writer = null;
ByteArrayOutputStream archivoFinal = new ByteArrayOutputStream();
try {
r = new PdfReader(fuente);
raf = new RandomAccessFileOrArray(fuente);
document = new Document(r.getPageSizeWithRotation(1));
writer = new PdfCopy(document,archivoFinal);
document.open();
PdfImportedPage page = null;
for (int i = 1; i <= r.getNumberOfPages(); i++) {
PdfDictionary pageDict = r.getPageN(i);
PdfDictionary resDict = (PdfDictionary) pageDict.get(PdfName.RESOURCES);
boolean noFontsOrImages = true;
if (resDict != null) {
noFontsOrImages = resDict.get(PdfName.FONT) == null
&& resDict.get(PdfName.XOBJECT) == null;
}
if (!noFontsOrImages) {
byte bContent[] = r.getPageContent(i, raf);
ByteArrayOutputStream bs = new ByteArrayOutputStream();
bs.write(bContent);
System.out.println("bs size: " + bs.size());
if (bs.size() > BLANK_THRESHOLD) {
page = writer.getImportedPage(r, i);
writer.addPage(page);
}
}
}
System.out.println("Original: " + fuente.length+ " new: " + archivoFinal.toByteArray().length);
return archivoFinal.toByteArray();
} finally {
if (document != null) {
document.close();
}
if (writer != null) {
writer.close();
}
if (raf != null) {
raf.close();
}
if (r != null) {
r.close();
}
}
}
}
my pdf gets corrupted i cannot open it after.
Even with a pdf without spaces I get different sizes, it should be the same
Original: 95089 New: 88129
That is my output from my las sysout.
I am using itext 2.1.5 and java 1.5 by the way. I cannot upgrade.
Anyways I found an answer. Just in case somebody needs for an older version of itext
public static void removeBlankPdfPages(PdfReader r) throws IOException{
PdfTextExtractor extractor = new PdfTextExtractor(r);
List<Integer> paginas = new ArrayList<Integer>();
for (int i = 1; i <= r.getNumberOfPages(); i++) {
PdfDictionary pageDict = r.getPageN(i);
PdfDictionary resDict = (PdfDictionary) pageDict.get(PdfName.RESOURCES);
boolean noFontsOrImages = true;
if (resDict != null) {
noFontsOrImages = resDict.get(PdfName.FONT) == null
&& resDict.get(PdfName.XOBJECT) == null;
}
if (!noFontsOrImages) {
String textFromPage = extractor.getTextFromPage(i);
if(textFromPage.length() >50 ){
paginas.add(i);
}
}
}
r.selectPages(paginas);
}

Performance in merging pdf file

I want to merge PDF files; around 20 files of 60 MB each. I am using iText API to merge it.
The problem is, I have to complete it within 2 seconds but my code is taking 8 seconds.
Any solution to speeding up merging PDF files?
private void mergeFiles(List<String> filesToBeMerged, String mergedFilePath) throws Exception {
Document document = null;
PdfCopy copy = null;
PdfReader reader = null;
BufferedOutputStream bos = null;
int bufferSize = 8 * 1024 * 1024;
String pdfLocation = "C:\\application\\projectone-working\\projectone\\web\\pdf\\";
try {
int fileIndex = 0;
for (String file : filesToBeMerged)
{
reader = new PdfReader(pdfLocation+"/"+file);
reader.consolidateNamedDestinations();
int totalPages = reader.getNumberOfPages();
if (fileIndex == 0) {
document = new Document(reader.getPageSizeWithRotation(1));
bos = new BufferedOutputStream(new FileOutputStream(mergedFilePath), bufferSize);
copy = new PdfCopy(document, bos);
document.open();
}
PdfImportedPage page;
for (int currentPage = 1; currentPage <= totalPages; currentPage++) {
page = copy.getImportedPage(reader, currentPage);
copy.addPage(page);
}
PRAcroForm form = reader.getAcroForm();
if (form != null) {
copy.copyAcroForm(reader);
}
}
document.close();
} finally {
if (reader != null) {
reader.close();
}
if (bos != null) {
bos.flush();
bos.close();
}
if (copy != null) {
copy.close();
}
}
}

Java - Read file and split into multiple files

I have a file which I would like to read in Java and split this file into n (user input) output files. Here is how I read the file:
int n = 4;
BufferedReader br = new BufferedReader(new FileReader("file.csv"));
try {
String line = br.readLine();
while (line != null) {
line = br.readLine();
}
} finally {
br.close();
}
How do I split the file - file.csv into n files?
Note - Since the number of entries in the file are of the order of 100k, I can't store the file content into an array and then split it and save into multiple files.
Since one file can be very large, each split file could be large as well.
Example:
Source File Size: 5GB
Num Splits: 5: Destination
File Size: 1GB each (5 files)
There is no way to read this large split chunk in one go, even if we have such a memory. Basically for each split we can read a fix size byte-array which we know should be feasible in terms of performance as well memory.
NumSplits: 10 MaxReadBytes: 8KB
public static void main(String[] args) throws Exception
{
RandomAccessFile raf = new RandomAccessFile("test.csv", "r");
long numSplits = 10; //from user input, extract it from args
long sourceSize = raf.length();
long bytesPerSplit = sourceSize/numSplits ;
long remainingBytes = sourceSize % numSplits;
int maxReadBufferSize = 8 * 1024; //8KB
for(int destIx=1; destIx <= numSplits; destIx++) {
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream("split."+destIx));
if(bytesPerSplit > maxReadBufferSize) {
long numReads = bytesPerSplit/maxReadBufferSize;
long numRemainingRead = bytesPerSplit % maxReadBufferSize;
for(int i=0; i<numReads; i++) {
readWrite(raf, bw, maxReadBufferSize);
}
if(numRemainingRead > 0) {
readWrite(raf, bw, numRemainingRead);
}
}else {
readWrite(raf, bw, bytesPerSplit);
}
bw.close();
}
if(remainingBytes > 0) {
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream("split."+(numSplits+1)));
readWrite(raf, bw, remainingBytes);
bw.close();
}
raf.close();
}
static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
byte[] buf = new byte[(int) numBytes];
int val = raf.read(buf);
if(val != -1) {
bw.write(buf);
}
}
import java.io.*;
import java.util.Scanner;
public class split {
public static void main(String args[])
{
try{
// Reading file and getting no. of files to be generated
String inputfile = "C:/test.txt"; // Source File Name.
double nol = 2000.0; // No. of lines to be split and saved in each output file.
File file = new File(inputfile);
Scanner scanner = new Scanner(file);
int count = 0;
while (scanner.hasNextLine())
{
scanner.nextLine();
count++;
}
System.out.println("Lines in the file: " + count); // Displays no. of lines in the input file.
double temp = (count/nol);
int temp1=(int)temp;
int nof=0;
if(temp1==temp)
{
nof=temp1;
}
else
{
nof=temp1+1;
}
System.out.println("No. of files to be generated :"+nof); // Displays no. of files to be generated.
//---------------------------------------------------------------------------------------------------------
// Actual splitting of file into smaller files
FileInputStream fstream = new FileInputStream(inputfile); DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in)); String strLine;
for (int j=1;j<=nof;j++)
{
FileWriter fstream1 = new FileWriter("C:/New Folder/File"+j+".txt"); // Destination File Location
BufferedWriter out = new BufferedWriter(fstream1);
for (int i=1;i<=nol;i++)
{
strLine = br.readLine();
if (strLine!= null)
{
out.write(strLine);
if(i!=nol)
{
out.newLine();
}
}
}
out.close();
}
in.close();
}catch (Exception e)
{
System.err.println("Error: " + e.getMessage());
}
}
}
Though its a old question but for reference I am listing out the code which I used to split large files to any sizes and it works with any Java versions above 1.4 .
Sample Split and Join blocks were like below:
public void join(String FilePath) {
long leninfile = 0, leng = 0;
int count = 1, data = 0;
try {
File filename = new File(FilePath);
//RandomAccessFile outfile = new RandomAccessFile(filename,"rw");
OutputStream outfile = new BufferedOutputStream(new FileOutputStream(filename));
while (true) {
filename = new File(FilePath + count + ".sp");
if (filename.exists()) {
//RandomAccessFile infile = new RandomAccessFile(filename,"r");
InputStream infile = new BufferedInputStream(new FileInputStream(filename));
data = infile.read();
while (data != -1) {
outfile.write(data);
data = infile.read();
}
leng++;
infile.close();
count++;
} else {
break;
}
}
outfile.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public void split(String FilePath, long splitlen) {
long leninfile = 0, leng = 0;
int count = 1, data;
try {
File filename = new File(FilePath);
//RandomAccessFile infile = new RandomAccessFile(filename, "r");
InputStream infile = new BufferedInputStream(new FileInputStream(filename));
data = infile.read();
while (data != -1) {
filename = new File(FilePath + count + ".sp");
//RandomAccessFile outfile = new RandomAccessFile(filename, "rw");
OutputStream outfile = new BufferedOutputStream(new FileOutputStream(filename));
while (data != -1 && leng < splitlen) {
outfile.write(data);
leng++;
data = infile.read();
}
leninfile += leng;
leng = 0;
outfile.close();
count++;
}
} catch (Exception e) {
e.printStackTrace();
}
}
Complete java code available here in File Split in Java Program link.
a clean solution to edit.
this solution involves loading the entire file into memory.
set all line of a file in List<String> rowsOfFile;
edit maxSizeFile to choice max size of a single file splitted
public void splitFile(File fileToSplit) throws IOException {
long maxSizeFile = 10000000 // 10mb
StringBuilder buffer = new StringBuilder((int) maxSizeFile);
int sizeOfRows = 0;
int recurrence = 0;
String fileName;
List<String> rowsOfFile;
rowsOfFile = Files.readAllLines(fileToSplit.toPath(), Charset.defaultCharset());
for (String row : rowsOfFile) {
buffer.append(row);
numOfRow++;
sizeOfRows += row.getBytes(StandardCharsets.UTF_8).length;
if (sizeOfRows >= maxSizeFile) {
fileName = generateFileName(recurrence);
File newFile = new File(fileName);
try (PrintWriter writer = new PrintWriter(newFile)) {
writer.println(buffer.toString());
}
recurrence++;
sizeOfRows = 0;
buffer = new StringBuilder();
}
}
// last rows
if (sizeOfRows > 0) {
fileName = generateFileName(recurrence);
File newFile = createFile(fileName);
try (PrintWriter writer = new PrintWriter(newFile)) {
writer.println(buffer.toString());
}
}
Files.delete(fileToSplit.toPath());
}
method to generate Name of file:
public String generateFileName(int numFile) {
String extension = ".txt";
return "myFile" + numFile + extension;
}
Have a counter to count no of entries. Let's say one entry per line.
step1: Initially create new subfile, set counter=0;
step2: increment counter as you read each entry from source file to buffer
step3: when counter reaches limit to number of entries that you want to write in each sub file, flush contents of buffer to subfile. close the subfile
step4 : jump to step1 till you have data in source file to read from
There's no need to loop twice through the file. You could estimate the size of each chunk as the source file size divided by number of chunks needed. Then you just stop filling each cunk with data as it's size exceeds estimated.
Here is one that worked for me and I used it to split 10GB file. it also enables you to add a header and a footer. very useful when splitting document based format such as XML and JSON because you need to add document wrapper in the new split files.
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
public class FileSpliter
{
public static void main(String[] args) throws IOException
{
splitTextFiles("D:\\xref.csx", 750000, "", "", null);
}
public static void splitTextFiles(String fileName, int maxRows, String header, String footer, String targetDir) throws IOException
{
File bigFile = new File(fileName);
int i = 1;
String ext = fileName.substring(fileName.lastIndexOf("."));
String fileNoExt = bigFile.getName().replace(ext, "");
File newDir = null;
if(targetDir != null)
{
newDir = new File(targetDir);
}
else
{
newDir = new File(bigFile.getParent() + "\\" + fileNoExt + "_split");
}
newDir.mkdirs();
try (BufferedReader reader = Files.newBufferedReader(Paths.get(fileName)))
{
String line = null;
int lineNum = 1;
Path splitFile = Paths.get(newDir.getPath() + "\\" + fileNoExt + "_" + String.format("%02d", i) + ext);
BufferedWriter writer = Files.newBufferedWriter(splitFile, StandardOpenOption.CREATE);
while ((line = reader.readLine()) != null)
{
if(lineNum == 1)
{
System.out.print("new file created '" + splitFile.toString());
if(header != null && header.length() > 0)
{
writer.append(header);
writer.newLine();
}
}
writer.append(line);
if (lineNum >= maxRows)
{
if(footer != null && footer.length() > 0)
{
writer.newLine();
writer.append(footer);
}
writer.close();
System.out.println(", " + lineNum + " lines written to file");
lineNum = 1;
i++;
splitFile = Paths.get(newDir.getPath() + "\\" + fileNoExt + "_" + String.format("%02d", i) + ext);
writer = Files.newBufferedWriter(splitFile, StandardOpenOption.CREATE);
}
else
{
writer.newLine();
lineNum++;
}
}
if(lineNum <= maxRows) // early exit
{
if(footer != null && footer.length() > 0)
{
writer.newLine();
lineNum++;
writer.append(footer);
}
}
writer.close();
System.out.println(", " + lineNum + " lines written to file");
}
System.out.println("file '" + bigFile.getName() + "' split into " + i + " files");
}
}
Below code used to split a big file into small files with lesser lines.
long linesWritten = 0;
int count = 1;
try {
File inputFile = new File(inputFilePath);
InputStream inputFileStream = new BufferedInputStream(new FileInputStream(inputFile));
BufferedReader reader = new BufferedReader(new InputStreamReader(inputFileStream));
String line = reader.readLine();
String fileName = inputFile.getName();
String outfileName = outputFolderPath + "\\" + fileName;
while (line != null) {
File outFile = new File(outfileName + "_" + count + ".split");
Writer writer = new OutputStreamWriter(new FileOutputStream(outFile));
while (line != null && linesWritten < linesPerSplit) {
writer.write(line);
line = reader.readLine();
linesWritten++;
}
writer.close();
linesWritten = 0;//next file
count++;//nect file count
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
Split a file to multiple chunks (in memory operation), here I'm splitting any file to a size of 500kb(500000 bytes) :
public static List<ByteArrayOutputStream> splitFile(File f) {
List<ByteArrayOutputStream> datalist = new ArrayList<>();
try {
int sizeOfFiles = 500000;
byte[] buffer = new byte[sizeOfFiles];
try (FileInputStream fis = new FileInputStream(f); BufferedInputStream bis = new BufferedInputStream(fis)) {
int bytesAmount = 0;
while ((bytesAmount = bis.read(buffer)) > 0) {
try (OutputStream out = new ByteArrayOutputStream()) {
out.write(buffer, 0, bytesAmount);
out.flush();
datalist.add((ByteArrayOutputStream) out);
}
}
}
} catch (Exception e) {
//get the error
}
return datalist; }
I am a bit late to answer, But here's how I did it:
Approach:
First I determine how many bytes each of the individual files should contain then I split the large file by bytes. Only one file chunk worth of data is loaded into memory at a time.
Example:- if a 5 GB file is split into 10 files then only 500MB worth of bytes are loaded into memory at a time which are held in the buffer variable in the splitBySize method below.
Code Explaination:
The method splitFile first gets the number of bytes each of the individual file chunks should contain by calling the getSizeInBytes method, then it calls the splitBySize method which splits the large file by size (i..e maxChunkSize represents the number of bytes each of file chunks will contain).
public static List<File> splitFile(File largeFile, int noOfFiles) throws IOException {
return splitBySize(largeFile, getSizeInBytes(largeFile.length(), noOfFiles));
}
public static List<File> splitBySize(File largeFile, int maxChunkSize) throws IOException {
List<File> list = new ArrayList<>();
int numberOfFiles = 0;
try (InputStream in = Files.newInputStream(largeFile.toPath())) {
final byte[] buffer = new byte[maxChunkSize];
int dataRead = in.read(buffer);
while (dataRead > -1) {
list.add(stageLocally(buffer, dataRead));
numberOfFiles++;
dataRead = in.read(buffer);
}
}
System.out.println("Number of files generated: " + numberOfFiles);
return list;
}
private static int getSizeInBytes(long totalBytes, int numberOfFiles) {
if (totalBytes % numberOfFiles != 0) {
totalBytes = ((totalBytes / numberOfFiles) + 1)*numberOfFiles;
}
long x = totalBytes / numberOfFiles;
if (x > Integer.MAX_VALUE){
throw new NumberFormatException("Byte chunk too large");
}
return (int) x;
}
Full Code:
public class StackOverflow {
private static final String INPUT_FILE_PATH = "/Users/malkesingh/Downloads/5MB.zip";
private static final String TEMP_DIRECTORY = "/Users/malkesingh/temp";
public static void main(String[] args) throws IOException {
File input = new File(INPUT_FILE_PATH);
File outPut = fileJoin2(splitFile(input, 5));
try (InputStream in = Files.newInputStream(input.toPath()); InputStream out = Files.newInputStream(outPut.toPath())) {
System.out.println(IOUtils.contentEquals(in, out));
}
}
public static List<File> splitFile(File largeFile, int noOfFiles) throws IOException {
return splitBySize(largeFile, getSizeInBytes(largeFile.length(), noOfFiles));
}
public static List<File> splitBySize(File largeFile, int maxChunkSize) throws IOException {
List<File> list = new ArrayList<>();
int numberOfFiles = 0;
try (InputStream in = Files.newInputStream(largeFile.toPath())) {
final byte[] buffer = new byte[maxChunkSize];
int dataRead = in.read(buffer);
while (dataRead > -1) {
list.add(stageLocally(buffer, dataRead));
numberOfFiles++;
dataRead = in.read(buffer);
}
}
System.out.println("Number of files generated: " + numberOfFiles);
return list;
}
private static int getSizeInBytes(long totalBytes, int numberOfFiles) {
if (totalBytes % numberOfFiles != 0) {
totalBytes = ((totalBytes / numberOfFiles) + 1)*numberOfFiles;
}
long x = totalBytes / numberOfFiles;
if (x > Integer.MAX_VALUE){
throw new NumberFormatException("Byte chunk too large");
}
return (int) x;
}
private static File stageLocally(byte[] buffer, int length) throws IOException {
File outPutFile = File.createTempFile("temp-", "split", new File(TEMP_DIRECTORY));
try(FileOutputStream fos = new FileOutputStream(outPutFile)) {
fos.write(buffer, 0, length);
}
return outPutFile;
}
public static File fileJoin2(List<File> list) throws IOException {
File outPutFile = File.createTempFile("temp-", "unsplit", new File(TEMP_DIRECTORY));
FileOutputStream fos = new FileOutputStream(outPutFile);
for (File file : list) {
Files.copy(file.toPath(), fos);
}
fos.close();
return outPutFile;
}}
import java.util.*;
import java.io.*;
public class task13 {
public static void main(String[] args)throws IOException{
Scanner s =new Scanner(System.in);
System.out.print("Enter path:");
String a=s.next();
File f=new File(a+".txt");
Scanner st=new Scanner(f);
System.out.println(f.canRead()+"\n"+f.canWrite());
long l=f.length();
System.out.println("Length is:"+l);
System.out.print("Enter no.of partitions:");
int p=s.nextInt();
long x=l/p;
st.useDelimiter("\\Z");
String t=st.next();
int j=0;
System.out.println("Each File Length is:"+x);
for(int i=1;i<=p;i++){
File ft=new File(a+"-"+i+".txt");
ft.createNewFile();
int g=(j*(int)x);
int h=(j+1)*(int)x;
if(g<=l&&h<=l){
FileWriter fw=new FileWriter(a+"-"+i+".txt");
String v=t.substring(g,h);
fw.write(v);
j++;
fw.close();
}}
}}

Categories