Struggling to write 300k rows to csv file through Apache POI java. I have been trying to generate a csv file from an excel file with 300k rows. Everytime, I get GCOutMemory error when it tries to write to output csv file. I even tried splitting the write for every 100k rows. The output file size keeps on growing but I don't see system.println statement isnt getting printed.
import javafx.beans.binding.StringBinding;
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.streaming.SXSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryPoolMXBean;
import java.lang.management.MemoryType;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Timestamp;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ReadWrite {
private static Logger logger= LoggerFactory.getLogger(ReadWrite.class);
public static void main(String[] args) {
try {
long startReading = System.currentTimeMillis();
Path path = Paths.get("/Users/venkatesh/Documents/Citiout_files/citiout300k_2sheets.xlsx");
byte[] result = new byte[0];
try {
result = Files.readAllBytes(path);
} catch (IOException e) {
e.printStackTrace();
}
InputStream is = new ByteArrayInputStream(result);
Workbook workbook = WorkbookFactory.create(is);
long readDone = System.currentTimeMillis() - startReading;
logger.info("read time " + readDone);
Sheet sheet = workbook.getSheetAt(1);
Row firstRow = sheet.getRow(0);
int headcol = firstRow.getLastCellNum();
long startTransform = System.currentTimeMillis();
firstRow.createCell(headcol++).setCellValue("Sold Amount1");
firstRow.createCell(headcol++).setCellValue("CF_Quantity1");
firstRow.createCell(headcol++).setCellValue("CF_Quantity2");
firstRow.createCell(headcol++).setCellValue("CF_TradePrice");
firstRow.createCell(headcol++).setCellValue("CF_ForwardPrice");
firstRow.createCell(headcol++).setCellValue("CF_UnrealizedPL");
firstRow.createCell(headcol++).setCellValue("CF_Quantity1Round");
firstRow.createCell(headcol++).setCellValue("CF_Quantity2Round");
firstRow.createCell(headcol++).setCellValue("CF_FXLotKeyNoTradeDate");
firstRow.createCell(headcol++).setCellValue("CF_FXRoundedKeyNoTradeDate");
firstRow.createCell(headcol++).setCellValue("CF_SettlementDate");
for (int i = 1; i <=sheet.getLastRowNum()+1; i++) {
String jj="";
Row nRow = sheet.getRow(i-1);
for(Cell c:nRow) {
if (c.getColumnIndex()==3 && i!=1) {
Calendar cal = Calendar.getInstance();
Date date1 = new SimpleDateFormat("dd-MMM-yyyy").parse(c.getStringCellValue());
cal.setTime(date1);
jj = String.valueOf(cal.get(Calendar.MONTH)+1) + "/" + String.valueOf(cal.get(Calendar.DAY_OF_MONTH)) + "/" + String.valueOf(cal.get(Calendar.YEAR));
}
}
int count = nRow.getLastCellNum();
//System.out.println(nRow.getCell(3).getClass());
nRow.createCell(count++).setCellFormula("G" + i + "*-1");
nRow.createCell(count++).setCellFormula("E" + i + "/" + "G" + i);
nRow.createCell(count++).setCellFormula("G" + i + "/E" + i);
nRow.createCell(count++).setCellFormula("ROUND(ABS(T" + i + "/S" + i + "),6)");
nRow.createCell(count++).setCellFormula("ROUND(K" + i + ",6)");
nRow.createCell(count++).setCellFormula("ROUND(N" + i + ",2)");
nRow.createCell(count++).setCellFormula("ROUND(S" + i + ",0)");
nRow.createCell(count++).setCellFormula("ROUND(T" + i + ",0)");
nRow.createCell(count++).setCellFormula("CONCATENATE(T" + i + "," + "\"~\"" + ",S" + i + ")");
nRow.createCell(count++).setCellFormula("CONCATENATE(X" + i + "," + "\"~\"" + ",Y" + i + ")");
nRow.createCell(count++).setCellValue(jj);
c.setCellValue(DateUtil.getExcelDate(calendar.getTime()));
}
long endTransform = System.currentTimeMillis() - startTransform;
System.out.println("Transformations time " + endTransform);
final FormulaEvaluator evaluator = workbook.getCreationHelper().createFormulaEvaluator();
FileWriter writer= new FileWriter(new enter code hereFile("/Users/venkatesh/Documents/cit300k.csv"));
StringBuilder data = new StringBuilder();
Iterator<Row> rowIterator = workbook.getSheetAt(1).iterator();
try {
while (rowIterator.hasNext()) {
Row row = rowIterator.next();
Iterator<Cell> cellIterator = row.cellIterator();
while (cellIterator.hasNext()) {
Cell cell = cellIterator.next();
CellType type = cell.getCellType();
if (type == CellType.BOOLEAN) {
data.append(cell.getBooleanCellValue());
} else if (type == CellType.NUMERIC) {
data.append(cell.getNumericCellValue());
} else if (type == CellType.STRING) {
data.append(cell.getStringCellValue());
} else if (type == CellType.FORMULA) {
switch (evaluator.evaluateFormulaCell(cell)) {
case STRING:
data.append(cell.getStringCellValue());
break;
case NUMERIC:
data.append(cell.getNumericCellValue());
break;
}
} else if (type == CellType.BLANK) {
} else {
data.append(cell + "");
}
data.append(",");
}
writer.append(data.toString());
writer.append('\n');
}
} catch(Exception e){
e.printStackTrace();
}
finally{
if(writer!=null){
writer.flush();
writer.close();
}
}
for (MemoryPoolMXBean mpBean: ManagementFactory.getMemoryPoolMXBeans()) {
if (mpBean.getType() == MemoryType.HEAP) {
System.out.printf(
"Name: %s: %s\n",
mpBean.getName(), mpBean.getUsage()
);
}
}
try {
workbook.close();
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
catch (Exception e){
e.printStackTrace();
}
}
}
20-01-12 19:52:49:267 INFO main ReadWrite:64 - read time 11354
Transformations time 38659
Exception in thread "main" java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.TreeMap$Values.iterator(TreeMap.java:1031)
at org.apache.poi.xssf.usermodel.XSSFRow.cellIterator(XSSFRow.java:117)
at org.apache.poi.xssf.usermodel.XSSFRow.iterator(XSSFRow.java:132)
at org.apache.poi.xssf.usermodel.XSSFEvaluationSheet.getCell(XSSFEvaluationSheet.java:86)
at org.apache.poi.ss.formula.WorkbookEvaluator.evaluateFormula(WorkbookEvaluator.java:402)
at org.apache.poi.ss.formula.WorkbookEvaluator.evaluateAny(WorkbookEvaluator.java:275)
at org.apache.poi.ss.formula.WorkbookEvaluator.evaluate(WorkbookEvaluator.java:216)
at org.apache.poi.xssf.usermodel.BaseXSSFFormulaEvaluator.evaluateFormulaCellValue(BaseXSSFFormulaEvaluator.java:56)
at org.apache.poi.ss.formula.BaseFormulaEvaluator.evaluateFormulaCell(BaseFormulaEvaluator.java:185)
at ReadWrite.main(ReadWrite.java:150)
So now that we have a usable stacktrace, it is clear that the problem is NOT happening while writing the CSV file. It is actually happening while you are evaluating a spreadsheet formula. My guess is that the formula is summing across all rows in a sheet ... or something like that.
This is a problem, and there is probably no simple solution.
Here's what the POI documentation says:
File sizes/Memory usage
There are some inherent limits in the Excel file formats. These are defined in class SpreadsheetVersion. As long as you have enough
main-memory, you should be able to handle files up to these limits.
For huge files using the default POI classes you will likely need a
very large amount of memory.
There are ways to overcome the main-memory limitations if needed:
For writing very huge files, there is SXSSFWorkbook which allows to do a streaming write of data out to files (with certain
limitations on what you can do as only parts of the file are held in
memory).
For reading very huge files, take a look at the sample XLSX2CSV which shows how you can read a file in streaming
fashion (again with some limitations on what information you can read
out of the file, but there are ways to get at most of it if
necessary).
You are clearly running into these memory limitations. Basically, POI is trying to load too much of the spreadsheet into memory ... while you are evaluating the spreadsheet formulae ... and you are filling the heap.
One solution would be to increase the Java heap size. Or if you are already using all available RAM for your heap, run the conversion on a machine with more RAM. A lot of standard PCs have 16GB RAM these days. Maybe it is time for a hardware upgrade? But I'm guessing you have already thought of this.
If increasing the heap size is not viable, then you will need to rewrite your application to use SXSSFWorkbook. Furthermore, you may need to replace your approach of using formula evaluation with doing the calculations in native Java in a way that is compatible with row-by-row streaming of the spreadsheet. (It will depend on what the formulae do.)
Look at the linked example from the POI documentation for ideas.
Related
I tested the program I developed.
However, there is a significant difference in performance for each operating system of Java.
I've been trying to find the cause all day, but haven't found it.
The logic I developed is as follows:
A user uploads an excel in a browser.
Read excel using poi library.
Converts the read string data into an object through parsing.
Save the converted object to Cassandra database.
In the demo source, only the part that parses the data is attached at the bottom.
The data parsing logic also has a serious performance difference.
The environment configuration is as follows.
SpringBoot-2.2.2.Release
JVMOptions is -Xms16g -Xmx16g
Cassandra-3.11.7
OpenJDK1.8
I ran 3 tests, and the results are:
Test 1
SpringBoot-2.2.2.Release
JVMOptions is -Xms16g -Xmx16g
Cassandra-3.11.7
OpenJDK1.8
Windows OS
poi excel read: 2 seconds
Data parsing: 42 seconds
Database save: 31 seconds
Linux OS
poi excel read: 2 seconds
Data parsing: 4 seconds
Database save: 5 seconds
Test 2 (Save logic removed)
SpringBoot-2.2.2.Release
JVMOptions is -Xms16g -Xmx16g
Cassandra-3.11.7
OpenJDK1.8
Windows OS
poi excel read: 2 seconds
Data parsing: 40 seconds
Linux OS
poi excel read: 1 seconds
Data parsing: 6 seconds
Test 3
This test was conducted because it was determined that there was a performance degradation in the process of parsing data.
JVMOptions is -Xms16g -Xmx16g
OpenJDK1.8
Windows OS
poi excel read: 2 seconds
Data parsing: 2 seconds
Linux OS
poi excel read: 2 seconds
Data parsing: 1 seconds
However, the results were unexpected.
By removing all previously developed logic and performing data parsing, the speed has been greatly increased.
I am doing it through a simple restAPI request in the browser.
I don't know which point to do cause analysis. Help.
Excel sample data to parse DOWNLOAD
package com.example.demo;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.xssf.streaming.SXSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
public class DemoApplication {
public static final List<String> TABLE_USERNOUN_EXCEL_FIELD = Arrays.asList("keyword", "separation", "foreign");
public static final List<String> TITLE_USERNOUN_EXCEL_FIELD = Arrays.asList("검색어", "분리정보", "외래어");
public static void main(String[] args) {
try {
Long start = new Date().getTime();
List<Map<String, Object>> data = readFileData();
Long end = new Date().getTime();
System.out.println(data.toString());
System.out.println("finish time : " +(end-start)/1000+ "seconds");
} catch (IOException e) {
e.printStackTrace();
}
}
public static List<Map<String, Object>> readFileData() throws IOException {
List<String> tableColumns = TABLE_USERNOUN_EXCEL_FIELD;
List<String> columnNames = TITLE_USERNOUN_EXCEL_FIELD;
List<Map<String, Object>> contents = new ArrayList(); // contents array
List<Integer> requiredList = Arrays.asList(0,1);
Path targetLocation = Paths.get("D:","usernoun_dic_10000.xlsx");
InputStream inputStream = new FileInputStream(new File(targetLocation.toString()));
SXSSFWorkbook workbook = new SXSSFWorkbook(new XSSFWorkbook(inputStream));
Sheet sheet = workbook.getXSSFWorkbook().getSheetAt(0);
int rows = sheet.getPhysicalNumberOfRows();
if (rows<2){
workbook.close();
inputStream.close();
return contents;
}
Row checkRow = sheet.getRow(0);
int cellCnt = checkRow.getPhysicalNumberOfCells();
if (cellCnt<columnNames.size()){
workbook.close();
inputStream.close();
return contents;
}else if (cellCnt>columnNames.size()){
Object check = getCellValue(checkRow.getCell(tableColumns.size()));
if (check!=null){
if (!check.toString().equals("")){
workbook.close();
inputStream.close();
return contents;
}
}
}else{
for (int i=0; i<cellCnt; i++){
Object cellVal = getCellValue(checkRow.getCell(i));
String columnName = "";
if (cellVal!=null){
columnName = cellVal.toString();
}
if (columnName.equals("")||!columnName.equals(columnNames.get(i))){
workbook.close();
inputStream.close();
return contents;
}
}
}
sheet.removeRow(checkRow);
List<String> finalTableColumns = tableColumns;
List<Integer> finalRequiredList = requiredList;
sheet.forEach(row -> {
Map<String, Object> content = new HashMap<>(); //contents object
AtomicReference<Boolean> skip = new AtomicReference<>(false);
row.forEach(cell -> {
int cellIdx = cell.getColumnIndex();
if (cellIdx< finalTableColumns.size()){
Object value = getCellValue(cell);
boolean require = finalRequiredList.stream().anyMatch(integer -> integer==cellIdx);
if (require){
if (value!=null){
if (value.toString().equals("")){
skip.set(true);
}
}else{
skip.set(true);
}
}
content.put(finalTableColumns.get(content.size()), value);
}
});
if (!skip.get()){
contents.add(content);
}
});
workbook.close();
inputStream.close();
return contents;
}
public static Object getCellValue(Cell cell) {
switch (cell.getCellType()) {
case BLANK: //Null exception
return "";
case ERROR:
return cell.getErrorCellValue();
case STRING:
return cell.getStringCellValue();
case BOOLEAN:
return cell.getBooleanCellValue();
case NUMERIC:
return cell.getNumericCellValue();
case FORMULA:
return cell.getCellFormula();
}
return null;
}
}
I have been trying to edit my code to allow a XLSX file to be uploaded and be able to be read on the website. But after countless tries, the data I typed into the XLSX File is unable to be captured on the website. (Eg: After downloading the XLSX Template from the website, I am able to type in anything that I want in the XLSX file and able to upload it again to the website so I do not need to keep on adding new data by clicking "new" every single time. I can just type in everything in that XLSX File all at once and upload it right away)
I was told to use hashmap but I am unsure of the way it works. The codes I have currently only enables the website to capture the header title and I am not suppose to use jxl.
While removing those codes that has jxl, I encounter some errors (being underline in red).
public HashMap getConstructJXLList_xlsx(UploadedFile File, int Sheetindex) {
String _LOC = "[PageCodeBase: getConstructJXLList]";
HashMap _m = new HashMap();
InputStream _is = null;
try {
_is = File.getInputstream();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
XSSFWorkbook workbook;
XSSFSheet s;
try {
workbook = new XSSFWorkbook(_is);
s = workbook.getSheetAt(Sheetindex);
} catch (Exception e) {
System.out.println(_LOC + "1.0 " + " Test:");
int _totalc = getColumns(); //getColumns is being underline in red
int _totalr = getRows(); //getRows is being underline in red
// Header r=0
String[] _st = new String[_totalc];
//XSSFSheet sheet = null;
for (int _c = 0; _c < _totalc; _c++) {
_st[_c] = getCell(_c, 0); //getCell is being underline in red
}
_m.put("HEADER", _st);
System.out.println(_LOC + "1.0 " + " _m:" + _m);
// Data r=1 thereafter
List _l = new ArrayList();
for (int _r = 1; _r < _totalr; _r++) {
Object[] _o = new Object[_totalc];
String _s_r = null;
for (int _c = 0; _c < _totalc; _c++) {
_o[_c] = getCell(_c, _r);
String _cn = _o[_c].getClass().getName();
String _s_c = null;
if (!isEmptyNull(_s_c)) {
_s_r = "record_available";
}
}
if ((_o != null) && (_o.length != 0)) {
_l.add(_o);
}
}
_m.put("DATA", _l);
System.out.println(_LOC + "1.0 " + " _m:" + _m);
}
return _m;
}
Do you mind helping me to solve this? Why there isn't any data being capture in the website? The error shown is "The method getColumns/getCell/getRows is undefined for the type PageCodeBase." And the help/quick fix given is to create a new method. But after creating the new method, I am unsure of what to add in the methods. Have tried various example (http://snippetjournal.wordpress.com/2014/02/05/read-xlsx-using-poi/) but I stil can't seem to get it work out.
I would recommend you to manage de excel file using this classes from the apache POI api
org.apache.poi.ss.usermodel.Cell;
org.apache.poi.ss.usermodel.Row;
org.apache.poi.ss.usermodel.Sheet;
org.apache.poi.ss.usermodel.Workbook;
org.apache.poi.ss.usermodel.WorkbookFactory;
instead of those XSSFWorkbook, XSSFSheet...
And also when accessing the file input stream try doing it this way:
FileInputStream input = new FileInputStream(new File("C:\\Users\\admin\\Desktop\\Load_AcctCntr_Template.xlsx"));
Workbook workBook = WorkbookFactory.create(stream);
workBook.getSheetAt(0);
use this.
FileInputStream input = new FileInputStream(new File("C:/Users/admin/Desktop/Load_AcctCntr_Template.xlsx"));
Workbook wb = WorkbookFactory.create(input);
as mentioned in user3661357 answer. use
Workbook instead of XSSFWorkbook.
Sheet instead of XSSFSheet.
etc..
Also read this
Getting Exception(org.apache.poi.openxml4j.exception - no content type [M1.13]) when reading xlsx file using Apache POI?
*HINT > use ALT+SHIFT+I in netbeans to load the necessary packages.
A working example
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
public class POITest {
public static void test() {
try {
FileInputStream input = new FileInputStream(new File("C:/Users/kingslayer/Desktop/test/a.xlsx"));
Workbook wb = WorkbookFactory.create(input);
Sheet s = wb.getSheetAt(0);
Iterator<Row> rows = s.rowIterator();
while (rows.hasNext()) {
Row row = rows.next();
Iterator cells = row.cellIterator();
while (cells.hasNext()) {
Cell cell = (Cell) cells.next();
if (cell.getCellType() == Cell.CELL_TYPE_STRING) {
System.out.print(cell.getStringCellValue() + "t");
} else if (cell.getCellType() == Cell.CELL_TYPE_NUMERIC) {
System.out.print(cell.getNumericCellValue() + "t");
} else if (cell.CELL_TYPE_BLANK == cell.getCellType()) {
System.out.print("BLANK ");
} else {
System.out.print("Unknown cell type");
}
}
input.close();
}
} catch (IOException | InvalidFormatException ex) {
Logger.getLogger(POITest.class.getName()).log(Level.SEVERE, null, ex);
}
}
public static void main(String[] args) {
test();
}
}
All the libraries you must have on the project path.
commons-codec-1.5.jar ,
commons-logging-1.1.jar ,
dom4j-1.6.1.jar ,
junit-3.8.1.jar ,
log4j-1.2.13.jar ,
poi-3.9-20121203.jar ,
poi-excelant-3.9-20121203.jar ,
poi-ooxml-3.9-20121203.jar ,
poi-ooxml-schemas-3.9-20121203.jar ,
poi-scratchpad-3.9-20121203.jar ,
stax-api-1.0.1.jar ,
xmlbeans-2.3.0.jar ,
1) get rid of POIFSFileSystem fs = new POIFSFileSystem(input); as you are not using it
2) input.close(); is called after first iteration of row
I am trying to read this file I created as sample made up from 4 columns and 1 row.
The code below was taken to test the API i am using i.e. Apache POI..
package testjavaexcel;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Date;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
/**
*
*/
public class TestJavaExcel {
/**
* #param args the command line arguments
*/
public static void main(String[] args) {
try {
FileInputStream fileInputStream = new FileInputStream("poi-test.xls");
HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream);
HSSFSheet worksheet = workbook.getSheet("POI Worksheet");
HSSFRow row1 = worksheet.getRow(0);
HSSFCell cellA1 = row1.getCell((short) 0);
String a1Val = cellA1.getStringCellValue();
HSSFCell cellB1 = row1.getCell((short) 1);
String b1Val = cellB1.getStringCellValue();
HSSFCell cellC1 = row1.getCell((short) 2);
boolean c1Val = cellC1.getBooleanCellValue();
HSSFCell cellD1 = row1.getCell((short) 3);
Date d1Val = cellD1.getDateCellValue();
System.out.println("A1: " + a1Val);
System.out.println("B1: " + b1Val);
System.out.println("C1: " + c1Val);
System.out.println("D1: " + d1Val);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
The error coming from line "HSSFRow row1 = worksheet.getRow(0);" output is:
Exception in thread "main" java.lang.NullPointerException at
testjavaexcel.TestJavaExcel.main(TestJavaExcel.java:28) Java Result: 1
Not sure why this is happening...it seems like straight forward.note that the .getCell() methods invoked are all striked indicating deprecated methods but not sure how can I replace them given the API.
Thanks,
UPDATE: I figured out that the new method if getCell takes int instead of older version using short type. that fixed the deprecated warning. The rest remains unsolved. Also i am using poi version 3.8
Check that your excel book must have a sheet with the name "POI Worksheet" (exact name).
Does anyone here know of any quick, clean way to convert csv files to xls or xlsx files in java?
I have something to manage csv files already in place and I need the extra compatibility for other programs.
Sample code in addition to package names is always well appreciated.
Many thanks,
Justian
Here's my code thus far. I need to remove the returns ("\n") from the lines. Some of my cells contain multiple lines of information (a list), so I can use "\n" in csv to indicate multiple lines within a cell, but xls treats these as if I mean to put them on a new line.
The code is modified from the internet and a little messy at the moment. You might notice some deprecated methods, as it was written in 2004, and be sure to ignore the terrible return statements. I'm just using S.o.p at the moment for testing and I'll clean that up later.
package jab.jm.io;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
public class FileConverter {
public static String ConvertCSVToXLS(String file) throws IOException {
if (file.indexOf(".csv") < 0)
return "Error converting file: .csv file not given.";
String name = FileManager.getFileNameFromPath(file, false);
ArrayList<ArrayList<String>> arList = new ArrayList<ArrayList<String>>();
ArrayList<String> al = null;
String thisLine;
DataInputStream myInput = new DataInputStream(new FileInputStream(file));
while ((thisLine = myInput.readLine()) != null) {
al = new ArrayList<String>();
String strar[] = thisLine.split(",");
for (int j = 0; j < strar.length; j++) {
// My Attempt (BELOW)
String edit = strar[j].replace('\n', ' ');
al.add(edit);
}
arList.add(al);
System.out.println();
}
try {
HSSFWorkbook hwb = new HSSFWorkbook();
HSSFSheet sheet = hwb.createSheet("new sheet");
for (int k = 0; k < arList.size(); k++) {
ArrayList<String> ardata = (ArrayList<String>) arList.get(k);
HSSFRow row = sheet.createRow((short) 0 + k);
for (int p = 0; p < ardata.size(); p++) {
System.out.print(ardata.get(p));
HSSFCell cell = row.createCell((short) p);
cell.setCellValue(ardata.get(p).toString());
}
}
FileOutputStream fileOut = new FileOutputStream(
FileManager.getCleanPath() + "/converted files/" + name
+ ".xls");
hwb.write(fileOut);
fileOut.close();
System.out.println(name + ".xls has been generated");
} catch (Exception ex) {
}
return "";
}
}
Don't know if you know this already, but:
Excel (if that's your real target) is easily able to read .csv files directly, so any conversion you'd do would only be a courtesy to your less "gifted" users.
CSV is a lowest-common-denominator format. It's unlikely for any converter to add information to that found in a .csv file that will make it more useful. In other words, CSV is a "dumb" format and converting it to .xls will (probably) increase file size but not make the format any smarter.
Curtis' suggestion of POI is the first thing that would come to my mind too.
If you're doing this conversion on a Windows machine, another alternative could be Jacob, a Java-COM bridge that would allow you to effectively remote control Excel from a Java program so as to do things like open a file and save in a different format, perhaps even applying some formatting changes or such.
Finally, I've also had some success doing SQL INSERTs (via JDBC) into an Excel worksheet accessed via the JDBC-ODBC bridge. i.e. ODBC can make an Excel file look like a database. It's not very flexible though, you can't ask the DB to create arbitrarily named .XLS files.
EDIT:
It looks to me like readLine() is already not giving you whole lines. How is it to know that carriage return is not a line terminator? You should be able to verify this with debug print statements right after the readLine().
If this is indeed so, it would suck because the way forward would be for you to
either recognize incomplete lines and paste them together after the fact,
or write your own substitute for readLine(). A simple approach would be to read character by character, replacing CRs within a CSV string and accumulating text in a StringBuilder until you feel you have a complete line.
Both alternatives are work you probably weren't looking forward to.
If you want to read or write XLS or XLSX files in Java, Apache POI is a good bet: http://poi.apache.org/
Copy paste the below program,I ran the program and it is working fine,Let me know if you have any concerns on this program.(You need Apache POI Jar to run this program)
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
public class CSVToExcelConverter {
public static void main(String args[]) throws IOException
{
ArrayList arList=null;
ArrayList al=null;
String fName = "test.csv";
String thisLine;
int count=0;
FileInputStream fis = new FileInputStream(fName);
DataInputStream myInput = new DataInputStream(fis);
int i=0;
arList = new ArrayList();
while ((thisLine = myInput.readLine()) != null)
{
al = new ArrayList();
String strar[] = thisLine.split(",");
for(int j=0;j<strar.length;j++)
{
al.add(strar[j]);
}
arList.add(al);
System.out.println();
i++;
}
try
{
HSSFWorkbook hwb = new HSSFWorkbook();
HSSFSheet sheet = hwb.createSheet("new sheet");
for(int k=0;k<arList.size();k++)
{
ArrayList ardata = (ArrayList)arList.get(k);
HSSFRow row = sheet.createRow((short) 0+k);
for(int p=0;p<ardata.size();p++)
{
HSSFCell cell = row.createCell((short) p);
String data = ardata.get(p).toString();
if(data.startsWith("=")){
cell.setCellType(Cell.CELL_TYPE_STRING);
data=data.replaceAll("\"", "");
data=data.replaceAll("=", "");
cell.setCellValue(data);
}else if(data.startsWith("\"")){
data=data.replaceAll("\"", "");
cell.setCellType(Cell.CELL_TYPE_STRING);
cell.setCellValue(data);
}else{
data=data.replaceAll("\"", "");
cell.setCellType(Cell.CELL_TYPE_NUMERIC);
cell.setCellValue(data);
}
//*/
// cell.setCellValue(ardata.get(p).toString());
}
System.out.println();
}
FileOutputStream fileOut = new FileOutputStream("test.xls");
hwb.write(fileOut);
fileOut.close();
System.out.println("Your excel file has been generated");
} catch ( Exception ex ) {
ex.printStackTrace();
} //main method ends
}
}
The tools in Excel are not adequate for what the OP wants to do. He's on the right track there. Excel cannot import multiple CSV files into different worksheets in the same file, which is why you'd want to do it in code. My suggestion is to use OpenCSV to read the CSV, as it can automatically correct for newlines in data and missing columns, and it's free and open source. It's actually very, very robust and can handle all sorts of different non-standard CSV files.
You wrote:
I have something to manage csv files
already in place and I need the extra
compatibility for other programs.
What are those other programs? Are they required to access your data through Excel files, or could they work with an JDBC or ODBC connection to a database? Using a database as the central location, you could extract the data into CSV files or other formats as needed.
I created a small software called csv2xls. It needs Java.
For this project I'm working on, I want to take multiple excel sheets and then merge them into one, manipulating the data as I please to make everything a little more readable.
What would be the best way to open files, read their contents, store that content, create a new file (.csv), then paste the information in the organization of my choosing?
I definitely need to stick to java, as this will be part of a pre-existing automated process and I don't want to have to change everything to another language.
Is there a useful package out there that I should know about?
Many thanks
Justian
I think any serious work in Excel should consider Joel's solution of letting Office do it for you on a Windows machine you call remotely if necessary. If, however, your needs are simple enough or you really need a pure Java solution, Apache's POI library does a good enough job.
As far as I know, csv is not excel-specific, but rather just a "comma-separated values"-file.
So this might help you.
Writing CSV files is usually very simple, for obvious reasons. You can write your own helper class to do it. The caveat is to ensure that you do not have your delimeter in any of the outputs.
Reading CSV is trickier. There isn't a standard library like there is in Python (a much better language, IMHO, for doing CSV processing), but if you search for it there are a lot of decent free implementations around.
The biggest question is the internal representation in your program: Depending on the size of your inputs and outputs, keeping everything in memory may be out of the question. Can you do everything in one pass? (I mean, read some, write some, etc.)
You may also want to use sparse representations rather than just represent all the spreadsheets in an array.
Maybe you should try this one:
Jxcell,it is a java spreadsheet component,and can read/write/edit all xls/xlsx/csv files.
Try this code
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.TimeoutException;
import java.util.logging.Logger;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbookFactory;
public class App {
public void convertExcelToCSV(Sheet sheet, String sheetName) {
StringBuffer data = new StringBuffer();
try {
FileOutputStream fos = new FileOutputStream("C:\\Users\\" + sheetName + ".csv");
Cell cell;
Row row;
Iterator<Row> rowIterator = sheet.iterator();
while (rowIterator.hasNext()) {
row = rowIterator.next();
Iterator<Cell> cellIterator = row.cellIterator();
while (cellIterator.hasNext()) {
cell = cellIterator.next();
CellType type = cell.getCellTypeEnum();
if (type == CellType.BOOLEAN) {
data.append(cell.getBooleanCellValue() + ",");
} else if (type == CellType.NUMERIC) {
data.append(cell.getNumericCellValue() + ",");
} else if (type == CellType.STRING) {
data.append(cell.getStringCellValue() + ",");
} else if (type == CellType.BLANK) {
data.append("" + ",");
} else {
data.append(cell + ",");
}
}
data.append('\n');
}
fos.write(data.toString().getBytes());
fos.close();
}
catch (FileNotFoundException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
}
public static void main(String [] args)
{
App app = new App();
String path = "C:\\Users\\myFile.xlsx";
InputStream inp = null;
try {
inp = new FileInputStream(path);
Workbook wb = WorkbookFactory.create(inp);
for(int i=0;i<wb.getNumberOfSheets();i++) {
System.out.println(wb.getSheetAt(i).getSheetName());
app.convertExcelToCSV(wb.getSheetAt(i),wb.getSheetAt(i).getSheetName());
}
} catch (Exception ex) {
System.out.println(ex.getMessage());
}
finally {
try {
inp.close();
} catch (Exception ex) {
System.out.println(ex.getMessage());
}
}
}
}