To print one confusion matrix instead of multiple matrices from each mapper - java

I am trying to print a confusion matrix of weka j48 algorithm and i am getting multiple matrices as output.
This is the class that runs the whole program. It is responsible for getting input from the user, setting up the mapper and reducer, organizing the weka input, etc.
public class WekDoop {
* The main method of this program.
* Precondition: arff file is uploaded into HDFS and the correct
* number of parameters were passed into the JAR file when it was run
*
* #param args
* #throws Exception
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// Make sure we have the correct number of arguments passed into the program
if (args.length != 4) {
System.err.println("Usage: WekDoop <# of splits> <classifier> <input file> <output file>");
System.exit(1);
}
// configure the job using the command line args
conf.setInt("Run-num.splits", Integer.parseInt(args[0]));
conf.setStrings("Run.classify", args[1]);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// Configure the jobs main class, mapper and reducer
// TODO: Make the Job name print the name of the currently running classifier
Job job = new Job(conf, "WekDoop");
job.setJarByClass(WekDoop.class);
job.setMapperClass(WekaMap.class);
job.setReducerClass(WekaReducer.class);
// Start with 1
job.setNumReduceTasks(1);
// This section sets the values of the <K2, V2>
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(weka.classifiers.bayes.NaiveBayes.class);
job.setOutputValueClass(AggregateableEvaluation.class);
// Set the input and output directories based on command line args
FileInputFormat.addInputPath(job, new Path(args[2]));
FileOutputFormat.setOutputPath(job, new Path(args[3]));
// Set the input type of the environment
// (In this case we are overriding TextInputFormat)
job.setInputFormatClass(WekaInputFormat.class);
// wait until the job is complete to exit
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Mapper Class
This class is a mapper for the weka classifiers It is given a chunk of data and it sets up a classifier to run on that data. There is a lot of other handling that occurs in the method as well.
public class WekaMap extends Mapper<Object, Text, Text, AggregateableEvaluation> {
private Instances randData = null;
private Classifier cls = null;
private AggregateableEvaluation eval = null;
private Classifier clsCopy = null;
// Run 10 mappers
private String numMaps = "10";
// TODO: Make sure this is not hard-coded -- preferably a command line arg
// Set the classifier
private String classname = "weka.classifiers.bayes.NaiveBayes";
private int seed = 20;
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
System.out.println("CURRENT LINE: " + line);
//line = "/home/ubuntu/Workspace/hadoop-1.1.0/hadoop-data/spambase_processed.arff";
Configuration conf = new Configuration();
FileSystem fileSystem = FileSystem.get(conf);
Path path = new Path("/home/hduser/very_small_spam.arff");
// Make sure the file exists...
if (!fileSystem.exists(path)) {
System.out.println("File does not exists");
return;
}
JobID test = context.getJobID();
TaskAttemptID tid = context.getTaskAttemptID();
// Set up the weka configuration
Configuration wekaConfig = context.getConfiguration();
numMaps = wekaConfig.get("Run-num.splits");
classname = wekaConfig.get("Run.classify");
String[] splitter = tid.toString().split("_");
String jobNumber = "";
int n = 0;
if (splitter[4].length() > 0) {
jobNumber = splitter[4].substring(splitter[4].length() - 1);
n = Integer.parseInt(jobNumber);
}
FileSystem fs = FileSystem.get(context.getConfiguration());
System.out.println("PATH: " + path);
// Read in the data set
context.setStatus("Reading in the arff file...");
readArff(fs, path.toString());
context.setStatus("Done reading arff! Initializing aggregateable eval...");
try {
eval = new AggregateableEvaluation(randData);
}
catch (Exception e1) {
e1.printStackTrace();
}
// Split the data into two sets: Training set and a testing set
// this will allow us to use a little bit of data to train the classifier
// before running the classifier on the rest of the dataset
Instances trainInstance = randData.trainCV(Integer.parseInt(numMaps), n);
Instances testInstance = randData.testCV(Integer.parseInt(numMaps), n);
// Set parameters to be passed to the classifiers
String[] opts = new String[3];
if (classname.equals("weka.classifiers.lazy.IBk")) {
opts[0] = "";
opts[1] = "-K";
opts[2] = "1";
}
else if (classname.equals("weka.classifiers.trees.J48")) {
opts[0] = "";
opts[1] = "-C";
opts[2] = "0.25";
}
else if (classname.equals("weka.classifiers.bayes.NaiveBayes")) {
opts[0] = "";
opts[1] = "";
opts[2] = "";
}
else {
opts[0] = "";
opts[1] = "";
opts[2] = "";
}
// Start setting up the classifier and its various options
try {
cls = (Classifier) Utils.forName(Classifier.class, classname, opts);
}
catch (Exception e) {
e.printStackTrace();
}
// These are all used for timing different processes
long beforeAbstract = 0;
long beforeBuildClass = 0;
long afterBuildClass = 0;
long beforeEvalClass = 0;
long afterEvalClass = 0;
try {
// Create the classifier and record how long it takes to set up
context.setStatus("Creating the classifier...");
System.out.println(new Timestamp(System.currentTimeMillis()));
beforeAbstract = System.currentTimeMillis();
clsCopy = AbstractClassifier.makeCopy(cls);
beforeBuildClass = System.currentTimeMillis();
System.out.println(new Timestamp(System.currentTimeMillis()));
// Train the classifier on the training set and record how long this takes
context.setStatus("Training the classifier...");
clsCopy.buildClassifier(trainInstance);
afterBuildClass = System.currentTimeMillis();
System.out.println(new Timestamp(System.currentTimeMillis()));
beforeEvalClass = System.currentTimeMillis();
// Run the classifer on the rest of the data set and record its duration as well
context.setStatus("Evaluating the model...");
eval.evaluateModel(clsCopy, testInstance);
afterEvalClass = System.currentTimeMillis();
System.out.println(new Timestamp(System.currentTimeMillis()));
// We are done this iteration!
context.setStatus("Complete");
}
catch (Exception e) {
System.out.println("Debugging strarts here!");
e.printStackTrace();
}
// calculate the total times for each section
long abstractTime = beforeBuildClass - beforeAbstract;
long buildTime = afterBuildClass - beforeBuildClass;
long evalTime = afterEvalClass - beforeEvalClass;
// Print out the times
System.out.println("The value of creation time: " + abstractTime);
System.out.println("The value of Build time: " + buildTime);
System.out.println("The value of Eval time: " + evalTime);
context.write(new Text(line), eval);
}
/**
* This can be used to write out the results on HDFS, but it is not essential
* to the success of this project. If time allows, we can implement it.
*/
public void writeResult() {
}
/**
* This method reads in the arff file that is provided to the program.
* Nothing really special about the way the data is handled.
*
* #param fs
* #param filePath
* #throws IOException
* #throws InterruptedException
*/
public void readArff(FileSystem fs, String filePath) throws IOException, InterruptedException {
BufferedReader reader;
DataInputStream d;
ArffReader arff;
Instance inst;
Instances data;
try {
// Read in the data using a ton of wrappers
d = new DataInputStream(fs.open(new Path(filePath)));
reader = new BufferedReader(new InputStreamReader(d));
arff = new ArffReader(reader, 100000);
data = arff.getStructure();
data.setClassIndex(data.numAttributes() - 1);
// Add each line to the input stream
while ((inst = arff.readInstance(data)) != null) {
data.add(inst);
}
reader.close();
Random rand = new Random(seed);
randData = new Instances(data);
randData.randomize(rand);
// This is how weka handles the sampling of the data
// the stratify method splits up the data to cross validate it
if (randData.classAttribute().isNominal()) {
randData.stratify(Integer.parseInt(numMaps));
}
}
catch (IOException e) {
e.printStackTrace();
}
}
}
Reducer Class
This class is a reducer for the output from the weka classifiers It is given bunch of cross-validated data chunks from the mappers and its job is to aggregate the data into one solution.
public class WekaReducer extends Reducer<Text, AggregateableEvaluation, Text, IntWritable> {
Text result = new Text();
Evaluation evalAll = null;
IntWritable test = new IntWritable();
AggregateableEvaluation aggEval;
/**
* The reducer method takes all the stratified, cross-validated
* values from the mappers in a list and uses an aggregatable evaluation to consolidate
* them.
*/
public void reduce(Text key, Iterable<AggregateableEvaluation> values, Context context) throws IOException, InterruptedException {
int sum = 0;
// record how long it takes to run the aggregation
System.out.println(new Timestamp(System.currentTimeMillis()));
long beforeReduceTime = System.currentTimeMillis();
// loop through each of the values and "aggregate"
// which basically means to consolidate the values
for (AggregateableEvaluation val : values) {
System.out.println("IN THE REDUCER!");
// The first time through, give aggEval a value
if (sum == 0) {
try {
aggEval = val;
}
catch (Exception e) {
e.printStackTrace();
}
}
else {
// combine the values
aggEval.aggregate(val);
}
try {
// This is what is taken from the mapper to be aggregated
System.out.println("This is the map result");
System.out.println(aggEval.toMatrixString());
}
catch (Exception e) {
e.printStackTrace();
}
sum += 1;
}
// Here is where the typical weka matrix output is generated
try {
System.out.println("This is reduce matrix");
System.out.println(aggEval.toMatrixString());
}
catch (Exception e) {
e.printStackTrace();
}
// calculate the duration of the aggregation
context.write(key, new IntWritable(sum));
long afterReduceTime = System.currentTimeMillis();
long reduceTime = afterReduceTime - beforeReduceTime;
// display the output
System.out.println("The value of reduce time is: " + reduceTime);
System.out.println(new Timestamp(System.currentTimeMillis()));
}
}
And lastly InputFormatClass
Takes a JobContext and returns a list of data split into pieces Basically this is a way of handling large data sets. This method allows us to split a large data set into smaller chunks to pass across worker nodes (or in our case, just to make life a little easier and pass the chunks to a single node so that it is not overwhelmed by one large data set)
public class WekaInputFormat extends TextInputFormat {
public List<InputSplit> getSplits(JobContext job) throws IOException {
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
long maxSize = getMaxSplitSize(job);
List<InputSplit> splits = new ArrayList<InputSplit>();
for (FileStatus file: listStatus(job)) {
Path path = file.getPath();
FileSystem fs = path.getFileSystem(job.getConfiguration());
//number of bytes in this file
long length = file.getLen();
BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
// make sure this is actually a valid file
if(length != 0) {
// set the number of splits to make. NOTE: the value can be changed to anything
int count = job.getConfiguration().getInt("Run-num.splits", 1);
for(int t = 0; t < count; t++) {
//split the file and add each chunk to the list
splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
}
}
else {
// Create empty array for zero length files
splits.add(new FileSplit(path, 0, length, new String[0]));
}
}
return splits;
}
}

Related

Is there a better way to generate 5 million csv files quickly

I would like to create 5 million csv files, I have waiting for almost 3 hours, but the program is still running. Can somebody give me some advice, how to speed up the file generation.
After these 5 million files generation complete, I have to upload them to s3 bucket.
It would be better if someone know how to generate these files through AWS, thus, we can move files to s3 bucket directly and ignore network speed issue.(Just start to learning AWS, there are lots of knowledge need to know)
The following is my code.
public class ParallelCsvGenerate implements Runnable {
private static AtomicLong baseID = new AtomicLong(8160123456L);
private static ThreadLocalRandom random = ThreadLocalRandom.current();
private static ThreadLocalRandom random2 = ThreadLocalRandom.current();
private static String filePath = "C:\\5millionfiles\\";
private static List<String> headList = null;
private static String csvHeader = null;
public ParallelCsvGenerate() {
headList = generateHeadList();
csvHeader = String.join(",", headList);
}
#Override
public void run() {
for(int i = 0; i < 1000000; i++) {
generateCSV();
}s
}
private void generateCSV() {
StringBuilder builder = new StringBuilder();
builder.append(csvHeader).append(System.lineSeparator());
for (int i = 0; i < headList.size(); i++) {
if(i < headList.size() - 1) {
builder.append(i % 2 == 0 ? generateRandomInteger() : generateRandomStr()).append(",");
} else {
builder.append(i % 2 == 0 ? generateRandomInteger() : generateRandomStr());
}
}
String fileName = String.valueOf(baseID.addAndGet(1));
File csvFile = new File(filePath + fileName + ".csv");
FileWriter fileWriter = null;
try {
fileWriter = new FileWriter(csvFile);
fileWriter.write(builder.toString());
fileWriter.flush();
} catch (Exception e) {
System.err.println(e);
} finally {
try {
if(fileWriter != null) {
fileWriter.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private static List<String> generateHeadList() {
List<String> headList = new ArrayList<>(20);
String baseFiledName = "Field";
for(int i = 1; i <=20; i++) {
headList.add(baseFiledName + i);
}
return headList;
}
/**
* generate a number in range of 0-50000
* #return
*/
private Integer generateRandomInteger() {
return random.nextInt(0,50000);
}
/**
* generate a string length is 5 - 8
* #return
*/
private String generateRandomStr() {
int strLength = random2.nextInt(5, 8);
String str="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
int length = str.length();
StringBuilder builder = new StringBuilder();
for (int i = 0; i < strLength; i++) {
builder.append(str.charAt(random.nextInt(length)));
}
return builder.toString();
}
Main
ParallelCsvGenerate generate = new ParallelCsvGenerate();
Thread a = new Thread(generate, "A");
Thread b = new Thread(generate, "B");
Thread c = new Thread(generate, "C");
Thread d = new Thread(generate, "D");
Thread e = new Thread(generate, "E");
a.run();
b.run();
c.run();
d.run();
e.run();
Thanks for your guys advice, just refactor the code, and generate 3.8million files using 2.8h, which is much better.
Refactor code:
public class ParallelCsvGenerate implements Callable<Integer> {
private static String filePath = "C:\\5millionfiles\\";
private static String[] header = new String[]{
"FIELD1","FIELD2","FIELD3","FIELD4","FIELD5",
"FIELD6","FIELD7","FIELD8","FIELD9","FIELD10",
"FIELD11","FIELD12","FIELD13","FIELD14","FIELD15",
"FIELD16","FIELD17","FIELD18","FIELD19","FIELD20",
};
private String fileName;
public ParallelCsvGenerate(String fileName) {
this.fileName = fileName;
}
#Override
public Integer call() throws Exception {
try {
generateCSV();
} catch (IOException e) {
e.printStackTrace();
}
return 0;
}
private void generateCSV() throws IOException {
CSVWriter writer = new CSVWriter(new FileWriter(filePath + fileName + ".csv"), CSVWriter.DEFAULT_SEPARATOR, CSVWriter.NO_QUOTE_CHARACTER);
String[] content = new String[]{
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr()
};
writer.writeNext(header);
writer.writeNext(content);
writer.close();
}
}
Main
public static void main(String[] args) {
System.out.println("Start generate");
long start = System.currentTimeMillis();
ThreadPoolExecutor threadPoolExecutor = new ThreadPoolExecutor(8, 8,
0L, TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<Runnable>());
List<ParallelCsvGenerate> taskList = new ArrayList<>(3800000);
for(int i = 0; i < 3800000; i++) {
taskList.add(new ParallelCsvGenerate(i+""));
}
try {
List<Future<Integer>> futures = threadPoolExecutor.invokeAll(taskList);
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println("Success");
long end = System.currentTimeMillis();
System.out.println("Using time: " + (end-start));
}
You could write directly into the file (without allocating the whole file in one StringBuilder). (I think this is the biggest time+memory bottleneck here: builder.toString())
You could generate each file in parallel.
(little tweaks:) Omit the if's inside loop.
if(i < headList.size() - 1) is not needed, when you do a more clever loop + 1 extra iteration.
The i % 2 == 0 can be eliminated by a better iteration (i+=2) ..and more labor inside the loop (i -> int, i + 1 -> string)
If applicable prefer append(char) to append(String). (Better append(',') than append(",")!)
...
You can use Fork/Join framework (java 7 and above) to make your process in parallel and use multi core of your Cpu.
I'll take an example for you.
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
import java.util.concurrent.RecursiveTask;
import java.util.stream.LongStream;
public class ForkJoinAdd extends RecursiveTask<Long> {
private final long[] numbers;
private final int start;
private final int end;
public static final long threshold = 10_000;
public ForkJoinAdd(long[] numbers) {
this(numbers, 0, numbers.length);
}
private ForkJoinAdd(long[] numbers, int start, int end) {
this.numbers = numbers;
this.start = start;
this.end = end;
}
#Override
protected Long compute() {
int length = end - start;
if (length <= threshold) {
return add();
}
ForkJoinAdd firstTask = new ForkJoinAdd(numbers, start, start + length / 2);
firstTask.fork(); //start asynchronously
ForkJoinAdd secondTask = new ForkJoinAdd(numbers, start + length / 2, end);
Long secondTaskResult = secondTask.compute();
Long firstTaskResult = firstTask.join();
return firstTaskResult + secondTaskResult;
}
private long add() {
long result = 0;
for (int i = start; i < end; i++) {
result += numbers[i];
}
return result;
}
public static long startForkJoinSum(long n) {
long[] numbers = LongStream.rangeClosed(1, n).toArray();
ForkJoinTask<Long> task = new ForkJoinAdd(numbers);
return new ForkJoinPool().invoke(task);
}
}
use this example
And if you want to read more about it, Guide to the Fork/Join Framework in Java | Baeldung
and Fork/Join (The Java™ Tutorials
can help you to better understand and better design your app.
be lucky.
Remove the for(int i = 0; i < 1000000; i++) loop from run method (leave a single generateCSV() call.
Create 5 million ParallelCsvGenerate objects.
Submit them to a ThreadPoolExecutor
Converted main:
public static void main(String[] args) {
ThreadPoolExecutor ex = (ThreadPoolExecutor) Executors.newFixedThreadPool(8);
for(int i = 0; i < 5000000; i++) {
ParallelCsvGenerate generate = new ParallelCsvGenerate();
ex.submit(generate);
}
ex.shutdown();
}
It takes roughly 5 minutes to complete on my laptop (4 physical cores with hyperthreading, SSD drive).
EDIT:
I've replaced FileWriter with AsynchronousFileChannel using the following code:
Path file = Paths.get(filePath + fileName + ".csv");
try(AsynchronousFileChannel asyncFile = AsynchronousFileChannel.open(file,
StandardOpenOption.WRITE,
StandardOpenOption.CREATE)) {
asyncFile.write(ByteBuffer.wrap(builder.toString().getBytes()), 0);
} catch (IOException e) {
e.printStackTrace();
}
to achieve 30% speedup.
I believe that the main bottleneck is the hard drive and filesystem itself. Not much more can be achieved here.

java.lang.IllegalArgumentException: SQL array must not be empty

I have below DBImporter class which is working fine and also inserting data correctly in database table. I am trying to fetch data from .CSV file and inserting into Oracle table.
Till now i was processing only one file in my directory and which is working fine. Now i want to process more than one file. So during run the first file process correctly and inserted data, in second file it started reading data and throw an error as :
java.lang.IllegalArgumentException: SQL array must not be empty
Below is my DBimporter class. I think the error is during final commit batch somewhere in line here but not sure
jdbcTemplate.batchUpdate(sqlBatch.toArray(new String[sqlBatch.size()]));
#Service
public class DBImporter {
private final static Logger log = LoggerFactory.getLogger(DBImporter.class);
private static final List<String> NULL_VALUES = Arrays.asList("", "N.A", "N.A", "UNKNOWN");
private static final List<String> COL_HEADERS = Arrays.asList("ID", "NM", "TYE", "SA");
private static final int BATCH_SIZE = 50;
private boolean eof = false;
private String tableName;
#Autowired
private JdbcTemplate jdbcTemplate;
public void setTableName(String tableName) {
this.tableName = tableName;
}
#Transactional(rollbackFor = IOException.class)
public void processFile(BufferedReader reader, String tableName) {
this.tableName = tableName;
List<String> sqlBatch = new ArrayList<String>(BATCH_SIZE);
log.info("Starte auslesen der Daten");
long t1 = System.currentTimeMillis();
log.info("Start time: " + t1);
jdbcTemplate.execute("DELETE FROM " + tableName);
while (!eof) {
try {
Map<String, ColumnData> dbColumns = getDBColumns();
// Get a list of db column data related to the column headers.
List<ColumnData> columnData = COL_HEADERS.stream().map(dbColumns::get).collect(toList());
// Get the next valid data row if its starts from "FRO" or "BO".
List<String> dataRow = findNextLineStartingWith(reader, "R", "T");
String query = createSql(columnData, dataRow);
sqlBatch.add(query);
// Process batch.
if (sqlBatch.size() >= BATCH_SIZE) {
jdbcTemplate.batchUpdate(sqlBatch.toArray(new String[sqlBatch.size()]));
sqlBatch.clear();
}
} catch (IllegalStateException e) {
break;
} catch (IOException e) {
log.error(e.getLocalizedMessage());
}
}
// Commit the final batch.
jdbcTemplate.batchUpdate(sqlBatch.toArray(new String[sqlBatch.size()]));
sqlBatch.clear();
long delta = System.currentTimeMillis() - t1;
log.info("Total runtime : " + delta / 1000 + " seconds");
}
/**
* Create a SQL insert query using the data row.
*
* #param tableName Name of the table.
* #param columnData Column data list.
* #param dataRow Data row to be inserted.
* #return Generated SQL query string.
*/
private String createSql(List<ColumnData> columnData, List<String> dataRow) {
List<String> values = new ArrayList<>(columnData.size());
for (int i = 0; i < columnData.size(); i++) {
if (NULL_VALUES.contains(dataRow.get(i))) {
values.add("NULL");
} else if (columnData.get(i).getType() >= Types.NUMERIC && columnData.get(i).getType() <= Types.DOUBLE) {
values.add(dataRow.get(i));
} else {
values.add("'" + dataRow.get(i).replace("'", "''") + "'");
}
}
return "INSERT INTO " + tableName + " (" +
columnData.stream().filter(Objects::nonNull).map(ColumnData::getName).collect(joining(", ")) +
", SYSTEM_INSERTED_AT) VALUES (" +
values.stream().collect(joining(", ")) +
", CURRENT_TIMESTAMP)";
}
/**
* Find the next line starting with the given string and split it into columns.
*
* #param reader BufferedReader object to be used.
* #param prefixes A list of prefixes to look for in the string.
* #return List of data objects.
* #throws IOException
*/
private List<String> findNextLineStartingWith(BufferedReader reader, String... prefixes) throws IOException {
while (true) {
String line = readLineOrThrow(reader);
for (String prefix : prefixes)
if (line.startsWith(prefix)) {
ArrayList<String> data = new ArrayList<>();
// Split the line using the delimiter.
data.addAll(Arrays.asList(line.split(";")));
// Build the row to be inserted.
List<String> row = Arrays.asList(data.get(1), data.get(2).trim(), "", "");
return row;
}
}
}
/**
* Read a single line in the file.
*
* #param reader BufferedReader object to be used.
* #return
* #throws IOException
*/
private String readLineOrThrow(BufferedReader reader) throws IOException {
String line = reader.readLine();
if (line == null) {
this.eof = true;
throw new IllegalStateException("Unexpected EOF");
}
return line.trim();
}
/**
* Read database column metadata.
*
* #param tableName Name of the table to process.
* #return A map containing column information.
*/
private Map<String, ColumnData> getDBColumns() {
Map<String, ColumnData> result = new HashMap<>();
try (Connection connection = jdbcTemplate.getDataSource().getConnection()) {
ResultSet rs = connection.getMetaData().getColumns(null, null, tableName, null);
while (rs.next()) {
String columnName = rs.getString(4).toUpperCase();
int type = rs.getInt(5);
result.put(columnName, new ColumnData(columnName, type));
}
return result;
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
}
Please try below changes:
// Commit the final batch.
if (sqlBatch.size() > 0){
jdbcTemplate.batchUpdate(sqlBatch.toArray(new String[sqlBatch.size()]));
sqlBatch.clear();
}
And
#Transactional(rollbackFor = IOException.class)
public void processFile(BufferedReader reader, String tableName) {
eof = false;
...
But if you want a more clear and safe solution do changes in your code as below:
public class DBImporter {
private final static Logger log = LoggerFactory.getLogger(DBImporter.class);
private static final List<String> NULL_VALUES = Arrays.asList("", "N.A", "N.A", "UNKNOWN");
private static final List<String> COL_HEADERS = Arrays.asList("USER_ID", "NAME", "TYPE", "SRC_DATA");
private static final int BATCH_SIZE = 50;
#Autowired
private JdbcTemplate jdbcTemplate;
#Transactional(rollbackFor = IOException.class)
public void processFile(BufferedReader reader, String tableName) {
AtomicBoolean eof = new AtomicBoolean(false);
List<String> sqlBatch = new ArrayList<String>(BATCH_SIZE);
log.info("Starte auslesen der Daten");
long t1 = System.currentTimeMillis();
log.info("Start time: " + t1);
jdbcTemplate.execute("DELETE FROM " + tableName);
while (!eof.get()) {
try {
Map<String, ColumnData> dbColumns = getDBColumns(tableName);
// Get a list of db column data related to the column headers.
List<ColumnData> columnData = COL_HEADERS.stream().map(dbColumns::get).collect(toList());
// Get the next valid data row if its starts from "R" or "T".
List<String> dataRow = findNextLineStartingWith(reader, eof, "R", "T");
String query = createSql(tableName, columnData, dataRow);
sqlBatch.add(query);
// Process batch.
if (sqlBatch.size() >= BATCH_SIZE) {
jdbcTemplate.batchUpdate(sqlBatch.toArray(new String[sqlBatch.size()]));
sqlBatch.clear();
}
} catch (IllegalStateException e) {
break;
} catch (IOException e) {
log.error(e.getLocalizedMessage());
}
}
// Commit the final batch.
jdbcTemplate.batchUpdate(sqlBatch.toArray(new String[sqlBatch.size()]));
sqlBatch.clear();
long delta = System.currentTimeMillis() - t1;
log.info("Total runtime : " + delta / 1000 + " seconds");
}
/**
* Create a SQL insert query using the data row.
*
* #param tableName Name of the table.
* #param columnData Column data list.
* #param dataRow Data row to be inserted.
* #return Generated SQL query string.
*/
private String createSql(String tableName, List<ColumnData> columnData, List<String> dataRow) {
List<String> values = new ArrayList<>(columnData.size());
for (int i = 0; i < columnData.size(); i++) {
if (NULL_VALUES.contains(dataRow.get(i))) {
values.add("NULL");
} else if (columnData.get(i).getType() >= Types.NUMERIC && columnData.get(i).getType() <= Types.DOUBLE) {
values.add(dataRow.get(i));
} else {
values.add("'" + dataRow.get(i).replace("'", "''") + "'");
}
}
return "INSERT INTO " + tableName + " (" +
columnData.stream().filter(Objects::nonNull).map(ColumnData::getName).collect(joining(", ")) +
", SYSTEM_INSERTED_AT) VALUES (" +
values.stream().collect(joining(", ")) +
", CURRENT_TIMESTAMP)";
}
/**
* Find the next line starting with the given string and split it into columns.
*
* #param reader BufferedReader object to be used.
* #param prefixes A list of prefixes to look for in the string.
* #return List of data objects.
* #throws IOException
*/
private List<String> findNextLineStartingWith(BufferedReader reader, AtomicBoolean eof, String... prefixes) throws IOException {
while (true) {
String line = readLineOrThrow(reader, eof);
for (String prefix : prefixes)
if (line.startsWith(prefix)) {
ArrayList<String> data = new ArrayList<>();
// Split the line using the delimiter.
data.addAll(Arrays.asList(line.split(";")));
// Build the row to be inserted.
List<String> row = Arrays.asList(data.get(1), data.get(2).trim(), "", "");
// Insert type depending on the prefix.
if (prefix.equals("R"))
row.set(2, "USER");
else if (prefix.equals("T"))
row.set(2, "PERM");
row.set(3, String.join(";", row.subList(0, 3)));
return row;
}
}
}
/**
* Read a single line in the file.
*
* #param reader BufferedReader object to be used.
* #return
* #throws IOException
*/
private String readLineOrThrow(BufferedReader reader, AtomicBoolean eof) throws IOException {
String line = reader.readLine();
if (line == null) {
eof.set(true);
throw new IllegalStateException("Unexpected EOF");
}
return line.trim();
}
/**
* Read database column metadata.
*
* #param tableName Name of the table to process.
* #return A map containing column information.
*/
private Map<String, ColumnData> getDBColumns(String tableName) {
Map<String, ColumnData> result = new HashMap<>();
try (Connection connection = jdbcTemplate.getDataSource().getConnection()) {
ResultSet rs = connection.getMetaData().getColumns(null, null, tableName, null);
while (rs.next()) {
String columnName = rs.getString(4).toUpperCase();
int type = rs.getInt(5);
result.put(columnName, new ColumnData(columnName, type));
}
return result;
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
}
There is the possibility that your final batch is empty.
This is possible in case you just commited BATCH_SIZE entries and have cleared the sqlBatch. In case your while loop exits at this point of time,
there are no elements to commit.
You'll want to fix that by adding a size check, for example:
// Commit the final batch (only if there is something left)
if (sqlBatch.isEmpty() == false) {
jdbcTemplate.batchUpdate(sqlBatch.toArray(new String[sqlBatch.size()]));
sqlBatch.clear();
}
Edit:
As #Vasif pointed out you'll need to reset the eof between different calls of the method.
A simple solution (albeit somewhat hacky) would be
boolean eof = false
while (!eof) {
try {
} catch (IllegalStateException e) {
eof = true;
break;
} catch (IOException e) {
log.error(e.getLocalizedMessage());
}
}
A proper solution would be to refactor your code so that it does not rely on these exception being thrown.
Some tips:
Get rid of readLineOrThrow.
Remove the while(true) in findNextLineStartingWith and instead return an empty list if the next line is null.
Adjust the outside loop to handle this return value appropriately.
(Note: you might also need to break the loop if you get an IOException).

Hadoop stdout is always empty and bytes written is zero

I am trying to execute Weka on MapReduce and the stdout is always empty
This is the class that runs the whole program. It is responsible
for getting input from the user, setting up the mapper and reducer,
organizing the weka input, etc.
public class WekDoop {
* The main method of this program.
* Precondition: arff file is uploaded into HDFS and the correct
* number of parameters were passed into the JAR file when it was run
*
* #param args
* #throws Exception
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// Make sure we have the correct number of arguments passed into the program
if (args.length != 4) {
System.err.println("Usage: WekDoop <# of splits> <classifier> <input file> <output file>");
System.exit(1);
}
// configure the job using the command line args
conf.setInt("Run-num.splits", Integer.parseInt(args[0]));
conf.setStrings("Run.classify", args[1]);
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
// Configure the jobs main class, mapper and reducer
// TODO: Make the Job name print the name of the currently running classifier
Job job = new Job(conf, "WekDoop");
job.setJarByClass(WekDoop.class);
job.setMapperClass(WekaMap.class);
job.setReducerClass(WekaReducer.class);
// Start with 1
job.setNumReduceTasks(1);
// This section sets the values of the <K2, V2>
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(weka.classifiers.bayes.NaiveBayes.class);
job.setOutputValueClass(AggregateableEvaluation.class);
// Set the input and output directories based on command line args
FileInputFormat.addInputPath(job, new Path(args[2]));
FileOutputFormat.setOutputPath(job, new Path(args[3]));
// Set the input type of the environment
// (In this case we are overriding TextInputFormat)
job.setInputFormatClass(WekaInputFormat.class);
// wait until the job is complete to exit
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Mapper Class
This class is a mapper for the weka classifiers
It is given a chunk of data and it sets up a classifier to run on that data.
There is a lot of other handling that occurs in the method as well.
public class WekaMap extends Mapper<Object, Text, Text, AggregateableEvaluation> {
private Instances randData = null;
private Classifier cls = null;
private AggregateableEvaluation eval = null;
private Classifier clsCopy = null;
// Run 10 mappers
private String numMaps = "10";
// TODO: Make sure this is not hard-coded -- preferably a command line arg
// Set the classifier
private String classname = "weka.classifiers.bayes.NaiveBayes";
private int seed = 20;
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
System.out.println("CURRENT LINE: " + line);
//line = "/home/ubuntu/Workspace/hadoop-1.1.0/hadoop-data/spambase_processed.arff";
Configuration conf = new Configuration();
FileSystem fileSystem = FileSystem.get(conf);
Path path = new Path("/home/hduser/very_small_spam.arff");
// Make sure the file exists...
if (!fileSystem.exists(path)) {
System.out.println("File does not exists");
return;
}
JobID test = context.getJobID();
TaskAttemptID tid = context.getTaskAttemptID();
// Set up the weka configuration
Configuration wekaConfig = context.getConfiguration();
numMaps = wekaConfig.get("Run-num.splits");
classname = wekaConfig.get("Run.classify");
String[] splitter = tid.toString().split("_");
String jobNumber = "";
int n = 0;
if (splitter[4].length() > 0) {
jobNumber = splitter[4].substring(splitter[4].length() - 1);
n = Integer.parseInt(jobNumber);
}
FileSystem fs = FileSystem.get(context.getConfiguration());
System.out.println("PATH: " + path);
// Read in the data set
context.setStatus("Reading in the arff file...");
readArff(fs, path.toString());
context.setStatus("Done reading arff! Initializing aggregateable eval...");
try {
eval = new AggregateableEvaluation(randData);
}
catch (Exception e1) {
e1.printStackTrace();
}
// Split the data into two sets: Training set and a testing set
// this will allow us to use a little bit of data to train the classifier
// before running the classifier on the rest of the dataset
Instances trainInstance = randData.trainCV(Integer.parseInt(numMaps), n);
Instances testInstance = randData.testCV(Integer.parseInt(numMaps), n);
// Set parameters to be passed to the classifiers
String[] opts = new String[3];
if (classname.equals("weka.classifiers.lazy.IBk")) {
opts[0] = "";
opts[1] = "-K";
opts[2] = "1";
}
else if (classname.equals("weka.classifiers.trees.J48")) {
opts[0] = "";
opts[1] = "-C";
opts[2] = "0.25";
}
else if (classname.equals("weka.classifiers.bayes.NaiveBayes")) {
opts[0] = "";
opts[1] = "";
opts[2] = "";
}
else {
opts[0] = "";
opts[1] = "";
opts[2] = "";
}
// Start setting up the classifier and its various options
try {
cls = (Classifier) Utils.forName(Classifier.class, classname, opts);
}
catch (Exception e) {
e.printStackTrace();
}
// These are all used for timing different processes
long beforeAbstract = 0;
long beforeBuildClass = 0;
long afterBuildClass = 0;
long beforeEvalClass = 0;
long afterEvalClass = 0;
try {
// Create the classifier and record how long it takes to set up
context.setStatus("Creating the classifier...");
System.out.println(new Timestamp(System.currentTimeMillis()));
beforeAbstract = System.currentTimeMillis();
clsCopy = AbstractClassifier.makeCopy(cls);
beforeBuildClass = System.currentTimeMillis();
System.out.println(new Timestamp(System.currentTimeMillis()));
// Train the classifier on the training set and record how long this takes
context.setStatus("Training the classifier...");
clsCopy.buildClassifier(trainInstance);
afterBuildClass = System.currentTimeMillis();
System.out.println(new Timestamp(System.currentTimeMillis()));
beforeEvalClass = System.currentTimeMillis();
// Run the classifer on the rest of the data set and record its duration as well
context.setStatus("Evaluating the model...");
eval.evaluateModel(clsCopy, testInstance);
afterEvalClass = System.currentTimeMillis();
System.out.println(new Timestamp(System.currentTimeMillis()));
// We are done this iteration!
context.setStatus("Complete");
}
catch (Exception e) {
System.out.println("Debugging strarts here!");
e.printStackTrace();
}
// calculate the total times for each section
long abstractTime = beforeBuildClass - beforeAbstract;
long buildTime = afterBuildClass - beforeBuildClass;
long evalTime = afterEvalClass - beforeEvalClass;
// Print out the times
System.out.println("The value of creation time: " + abstractTime);
System.out.println("The value of Build time: " + buildTime);
System.out.println("The value of Eval time: " + evalTime);
context.write(new Text(line), eval);
}
/**
* This can be used to write out the results on HDFS, but it is not essential
* to the success of this project. If time allows, we can implement it.
*/
public void writeResult() {
}
/**
* This method reads in the arff file that is provided to the program.
* Nothing really special about the way the data is handled.
*
* #param fs
* #param filePath
* #throws IOException
* #throws InterruptedException
*/
public void readArff(FileSystem fs, String filePath) throws IOException, InterruptedException {
BufferedReader reader;
DataInputStream d;
ArffReader arff;
Instance inst;
Instances data;
try {
// Read in the data using a ton of wrappers
d = new DataInputStream(fs.open(new Path(filePath)));
reader = new BufferedReader(new InputStreamReader(d));
arff = new ArffReader(reader, 100000);
data = arff.getStructure();
data.setClassIndex(data.numAttributes() - 1);
// Add each line to the input stream
while ((inst = arff.readInstance(data)) != null) {
data.add(inst);
}
reader.close();
Random rand = new Random(seed);
randData = new Instances(data);
randData.randomize(rand);
// This is how weka handles the sampling of the data
// the stratify method splits up the data to cross validate it
if (randData.classAttribute().isNominal()) {
randData.stratify(Integer.parseInt(numMaps));
}
}
catch (IOException e) {
e.printStackTrace();
}
}
}
Reducer Class
This class is a reducer for the output from the weka classifiers
It is given bunch of cross-validated data chunks from the mappers and its
job is to aggregate the data into one solution.
public class WekaReducer extends Reducer<Text, AggregateableEvaluation, Text, IntWritable> {
Text result = new Text();
Evaluation evalAll = null;
IntWritable test = new IntWritable();
AggregateableEvaluation aggEval;
/**
* The reducer method takes all the stratified, cross-validated
* values from the mappers in a list and uses an aggregatable evaluation to consolidate
* them.
*/
public void reduce(Text key, Iterable<AggregateableEvaluation> values, Context context) throws IOException, InterruptedException {
int sum = 0;
// record how long it takes to run the aggregation
System.out.println(new Timestamp(System.currentTimeMillis()));
long beforeReduceTime = System.currentTimeMillis();
// loop through each of the values and "aggregate"
// which basically means to consolidate the values
for (AggregateableEvaluation val : values) {
System.out.println("IN THE REDUCER!");
// The first time through, give aggEval a value
if (sum == 0) {
try {
aggEval = val;
}
catch (Exception e) {
e.printStackTrace();
}
}
else {
// combine the values
aggEval.aggregate(val);
}
try {
// This is what is taken from the mapper to be aggregated
System.out.println("This is the map result");
System.out.println(aggEval.toMatrixString());
}
catch (Exception e) {
e.printStackTrace();
}
sum += 1;
}
// Here is where the typical weka matrix output is generated
try {
System.out.println("This is reduce matrix");
System.out.println(aggEval.toMatrixString());
}
catch (Exception e) {
e.printStackTrace();
}
// calculate the duration of the aggregation
context.write(key, new IntWritable(sum));
long afterReduceTime = System.currentTimeMillis();
long reduceTime = afterReduceTime - beforeReduceTime;
// display the output
System.out.println("The value of reduce time is: " + reduceTime);
System.out.println(new Timestamp(System.currentTimeMillis()));
}
}
And lastly InputFormatClass
Takes a JobContext and returns a list of data split into pieces
Basically this is a way of handling large data sets. This method allows
us to split a large data set into smaller chunks to pass across worker nodes
(or in our case, just to make life a little easier and pass the chunks to a single
node so that it is not overwhelmed by one large data set)
public class WekaInputFormat extends TextInputFormat {
public List<InputSplit> getSplits(JobContext job) throws IOException {
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
long maxSize = getMaxSplitSize(job);
List<InputSplit> splits = new ArrayList<InputSplit>();
for (FileStatus file: listStatus(job)) {
Path path = file.getPath();
FileSystem fs = path.getFileSystem(job.getConfiguration());
//number of bytes in this file
long length = file.getLen();
BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
// make sure this is actually a valid file
if(length != 0) {
// set the number of splits to make. NOTE: the value can be changed to anything
int count = job.getConfiguration().getInt("Run-num.splits", 1);
for(int t = 0; t < count; t++) {
//split the file and add each chunk to the list
splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
}
}
else {
// Create empty array for zero length files
splits.add(new FileSplit(path, 0, length, new String[0]));
}
}
return splits;
}
}
For each of the mapper, reducer and overall job, there is an stderr file, an stdout file and a syslog file.
You are printing to stdout in the mapper and the reducer, so you should check the stdout file of the mapper and the reducer not that of the overall job.
Best of luck

Scanner from file doesn't seem to be reading file

I'm doing a Phone Directory project and we have to read from a directory file telnos.txt
I'm using a Scanner to load the data from the file telnos.txt, using a loadData method from a previous question I asked here on StackOverflow.
I noticed attempts to find a user always returned Not Found, so I added a few System.out.printlns in the methods to help me see what was going on. It looks like the scanner isn't reading anything from the file. Weirdly, it is printing the name of the file as what should be the first line read, which makes me think I've missed something very very simple here.
Console
run:
telnos.txt
null
loadData tested successfully
Please enter a name to look up: John
-1
Not found
BUILD SUCCESSFUL (total time: 6 seconds)
ArrayPhoneDirectory.java
import java.util.*;
import java.io.*;
public class ArrayPhoneDirectory implements PhoneDirectory {
private static final int INIT_CAPACITY = 100;
private int capacity = INIT_CAPACITY;
// holds telno of directory entries
private int size = 0;
// Array to contain directory entries
private DirectoryEntry[] theDirectory = new DirectoryEntry[capacity];
// Holds name of data file
private final String sourceName = "telnos.txt";
File telnos = new File(sourceName);
// Flag to indicate whether directory was modified since it was last loaded or saved
private boolean modified = false;
// add method stubs as specified in interface to compile
public void loadData(String sourceName) {
Scanner read = new Scanner("telnos.txt").useDelimiter("\\Z");
int i = 1;
String name = null;
String telno = null;
while (read.hasNextLine()) {
if (i % 2 != 0)
name = read.nextLine();
else
telno = read.nextLine();
add(name, telno);
i++;
}
}
public String lookUpEntry(String name) {
int i = find(name);
String a = null;
if (i >= 0) {
a = name + (" is at position " + i + " in the directory");
} else {
a = ("Not found");
}
return a;
}
public String addChangeEntry(String name, String telno) {
for (DirectoryEntry i : theDirectory) {
if (i.getName().equals(name)) {
i.setNumber(telno);
} else {
add(name, telno);
}
}
return null;
}
public String removeEntry(String name) {
for (DirectoryEntry i : theDirectory) {
if (i.getName().equals(name)) {
i.setName(null);
i.setNumber(null);
}
}
return null;
}
public void save() {
PrintWriter writer = null;
// writer = new PrintWriter(FileWriter(sourceName));
}
public String format() {
String a;
a = null;
for (DirectoryEntry i : theDirectory) {
String b;
b = i.getName() + "/n";
String c;
c = i.getNumber() + "/n";
a = a + b + c;
}
return a;
}
// add private methods
// Adds a new entry with the given name and telno to the array of
// directory entries
private void add(String name, String telno) {
System.out.println(name);
System.out.println(telno);
theDirectory[size] = new DirectoryEntry(name, telno);
size = size + 1;
}
// Searches the array of directory entries for a specific name
private int find(String name) {
int result = -1;
for (int count = 0; count < size; count++) {
if (theDirectory[count].getName().equals(name)) {
result = count;
}
System.out.println(result);
}
return result;
}
// Creates a new array of directory entries with twice the capacity
// of the previous one
private void reallocate() {
capacity = capacity * 2;
DirectoryEntry[] newDirectory = new DirectoryEntry[capacity];
System.arraycopy(theDirectory, 0, newDirectory,
0, theDirectory.length);
theDirectory = newDirectory;
}
}
ArrayPhoneDirectoryTester.java
import java.util.Scanner;
public class ArrayPhoneDirectoryTester {
public static void main(String[] args) {
//create a new ArrayPhoneDirectory
PhoneDirectory newTest = new ArrayPhoneDirectory();
newTest.loadData("telnos.txt");
System.out.println("loadData tested successfully");
System.out.print("Please enter a name to look up: ");
Scanner in = new Scanner(System.in);
String name = in.next();
String entryNo = newTest.lookUpEntry(name);
System.out.println(entryNo);
}
}
telnos.txt
John
123
Bill
23
Hello
23455
Frank
12345
Dkddd
31231
In your code:
Scanner read = new Scanner("telnos.txt");
Is not going to load file 'telnos.txt'. It is instead going to create a Scanner object that scans the String "telnos.txt".
To make the Scanner understand that it has to scan a file you have to either:
Scanner read = new Scanner(new File("telnos.txt"));
or create a File object and pass its path to the Scanner constructor.
In case you are getting "File not found" errors you need to check the current working directory. You could run the following lines and see if you are indeed in the right directory in which the file is:
String workingDir = System.getProperty("user.dir");
System.out.println("Current working directory : " + workingDir);
You need to also catch the FileNotFoundException in the function as follows:
public void loadData(String sourceName) {
try {
Scanner read = new Scanner(new File("telnos.txt")).useDelimiter("\\Z");
int i = 1;
String name = null;
String telno = null;
while (read.hasNextLine()) {
if (i % 2 != 0)
name = read.nextLine();
else {
telno = read.nextLine();
add(name, telno);
}
i++;
}
}catch(FileNotFoundException ex) {
System.out.println("File not found:"+ex.getMessage);
}
}
You are actually parsing the filename not the actual file contents.
Instead of:
new Scanner("telnos.txt")
you need
new Scanner( new File( "telnos.txt" ) )
http://docs.oracle.com/javase/7/docs/api/java/util/Scanner.html

How can I get the correct amount of internal and external space of device while factoring in the edge cases of Galaxy devices?

I'd like to get the amount on internal and external space in a device and after going to through a couple of posts on StackOverflow, I found that this is easy. I can get the amount of internal space using this:
StatFs sfsInternal = new StatFs(Environment.getRootDirectory().getAbsolutePath());
return Long.valueOf(sfsInternal.getBlockCount()) * Long.valueOf(sfsInternal.getBlockSize());
...and I can get the amount of external space using this:
StatFs sfsExternal = new StatFs(Environment.getExternalStorageDirectory().getAbsolutePath());
return Long.valueOf(sfsExternal.getBlockCount()) * Long.valueOf(sfsExternal.getBlockSize());
When I read about "internal" storage, I assumed that it would be the non-removable onboard storage on the device and "external" would the removable flash card storage but this hasn't been case entirely.
I found that Samsung devices e.e. Galaxy Note 2, show a large chunk of the internal storage as external. Here's an answer that discusses the same thing. https://stackoverflow.com/a/12087556/304151
How can I get the amount of internal storage (on-board and non-removable) and the amount of external storage (flash and removable) while factoring in the edge cases of Samsung's Galaxy devices. I'm yet to find an answer on StackOverflow that provides a complete working solution for this scenario. My code is for API level 17.
Thanks.
Here is the code to get available free space on different devices i have tested that code on samsung GALAXY Tab7 2.2 Froyo and Nexus 7 4.2.2 Jelly Beans
// calculate frespace on external storage
public static int getExternalStorageFreeSpace(String storagePath)
{
try
{
File file = new File(storagePath);
StatFs stat = new StatFs(file.getPath());
double sdAvailSize = (double) stat.getAvailableBlocks() * (double) stat.getBlockSize();
int valueinmb = (int) (sdAvailSize / 1024) / 1024;
return valueinmb;
}
catch (Exception e)
{
System.out.println("Message//////" + e.getMessage() + "Cause555555555555" + e.getCause());
}
return 0;
}
to diffrentiate between internal and external storages i have used this class and some logic
public class GetRemoveableDevices
{
private final static String TAG = "GetRemoveableDevice";
public GetRemoveableDevices()
{
}
public static String[] getDirectories()
{
Log.d(TAG, "getStorageDirectories");
File tempFile;
String[] directories = null;
String[] splits;
ArrayList<String> arrayList = new ArrayList<String>();
BufferedReader bufferedReader = null;
String lineRead;
try
{
arrayList.clear(); // redundant, but what the hey
bufferedReader = new BufferedReader(new FileReader("/proc/mounts"));
while ((lineRead = bufferedReader.readLine()) != null)
{
Log.d(TAG, "lineRead: " + lineRead);
splits = lineRead.split(" ");
// System external storage
if (splits[1].equals(Environment.getExternalStorageDirectory().getPath()))
{
arrayList.add(splits[1]);
Log.d(TAG, "gesd split 1: " + splits[1]);
continue;
}
// skip if not external storage device
if (!splits[0].contains("/dev/block/"))
{
continue;
}
// skip if mtdblock device
if (splits[0].contains("/dev/block/mtdblock"))
{
continue;
}
// skip if not in /mnt node
if (!splits[1].contains("/mnt"))
{
continue;
}
// skip these names
if (splits[1].contains("/secure"))
{
continue;
}
if (splits[1].contains("/mnt/asec"))
{
continue;
}
// Eliminate if not a directory or fully accessible
tempFile = new File(splits[1]);
if (!tempFile.exists())
{
continue;
}
if (!tempFile.isDirectory())
{
continue;
}
if (!tempFile.canRead())
{
continue;
}
if (!tempFile.canWrite())
{
continue;
}
// Met all the criteria, assume sdcard
arrayList.add(splits[1]);
}
}
catch (FileNotFoundException e)
{
}
catch (IOException e)
{
}
finally
{
if (bufferedReader != null)
{
try
{
bufferedReader.close();
}
catch (IOException e)
{
}
}
}
// Send list back to caller
if (arrayList.size() == 0)
{
arrayList.add("sdcard not found");
}
directories = new String[arrayList.size()];
for (int i = 0; i < arrayList.size(); i++)
{
directories[i] = arrayList.get(i);
}
return directories;
}
}
now i am showing you my logic
String[] dirs = GetRemoveableDevices.getDirectories();
ArrayList<String> directories=new ArrayList<String>();
for(String directory:dirs)
{
if(!directory.contains("."))
directories.add(directory);
}
String externalStorage = "";
String internalStorage = "";
if (directories.size()>= 2)
{
internalStorage = directories.get(0).toString();
externalStorage = directories.get(1).toString();
}
else if (directories.size() < 2)
{
internalStorage = directories.get(0).toString();
externalStorage = null;
}
hope it will be helpful
"Internal storage" is for privately held data. It's called internal because it's relative to the application itself. It's for sandboxing the application's data and keeping it private.
Environment.getRootDirectory() gets the phone's system folder. Which is not internal storage, but external storage.
External storage is for publicly shared data, external to the application.
Since mounting naming conventions vary greatly between phones, it can be difficult to differentiate from an SD card and normal onboard directories. But generally, the sd card is mounted to the directory /sdcard/.
I found some code that does this on this blog post.
package com.sapien.music.importer.util;
import java.io.File;
#SuppressLint("NewApi")
public class StorageOptions {
public static String[] labels;
public static String[] paths;
public static int count = 0;
private static Context sContext;
private static ArrayList<String> sVold = new ArrayList<String>();
public static void determineStorageOptions(Context context) {
sContext = context.getApplicationContext();
readVoldFile();
testAndCleanList();
setProperties();
}
private static void readVoldFile() {
/*
* Scan the /system/etc/vold.fstab file and look for lines like this:
* dev_mount sdcard /mnt/sdcard 1
* /devices/platform/s3c-sdhci.0/mmc_host/mmc0
*
* When one is found, split it into its elements and then pull out the
* path to the that mount point and add it to the arraylist
*
* some devices are missing the vold file entirely so we add a path here
* to make sure the list always includes the path to the first sdcard,
* whether real or emulated.
*/
sVold.add("/mnt/sdcard");
try {
Scanner scanner = new Scanner(new File("/system/etc/vold.fstab"));
while (scanner.hasNext()) {
String line = scanner.nextLine();
if (line.startsWith("dev_mount")) {
String[] lineElements = line.split(" ");
String element = lineElements[2];
if (element.contains(":"))
element = element.substring(0, element.indexOf(":"));
if (element.contains("usb"))
continue;
// don't add the default vold path
// it's already in the list.
if (!sVold.contains(element))
sVold.add(element);
}
}
} catch (Exception e) {
// swallow - don't care
e.printStackTrace();
}
}
private static void testAndCleanList() {
/*
* Now that we have a cleaned list of mount paths, test each one to make
* sure it's a valid and available path. If it is not, remove it from
* the list.
*/
for (int i = 0; i < sVold.size(); i++) {
String voldPath = sVold.get(i);
File path = new File(voldPath);
if (!path.exists() || !path.isDirectory() || !path.canWrite())
sVold.remove(i--);
}
}
private static void setProperties() {
/*
* At this point all the paths in the list should be valid. Build the
* public properties.
*/
ArrayList<String> labelList = new ArrayList<String>();
int j = 0;
if (sVold.size() > 0) {
if (Build.VERSION.SDK_INT < Build.VERSION_CODES.GINGERBREAD)
labelList.add("Auto");
else if (Build.VERSION.SDK_INT < Build.VERSION_CODES.HONEYCOMB) {
if (Environment.isExternalStorageRemovable()) {
labelList.add(sContext
.getString(R.string.text_external_storage) + " 1");
j = 1;
} else
labelList.add(sContext
.getString(R.string.text_internal_storage));
} else {
if (!Environment.isExternalStorageRemovable()
|| Environment.isExternalStorageEmulated())
labelList.add(sContext
.getString(R.string.text_internal_storage));
else {
labelList.add(sContext
.getString(R.string.text_external_storage) + " 1");
j = 1;
}
}
if (sVold.size() > 1) {
for (int i = 1; i < sVold.size(); i++) {
labelList.add(sContext
.getString(R.string.text_external_storage)
+ " " + (i + j));
}
}
}
labels = new String[labelList.size()];
labelList.toArray(labels);
paths = new String[sVold.size()];
sVold.toArray(paths);
count = Math.min(labels.length, paths.length);
/*
* don't need these anymore, clear the lists to reduce memory use and to
* prepare them for the next time they're needed.
*/
sVold.clear();
}
try this out:
private boolean is_sdCardSaveToUse(){
/**default disk cache size in bytes*/
final int DEFAULT_DISK_CACHE_SIZE = 1024 * 1024 * 10; //10 MB
/**get sdCard state*/
String sdCardState = Environment.getExternalStorageState();
/**check if the sdCard is mounted*/
/**check if we can write to sdCard*/if (Environment.MEDIA_MOUNTED.equals(sdCardState)) {
if (Environment.MEDIA_MOUNTED_READ_ONLY.equals(sdCardState)) {
Log.d("sdCard", "mounted readOnly");
} else {
Log.d("sdCard", "mounted readWrite");
/**get free usable space in bytes */
long freeUsableSpace = Environment.getExternalStorageDirectory().getUsableSpace();
int temp = Math.round(((float) freeUsableSpace / 1024) / 1024); //convert from bytes to MB.
Log.d("usableSpace= ", Integer.toString(temp) + " MB");
if (freeUsableSpace > DEFAULT_DISK_CACHE_SIZE){
return true;
} else {
Log.d("sdCard","not enough space");
return false;
}
}
} else{
Log.d("sdCard","not mounted");
return false;
}
return false;
}

Categories