prediction method is giving error in java-weka integration - java

package demo_thesis;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.classifiers.evaluation.NominalPrediction;
import weka.classifiers.rules.DecisionTable;
import weka.classifiers.rules.PART;
import weka.classifiers.trees.DecisionStump;
import weka.classifiers.trees.J48;
import weka.core.FastVector;
import weka.core.Instances;
public class WekaTest {
public static BufferedReader readDataFile(String filename) {
BufferedReader inputReader = null;
try {
inputReader = new BufferedReader(new FileReader(filename));
} catch (FileNotFoundException ex) {
System.err.println("File not found: " + filename);
}
return inputReader;
}
public static Evaluation classify(Classifier model,
Instances trainingSet, Instances testingSet) throws Exception {
Evaluation evaluation = new Evaluation(trainingSet);
model.buildClassifier(trainingSet);
evaluation.evaluateModel(model, testingSet);
return evaluation;
}
public static double calculateAccuracy(FastVector predictions) {
double correct = 0;
for (int i = 0; i < predictions.size(); i++) {
NominalPrediction np = (NominalPrediction) predictions.elementAt(i);
if (np.predicted() == np.actual()) {
correct++;
}
}
return 100 * correct / predictions.size();
}
public static Instances[][] crossValidationSplit(Instances data, int numberOfFolds) {
Instances[][] split = new Instances[2][numberOfFolds];
for (int i = 0; i < n`enter code here`umberOfFolds; i++) {
split[0][i] = data.trainCV(numberOfFolds, i);
split[1][i] = data.testCV(numberOfFolds, i);
}
return split;
}
public static void main(String[] args) throws Exception {
BufferedReader datafile = readDataFile("C:\\Users\\user\\Desktop\\demo_thesis\\src\\input_file\\weather.txt");
Instances data = new Instances(datafile);
data.setClassIndex(data.numAttributes() - 1);
// Do 10-split cross validation
Instances[][] split = crossValidationSplit(data, 10);
// Separate split into training and testing arrays
Instances[] trainingSplits = split[0];
Instances[] testingSplits = split[1];
// Use a set of classifiers
Classifier[] models = {
new J48(), // a decision tree
new PART(),
new DecisionTable(),//decision table majority classifier
new DecisionStump() //one-level decision tree
};
// Run for each model
for (int j = 0; j < models.length; j++) {
// Collect every group of predictions for current model in a FastVector
FastVector predictions = new FastVector();
// For each training-testing split pair, train and test the classifier
for (int i = 0; i < trainingSplits.length; i++) {
Evaluation validation = classify(models[j], trainingSplits[i], testingSplits[i]);
predictions.appendElements(validation.predictions());
// Uncomment to see the summary for each training-testing pair.
System.out.println(models[j].toString());
}
// Calculate overall accuracy of current classifier on all splits
double accuracy = calculateAccuracy(predictions);
// Print current classifier's name and accuracy in a complicated,
// but nice-looking way.
System.out.println("Accuracy of " + models[j].getClass().getSimpleName() + ": "
+ String.format("%.2f%%", accuracy)
+ "\n---------------------------------");
}
}
}
I have integrated weka jar file to a java package. i have used this code.but there is a error show in the line predictions.appendElements(validation.predictions()); Giving a error like "cannot find symbol symbol: method predictions()".I have used Netbeans IDE 8.2 and Jdk 1.8. I have tried in many ways but can not solve it. how should I solve the error??

Related

Hadoop mapreduce job create too big intermediate files

I wrote a mapreduce program, but when I tried to run on hadoop it can't succeed since it generates that much amount of intermediate data that I get an error message: the node has no more space on it. After it tries with the second node, but the result is the same. I would like process two text files: approximately ~60k lines.
I have tried:
- enable snappy compression, but it didn't help.
- add more space, so the two node have 50-50gb storage
Since none of them are helped maybe the problem is with the code, not with the setup.
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class FirstMapper extends Mapper<LongWritable, Text, Text, Text> {
enum POS_TAG {
CC, CD, DT, EX,
FW, IN, JJ, JJR,
JJS, LS, MD, NN,
NNS, NNP, NNPS, PDT,
WDT, WP, POS, PRP,
PRP$, RB, RBR, RBS,
RP, SYM, TO, UH,
VB, VBD, VBG, VBN,
VBP, VBZ, WP$, WRB
}
private static final List<String> tags = Stream.of(POS_TAG.values())
.map(Enum::name)
.collect(Collectors.toList());
private static final int MAX_NGRAM = 5;
private static String[][] cands = {
new String[3],
new String[10],
new String[32],
new String[10]
};
#Override
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
String location = conf.get("job.cands.path");
if (location != null) {
BufferedReader br = null;
try {
FileSystem fs = FileSystem.get(conf);
Path path = new Path(location);
if (fs.exists(path)) {
FSDataInputStream fis = fs.open(path);
br = new BufferedReader(new InputStreamReader(fis));
String line;
int i = 0;
while ((line = br.readLine()) != null) {
String[] splitted = line.split(" ");
cands[i] = splitted;
i++;
}
}
} catch (IOException e) {
//
} finally {
br.close();
}
}
}
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] tokens = value.toString().split(" ");
int m = tokens.length;
for (int n = 2; n <= MAX_NGRAM; n++) {
for (int s = 0; s <= m - n; s++) {
for (int i = 0; i < cands[n - 2].length; i++) {
List<String> pattern = new ArrayList<>();
List<String> metWords = new ArrayList<>();
for (int j = 0; j <= n - 1; j++) {
String[] pair = tokens[s + j].split("/");
String word = pair[0];
String pos = pair[1];
char c = cands[n - 2][i].charAt(j);
addToPattern(word, pos, c, pattern);
if (c > 0 && tags.contains(pos)) {
metWords.add(word);
}
}
if (metWords.isEmpty()) {
metWords.add("_NONE");
}
Text resultKey = new Text(pattern.toString() + ";" + metWords.toString());
context.write(resultKey, new Text(key.toString()));
}
}
}
}
public void addToPattern(String word, String pos, char c, List<String> pattern) {
switch (c) {
case 'w':
pattern.add(word);
break;
case 'p':
pattern.add(pos);
break;
default:
pattern.add("_WC_");
break;
}
}
}
public class Main {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("job.cands.path", "/user/thelfter/pwp");
Job job1 = Job.getInstance(conf, "word pattern1");
job1.setJarByClass(Main.class);
job1.setMapperClass(FirstMapper.class);
job1.setCombinerClass(FirstReducer.class);
job1.setReducerClass(FirstReducer.class);
job1.setMapOutputKeyClass(Text.class);
job1.setMapOutputValueClass(Text.class);
FileInputFormat.addInputPath(job1, new Path(args[0]));
FileOutputFormat.setOutputPath(job1, new Path("/user/thelfter/output"));
System.exit(job1.waitForCompletion(true) ? 0 : 1);
}
}
If your using YARN, then the Node Manager's disk space is controlled by yarn.nodemanager.local-dirs in your yarn-site.xml file so whatever that is pointing to needs to have enough disk space.

How to use correctly ZipfDistribution from Apache commons math library in Java?

I want to create a source of data (in Java) based on words (from a dictionary) that follow a Zipf distribution. So I come to ZipfDistribution and NormalDistribution of the Apache commons library. Unfortunately, information about how to use these classes are rarely. I tried to do some tests but I am not sure if I am using it in the right manner. I am following only what is written in the documentation of each constructor. But the results don't seem to be "well-distributed".
import org.apache.commons.math3.distribution.NormalDistribution;
import org.apache.commons.math3.distribution.ZipfDistribution;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
public class ZipfDistributionDataSource extends RichSourceFunction<String> {
private static final String DISTINCT_WORDS_URL = "https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt";
public static void main(String[] args) throws Exception {
ZipfDistributionDataSource zipfDistributionDataSource = new ZipfDistributionDataSource();
StringBuffer stringBuffer = new StringBuffer(zipfDistributionDataSource.readDataFromResource());
String[] words = stringBuffer.toString().split("\n");
System.out.println("size: " + words.length);
System.out.println("Normal Distribution");
NormalDistribution normalDistribution = new NormalDistribution(words.length / 2, 1);
for (int i = 0; i < 10; i++) {
int sample = (int) normalDistribution.sample();
System.out.print("sample[" + sample + "]: ");
System.out.println(words[sample]);
}
System.out.println();
System.out.println("Zipf Distribution");
ZipfDistribution zipfDistribution = new ZipfDistribution(words.length - 1, 1);
for (int i = 0; i < 10; i++) {
int sample = zipfDistribution.sample();
System.out.print("sample[" + sample + "]: ");
System.out.println(words[sample]);
}
}
private String readDataFromResource() throws Exception {
URL url = new URL(DISTINCT_WORDS_URL);
InputStream in = url.openStream();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(in));
StringBuilder builder = new StringBuilder();
String line;
try {
while ((line = bufferedReader.readLine()) != null) {
builder.append(line + "\n");
}
bufferedReader.close();
} catch (IOException ioe) {
ioe.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
return builder.toString();
}
}
output
size: 370103
Normal Distribution
sample[185049]: metathesize
sample[185052]: metathetically
sample[185051]: metathetical
sample[185050]: metathetic
sample[185049]: metathesize
sample[185050]: metathetic
sample[185052]: metathetically
sample[185050]: metathetic
sample[185052]: metathetically
sample[185050]: metathetic
Zipf Distribution
sample[11891]: anaphasic
sample[314]: abegge
sample[92]: abandoner
sample[3]: aah
sample[36131]: blepharosynechia
sample[218]: abbozzo
sample[8]: aalii
sample[5382]: affing
sample[6394]: agoraphobia
sample[4360]: adossed
You are using it just fine from a code perspective :) The problem is in assuming the source material is ordered by Zipf when it is clearly alphabetical. The whole point of using ZipfDistribution is that words[0] must be the most common word (hint: it's 'the') and roughly twice the freq of words[1]) etc.
https://en.wikipedia.org/wiki/Word_lists_by_frequency
https://en.wikipedia.org/wiki/Most_common_words_in_English

Trained SVM only ever outputs 1.0 as a result despite 0.0 training error

I am trying to classify a dataset! In this dataset the first column is the ideal outcome and the other 20 columns are the inputs.
The problem that arises here for me is that the SVM trained on the dataset (in this case 80% is used for training) shows a training error of 0.0 but it always predicts 1.0 as outcome.
I have divided the set into two parts one for training (80% of the data) and 20% for classification. The data is a concatenation of two short timeseries of RSI values (one 2 period and one 14 period).
Why does the SVM behave this way? And can I do something to avoid this? I thought 0.0 of training error would mean, that on the training set the SVM makes no more errors. This seems to be false judging from the results.
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.encog.Encog;
import org.encog.ml.data.MLData;
import org.encog.ml.data.MLDataPair;
import org.encog.ml.data.MLDataSet;
import org.encog.ml.data.basic.BasicMLDataSet;
import org.encog.ml.svm.SVM;
import org.encog.ml.svm.training.SVMTrain;
public class SVMTest {
public static void main(String[] args) {
List<String> lines = readFile("/home/wens/mlDataSet.csv");
double[][] trainingSetData = getInputData(lines, 0, lines.size()/10*8);
double[][] trainingIdeal = getIdeal(lines, 0, lines.size()/10*8);
MLDataSet trainingSet = new BasicMLDataSet(trainingSetData, trainingIdeal);
double[][] classificationSetData = getInputData(lines, lines.size()/10*8, lines.size());
double[][] classificationIdeal = getIdeal(lines, lines.size()/10*8, lines.size());
MLDataSet classificationSet = new BasicMLDataSet(classificationSetData, classificationIdeal);
SVM svm = new SVM(20,false);
final SVMTrain train = new SVMTrain(svm, trainingSet);
train.iteration();
train.finishTraining();
System.out.println("training error: " + train.getError());
System.out.println("SVM Results:");
for(MLDataPair pair: classificationSet ) {
final MLData output = svm.compute(pair.getInput());
System.out.println("actual: " + output.getData(0) + "\tideal=" + pair.getIdeal().getData(0));
}
Encog.getInstance().shutdown();
}
private static List<String> readFile(String filepath){
List<String> res = new ArrayList<>();
try {
File f = new File(filepath);
BufferedReader b = new BufferedReader(new FileReader(f));
String readLine = "";
while ((readLine = b.readLine()) != null) {
res.add(readLine);
}
} catch (IOException e) {
e.printStackTrace();
}
return res;
}
private static double[][] getInputData(List<String> lines, int start, int end){
double[][] res = new double[end-start][20];
int cnt = 0;
for(int i=start; i<end; i++){
String[] tmp = lines.get(i).split("\t");
for(int j=1; j<tmp.length; j++){
res[cnt][j-1] = Double.parseDouble(tmp[j]);
}
cnt++;
}
return res;
}
private static double[][] getIdeal(List<String> lines, int start, int end){
double[][] res = new double[end-start][1];
int cnt = 0;
for(int i=start; i<end; i++){
String[] tmp = lines.get(i).split("\t");
res[cnt][0] = Double.parseDouble(tmp[0]);
cnt++;
}
return res;
}
}

Java Extract count based on column data from csv file

I have my below Java code and TestData.csv (Input file)
and my expected output is like below. But it show actual count
I tried lot. Any one hava any idea on this. Any help is valuable one. Based on column data I want the count for the particular value.
package com;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import com.opencsv.CSVWriter;
import com.opencsv.CSVReader;
import java.time.format.DateTimeFormatter;
import java.time.LocalDateTime;
public class TestDataProcess {
public static void main(String args[]) throws IOException {
processData();
}
public static void processData() {
String[] trafficDetails;
int locColumnPosition, subCcolumnPosition, j, i, msgTypePosition, k, m, trafficLevelPosition;
String masterCSVFile, dayFolderPath;
String[] countryID = { "LOC1" };
String[] subID = { "S1" };
String[] mType = { "MSG1" };
String[] trafficLevel = { "1", "2", "3" };
String columnNameLocation = "CountryID";
String columnNameSubsystem = "SubID";
String columnNameMsgType = "Type";
String columnNameAlrmLevel = "TrafficLevel";
masterCSVFile = "D:\\TestData.csv";
dayFolderPath = "D:\\output\\";
DateTimeFormatter dtf = DateTimeFormatter.ofPattern("dd_MM_yyyy");
LocalDateTime now = LocalDateTime.now();
System.out.println(dtf.format(now));
int count = 0;
for (i = 0; i < countryID.length; i++) {
count = 0;
for (j = 0; j < subID.length; j++) {
count = 0;
String locaIdSubsysId = dtf.format(now) + "_" + countryID[i] + "_" + subID[j] + ".csv";
try (CSVWriter csvWriter = new CSVWriter(new FileWriter(dayFolderPath + locaIdSubsysId, true));
CSVReader csvReader = new CSVReader(new FileReader(masterCSVFile));) {
trafficDetails = csvReader.readNext();
csvWriter.writeNext(trafficDetails);
locColumnPosition = getHeaderLocation(trafficDetails, columnNameLocation);
subCcolumnPosition = getHeaderLocation(trafficDetails, columnNameSubsystem);
msgTypePosition = getHeaderLocation(trafficDetails, columnNameMsgType);
trafficLevelPosition = getHeaderLocation(trafficDetails, columnNameAlrmLevel);
while ((trafficDetails = csvReader.readNext()) != null && locColumnPosition > -1
&& subCcolumnPosition > -1) {
for (k = 0; k < mType.length; k++) {
for (m = 0; m < trafficLevel.length; m++) {
if (trafficDetails[locColumnPosition].matches(countryID[i])
& trafficDetails[subCcolumnPosition].matches(subID[j])
& trafficDetails[trafficLevelPosition].matches(trafficLevel[m])
& trafficDetails[msgTypePosition].matches(mType[k]))
{
count = count + 1;
csvWriter.writeNext(trafficDetails);
}
}
}
}
} catch (Exception ee) {
ee.printStackTrace();
}
}
}
}
public static int getHeaderLocation(String[] headers, String columnName) {
return Arrays.asList(headers).indexOf(columnName);
}
}
You can have that using a Map to store the traffic level as a key and all the rows from your csv file in a List as its value. Then just print the size of the List.
See the following example and have a look at the code comments:
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
public class ExampleMain {
public static void main(String[] args) {
// create a Path object from the path to your file
Path csvFilePath = Paths.get("Y:\\our\\path\\to\\file.csv");
// create a data structure that stores data rows per traffic level
Map<Integer, List<DataRow>> dataRowsPerTrafficLevel = new TreeMap<Integer, List<DataRow>>();
try {
// read all the lines of the file
List<String> lines = Files.readAllLines(csvFilePath);
// iterate all the lines, skipping the header line
for (int i = 1; i < lines.size(); i++) {
// split the lines by the separator (WHICH MAY DIFFER FROM THE ONE USED HERE)
String[] lineValues = lines.get(i).split(",");
// store the value from column 6 (index 5) as the traffic level
int trafficLevel = Integer.valueOf(lineValues[5]);
// if the map already contains this key, just add the next data row
if (dataRowsPerTrafficLevel.containsKey(trafficLevel)) {
DataRow dataRow = new DataRow();
dataRow.subId = lineValues[1];
dataRow.countryId = lineValues[2];
dataRow.type = lineValues[3];
dataRowsPerTrafficLevel.get(trafficLevel).add(dataRow);
} else {
/* otherwise create a list, then a data row, add it to the list and put it in
* the map along with the new key
*/
List<DataRow> dataRows = new ArrayList<DataRow>();
DataRow dataRow = new DataRow();
dataRow.subId = lineValues[1];
dataRow.countryId = lineValues[2];
dataRow.type = lineValues[3];
dataRows.add(dataRow);
dataRowsPerTrafficLevel.put(trafficLevel, dataRows);
}
}
// print the result
dataRowsPerTrafficLevel.forEach((trafficLevel, dataRows) -> {
System.out.println("For TrafficLevel " + trafficLevel + " there are " + dataRows.size()
+ " data rows in the csv file");
});
} catch (IOException e) {
e.printStackTrace();
}
}
/*
* small holder class that just holds the values of columns 3, 4 and 5.
* If you want to have distinct values, make this one a full POJO implementing Comparable
*/
static class DataRow {
String subId;
String countryId;
String type;
}

How do I create a method to create serial numbers based on an input parameter?

I am creating a delimited text string from a data source that contain non-delimited document metadata. All of the data is sorted by index, then subindex, and one of the first things I want to do is create a serial number for each record. The first characters of each line dictate if this is an index or subindex record, and I use these in increment the data as noted in the logic below, which works as expected.
import java.util.*;
import java.io.*;
import java.nio.file.*;
import java.util.regex.*;
import java.lang.StringBuilder;
//
public class mdata
{
public static void main(String[] args)
{
double indexNo = 0;
double subIndexNo = 0;
double recNo = 0 ;
try
{
FileInputStream inputStream = new FileInputStream("whidata0.htm");
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
String inputLine = null;
while((inputLine=br.readLine())!=null)
{
String recordNumber = "";
if (inputLine.trim().startsWith("aIE(2")) {
indexNo = indexNo + 1;
subIndexNo = .00;
} else
if (inputLine.trim().startsWith("aIE(3")) {
subIndexNo = subIndexNo + .01;
}
recNo = indexNo + subIndexNo;
System.out.println(recNo);
}
}
//
catch (Exception e)
{
System.err.println("Error: " + e.getMessage());
}
}
}
I have other applications that require me to serialize data, and want to create a standalone method that assigns the serial number. I'm having some issues which may be scope-related, and I need an few extra set of eyes to help me understand what's happening.
Here's where I am so far with creating a serialization method:
import java.util.*;
import java.io.*;
import java.nio.file.*;
import java.util.regex.*;
import java.lang.StringBuilder;
//
public class mdata2
{
public static void main(String[] args)
{
try
{
FileInputStream inputStream = new FileInputStream("whidata0.htm");
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
String inputLine = null;
while((inputLine=br.readLine())!=null)
{
recNo = generateSerial(inputLine.trim());
System.out.println(recNo);
}
}
//
catch (Exception e)
{
System.err.println("Error: " + e.getMessage());
}
}
//
public static double generateSerial(String inputLine)
{
double indexNo = 0; // where do
double subIndexNo = 0; // these have
double recNo = 0 ; // to go?
String recordNumber = "";
if (inputLine.trim().startsWith("aIE(2")) {
indexNo = indexNo + 1;
subIndexNo = .00;
} else
if (inputLine.trim().startsWith("aIE(3")) {
subIndexNo = subIndexNo + .01;
}
recNo = indexNo + subIndexNo;
System.out.println(recNo);
return recNo;
}
}
In the first block of code, my recNo prints as a sequence 1.00,2.00,2.01,2.02,2.03,3.00 etc. In the second, that same sequence returns as 1.00,1.00,1.01,1.01,1.01,1.00 etc. Looking at it, that makes sense; the first thing I'm doing in the method is resetting the variables to 0. Initializing the variables in main gives me scope issues--generateSerial doesn't recognize the variable.
I played around with using combinations of this.[variableName] but that didn't seem to ave any effect. What's the best way to handle this?

Categories