Hadoop - MapReduce - java

I've been trying to solve a simple Map/Reduce problem in which I would be counting words from some input files, and then have their frequency as one key, and their word length as the other key. The Mapping would emit one eveytime a new word is read from the file, and then it would group all the same words together to have their final count. Then as an output I'd like to see the statistics for each word length what's the most frequent word.
This is as far as we've gotten (me and my team):
This is the WordCountMapper class
import java.io.IOException;
import java.util.ArrayList;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class WordCountMapper extends MapReduceBase implements
Mapper<LongWritable, Text, Text, CompositeGroupKey> {
private final IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value,
OutputCollector<Text, CompositeGroupKey> output, Reporter reporter)
throws IOException {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line.toLowerCase());
while(itr.hasMoreTokens()) {
word.set(itr.nextToken());
CompositeGroupKey gky = new CompositeGroupKey(1, word.getLength());
output.collect(word, gky);
}
}
}
This is wordcountreducer class:
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import com.sun.xml.internal.bind.CycleRecoverable.Context;
public class WordCountReducer extends MapReduceBase
implements Reducer<Text, CompositeGroupKey, Text, CompositeGroupKey> {
#Override
public void reduce(Text key, Iterator<CompositeGroupKey> values,
OutputCollector<Text, CompositeGroupKey> output, Reporter reporter)
throws IOException {
int sum = 0;
int length = 0;
while (values.hasNext()) {
CompositeGroupKey value = (CompositeGroupKey) values.next();
sum += (Integer) value.getCount(); // process value
length = (Integer) key.getLength();
}
CompositeGroupKey cgk = new CompositeGroupKey(sum,length);
output.collect(key, cgk);
}
}
This is the class wordcount
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobStatus;
import org.apache.hadoop.mapred.jobcontrol.Job;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;
public class WordCount {
public static void main(String[] args) {
JobClient client = new JobClient();
JobConf conf = new JobConf(WordCount.class);
// specify output types
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(CompositeGroupKey.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(CompositeGroupKey.class);
// specify input and output dirs
FileInputFormat.addInputPath(conf, new Path("input"));
FileOutputFormat.setOutputPath(conf, new Path("output16"));
// specify a mapper
conf.setMapperClass(WordCountMapper.class);
// specify a reducer
conf.setReducerClass(WordCountReducer.class);
conf.setCombinerClass(WordCountReducer.class);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (Exception e) {
e.printStackTrace();
}
}
}
And this is the groupcompositekey
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
public class CompositeGroupKey implements WritableComparable<CompositeGroupKey> {
int count;
int length;
public CompositeGroupKey(int c, int l) {
this.count = c;
this.length = l;
}
public void write(DataOutput out) throws IOException {
WritableUtils.writeVInt(out, count);
WritableUtils.writeVInt(out, length);
}
public void readFields(DataInput in) throws IOException {
this.count = WritableUtils.readVInt(in);
this.length = WritableUtils.readVInt(in);
}
public int compareTo(CompositeGroupKey pop) {
return 0;
}
public int getCount() {
return this.count;
}
public int getLength() {
return this.length;
}
}
Right now I get this error:
java.lang.RuntimeException: java.lang.NoSuchMethodException: CompositeGroupKey.<init>()
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:80)
at org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:62)
at org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:40)
at org.apache.hadoop.mapred.Task$ValuesIterator.readNextValue(Task.java:738)
at org.apache.hadoop.mapred.Task$ValuesIterator.next(Task.java:678)
at org.apache.hadoop.mapred.Task$CombineValuesIterator.next(Task.java:757)
at WordCountReducer.reduce(WordCountReducer.java:24)
at WordCountReducer.reduce(WordCountReducer.java:1)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.combineAndSpill(MapTask.java:904)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:785)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.flush(MapTask.java:698)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:228)
at org.apache.hadoop.mapred.TaskTracker$Child.main(TaskTracker.java:2209)
Caused by: java.lang.NoSuchMethodException: CompositeGroupKey.<init>()
at java.lang.Class.getConstructor0(Unknown Source)
at java.lang.Class.getDeclaredConstructor(Unknown Source)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:74)
I know the coding's not that good, but right now we don't have any idea where we went wrong, so any help would be welcome!

You have to provide an empty default constructor in your key class CompositeGroupKey. It is used for serialization.
Just add:
public CompositeGroupKey() {
}

Whenever you see some exceptions like the one given below
java.lang.RuntimeException: java.lang.NoSuchMethodException: CompositeGroupKey.<init>()
Then it will be a problem with the object instantiation which means either of the constructors might not be present.Eitherdefault constructor OR parameterised constructor
The moment you write a parameterised constructor JVM suppresses the default constructor unless expicitly declared.
The answer given by RusIan Ostafiichuk is enough to answer your query yet I added some more points to make things much clear.

Related

"How to fix 'uses or overrides a deprecated API and Recompile with -Xlint:deprecation for details' in Java"

I want to take advantage of using linux to practice examples of Hadoop-MapReduce.
I have written a code for my project and I am getting some warning message when I compile. I could not run it. Having tried many possible ways like ignoring warning and etc, I am still unable to run it. Below you will find the code.
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
importorg.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;
public class Average{
public static class Map extends Mapper<Object, Text, Text, IntWritable> {
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer script = new StringTokenizer(line, "\n"); while (script.hasMoreTokens()) {
StringTokenizer scriptLine = new StringTokenizer(script.nextToken());
Text Name = new Text(scriptLine.nextToken());
int Score = Integer.parseInt(scriptLine.nextToken());
context.write(Name, new IntWritable(Score));
}
}
}
public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException{
int numerator = 0;
int denominator = 0;
int avg = 0;
for (IntWritable score : value) {
numerator += score.get();
denominator++;
}
avg = numerator/denominator;
context.write(key, new IntWritable(avg));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Path dst_path = new Path (otherArgs[1]);
FileSystem hdfs = dst_path.getFileSystem(conf);
if (hdfs.exists(dst_path)){
hdfs.delete(dst_path, true);
};
Job job = new Job(conf, "Average");
job.setJarByClass(Average.class);
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

MapReduce with phoenix : org.apache.hadoop.io.LongWritable cannot be cast to org.apache.hadoop.io.NullWritable

I am trying to insert values into a table ("mea_interval") from data collected in another table ("mea_data"). The idea is not unique, it identifies a datatype. I use MeasureWritable class to read and write to the database, it implements DBWritable and Writable. When I run my jar I get the error:
15/12/15 10:13:38 WARN mapred.LocalJobRunner: job_local957174264_0001
java.lang.ClassCastException: org.apache.hadoop.io.LongWritable cannot be cast to org.apache.hadoop.io.NullWritable
at org.apache.phoenix.mapreduce.PhoenixRecordWriter.write(PhoenixRecordWriter.java:39)
at org.apache.hadoop.mapred.ReduceTask$NewTrackingRecordWriter.write(ReduceTask.java:551)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:85)
at org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer$Context.write(WrappedReducer.java:99)
at org.apache.hadoop.mapreduce.Reducer.reduce(Reducer.java:144)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:164)
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:610)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:444)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:449))
I can read the values in the table mea_data. If I display in console, they appear good. I think the error occurs during the execution of context.write in the map but I don't understand why.
I attached you the code of the job configuration and my map class. If you want to see another part of my code do not hesitate.
Thank you beforehand. :)
The job configuration :
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.phoenix.mapreduce.PhoenixInputFormat;
import org.apache.phoenix.mapreduce.PhoenixOutputFormat;
import org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil;
import org.apache.phoenix.mapreduce.util.PhoenixMapReduceUtil;
public class Application {
public static void main(String[] args) {
final Configuration configuration = HBaseConfiguration.create();
final Job job;
try {
job = Job.getInstance(configuration, "phoenix-mr-job");
final String selectQuery = "SELECT * FROM \"mea_data\" where \"timestamp\" > 1450168200";
PhoenixMapReduceUtil.setInput(job, MeasureWritable.class, "mea_data", selectQuery);
// Set the target Phoenix table and the columns
PhoenixMapReduceUtil.setOutput(job, "\"mea_interval\"", "id_collection,startDate,endDate,value");
job.setMapperClass(MeasureMapper.class);
job.setReducerClass(MeasureReducer.class);
job.setOutputFormatClass(PhoenixOutputFormat.class);
// job.setInputFormatClass(PhoenixInputFormat.class);
job.setNumReduceTasks(10);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(MeasureWritable.class);
// TableMapReduceUtil.addDependencyJars(job);
job.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
}
}
The mapper class :
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MeasureMapper extends Mapper<NullWritable , MeasureWritable, LongWritable, Text> {
#Override
protected void map(NullWritable key, MeasureWritable measureWritable, Context context) throws IOException, InterruptedException {
final long timestamp = measureWritable.getTimestamp();
double val = measureWritable.getValue();
final long id = measureWritable.getId();
System.out.print("id : "+ new LongWritable(id));
System.out.print(" timestamp : "+ timestamp);
System.out.println(" val : "+ val);
try{
context.write(new LongWritable(id), new Text(timestamp + ";" + val));
} catch (Exception e) {
e.printStackTrace();
}
}
}
The reducer class :
import java.io.IOException;
import java.text.NumberFormat;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MeasureReducer extends Reducer<LongWritable, Iterable<Text>, NullWritable, MeasureWritable> {
protected void reduce(LongWritable key, Iterable<Text> valeurs, Context context) throws IOException, InterruptedException {
MeasureWritable interval = new MeasureWritable();
interval.setId(Long.valueOf(key.toString()).longValue());
NumberFormat nf = NumberFormat.getInstance();
for(Text valeur : valeurs) {
String[] array = valeur.toString().split(";", -1);
interval.setStartingDate(Long.valueOf(array[0]).longValue());
interval.setEndingDate(Long.valueOf(array[0]).longValue());
try {
interval.setValue(nf.parse(array[1]).doubleValue() );
} catch (Exception e) {
e.printStackTrace();
}
}
context.write(NullWritable.get(), interval);
}
}
Use LongWritable as Mapper's input key as well as map method's first parameter instead of NullWritable.

Unable to identify the errors in this program

Am doing mapreduce for the following code. When i run this Job every thing works fine. But the output shows 0 0. I suspect this may be due to the TryparseInt() method which i QUICKFIXED as it was undefined previously.Initially there was no method for the TryparseInt(). so i created one, Can any one check whether the code is correct expecially the TryParseInt Method and tell me any suggetion to run this program successfully.
input looks like :
Thanks in Advance
import java.io.IOException;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.LongWritable;
public class MaxPubYear {
public static class MaxPubYearMapper extends Mapper<LongWritable , Text, IntWritable,Text>
{
public void map(LongWritable key, Text value , Context context)
throws IOException, InterruptedException
{
String delim = "\t";
Text valtosend = new Text();
String tokens[] = value.toString().split(delim);
if (tokens.length == 2)
{
valtosend.set(tokens[0] + ";"+ tokens[1]);
context.write(new IntWritable(1), valtosend);
}
}
}
public static class MaxPubYearReducer extends Reducer<IntWritable ,Text, Text, IntWritable>
{
public void reduce(IntWritable key, Iterable<Text> values , Context context) throws IOException, InterruptedException
{
int maxiValue = Integer.MIN_VALUE;
String maxiYear = "";
for(Text value:values) {
String token[] = value.toString().split(";");
if(token.length == 2 && TryParseInt(token[1]).intValue()> maxiValue)
{
maxiValue = TryParseInt(token[1]);
maxiYear = token[0];
}
}
context.write(new Text(maxiYear), new IntWritable(maxiValue));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job Job = new Job(conf, "Maximum Publication year");
Job.setJarByClass(MaxPubYear.class);
Job.setOutputKeyClass(Text.class);
Job.setOutputValueClass(IntWritable.class);
Job.setMapOutputKeyClass(IntWritable.class);
Job.setMapOutputValueClass(Text.class);
Job.setMapperClass(MaxPubYearMapper.class);
Job.setReducerClass(MaxPubYearReducer.class);
FileInputFormat.addInputPath(Job,new Path(args[0]));
FileOutputFormat.setOutputPath(Job,new Path(args[1]));
System.exit(Job.waitForCompletion(true)?0:1);
}
public static Integer TryParseInt(String string) {
// TODO Auto-generated method stub
return(0);
}
}
The errors mean exactly what they say: for the three 'could not be resolved to a type' errors you probaobly forgot to import the right classes. Error 2 simply means there is no method TryParseInt(String) in the class MaxPubYear.MaxPubYearReducer you have to create one there.

Splitting Files w.r.t input file MapReduce

Can someBody Suggest me Whats wrong in the Following Code.
Can u help me how to get the Below output using this Mapreduce program??
Actually This code works fine but the output is not as expected... output is generated in two files but either in Name.txt file or Age.txt file the output is swaping
Input File:
Name:A
Age:28
Name:B
Age:25
Name:K
Age:20
Name:P
Age:18
Name:Ak
Age:11
Name:N
Age:14
Name:Kr
Age:26
Name:Ra
Age:27
And my output should split into Name and Age
Name File:
Name:A
Name:B
Name:K
Name:P
Name:Ak
Name:N
Name:Kr
Name:Ra
Age File:
Age:28
Age:25
Age:20
Age:18
Age:11
Age:14
Age:26
Age:27
My Code :
MyMapper.java
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value,OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
String [] dall=value.toString().split(":");
output.collect(new Text(dall[0]),new Text(dall[1]));
}
}
MyReducer.Java:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class MyReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values,OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
while (values.hasNext()) {
output.collect(new Text(key),new Text(values.next()));
}
}
}
MultiFileOutput.java:
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.lib.*;
public class MultiFileOutput extends MultipleTextOutputFormat<Text, Text>{
protected String generateFileNameForKeyValue(Text key, Text value,String name) {
//return new Path(key.toString(), name).toString();
return key.toString();
}
protected Text generateActualKey(Text key, Text value) {
//return new Text(key.toString());
return null;
}
}
MyDriver.java:
import java.io.IOException;
import java.lang.Exception;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
public class MyDriver{
public static void main(String[] args) throws Exception,IOException {
Configuration mycon=new Configuration();
JobConf conf = new JobConf(mycon,MyDriver.class);
//JobConf conf = new JobConf(MyDriver.class);
conf.setJobName("Splitting");
conf.setMapperClass(MyMapper.class);
conf.setReducerClass(MyReducer.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(MultiFileOutput.class);
conf.setOutputKeyClass(Text.class);
conf.setMapOutputKeyClass(Text.class);
//conf.setOutputValueClass(Text.class);
conf.setMapOutputValueClass(Text.class);
FileInputFormat.setInputPaths(conf,new Path(args[0]));
FileOutputFormat.setOutputPath(conf,new Path(args[1]));
JobClient.runJob(conf);
//System.err.println(JobClient.runJob(conf));
}
}
ThankYou
Ok this is a bit more complicated use case than simple word count :)
So what you need is a complex key & a partitioner. And set number of reducers =2
Your complex key could be a Text(concatenation of Name|A or Age|28) or CustomWritable (that has 2 instance variables holding type(Name or Age) & value)
In the mapper you create the Text or CustomWritable anfd set it as output key and value can be just the name of the person or his age.
Create a partitioner (which implements org.apache.hadoop.mapred.Partitioner). In the getPartition method you basically decide based on your key which reducer it will go to.
Hope this helps.

My Input file is being read twice by the mapper in MapReduce of Hadoop

I am facing the problem while write MapReduce Program, my input file is being read twice by the program. already have gone through this why is my sequence file being read twice in my hadoop mapper class? answer, but unfortunately it did not help
My Mapper class is:
package com.siddu.mapreduce.csv;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class SidduCSVMapper extends MapReduceBase implements Mapper<LongWritable,Text,Text,IntWritable>
{
IntWritable one = new IntWritable(1);
#Override
public void map(LongWritable key, Text line,
OutputCollector<Text, IntWritable> output, Reporter report)
throws IOException
{
String lineCSV= line.toString();
String[] tokens = lineCSV.split(";");
output.collect(new Text(tokens[2]), one);
}
}
And My Reducer class is:
package com.siddu.mapreduce.csv;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class SidduCSVReducer extends MapReduceBase implements Reducer<Text,IntWritable,Text,IntWritable>
{
#Override
public void reduce(Text key, Iterator<IntWritable> inputFrmMapper,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException
{
System.out.println("In reducer the key is:"+key.toString());
int relationOccurance=0;
while(inputFrmMapper.hasNext())
{
IntWritable intWriteOb = inputFrmMapper.next();
int val = intWriteOb.get();
relationOccurance += val;
}
output.collect(key, new IntWritable(relationOccurance));
}
}
And finally My Driver class is:
package com.siddu.mapreduce.csv;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public class SidduCSVMapReduceDriver
{
public static void main(String[] args)
{
JobClient client = new JobClient();
JobConf conf = new JobConf(com.siddu.mapreduce.csv.SidduCSVMapReduceDriver.class);
conf.setJobName("Siddu CSV Reader 1.0");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(com.siddu.mapreduce.csv.SidduCSVMapper.class);
conf.setReducerClass(com.siddu.mapreduce.csv.SidduCSVReducer.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
client.setConf(conf);
try
{
JobClient.runJob(conf);
}
catch(Exception e)
{
e.printStackTrace();
}
}
}
You should be aware that hadoop spawns multiple attempts of a task, usually two for each mapper. If you see log file output twice, that's probably the reason.

Categories