I am very new to hadoop and I am getting this error while running a mapreduce job. I am trying to calculate the avg for a person and trying to take the input of first job and passing to 2nd job for calculating grades. I understood the problem but I am not able to figure out where I am doing wrong.
Below is the exception:
15/07/02 23:53:36 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
15/07/02 23:53:36 INFO input.FileInputFormat: Total input paths to process : 1
15/07/02 23:53:38 INFO mapred.JobClient: Running job: job_201507022153_0026
15/07/02 23:53:39 INFO mapred.JobClient: map 0% reduce 0%
15/07/02 23:53:44 INFO mapred.JobClient: Task Id : attempt_201507022153_0026_m_000000_0, Status : FAILED
java.lang.ClassCastException: org.apache.hadoop.io.Text cannot be cast to org.apache.hadoop.io.DoubleWritable
at com.hadoop.mrchain.Driver$Mapper2.map(Driver.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:647)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:323)
at org.apache.hadoop.mapred.Child$4.run(Child.java:266)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1278)
at org.apache.hadoop.mapred.Child.main(Child.java:260)
My code:
package com.hadoop.mrchain;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {
/*
* Mapper1
*/
public static class Mapper1 extends
Mapper<Object, Text, Text, DoubleWritable> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
String studentName = itr.nextToken();
Double marks = Double.parseDouble(itr.nextToken());
context.write(new Text(studentName), new DoubleWritable(marks));
}
}
/*
* Mapper1
*/
public static class Mapper2 extends
Mapper<Object, DoubleWritable, Text, DoubleWritable> {
public void map(Object key, DoubleWritable value, Context context)
throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
context.write(new Text(itr.nextToken()), new DoubleWritable(Double
.parseDouble(itr.nextToken().toString())));
}
}
/*
* Reducer1
*/
public static class Reducer1 extends
Reducer<Text, DoubleWritable, Text, DoubleWritable> {
public void reduce(Text key, Iterable<DoubleWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
int count = 0;
for (DoubleWritable val : values) {
sum += val.get();
count++;
}
double avg = sum / count;
context.write(key, new DoubleWritable(avg));
}
}
/*
* Reducer2
*/
public static class Reducer2 extends
Reducer<Text, DoubleWritable, Text, Text> {
public void reduce(Text key, Iterable<DoubleWritable> values,
Context context) throws IOException, InterruptedException {
for (DoubleWritable val : values) {
// double marks = Double.parseDouble(val.toString());
int marks = ((Double) val.get()).intValue();
if (marks >= 70) {
context.write(key, new Text("GradeA"));
} else if (marks >= 60 && marks < 70) {
context.write(key, new Text("GradeB"));
} else if (marks < 60 && marks >= 40) {
context.write(key, new Text("GradeC"));
} else {
context.write(key, new Text("FAIL"));
}
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
cleanFileSystem(conf, args);
Job job1 = new Job(conf, "BATCH51-MRCHAIN-JOB1");
job1.setJarByClass(Driver.class);
job1.setMapperClass(Mapper1.class);
job1.setCombinerClass(Reducer1.class);
job1.setReducerClass(Reducer1.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(job1, new Path(args[0]));
FileOutputFormat.setOutputPath(job1, new Path(args[1]));
job1.waitForCompletion(true);
// Job2
Job job2 = new Job(conf, "BATCH51-MRCHAIN-JOB2");
job2.setJarByClass(Driver.class);
job2.setMapperClass(Mapper2.class);
job2.setCombinerClass(Reducer2.class);
job2.setReducerClass(Reducer2.class);
// job2.setMapOutputKeyClass(Text.class);
// job2.setMapOutputValueClass(DoubleWritable.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job2, new Path(args[1]));
FileOutputFormat.setOutputPath(job2, new Path(args[2]));
System.exit(job2.waitForCompletion(true) ? 0 : 1);
}
private static void cleanFileSystem(Configuration conf, String[] args)
throws Exception {
FileSystem fs = FileSystem.get(conf);
if (fs.exists(new Path(args[1]))) {
fs.delete(new Path(args[1]), true);
}
if (fs.exists(new Path(args[2]))) {
fs.delete(new Path(args[2]), true);
}
// if (fs.exists(new Path(args[3]))) {
// fs.delete(new Path(args[3]), true);
// }
}
}
Sample Input:
hello 90
suresh 80
krishna 16
ramesh 55
santosh 82
anji 66
gopal 88
hello99
suresh 80
krishna 16
gopal 91
hello 91
suresh 80
krishna 86
ramesh 55
santosh 82
anji 66
gopal 95
It is not able to cast few strings into double, for example hello cannot be casted to double. You need to change you logic in mapper to fix this.
There are 2 issues to be addressed in the code posted in question:
We need to ensure that second mapper is able to correctly read the output generated by first map-reduce job. As the input format in use is default TextInputFormat which reads and store key-value in LongWritable, Text. In here, the code is trying to fit value of type Text into type DoubleWritable. Hence the exception. To fix this we need to ensure Text goes into Text.
As the combiner output goes to the reducer, the given reducer class cannot be used as is for combiner. To explain it; in the given scenario combiner emits Text, Text, but this is NOT the type reducer expects its key-values to be.
Below are the changes required to make the code working:
Mapper<LongWritable, Text, Text, DoubleWritable> { //Changed in mapper2 defn
//Changes in Driver main method
job1.setInputFormatClass(TextInputFormat.class); //added
job1.setOutputFormatClass(TextOutputFormat.class); //added
//job2.setCombinerClass(Reducer2.class); //commented
job2.setMapOutputKeyClass(Text.class); //un-commented
job2.setMapOutputValueClass(DoubleWritable.class); //un-commented
job2.setInputFormatClass(TextInputFormat.class); //added
Hope this helps.
Related
i am new to the mapreduce topic and still in the learning phase. i thank you in advance for the help and further tips. in the context of an exercise at the university i have the following problem:
from a csv file (listed below as an example) i want to calculate the average order_demand for every single product_code.
the codes, shown below "FrequencyMapper" & "FreqeuencyReducer" are running on my server and i think i currently have a display problem of the output.
since i am making my first beginnings with mapreduce i am grateful for any help.
listed below are the mapper, reducer and driver codes.
Example of the Dataset (csv-file)
Product_Code,Warehouse,Product_Category,Date,Order_Demand
Product_0993,Whse_J,Category_028,2012/7/27,100
Product_0979,Whse_J,Category_028,2012/6/5,500
Product_0979,Whse_E,Category_028,2012/11/29,500
Product_1157,Whse_E,Category_006,2012/6/4,160000
Product_1159,Whse_A,Category_006,2012/7/17,50000
My goal for example:
Product_0979 500
Product_1157 105000
...
FrequencyMapper.java:
package ma.test.a02;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class FrequencyMapper
extends Mapper<LongWritable, Text, Text, IntWritable> {
#Override
public void map(LongWritable offset, Text lineText, Context context)
throws IOException, InterruptedException {
String line = lineText.toString();
if(line.contains("Product")) {
String productcode = line.split(",")[0];
float orderDemand = Float.parseFloat(line.split(",")[4]);
context.write(new Text(productcode), new IntWritable((int) orderDemand));
}
}
}
FrequencyReducer.java:
package ma.test.a02;
import java.io.IOException;
import javax.xml.soap.Text;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class FrequencyReducer extends Reducer< Text , IntWritable , IntWritable , FloatWritable > {
public void reduce( IntWritable productcode, Iterable<IntWritable> orderDemands, Context context)
throws IOException, InterruptedException {
float averageDemand = 0;
float count = 0;
for ( IntWritable orderDemand : orderDemands) {
averageDemand +=orderDemand.get();
count +=1;
}
float result = averageDemand / count;
context.write(productcode, new FloatWritable (result));
}
}
Frequency.java (Driver):
package ma.test.a02;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Frequency {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: Average <input path> <output path>");
System.exit(-1);
}
// create a Hadoop job and set the main class
Job job = Job.getInstance();
job.setJarByClass(Frequency.class);
job.setJobName("MA-Test Average");
// set the input and output path
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// set the Mapper and Reducer class
job.setMapperClass(FrequencyMapper.class);
job.setReducerClass(FrequencyReducer.class);
// specify the type of the output
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
// run the job
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Tip 1: In the mapper you have filtered lines that contains "VOLUME" in the following line:
if(line.contains("VOLUME")) {
}
But no line contains "VOLUME" so you have no input in reducer!
Tip 2: your reducer output value is FloatWritable and you should use this line in your runner(Frequency class):
job.setOutputValueClass(FloatWritable.class);
instead of this one:
job.setOutputValueClass(IntWritable.class);
Tip 3: In reducer change this line:
public class FrequencyReducer extends Reducer<IntWritable , IntWritable , IntWritable , FloatWritable>
To this one:
public class FrequencyReducer extends Reducer<Text, IntWritable, IntWritable, FloatWritable >
Also add these lines to the Frequency class:
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
Tip 4: fist line in your csv file which describe the structure of your csv file will cause problem. reject this line by putting following line at the first of your map methd:
if(line.contains("Product_Code,Warehouse")) {
return;
}
Tip 5: In the real program make sure that you have plan for String that can not be cast to Integer in orderDemand.
At the end your mapper will be :
public class FrequencyMapper
extends Mapper<LongWritable, Text, Text, IntWritable> {
#Override
public void map(LongWritable offset, Text lineText, Context context)
throws IOException, InterruptedException {
String line = lineText.toString();
if (line.contains("Product_Code,Warehouse")) {
return;
}
if (line.contains("Product")) {
String productcode = line.split(",")[0].trim();
int orderDemand = Integer.valueOf(line.split(",")[4].trim());
context.write(new Text(productcode), new IntWritable(orderDemand));
}
}
}
And here is your reducer:
public class FrequencyReducer extends Reducer<Text, IntWritable , Text, FloatWritable > {
public void reduce( Text productcode, Iterable<IntWritable> orderDemands, Context context)
throws IOException, InterruptedException {
float averageDemand = 0;
float count = 0;
for ( IntWritable orderDemand : orderDemands) {
averageDemand +=orderDemand.get();
count +=1;
}
float result = averageDemand / count;
context.write(productcode, new FloatWritable (result));
}
}
And here is your runner:
public class Frequency {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: Average <input path> <output path>");
System.exit(-1);
}
// create a Hadoop job and set the main class
Job job = Job.getInstance();
job.setJarByClass(Frequency.class);
job.setJobName("MA-Test Average");
// set the input and output path
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// set the Mapper and Reducer class
job.setMapperClass(FrequencyMapper.class);
job.setReducerClass(FrequencyReducer.class);
// specify the type of the output
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
// run the job
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Problem Statement : Find the max temperature of each city using MapReduce
Input:
Kolkata,56
Jaipur,45
Delhi,43
Mumbai,34
Goa,45
Kolkata,35
Jaipur,34
Delhi,32
Output:
Kolkata 56
Jaipur 45
Delhi 43
Mumbai 34
I have written the following code :
Map:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class Map
extends Mapper<LongWritable, Text, Text, IntWritable>{
private IntWritable max = new IntWritable();
private Text word = new Text();
#Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer line = new StringTokenizer(value.toString(),",\t");
word.set(line.nextToken());
max.set(Integer.parseInt(line.nextToken()));
context.write(word,max);
}
}
Reduce:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class Reduce
extends Reducer<Text, IntWritable, Text, IntWritable>{
private int max_temp = Integer.MIN_VALUE;
private int temp = 0;
#Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
Iterator<IntWritable> itr = values.iterator();
while (itr.hasNext()) {
temp = itr.next().get();
if( temp > max_temp)
{
max_temp = temp;
}
}
context.write(key, new IntWritable(max_temp));
}
}
Driver Class:
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxTempDriver {
public static void main(String[] args) throws Exception {
// Create a new job
Job job = new Job();
// Set job name to locate it in the distributed environment
job.setJarByClass(MaxTempDriver.class);
job.setJobName("Max Temperature");
// Set input and output Path, note that we use the default input format
// which is TextInputFormat (each record is a line of input)
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// Set Mapper and Reducer class
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
// Set Output key and value
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
I am getting the following error:
17/06/15 10:44:17 INFO mapred.JobClient: Task Id :
attempt_201706151011_0002_m_000000_1, Status : FAILED
java.util.NoSuchElementException
at java.util.StringTokenizer.nextToken(StringTokenizer.java:349)
at Map.map(Map.java:23)
at Map.map(Map.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
atorg.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
As you can see, I am getting java.util.NoSuchElementException in the map function. Please help me with this exception and provide your suggestions to modify the map() code.
Check whether the next token exists:
#Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer line = new StringTokenizer(value.toString(), ",\t");
if (line.countTokens() > 0) {
word.set(line.nextToken());
if (line.hasMoreTokens())
max.set(Integer.parseInt(line.nextToken()));
context.write(word, max);
}
}
One thing that I noticed when I tried out this particular example of MapReduce is that, the highest value gets cascaded for all the values following the place with the highest temperature.
Output looked something similar to this,
Delhi 43
Goa 45
Jaipur 45
Kolkata 56
Mumbai 56
As opposed to this,
Delhi 43
Goa 45
Jaipur 45
Kolkata 56
Mumbai 34
You can see that the last value of Mumbai has a temperature of 56(which is the highest temperature for Kolkata)
I noticed that, this was because of not resetting the temp and the max_temperature for each call of the reduce function.
Adding the following two lines inside the reduce function within the Reduce class, just before the while loop solves this issue,
temp = 0;
max_temp = Integer.MIN_VALUE;
I use this code below to get output result like ( Key , Value )
Apple 12
Bee 345
Cat 123
What I want is descending sorted by value ( 345 ) and place them before the key ( Value , Key )
345 Bee
123 Cat
12 Apple
I found there are something called "secondary sorted" not going to lie but I'm so lost - I tried to change .. context.write(key, result); but failed miserably. I'm new to Hadoop and not sure how can I start to tackle this problem. Any recommendation would be appreciated. Which function do I need to change ? or which class do I need modify ?
here 'are my classes :
package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job,
new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
You have been able to do word count correctly.
You will need second map only job to perform the second requirement of descending sort and swapping of key value
Use DecreasingComparator as sort comparator
Use InverseMapper to swap key and values
Use Identity Reducer i.e. Reducer.class - In case of Identity Reducer no aggregation will happen ( as each value is output individually for key )
Set number of reduce tasks to 1 or use TotalOderPartitioner
I am using hadoop map reduce and I want to compute two files. My first Map/Reduce iteration is giving me an a file with a pair ID number like this:
A 30
D 20
My goal is to use that ID from the file to associate with another file and have another output with a trio: ID, Number, Name, like this:
A ABC 30
D EFGH 20
But I am not sure whether using Map Reduce is the best way to do this. Would it be better for example to use a File Reader to Read the second input file and get the Name by ID? Or can I do it with Map Reduce?
If so, I'm trying to find out how. I tried a MultipleInput solution:
MultipleInputs.addInputPath(job2, new Path(args[1]+"-tmp"),
TextInputFormat.class, FlightsByCarrierMapper2.class);
MultipleInputs.addInputPath(job2, new Path("inputplanes"),
TextInputFormat.class, FlightsModeMapper.class);
But I can't think of any solution to combine the two and get the output I want. The way I have right now is just giving me the list like this example:
A ABC
A 30
B ABCD
C ABCDEF
D EFGH
D 20
After my Last Reduce I am getting this:
N125DL 767-332
N125DL 7 ,
N126AT 737-76N
N126AT 19 ,
N126DL 767-332
N126DL 1 ,
N127DL 767-332
N127DL 7 ,
N128DL 767-332
N128DL 3
I want this: N127DL 7 767-332. And also, I don't want the ones which do not combine.
And this is my reduce class:
public class FlightsByCarrierReducer2 extends Reducer {
String merge = "";
protected void reduce(Text token, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
int i = 0;
for(Text value:values)
{
if(i == 0){
merge = value.toString()+",";
}
else{
merge += value.toString();
}
i++;
}
context.write(token, new Text(merge));
}
}
Update:
http://stat-computing.org/dataexpo/2009/the-data.html this is the example I'm using.
I'm trying with: TailNum and Cancelled which is (1 or 0) get the model name that corresponds to the TailNum. My file with model has a TailNumb, Model and other stuff. My current output is:
N193JB ERJ 190-100 IGW
N194DN 767-332
N19503 EMB-135ER
N19554 EMB-145LR
N195DN 767-332
N195DN 2
First comes the key, second the model, the keys that has flights cancelled, apperas below the model
And I would like a trio Key,Model Number of Cancelled, Because I want number of Cancellations per model
You can join them using ID as key for both mapper.
You can write your map task as something like this
public void map(LongWritable k, Text value, Context context) throws IOException, InterruptedException
{
//Get the line
//split the line to get ID seperate
//word1 = A
//word2 = 30
//Likewise for A ABC
//word1 = A
//word2 = ABC
context.write(word1, word2);
}
I think you can resuse the same Map task.
And then write a commomn Reducer job where Hadoop Framework groups data on key basis.
So you will be able to get ID as key.
And You can cache one of the value and then concat.
String merge = "";
public void reduce(Text key, Iterable<Text> values, Context context)
{
int i =0;
for(Text value:values)
{
if(i == 0){
merge = value.toString()+",";
}
else{
merge += value.toString();
}
i++;
}
valEmit.set(merge);
context.write(key, valEmit);
}
Finally you can write your Driver class
public int run(String[] args) throws Exception {
Configuration c=new Configuration();
String[] files=new GenericOptionsParser(c,args).getRemainingArgs();
Path p1=new Path(files[0]);
Path p2=new Path(files[1]);
Path p3=new Path(files[2]);
FileSystem fs = FileSystem.get(c);
if(fs.exists(p3)){
fs.delete(p3, true);
}
Job job = new Job(c,"Multiple Job");
job.setJarByClass(MultipleFiles.class);
MultipleInputs.addInputPath(job, p1, TextInputFormat.class, MultipleMap1.class);
MultipleInputs.addInputPath(job,p2, TextInputFormat.class, MultipleMap2.class);
job.setReducerClass(MultipleReducer.class);
.
.
}
You can find the example HERE
Hope this helps.
UPDATE
Input1
A 30
D 20
Input2
A ABC
D EFGH
Output
A ABC 30
D EFGH 20
Mapper.java
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* #author sreeveni
*
*/
public class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
Text keyEmit = new Text();
Text valEmit = new Text();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String parts[] = line.split(" ");
keyEmit.set(parts[0]);
valEmit.set(parts[1]);
context.write(keyEmit, valEmit);
}
}
Reducer.java
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* #author sreeveni
*
*/
public class ReducerJoin extends Reducer<Text, Text, Text, Text> {
Text valEmit = new Text();
String merge = "";
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String character = "";
String number = "";
for (Text value : values) {
// ordering output
String val = value.toString();
char myChar = val.charAt(0);
if (Character.isDigit(myChar)) {
number = val;
} else {
character = val;
}
}
merge = character + " " + number;
valEmit.set(merge);
context.write(key, valEmit);
}
}
Driver class
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* #author sreeveni
*
*/
public class Driver extends Configured implements Tool {
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
// checking the arguments count
if (args.length != 3) {
System.err
.println("Usage : <inputlocation> <inputlocation> <outputlocation> ");
System.exit(0);
}
int res = ToolRunner.run(new Configuration(), new Driver(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
String source1 = args[0];
String source2 = args[1];
String dest = args[2];
Configuration conf = new Configuration();
conf.set("mapred.textoutputformat.separator", " "); // changing default
// delimiter to user
// input delimiter
FileSystem fs = FileSystem.get(conf);
Job job = new Job(conf, "Multiple Jobs");
job.setJarByClass(Driver.class);
Path p1 = new Path(source1);
Path p2 = new Path(source2);
Path out = new Path(dest);
MultipleInputs.addInputPath(job, p1, TextInputFormat.class,
Mapper1.class);
MultipleInputs.addInputPath(job, p2, TextInputFormat.class,
Mapper1.class);
job.setReducerClass(ReducerJoin.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
/*
* delete if exist
*/
if (fs.exists(out))
fs.delete(out, true);
TextOutputFormat.setOutputPath(job, out);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
}
Your reducer has a map method, but it should have a reduce method that takes an Iterable collection of values which you then merge. Because you don't have a reduce() method, you get the default behavior which is to just pass through all of the key/value pairs.
I using Hadoop Map/Reduce using Java
Suppose, I have completed a whole map/reduce job. Is there any way I could repeat the whole map/reduce part only, without ending the job. I mean, I DON'T want to use any chaining of the different jobs but only only want the map/reduce part to repeat.
Thank you!
So I am more familiar with hadoop streaming APIs but approach should translate to the native APIs.
In my understanding what you are trying to do is run the several iterations of same map() and reduce() operations on the input data.
Lets say your initial map() input data comes from file input.txt and the output file is output + {iteration}.txt (where iteration is loop count, iteration =[0, # of iteration)).
In the second invocation of the map()/reduce() your input file is output+{iteration} and output file would become output+{iteration +1}.txt.
Let me know if this is not clear, I can conjure up a quick example and post a link here.
EDIT* So for Java I modified the hadoop wordcount example to run multiple times
package com.rorlig;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountJob {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 3) {
System.err.println("Usage: wordcount <in> <out> <iterations>");
System.exit(2);
}
int iterations = new Integer(args[2]);
Path inPath = new Path(args[0]);
Path outPath = null;
for (int i = 0; i<iterations; ++i){
outPath = new Path(args[1]+i);
Job job = new Job(conf, "word count");
job.setJarByClass(WordCountJob.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, inPath);
FileOutputFormat.setOutputPath(job, outPath);
job.waitForCompletion(true);
inPath = outPath;
}
}
}
Hope this helps