Goal:
I want to be able to specify the number of mappers used on an input file
Equivalently, I want to specify the number of line of a file each mapper will take
Simple example:
For an input file of 10 lines (of unequal length; example below), I want there to be 2 mappers -- each mapper will thus process 5 lines.
This is
an arbitrary example file
of 10 lines.
Each line does
not have to be
of
the same
length or contain
the same
number of words
This is what I have:
(I have it so that each mapper produces one "<map,1>" key-value pair ... so that it will then be summed in the reducer)
package org.myorg;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.InputFormat;
public class Test {
// prduce one "<map,1>" pair per mapper
public static class Map extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
context.write(new Text("map"), one);
}
}
// reduce by taking a sum
public static class Red extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job1 = Job.getInstance(conf, "pass01");
job1.setJarByClass(Test.class);
job1.setMapperClass(Map.class);
job1.setCombinerClass(Red.class);
job1.setReducerClass(Red.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job1, new Path(args[0]));
FileOutputFormat.setOutputPath(job1, new Path(args[1]));
// // Attempt#1
// conf.setInt("mapreduce.input.lineinputformat.linespermap", 5);
// job1.setInputFormatClass(NLineInputFormat.class);
// // Attempt#2
// NLineInputFormat.setNumLinesPerSplit(job1, 5);
// job1.setInputFormatClass(NLineInputFormat.class);
// // Attempt#3
// conf.setInt(NLineInputFormat.LINES_PER_MAP, 5);
// job1.setInputFormatClass(NLineInputFormat.class);
// // Attempt#4
// conf.setInt("mapreduce.input.fileinputformat.split.minsize", 234);
// conf.setInt("mapreduce.input.fileinputformat.split.maxsize", 234);
System.exit(job1.waitForCompletion(true) ? 0 : 1);
}
}
The above code, using the above example data, will produce
map 10
I want the output to be
map 2
where the first mapper will do something will the first 5 lines, and the second mapper will do something with the second 5 lines.
You could use NLineInputFormat.
With NLineInputFormat functionality, you can specify exactly how many lines should go to a mapper.
E.g. If your file has 500 lines, and you set number of lines per mapper to 10, you have 50 mappers
(instead of one - assuming the file is smaller than a HDFS block size).
EDIT:
Here is an example for using NLineInputFormat:
Mapper Class:
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MapperNLine extends Mapper<LongWritable, Text, LongWritable, Text> {
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(key, value);
}
}
Driver class:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Driver extends Configured implements Tool {
#Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.out
.printf("Two parameters are required for DriverNLineInputFormat- <input dir> <output dir>\n");
return -1;
}
Job job = new Job(getConf());
job.setJobName("NLineInputFormat example");
job.setJarByClass(Driver.class);
job.setInputFormatClass(NLineInputFormat.class);
NLineInputFormat.addInputPath(job, new Path(args[0]));
job.getConfiguration().setInt("mapreduce.input.lineinputformat.linespermap", 5);
LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(MapperNLine.class);
job.setNumReduceTasks(0);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Configuration(), new Driver(), args);
System.exit(exitCode);
}
}
With the input you provided the output from the above sample Mapper would be written to two files as 2 Mappers get initialized :
part-m-00001
0 This is
8 an arbitrary example file
34 of 10 lines.
47 Each line does
62 not have to be
part-m-00002
77 of
80 the same
89 length or contain
107 the same
116 number of words
Related
I want to take advantage of using linux to practice examples of Hadoop-MapReduce.
I have written a code for my project and I am getting some warning message when I compile. I could not run it. Having tried many possible ways like ignoring warning and etc, I am still unable to run it. Below you will find the code.
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
importorg.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;
public class Average{
public static class Map extends Mapper<Object, Text, Text, IntWritable> {
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer script = new StringTokenizer(line, "\n"); while (script.hasMoreTokens()) {
StringTokenizer scriptLine = new StringTokenizer(script.nextToken());
Text Name = new Text(scriptLine.nextToken());
int Score = Integer.parseInt(scriptLine.nextToken());
context.write(Name, new IntWritable(Score));
}
}
}
public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException{
int numerator = 0;
int denominator = 0;
int avg = 0;
for (IntWritable score : value) {
numerator += score.get();
denominator++;
}
avg = numerator/denominator;
context.write(key, new IntWritable(avg));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Path dst_path = new Path (otherArgs[1]);
FileSystem hdfs = dst_path.getFileSystem(conf);
if (hdfs.exists(dst_path)){
hdfs.delete(dst_path, true);
};
Job job = new Job(conf, "Average");
job.setJarByClass(Average.class);
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
I am currently using Eclipse and Hadoop to create a mapper and reducer to find Maximum Total Cost of an Airline Data Set.
So the Total Cost is Decimal Value and Airline Carrier is Text.
The dataset I used can be found in the following weblink:
https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/236265/dft-flights-data-2011.csv
When I export the jar file in Hadoop,
I am getting the following message: ls: "output" : No such file or directory.
Can anyone help me correct the code please?
My code is below.
Mapper:
package org.myorg;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MaxTotalCostMapper extends Mapper<LongWritable, Text, Text, DoubleWritable>
{
private final static DoubleWritable totalcostWritable = new DoubleWritable(0);
private Text AirCarrier = new Text();
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException
{
String[] line = value.toString().split(",");
AirCarrier.set(line[8]);
double totalcost = Double.parseDouble(line[2].trim());
totalcostWritable.set(totalcost);
context.write(AirCarrier, totalcostWritable);
}
}
Reducer:
package org.myorg;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MaxTotalCostReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable>
{
ArrayList<Double> totalcostList = new ArrayList<Double>();
#Override
public void reduce(Text key, Iterable<DoubleWritable> values, Context context)
throws IOException, InterruptedException
{
double maxValue=0.0;
for (DoubleWritable value : values)
{
maxValue = Math.max(maxValue, value.get());
}
context.write(key, new DoubleWritable(maxValue));
}
}
Main:
package org.myorg;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxTotalCost
{
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
if (args.length != 2)
{
System.err.println("Usage: MaxTotalCost<input path><output path>");
System.exit(-1);
}
Job job;
job=Job.getInstance(conf, "Max Total Cost");
job.setJarByClass(MaxTotalCost.class);
FileInputFormat.addInputPath(job, new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.setMapperClass(MaxTotalCostMapper.class);
job.setReducerClass(MaxTotalCostReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
ls: "output" : No such file or directory
You have no HDFS user directory. Your code isn't making it into the Mapper or Reducer. That error typically arises at the Job
FileOutputFormat.setOutputPath(job, new Path(args[2]));
Run an hdfs dfs -ls, see if you get any errors. If so, make a directory under /user that matches your current user.
Otherwise, change your output directory to something like /tmp/max
Can someone please help me to find out why I am not getting the average salary after running my MapReduce code.
Problem: Calculate Average salary of permanent and contract employee
Sample Input:
1 user1 permanent 100
2 user2 contract 500
3 user3 permanent 200
4 user4 contract 300
Expected Output:
permanent 285
contract 187
Output I got:
permanent 100
permanent 200
contract 500
contract 300
Run Job:
$ hadoop jar partition.jar com.hadoop.PartitionExample
input/partition_example.txt output
package com.hadoop;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class PartitionExample {
public static class MapClass extends Mapper<LongWritable, Text,
Text, IntWritable>{
Text outKey = new Text(); ;
IntWritable outValue = new IntWritable();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException{
String[] colmn = value.toString().split(" ");
outKey.set(colmn[2]);
outValue.set(Integer.parseInt(colmn[3]));
context.write(outKey, outValue);
}
}
// permanent [100,300,200,400]
public static class ReduceClass extends Reducer<Text,IntWritable,Text,
IntWritable>{
IntWritable outValue = new IntWritable();
public void Reduce(Text key, Iterable<IntWritable> value, Context
context) throws IOException, InterruptedException{
int sum = 0; int count = 0; int avg ;
//outKey.set(key);
for (IntWritable sal:value){
sum = sum + sal.get();
count++;
}
avg = sum/count ;
outValue.set(avg);
context.write(key, outValue);
}
}
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException{
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if(otherArgs.length != 2){
System.err.println("Number of argument passed is not 2");
System.exit(1);
}
Job job = new Job(conf, "My regular MapReduce job");
job.setJarByClass(PartitionExample.class);
job.setMapperClass(MapClass.class);
// job.setCombinerClass(ReduceClass.class);
job.setReducerClass(ReduceClass.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
I found my mistake in the code. It was very silly mistake I can say :(
it was in ovirridden reduce function name. I change it from "Reduce" to "reduce".
Problem Statement : Find the max temperature of each city using MapReduce
Input:
Kolkata,56
Jaipur,45
Delhi,43
Mumbai,34
Goa,45
Kolkata,35
Jaipur,34
Delhi,32
Output:
Kolkata 56
Jaipur 45
Delhi 43
Mumbai 34
I have written the following code :
Map:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class Map
extends Mapper<LongWritable, Text, Text, IntWritable>{
private IntWritable max = new IntWritable();
private Text word = new Text();
#Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer line = new StringTokenizer(value.toString(),",\t");
word.set(line.nextToken());
max.set(Integer.parseInt(line.nextToken()));
context.write(word,max);
}
}
Reduce:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class Reduce
extends Reducer<Text, IntWritable, Text, IntWritable>{
private int max_temp = Integer.MIN_VALUE;
private int temp = 0;
#Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
Iterator<IntWritable> itr = values.iterator();
while (itr.hasNext()) {
temp = itr.next().get();
if( temp > max_temp)
{
max_temp = temp;
}
}
context.write(key, new IntWritable(max_temp));
}
}
Driver Class:
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxTempDriver {
public static void main(String[] args) throws Exception {
// Create a new job
Job job = new Job();
// Set job name to locate it in the distributed environment
job.setJarByClass(MaxTempDriver.class);
job.setJobName("Max Temperature");
// Set input and output Path, note that we use the default input format
// which is TextInputFormat (each record is a line of input)
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// Set Mapper and Reducer class
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
// Set Output key and value
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
I am getting the following error:
17/06/15 10:44:17 INFO mapred.JobClient: Task Id :
attempt_201706151011_0002_m_000000_1, Status : FAILED
java.util.NoSuchElementException
at java.util.StringTokenizer.nextToken(StringTokenizer.java:349)
at Map.map(Map.java:23)
at Map.map(Map.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
atorg.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
As you can see, I am getting java.util.NoSuchElementException in the map function. Please help me with this exception and provide your suggestions to modify the map() code.
Check whether the next token exists:
#Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer line = new StringTokenizer(value.toString(), ",\t");
if (line.countTokens() > 0) {
word.set(line.nextToken());
if (line.hasMoreTokens())
max.set(Integer.parseInt(line.nextToken()));
context.write(word, max);
}
}
One thing that I noticed when I tried out this particular example of MapReduce is that, the highest value gets cascaded for all the values following the place with the highest temperature.
Output looked something similar to this,
Delhi 43
Goa 45
Jaipur 45
Kolkata 56
Mumbai 56
As opposed to this,
Delhi 43
Goa 45
Jaipur 45
Kolkata 56
Mumbai 34
You can see that the last value of Mumbai has a temperature of 56(which is the highest temperature for Kolkata)
I noticed that, this was because of not resetting the temp and the max_temperature for each call of the reduce function.
Adding the following two lines inside the reduce function within the Reduce class, just before the while loop solves this issue,
temp = 0;
max_temp = Integer.MIN_VALUE;
Am doing mapreduce for the following code. When i run this Job every thing works fine. But the output shows 0 0. I suspect this may be due to the TryparseInt() method which i QUICKFIXED as it was undefined previously.Initially there was no method for the TryparseInt(). so i created one, Can any one check whether the code is correct expecially the TryParseInt Method and tell me any suggetion to run this program successfully.
input looks like :
Thanks in Advance
import java.io.IOException;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.LongWritable;
public class MaxPubYear {
public static class MaxPubYearMapper extends Mapper<LongWritable , Text, IntWritable,Text>
{
public void map(LongWritable key, Text value , Context context)
throws IOException, InterruptedException
{
String delim = "\t";
Text valtosend = new Text();
String tokens[] = value.toString().split(delim);
if (tokens.length == 2)
{
valtosend.set(tokens[0] + ";"+ tokens[1]);
context.write(new IntWritable(1), valtosend);
}
}
}
public static class MaxPubYearReducer extends Reducer<IntWritable ,Text, Text, IntWritable>
{
public void reduce(IntWritable key, Iterable<Text> values , Context context) throws IOException, InterruptedException
{
int maxiValue = Integer.MIN_VALUE;
String maxiYear = "";
for(Text value:values) {
String token[] = value.toString().split(";");
if(token.length == 2 && TryParseInt(token[1]).intValue()> maxiValue)
{
maxiValue = TryParseInt(token[1]);
maxiYear = token[0];
}
}
context.write(new Text(maxiYear), new IntWritable(maxiValue));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job Job = new Job(conf, "Maximum Publication year");
Job.setJarByClass(MaxPubYear.class);
Job.setOutputKeyClass(Text.class);
Job.setOutputValueClass(IntWritable.class);
Job.setMapOutputKeyClass(IntWritable.class);
Job.setMapOutputValueClass(Text.class);
Job.setMapperClass(MaxPubYearMapper.class);
Job.setReducerClass(MaxPubYearReducer.class);
FileInputFormat.addInputPath(Job,new Path(args[0]));
FileOutputFormat.setOutputPath(Job,new Path(args[1]));
System.exit(Job.waitForCompletion(true)?0:1);
}
public static Integer TryParseInt(String string) {
// TODO Auto-generated method stub
return(0);
}
}
The errors mean exactly what they say: for the three 'could not be resolved to a type' errors you probaobly forgot to import the right classes. Error 2 simply means there is no method TryParseInt(String) in the class MaxPubYear.MaxPubYearReducer you have to create one there.