Unable to run Hadoop MapReduce Word Count on Hadoop 2.6 - java

I have set up Hadoop 2.6.0 on 2 Vm's each running CentOS 6.5(64 bit) ,Yarn as well as Hadoop is running fine. My host m/c is Windows 8.1 (64 bit) I am trying to run Word Count map reduce problem from Eclipse on my host machine while the cluster is running on VMN's but getting exception
package com.mapr.example;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path("D:\\MyworkSpace\\MapReducePractice\\resource\\Input.txt"));
FileOutputFormat.setOutputPath(job, new Path("D:\\MyworkSpace\\MapReducePractice\\resource\\Output"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Exception that i am getting
Exception in thread "main" java.lang.NullPointerException
at java.lang.ProcessBuilder.start(ProcessBuilder.java:1011)
at org.apache.hadoop.util.Shell.runCommand(Shell.java:482)
at org.apache.hadoop.util.Shell.run(Shell.java:455)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:715)
at org.apache.hadoop.util.Shell.execCommand(Shell.java:808)
at org.apache.hadoop.util.Shell.execCommand(Shell.java:791)
at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:656)
at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:444)
at org.apache.hadoop.fs.FilterFileSystem.mkdirs(FilterFileSystem.java:293)
at org.apache.hadoop.mapreduce.JobSubmissionFiles.getStagingDir(JobSubmissionFiles.java:133)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:437)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1296)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1293)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1628)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1293)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1314)
at com.mapr.example.WordCount.main(WordCount.java:72)
Any pointers will be of a great help.

Related

How can i fix InputBootStrapper error in eclipse for runnig Hadoop program?

How can i fix it ?
This code is Hadoop WordCount example in JAVA.
I have added the external JARs and this is not a compile time issue,
When i have tried o run the program this is happening.
The input and out put arguments are set.
All the mangers are up and running.
Hadoop version 2.9.0
Java version 1.8.0
Eclipse Oxygen
OS Ubuntu 14.04 64bit
What could be the issue?
wordcount.java
package mapreduce;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class wordcount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(wordcount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Console(Error)
Exception in thread "main" java.lang.NoClassDefFoundError: com/ctc/wstx/io/InputBootstrapper
at mapreduce.wordcount.main(wordcount.java:51)
Caused by: java.lang.ClassNotFoundException: com.ctc.wstx.io.InputBootstrapper
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 1 more
For me missing jar was this: hadoop-client-runtime.jar. Adding that additional dependency from Maven solved issue.
As a side note below is a list of my dependencies (I'm using SBT/Scala):
libraryDependencies ++= Seq(
"org.apache.hadoop" % "hadoop-client-api" % "3.3.1",
"commons-logging" % "commons-logging" % "1.2",
"org.slf4j" % "slf4j-api" % "1.7.35",
"org.apache.hadoop" % "hadoop-client-runtime" % "3.3.1"
)
This link helped me to look at right direction: https://issues.apache.org/jira/browse/SPARK-33343

How to make WordCount use new Java libraries in Cloudera?

I am new to Hadoop, and trying the example of WordCount V1.0 here:
https://www.cloudera.com/documentation/other/tutorial/CDH5/topics/ht_usage.html
However, when I compile the WordCount.java using this line:
javac -cp /usr/lib/hadoop/*:/usr/lib/hadoop-mapreduce/* WordCount.java -d build -Xlint
It seems like the code uses the old version of .jar files, and gives me the following warnings (as shown in the picture). However, when I check inside the classpath I declared, there are some .jar files which seems to be newer versions of those being required .jar files.
So my question is that how can I make my WordCount.java use the newer file instead? I tried looking inside the WordCount.java code to see which rows use those required .jar files but could not see them.
Thanks in advance for any help.
The code of the WordCount.java
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

class not found exception in Hadoop

I'm trying to run a hadoop single unit program for wordcount, I'm doing this on windows 10 64 bit and on Cygwin, this is the program I'm using:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens())
{
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable>
{
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
{
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
My classpath is as follows:
~/hadoop-1.2.1/bin/hadoop jar WordCount.jar hadoop.ProcessUnits input_dir output_dir
and I get the following error messages when I try to compile the program:
Exception in thread "main" java.lang.ClassNotFoundException: hadoop.ProcessUnits
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.hadoop.util.RunJar.main(RunJar.java:153)
The problem is that hadoop.ProcessUnits should point to the mainClass you want to run. Your class is called WordCount, so it should look something like:
$ hadoop jar WordCount.jar <PACKAGE>.WordCount input_dir output_dir
Your code doesn't include the package name so I've substituted <PACKAGE>

Mapreduce2 program error: Not a valid JAR

I have installed Hadoop version 2.2.0 on Ubuntu. When I run:
yarn jar
hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.2.0.jar
wordcount /user/ubuntu/wordcount/input/file01.txt /output
It runs fine.
When I run a sample program, make the JAR using Eclipse export utility, and run using:
yarn jar /user/ubuntu/WordCountNew.jar com.sample.WordCountNew
/user/ubuntu/wordcount/input/file01.txt /output9
It shows: Not a valid JAR: /user/ubuntu/WordCountNew.jar
When I compile the code in Eclipse, it also shows this error:
2015-02-02 16:08:53,077 WARN [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(62)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: 0
at com.kumar.WordCountNew.main(WordCountNew.java:62)
Code:
package com.sample;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WordCountNew {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum = sum + val.get();
context.write(key, new IntWritable(sum));
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "wordcount");
job.setJarByClass(WordCountNew.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.waitForCompletion(true);
}
}

Hadoop - Exception in thread "main" java.lang.NullPointerException

I am trying to use Apache Hadoop for Windows Platform through this tutorial: http://www.codeproject.com/Articles/757934/Apache-Hadoop-for-Windows-Platform?fid=1858035, the eclipse part. Everything is going fine until the last step. When running the program I got:
log4j:WARN No appenders could be found for logger (org.apache.hadoop.metrics2.lib.MutableMetricsFactory).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Exception in thread "main" java.lang.NullPointerException
at java.lang.ProcessBuilder.start(Unknown Source)
at org.apache.hadoop.util.Shell.runCommand(Shell.java:445)
at org.apache.hadoop.util.Shell.run(Shell.java:418)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:650)
at org.apache.hadoop.util.Shell.execCommand(Shell.java:739)
at org.apache.hadoop.util.Shell.execCommand(Shell.java:722)
at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:631)
at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:421)
at org.apache.hadoop.fs.FilterFileSystem.mkdirs(FilterFileSystem.java:277)
at org.apache.hadoop.mapreduce.JobSubmissionFiles.getStagingDir(JobSubmissionFiles.java:125)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:348)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1285)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1282)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Unknown Source)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1548)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1282)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1303)
at Recipe.main(Recipe.java:82)
The code is:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import com.google.gson.Gson;
public class Recipe {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
Gson gson = new Gson();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
Roo roo=gson.fromJson(value.toString(),Roo.class);
if(roo.cookTime!=null)
{
word.set(roo.cookTime);
}
else
{
word.set("none");
}
context.write(word, one);
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
#SuppressWarnings("deprecation")
Job job = new Job(conf, "Recipe");
job.setJarByClass(Recipe.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
//FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
FileInputFormat.addInputPath(job, new Path("hdfs://127.0.0.1:9000/in"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://127.0.0.1:9000/output"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
// job.submit();
}
}
class Id
{
public String oid;
}
class Ts
{
public long date ;
}
class Roo
{
public Id _id ;
public String name ;
public String ingredients ;
public String url ;
public String image ;
public Ts ts ;
public String cookTime;
public String source ;
public String recipeYield ;
public String datePublished;
public String prepTime ;
public String description;
}
This happens only when I try to run it through Eclipse. Through CMD it went fine:
javac -classpath C:\hadoop-2.3\share\hadoop\common\hadoop-common-2.3.0.jar;C:\hadoop-2.3\share\hadoop\common\lib\gson-2.2.4.jar;C:\hadoop-2.3\share\hadoop\common\lib\commons-cli-1.2.jar;C:\hadoop-2.3\share\hadoop\mapreduce\hadoop-mapreduce-client-core-2.3.0.jar;Recipe.java
jar -cvf Recipe.jar *.class
hadoop jar c:\Hwork\Recipe.jar Recipe /in /out
Any idea how can I solve this?
I had the same problem and workaround from here http://qnalist.com/questions/4994960/run-spark-unit-test-on-windows-7 fixed it.
Workaround is:
download compiled winutils.exe from
https://codeload.github.com/srccodes/hadoop-common-2.2.0-bin/zip/master
extract this archive directly into d:\winutil, d:\winutil\bin should be created
add System.setProperty("hadoop.home.dir", "d:\\winutil\\"); to your code before instantiating the Job (for example as the 1st line of your main method)

Categories