flume + spark + Hbase can't work - java
I'm trying to use flume to spend data to spark then add data to HBase~
I have tried to use flume + spark + HDFS and it's work .
These is the source code :
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.flume.source.avro.AvroFlumeEvent;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.*;
import org.apache.spark.streaming.api.java.*;
import org.apache.spark.streaming.flume.FlumeUtils;
import org.apache.spark.streaming.flume.SparkFlumeEvent;
import com.google.common.collect.Lists;
import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text;
import scala.Tuple2;
import scala.Tuple4;
public class JavaFlumeEventTest {
private static final Pattern SPACE = Pattern.compile(" ");
private static Configuration conf = null;
/**
* initial
*/
static {
conf = HBaseConfiguration.create();
conf.addResource(new Path("file:///etc/hbase/conf/hbase-site.xml"));
conf.addResource(new Path("file:///etc/hadoop/conf/hdfs-site.xml"));
conf.addResource(new Path("file:///etc/hadoop/conf/core-site.xml"));
conf.addResource(new Path("file:///etc/hadoop/conf/mapred-site.xml"));
conf.addResource(new Path("file:///etc/hadoop/conf/yarn-site.xml"));
conf.set("hbase.zookeeper.quorum", "elephant,tiger,horse");
conf.set("hbase.zookeeper.property.clientPort","2181");
conf.set("hbase.master", "elephant" + ":60000");
conf.set("hbase.cluster.distributed", "true");
conf.set("hbase.rootdir", "hdfs://elephant:8020/hbase");
}
/**
* Add new record
* #param tableName
* #param rowKey
* #param family
* #param qualifier
* #param value
*/
public static void addRecord (String tableName, String rowKey, String family, String qualifier, String value){
try {
System.out.println("===========HTable =========="+conf);
HTable table = new HTable(conf, tableName);
System.out.println("===========put ==========");
Put put = new Put(Bytes.toBytes(rowKey));
System.out.println("===========put Add==========");
put.add(Bytes.toBytes(family),Bytes.toBytes(qualifier),Bytes.toBytes(value));
System.out.println("===========table put ==========");
table.put(put);
System.out.println("insert recored " + rowKey + " to table " + tableName +" ok.");
} catch (IOException e) {
System.out.println("===========IOException ==========");
e.printStackTrace();
}
}
private JavaFlumeEventTest() {
}
public static void main(String[] args) {
String host = args[0];
int port = Integer.parseInt(args[1]);
Duration batchInterval = new Duration(Integer.parseInt(args[2]));
final String tableName = args[3];
final String columnFamily = args[4];
SparkConf sparkConf = new SparkConf()
.setAppName("JavaFlumeEventTest")
.set("spark.executor.memory", "256m");
JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, batchInterval);
final Broadcast<String> broadcastTableName = ssc.sparkContext().broadcast(tableName);
final Broadcast<String> broadcastColumnFamily = ssc.sparkContext().broadcast(columnFamily);
JavaReceiverInputDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(ssc, host, port);
JavaDStream<String>
words = flumeStream.flatMap(new FlatMapFunction<SparkFlumeEvent,String>(){
#Override
public Iterable<String> call(SparkFlumeEvent arg0) throws Exception {
String body = new String(arg0.event().getBody().array(), Charset.forName("UTF-8"));
return Lists.newArrayList(SPACE.split(body));
}
});
JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
new PairFunction<String, String, Integer>() {
#Override
public Tuple2<String, Integer> call(String s) {
return new Tuple2<String, Integer>(s, 1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
#Override
public Integer call(Integer i1, Integer i2) {
return i1 + i2;
}
});
wordCounts.print();
wordCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() {
#Override
public Void call(JavaPairRDD<String, Integer> values,
Time time) throws Exception {
values.foreach(new VoidFunction<Tuple2<String, Integer>> () {
#Override
public void call(Tuple2<String, Integer> tuple){
System.out.println("===========insert record========"+tuple._1()+"=="+tuple._2().toString());
JavaFlumeEventTest.addRecord("mytable","PutInpu",columnFamily,tuple._1(),tuple._2().toString());
System.out.println("===========Done record========"+tuple._1());
}} );
return null;
}});
flumeStream.count().map(new Function<Long, String>() {
#Override
public String call(Long in) {
return "Received " + in + " flume events.";
}
}).print();
ssc.start();
}
}
I exported that as a runnable jar and start with spark
./bin/spark-submit --class JavaFlumeEventTest --master local[15] /home/training/software/JavaFlumeEventTest3.jar elephant 11000 5000 mytable cf
There is no Exception but no data be added to HBase~
I found that that the thread is stop at
HTable table = new HTable(conf, tableName);
There are the Spark terminal logs~
15/02/04 21:36:05 INFO DAGScheduler: Job 72 finished: print at JavaFlumeEventTest.java:139, took 0.056793 s
-------------------------------------------
Time: 1423103765000 ms
-------------------------------------------
(have,3)
(example,,1)
(dependencies,1)
(linked,1)
(1111,28)
(non-Spark,1)
(do,1)
(some,1)
(Hence,,1)
(from,2)
...
15/02/04 21:36:05 INFO JobScheduler: Finished job streaming job 1423103765000 ms.0 from job set of time 1423103765000 ms
15/02/04 21:36:05 INFO JobScheduler: Starting job streaming job 1423103765000 ms.1 from job set of time 1423103765000 ms
15/02/04 21:36:05 INFO SparkContext: Starting job: foreach at JavaFlumeEventTest.java:141
15/02/04 21:36:05 INFO DAGScheduler: Got job 73 (foreach at JavaFlumeEventTest.java:141) with 15 output partitions (allowLocal=false)
15/02/04 21:36:05 INFO DAGScheduler: Final stage: Stage 146(foreach at JavaFlumeEventTest.java:141)
15/02/04 21:36:05 INFO DAGScheduler: Parents of final stage: List(Stage 145)
15/02/04 21:36:05 INFO DAGScheduler: Missing parents: List()
15/02/04 21:36:05 INFO DAGScheduler: Submitting Stage 146 (ShuffledRDD[114] at reduceByKey at JavaFlumeEventTest.java:132), which has no missing parents
15/02/04 21:36:05 INFO MemoryStore: ensureFreeSpace(2544) called with curMem=141969, maxMem=280248975
15/02/04 21:36:05 INFO MemoryStore: Block broadcast_86 stored as values in memory (estimated size 2.5 KB, free 267.1 MB)
15/02/04 21:36:05 INFO MemoryStore: ensureFreeSpace(1862) called with curMem=144513, maxMem=280248975
15/02/04 21:36:05 INFO MemoryStore: Block broadcast_86_piece0 stored as bytes in memory (estimated size 1862.0 B, free 267.1 MB)
15/02/04 21:36:05 INFO BlockManagerInfo: Added broadcast_86_piece0 in memory on localhost:41505 (size: 1862.0 B, free: 267.2 MB)
15/02/04 21:36:05 INFO BlockManagerMaster: Updated info of block broadcast_86_piece0
15/02/04 21:36:05 INFO SparkContext: Created broadcast 86 from getCallSite at DStream.scala:294
15/02/04 21:36:05 INFO DAGScheduler: Submitting 15 missing tasks from Stage 146 (ShuffledRDD[114] at reduceByKey at JavaFlumeEventTest.java:132)
15/02/04 21:36:05 INFO TaskSchedulerImpl: Adding task set 146.0 with 15 tasks
15/02/04 21:36:05 INFO TaskSetManager: Starting task 0.0 in stage 146.0 (TID 466, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 1.0 in stage 146.0 (TID 467, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 2.0 in stage 146.0 (TID 468, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 3.0 in stage 146.0 (TID 469, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 4.0 in stage 146.0 (TID 470, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 5.0 in stage 146.0 (TID 471, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 6.0 in stage 146.0 (TID 472, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 7.0 in stage 146.0 (TID 473, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 8.0 in stage 146.0 (TID 474, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 9.0 in stage 146.0 (TID 475, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 10.0 in stage 146.0 (TID 476, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 11.0 in stage 146.0 (TID 477, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 12.0 in stage 146.0 (TID 478, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO TaskSetManager: Starting task 13.0 in stage 146.0 (TID 479, localhost, PROCESS_LOCAL, 1122 bytes)
15/02/04 21:36:05 INFO Executor: Running task 0.0 in stage 146.0 (TID 466)
15/02/04 21:36:05 INFO ShuffleBlockFetcherIterator: Getting 1 non-empty blocks out of 1 blocks
15/02/04 21:36:05 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
===========insert record========have==3
===========HTable ==========Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hbase-default.xml, hbase-site.xml, file:/etc/hbase/conf/hbase-site.xml, file:/etc/hadoop/conf/hdfs-site.xml, file:/etc/hadoop/conf/core-site.xml, file:/etc/hadoop/conf/mapred-site.xml, file:/etc/hadoop/conf/yarn-site.xml
15/02/04 21:36:05 INFO Executor: Running task 1.0 in stage 146.0 (TID 467)
15/02/04 21:36:05 INFO ShuffleBlockFetcherIterator: Getting 1 non-empty blocks out of 1 blocks
15/02/04 21:36:05 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
===========insert record========1111==28
===========HTable ==========Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hbase-default.xml, hbase-site.xml, file:/etc/hbase/conf/hbase-site.xml, file:/etc/hadoop/conf/hdfs-site.xml, file:/etc/hadoop/conf/core-site.xml, file:/etc/hadoop/conf/mapred-site.xml, file:/etc/hadoop/conf/yarn-site.xml
15/02/04 21:36:05 INFO Executor: Running task 2.0 in stage 146.0 (TID 468)
...
...
15/02/04 21:36:05 INFO ContextCleaner: Cleaned shuffle 1
15/02/04 21:36:05 INFO ContextCleaner: Cleaned shuffle 0
15/02/04 21:36:05 INFO ZooKeeper: Client environment:zookeeper.version=3.4.5-1392090, built on 09/30/2012 17:52 GMT
15/02/04 21:36:05 INFO ZooKeeper: Client environment:host.name=elephant
15/02/04 21:36:05 INFO ZooKeeper: Client environment:java.version=1.7.0_45
15/02/04 21:36:05 INFO ZooKeeper: Client environment:java.vendor=Oracle Corporation
15/02/04 21:36:05 INFO ZooKeeper: Client environment:java.home=/usr/java/jdk1.7.0_45-cloudera/jre
15/02/04 21:36:05 INFO ZooKeeper: Client environment:java.class.path=::/home/training/software/spark-1.2.0-bin-hadoop2.3/conf:/home/training/software/spark-1.2.0-bin-hadoop2.3/lib/spark-assembly-1.2.0-hadoop2.3.0.jar:/home/training/software/spark-1.2.0-bin-hadoop2.3/lib/datanucleus-api-jdo-3.2.6.jar:/home/training/software/spark-1.2.0-bin-hadoop2.3/lib/datanucleus-core-3.2.10.jar:/home/training/software/spark-1.2.0-bin-hadoop2.3/lib/datanucleus-rdbms-3.2.9.jar
15/02/04 21:36:05 INFO ZooKeeper: Client environment:java.library.path=/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib
15/02/04 21:36:05 INFO ZooKeeper: Client environment:java.io.tmpdir=/tmp
15/02/04 21:36:05 INFO ZooKeeper: Client environment:java.compiler=<NA>
15/02/04 21:36:05 INFO ZooKeeper: Client environment:os.name=Linux
15/02/04 21:36:05 INFO ZooKeeper: Client environment:os.arch=amd64
15/02/04 21:36:05 INFO ZooKeeper: Client environment:os.version=2.6.32-279.el6.x86_64
15/02/04 21:36:05 INFO ZooKeeper: Client environment:user.name=training
15/02/04 21:36:05 INFO ZooKeeper: Client environment:user.home=/home/training
15/02/04 21:36:05 INFO ZooKeeper: Client environment:user.dir=/home/training/software/spark-1.2.0-bin-hadoop2.3
15/02/04 21:36:05 INFO ZooKeeper: Initiating client connection, connectString=tiger:2181,elephant:2181,horse:2181 sessionTimeout=90000 watcher=hconnection-0x575b43dd, quorum=tiger:2181,elephant:2181,horse:2181, baseZNode=/hbase
15/02/04 21:36:05 INFO RecoverableZooKeeper: Process identifier=hconnection-0x575b43dd connecting to ZooKeeper ensemble=tiger:2181,elephant:2181,horse:2181
15/02/04 21:36:05 INFO ClientCnxn: Opening socket connection to server tiger/192.168.137.12:2181. Will not attempt to authenticate using SASL (unknown error)
15/02/04 21:36:05 INFO ClientCnxn: Socket connection established to tiger/192.168.137.12:2181, initiating session
15/02/04 21:36:05 INFO ClientCnxn: Session establishment complete on server tiger/192.168.137.12:2181, sessionid = 0x24b573f71f00007, negotiated timeout = 40000
15/02/04 21:36:10 INFO JobScheduler: Added jobs for time 1423103770000 ms
15/02/04 21:36:15 INFO JobScheduler: Added jobs for time 1423103775000 ms
15/02/04 21:36:20 INFO JobScheduler: Added jobs for time 1423103780000 ms
Btw I can add data to Hbase with java but flume and spark~
Can any to help me to solve the problem?
Thx~
Related
Unable to sink record to KafkaSink in Apache Flink
I have a data like {name: "abc", age: 20} being read from kafka in a flink code using Java. I am trying to sink this data into Kafka from flink KafkaSink<AllIncidentsDataPOJO> sink = KafkaSink.<AllIncidentsDataPOJO>builder() .setBootstrapServers(this.bootstrapServer) .setKafkaProducerConfig(kafkaProps) .setRecordSerializer(KafkaRecordSerializationSchema.builder() .setTopic("flink-all-incidents") .setValueSerializationSchema(new IncidentsSerializationSchema()).build()) .setDeliverGuarantee(DeliveryGuarantee.EXACTLY_ONCE) .build(); IncidentSerializationSchema public class IncidentsSerializationSchema implements SerializationSchema<AllIncidentsDataPOJO> { static ObjectMapper objectMapper = new ObjectMapper(); #Override public byte[] serialize(AllIncidentsDataPOJO element) { if (objectMapper == null) { objectMapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY); objectMapper = new ObjectMapper(); } try { // System.out.println("Returned value: " + objectMapper.writeValueAsString(element).getBytes()); return objectMapper.writeValueAsString(element).getBytes(); } catch ( org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.JsonProcessingException e) { System.out.println("Exception: " + e); } return new byte[0]; } } Now when I print within the above class it gives me a serialized message of say Returned value: [B#6f29a884 Returned value: [B#743145c5 Returned value: [B#6aa6b431 Returned value: [B#208c0151 Returned value: [B#1a3a0c6d Returned value: [B#7972da35 Returned value: [B#1097059f Returned value: [B#6013a87b Returned value: [B#27b252dc Returned value: [B#288478b7 Returned value: [B#54185041 Returned value: [B#970646b But when I try to use data.sinkTo(sink) after initializing KafkaSink, it starts disconnecting [kafka-producer-network-thread | producer-kafka-sink-0-1] INFO org.apache.kafka.clients.NetworkClient - [Producer clientId=producer-kafka-sink-0-1, transactionalId=kafka-sink-0-1] Node 8 disconnected. [kafka-producer-network-thread | producer-kafka-sink-0-1] INFO org.apache.kafka.clients.NetworkClient - [Producer clientId=producer-kafka-sink-0-1, transactionalId=kafka-sink-0-1] Cancelled in-flight API_VERSIONS request with correlation id 35 due to node 8 being disconnected (elapsed time since creation: 292ms, elapsed time since send: 292ms, request timeout: 30000ms) [kafka-producer-network-thread | producer-kafka-sink-0-1] INFO org.apache.kafka.clients.NetworkClient - [Producer clientId=producer-kafka-sink-0-1, transactionalId=kafka-sink-0-1] Node 8 disconnected. [kafka-producer-network-thread | producer-kafka-sink-0-1] INFO org.apache.kafka.clients.NetworkClient - [Producer clientId=producer-kafka-sink-0-1, transactionalId=kafka-sink-0-1] Cancelled in-flight API_VERSIONS request with correlation id 37 due to node 8 being disconnected (elapsed time since creation: 639ms, elapsed time since send: 639ms, request timeout: 30000ms) [kafka-producer-network-thread | producer-kafka-sink-0-1] INFO org.apache.kafka.clients.NetworkClient - [Producer clientId=producer-kafka-sink-0-1, transactionalId=kafka-sink-0-1] Node 8 disconnected. [kafka-producer-network-thread | producer-kafka-sink-0-1] INFO org.apache.kafka.clients.NetworkClient - [Producer clientId=producer-kafka-sink-0-1, transactionalId=kafka-sink-0-1] Cancelled in-flight API_VERSIONS request with correlation id 38 due to node 8 being disconnected (elapsed time since creation: 875ms, elapsed time since send: 875ms, request timeout: 30000ms) Can someone please help me here?
Map ends at 0% Reduce Ends at 100%
I am working on a WordCount MapReduce program, I can get my code to run, but there is no output and the map finishes at 0% while the reduce finishes at 100%. hadoop jar C:/wordCount.jar C:/project_1_data /myWordCount 2022-02-11 18:43:59,648 INFO client.RMProxy: Connecting to ResourceManager at /127.0.0.1:8032 2022-02-11 18:43:59,788 WARN nativeio.NativeIO: NativeIO.getStat error (3): The system cannot find the path specified. -- file path: tmp/hadoop-yarn/staging/camma/.staging 2022-02-11 18:43:59,809 INFO mapreduce.JobSubmissionFiles: Permissions on staging directory /tmp/hadoop-yarn/staging/camma/.staging are incorrect: rwxrwx---. Fixing permissions to correct value rwx------ 2022-02-11 18:43:59,996 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this. 2022-02-11 18:44:00,025 INFO input.FileInputFormat: Total input files to process : 0 2022-02-11 18:44:00,032 INFO mapreduce.JobSubmitter: number of splits:0 2022-02-11 18:44:00,084 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1644626534356_0002 2022-02-11 18:44:00,084 INFO mapreduce.JobSubmitter: Executing with tokens: [] 2022-02-11 18:44:00,188 INFO conf.Configuration: resource-types.xml not found 2022-02-11 18:44:00,189 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'. 2022-02-11 18:44:00,222 INFO impl.YarnClientImpl: Submitted application application_1644626534356_0002 2022-02-11 18:44:00,243 INFO mapreduce.Job: The url to track the job: http://Cameron:8088/proxy/application_1644626534356_0002/ 2022-02-11 18:44:00,243 INFO mapreduce.Job: Running job: job_1644626534356_0002 2022-02-11 18:44:04,303 INFO mapreduce.Job: Job job_1644626534356_0002 running in uber mode : false 2022-02-11 18:44:04,303 INFO mapreduce.Job: map 0% reduce 0% 2022-02-11 18:44:08,353 INFO mapreduce.Job: map 0% reduce 100% 2022-02-11 18:44:08,357 INFO mapreduce.Job: Job job_1644626534356_0002 completed successfully 2022-02-11 18:44:08,403 INFO mapreduce.Job: Counters: 35 File System Counters FILE: Number of bytes read=0 FILE: Number of bytes written=235876 FILE: Number of read operations=0 FILE: Number of large read operations=0 FILE: Number of write operations=0 Job Counters Launched reduce tasks=1 Total time spent by all maps in occupied slots (ms)=0 Total time spent by all reduces in occupied slots (ms)=1184 Total time spent by all reduce tasks (ms)=1184 Total vcore-milliseconds taken by all reduce tasks=1184 Total megabyte-milliseconds taken by all reduce tasks=1212416 Map-Reduce Framework Combine input records=0 Combine output records=0 Reduce input groups=0 Reduce shuffle bytes=0 Reduce input records=0 Reduce output records=0 Spilled Records=0 Shuffled Maps =0 Failed Shuffles=0 Merged Map outputs=0 GC time elapsed (ms)=3 CPU time spent (ms)=77 Physical memory (bytes) snapshot=223731712 Virtual memory (bytes) snapshot=451411968 Total committed heap usage (bytes)=257425408 Peak Reduce Physical memory (bytes)=223731712 Peak Reduce Virtual memory (bytes)=451428352 Shuffle Errors BAD_ID=0 CONNECTION=0 IO_ERROR=0 WRONG_LENGTH=0 WRONG_MAP=0 WRONG_REDUCE=0 File Output Format Counters Bytes Written=8 Here is my current code: import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCount { public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{ private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context ) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } } public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context ) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } } If I need to post copies of the files from setting up hadoop I can.
Iterate over different columns using withcolumn in Java Spark
I have to modify a Dataset<Row> according to some rules that are in a List<Row>. I want to iterate over the Datset<Row> columns using Dataset.withColumn(...) as seen in the next example: (import necesary libraries...) SparkSession spark = SparkSession .builder() .appName("appname") .config("spark.some.config.option", "some-value") .getOrCreate(); Dataset<Row> dfToModify = spark.read().table("TableToModify"); List<Row> ListListWithInfo = new ArrayList<>(Arrays.asList()); ListWithInfo.add(0,RowFactory.create("field1", "input1", "output1", "conditionAux1")); ListWithInfo.add(1,RowFactory.create("field1", "input1", "output1", "conditionAux2")); ListWithInfo.add(2,RowFactory.create("field1", "input2", "output3", "conditionAux3")); ListWithInfo.add(3,RowFactory.create("field2", "input3", "output4", "conditionAux4")); . . . for (Row row : ListWithInfo) { String field = row.getString(0); String input = row.getString(1); String output = row.getString(2); String conditionAux = row.getString(3); dfToModify = dfToModify.withColumn(field, when(dfToModify.col(field).equalTo(input) .and(dfToModify.col("conditionAuxField").equalTo(conditionAux)) ,output) .otherwise(dfToModify.col(field))); } The code does works as it should, but when there are more than 50 "rules" in the List, the program doesn't finish and this output is shown in the screen: 0/01/27 17:48:18 INFO spark.ContextCleaner: Cleaned accumulator 1653 20/01/27 17:48:18 INFO spark.ContextCleaner: Cleaned accumulator 1650 20/01/27 17:48:18 INFO spark.ContextCleaner: Cleaned accumulator 1635 20/01/27 17:48:18 INFO spark.ContextCleaner: Cleaned accumulator 1641 20/01/27 17:48:18 INFO spark.ContextCleaner: Cleaned accumulator 1645 20/01/27 17:48:18 INFO spark.ContextCleaner: Cleaned accumulator 1646 20/01/27 17:48:18 INFO storage.BlockManagerInfo: Removed broadcast_113_piece0 on **************** in memory (size: 14.5 KB, free: 3.0 GB) 20/01/27 17:48:18 INFO storage.BlockManagerInfo: Removed broadcast_113_piece0 on ***************** in memory (size: 14.5 KB, free: 3.0 GB) 20/01/27 17:48:18 INFO spark.ContextCleaner: Cleaned accumulator 1639 20/01/27 17:48:18 INFO spark.ContextCleaner: Cleaned accumulator 1649 20/01/27 17:48:18 INFO spark.ContextCleaner: Cleaned accumulator 1651 20/01/27 17:49:18 INFO spark.ExecutorAllocationManager: Request to remove executorIds: 6 20/01/27 17:49:18 INFO cluster.YarnClientSchedulerBackend: Requesting to kill executor(s) 6 20/01/27 17:49:18 INFO cluster.YarnClientSchedulerBackend: Actual list of executor(s) to be killed is 6 20/01/27 17:49:18 INFO spark.ExecutorAllocationManager: Removing executor 6 because it has been idle for 60 seconds (new desired total will be 0) 20/01/27 17:49:19 INFO yarn.SparkRackResolver: Got an error when resolving hostNames. Falling back to /default-rack for all 20/01/27 17:49:19 INFO cluster.YarnSchedulerBackend$YarnDriverEndpoint: Disabling executor 6. 20/01/27 17:49:19 INFO scheduler.DAGScheduler: Executor lost: 6 (epoch 0) 20/01/27 17:49:19 INFO yarn.SparkRackResolver: Got an error when resolving hostNames. Falling back to /default-rack for all 20/01/27 17:49:19 INFO storage.BlockManagerMasterEndpoint: Trying to remove executor 6 from BlockManagerMaster. 20/01/27 17:49:19 INFO storage.BlockManagerMasterEndpoint: Removing block manager BlockManagerId(6, *********************, 43387, None) 20/01/27 17:49:19 INFO storage.BlockManagerMaster: Removed 6 successfully in removeExecutor 20/01/27 17:49:19 INFO cluster.YarnScheduler: Executor 6 on **************** killed by driver. 20/01/27 17:49:19 INFO spark.ExecutorAllocationManager: Existing executor 6 has been removed (new total is 0) 20/01/27 17:49:20 INFO yarn.SparkRackResolver: Got an error when resolving hostNames. Falling back to /default-rack for all 20/01/27 17:49:21 INFO yarn.SparkRackResolver: Got an error when resolving hostNames. Falling back to /default-rack for all 20/01/27 17:49:22 INFO yarn.SparkRackResolver: Got an error when resolving hostNames. Falling back to /default-rack for all . . . . Is there any way to make it more efficient using Java Spark? (without using for loop or something similar)
Finally I used withColumns method of Dataset<Row> objet. This method need two arguments: .withColumns(Seq<String> ColumnsNames, Seq<Column> ColumnsValues); And in the Seq<String> can not be duplicated. The code is as follow: SparkSession spark = SparkSession .builder() .appName("appname") .config("spark.some.config.option", "some-value") .getOrCreate(); Dataset<Row> dfToModify = spark.read().table("TableToModify"); List<Row> ListListWithInfo = new ArrayList<>(Arrays.asList()); ListWithInfo.add(0,RowFactory.create("field1", "input1", "output1", "conditionAux1")); ListWithInfo.add(1,RowFactory.create("field1", "input1", "output1", "conditionAux2")); ListWithInfo.add(2,RowFactory.create("field1", "input2", "output3", "conditionAux3")); ListWithInfo.add(3,RowFactory.create("field2", "input3", "output4", "conditionAux4")); . . . // initialize values for fields and conditions String field_ant = ListWithInfo.get(0).getString(0).toLowerCase(); String first_input = ListWithInfo.get(0).getString(1); String first_output = ListWithInfo.get(0).getString(2); String first_conditionAux = ListWithInfo.get(0).getString(3); Column whenColumn = when(dfToModify.col(field_ant).equalTo(first_input) .and(dfToModify.col("conditionAuxField").equalTo(lit(first_conditionAux))) ,first_output); // lists with the names of the fields and the conditions List<Column> whenColumnList = new ArrayList(Arrays.asList()); List<String> fieldsNameList = new ArrayList(Arrays.asList()); for (Row row : ListWithInfo.subList(1,ListWithInfo.size())) { String field = row.getString(0); String input = row.getString(1); String output = row.getString(2); String conditionAux = row.getString(3); if (field.equals(field_ant)) { // if field is equals to fiel_ant the new condition is added to the previous one whenColumn = whenColumn.when(dfToModify.col(field).equalTo(input) .and(dfToModify.col("conditionAuxField").equalTo(lit(conditionAux))) ,output); } else { // if field is diferent to the previous: // close the conditions for this field whenColumn = whenColumn.otherwise(dfToModify.col(field_ant)); // add to the lists the field(String) and the conditions (columns) whenColumnList.add(whenColumn); fieldsNameList.add(field_ant); // and initialize the conditions for the new field whenColumn = when(dfToModify.col(field).equalTo(input) .and(dfToModify.col("branchField").equalTo(lit(branch))) ,output); } field_ant = field; } // add last values whenColumnList.add(whenColumn); fieldsNameList.add(field_ant); // transform list to Seq Seq<Column> whenColumnSeq = JavaConversions.asScalaBuffer(whenColumnList).seq(); Seq<String> fieldsNameSeq = JavaConversions.asScalaBuffer(fieldsNameList).seq(); Dataset<Row> dfModified = dfToModify.withColumns(fieldsNameSeq, whenColumnSeq);
Spark Streaming dynamic executors overried kafka parameters in cluster mode
I have written a spark streaming consumer to consume the data from Kafka. I found a weird behavior in my logs. The Kafka topic has 3 partitions and for each partition, an executor is launched by Spark Streaming job. The first executor id always takes the parameters I have provided while creating the streaming context but the executor with ID 2 and 3 always override the kafka parameters. 20/01/14 12:15:05 WARN StreamingContext: Dynamic Allocation is enabled for this application. Enabling Dynamic allocation for Spark Streaming applications can cause data loss if Write Ahead Log is not enabled for non-replayable sour ces like Flume. See the programming guide for details on how to enable the Write Ahead Log. 20/01/14 12:15:05 INFO FileBasedWriteAheadLog_ReceivedBlockTracker: Recovered 2 write ahead log files from hdfs://tlabnamenode/checkpoint/receivedBlockMetadata 20/01/14 12:15:05 INFO DirectKafkaInputDStream: Slide time = 5000 ms 20/01/14 12:15:05 INFO DirectKafkaInputDStream: Storage level = Serialized 1x Replicated 20/01/14 12:15:05 INFO DirectKafkaInputDStream: Checkpoint interval = null 20/01/14 12:15:05 INFO DirectKafkaInputDStream: Remember interval = 5000 ms 20/01/14 12:15:05 INFO DirectKafkaInputDStream: Initialized and validated org.apache.spark.streaming.kafka010.DirectKafkaInputDStream#12665f3f 20/01/14 12:15:05 INFO ForEachDStream: Slide time = 5000 ms 20/01/14 12:15:05 INFO ForEachDStream: Storage level = Serialized 1x Replicated 20/01/14 12:15:05 INFO ForEachDStream: Checkpoint interval = null 20/01/14 12:15:05 INFO ForEachDStream: Remember interval = 5000 ms 20/01/14 12:15:05 INFO ForEachDStream: Initialized and validated org.apache.spark.streaming.dstream.ForEachDStream#a4d83ac 20/01/14 12:15:05 INFO ConsumerConfig: ConsumerConfig values: auto.commit.interval.ms = 5000 auto.offset.reset = latest bootstrap.servers = [1,2,3] check.crcs = true client.id = client-0 connections.max.idle.ms = 540000 default.api.timeout.ms = 60000 enable.auto.commit = false exclude.internal.topics = true fetch.max.bytes = 52428800 fetch.max.wait.ms = 500 fetch.min.bytes = 1 group.id = telemetry-streaming-service heartbeat.interval.ms = 3000 interceptor.classes = [] internal.leave.group.on.close = true isolation.level = read_uncommitted key.deserializer = class org.apache.kafka.common.serialization.StringDeserializer Here is the log for other executors. 20/01/14 12:15:04 INFO Executor: Starting executor ID 2 on host 1 20/01/14 12:15:04 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 40324. 20/01/14 12:15:04 INFO NettyBlockTransferService: Server created on 1 20/01/14 12:15:04 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy 20/01/14 12:15:04 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(2, matrix-hwork-data-05, 40324, None) 20/01/14 12:15:04 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(2, matrix-hwork-data-05, 40324, None) 20/01/14 12:15:04 INFO BlockManager: external shuffle service port = 7447 20/01/14 12:15:04 INFO BlockManager: Registering executor with local external shuffle service. 20/01/14 12:15:04 INFO TransportClientFactory: Successfully created connection to matrix-hwork-data-05/10.83.34.25:7447 after 1 ms (0 ms spent in bootstraps) 20/01/14 12:15:04 INFO BlockManager: Initialized BlockManager: BlockManagerId(2, matrix-hwork-data-05, 40324, None) 20/01/14 12:15:19 INFO CoarseGrainedExecutorBackend: Got assigned task 1 20/01/14 12:15:19 INFO Executor: Running task 1.0 in stage 0.0 (TID 1) 20/01/14 12:15:19 INFO TorrentBroadcast: Started reading broadcast variable 0 20/01/14 12:15:19 INFO TransportClientFactory: Successfully created connection to matrix-hwork-data-05/10.83.34.25:38759 after 2 ms (0 ms spent in bootstraps) 20/01/14 12:15:20 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 8.1 KB, free 6.2 GB) 20/01/14 12:15:20 INFO TorrentBroadcast: Reading broadcast variable 0 took 163 ms 20/01/14 12:15:20 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 17.9 KB, free 6.2 GB) 20/01/14 12:15:20 INFO KafkaRDD: Computing topic telemetry, partition 1 offsets 237352170 -> 237352311 20/01/14 12:15:20 INFO CachedKafkaConsumer: Initializing cache 16 64 0.75 20/01/14 12:15:20 INFO CachedKafkaConsumer: Cache miss for CacheKey(spark-executor-telemetry-streaming-service,telemetry,1) 20/01/14 12:15:20 INFO ConsumerConfig: ConsumerConfig values: auto.commit.interval.ms = 5000 auto.offset.reset = none bootstrap.servers = [1,2,3] check.crcs = true client.id = client-0 connections.max.idle.ms = 540000 default.api.timeout.ms = 60000 enable.auto.commit = false exclude.internal.topics = true fetch.max.bytes = 52428800 fetch.max.wait.ms = 500 If we closely observer in the first executor the auto.offset.reset is latest but for the other executors the auto.offset.reset = none Here is how I am creating the streaming context public void init() throws Exception { final String BOOTSTRAP_SERVERS = PropertyFileReader.getInstance() .getProperty("spark.streaming.kafka.broker.list"); final String DYNAMIC_ALLOCATION_ENABLED = PropertyFileReader.getInstance() .getProperty("spark.streaming.dynamicAllocation.enabled"); final String DYNAMIC_ALLOCATION_SCALING_INTERVAL = PropertyFileReader.getInstance() .getProperty("spark.streaming.dynamicAllocation.scalingInterval"); final String DYNAMIC_ALLOCATION_MIN_EXECUTORS = PropertyFileReader.getInstance() .getProperty("spark.streaming.dynamicAllocation.minExecutors"); final String DYNAMIC_ALLOCATION_MAX_EXECUTORS = PropertyFileReader.getInstance() .getProperty("spark.streaming.dynamicAllocation.maxExecutors"); final String DYNAMIC_ALLOCATION_EXECUTOR_IDLE_TIMEOUT = PropertyFileReader.getInstance() .getProperty("spark.streaming.dynamicAllocation.executorIdleTimeout"); final String DYNAMIC_ALLOCATION_CACHED_EXECUTOR_IDLE_TIMEOUT = PropertyFileReader.getInstance() .getProperty("spark.streaming.dynamicAllocation.cachedExecutorIdleTimeout"); final String SPARK_SHUFFLE_SERVICE_ENABLED = PropertyFileReader.getInstance() .getProperty("spark.shuffle.service.enabled"); final String SPARK_LOCALITY_WAIT = PropertyFileReader.getInstance().getProperty("spark.locality.wait"); final String SPARK_KAFKA_CONSUMER_POLL_INTERVAL = PropertyFileReader.getInstance() .getProperty("spark.streaming.kafka.consumer.poll.ms"); final String SPARK_KAFKA_MAX_RATE_PER_PARTITION = PropertyFileReader.getInstance() .getProperty("spark.streaming.kafka.maxRatePerPartition"); final String SPARK_BATCH_DURATION_IN_SECONDS = PropertyFileReader.getInstance() .getProperty("spark.batch.duration.in.seconds"); final String KAFKA_TOPIC = PropertyFileReader.getInstance().getProperty("spark.streaming.kafka.topic"); LOGGER.debug("connecting to brokers ::" + BOOTSTRAP_SERVERS); LOGGER.debug("bootstrapping properties to create consumer"); kafkaParams = new HashMap<>(); kafkaParams.put("bootstrap.servers", BOOTSTRAP_SERVERS); kafkaParams.put("key.deserializer", StringDeserializer.class); kafkaParams.put("value.deserializer", StringDeserializer.class); kafkaParams.put("group.id", "telemetry-streaming-service"); kafkaParams.put("auto.offset.reset", "latest"); kafkaParams.put("enable.auto.commit", false); kafkaParams.put("client.id","client-0"); // Below property should be enabled in properties and changed based on // performance testing kafkaParams.put("max.poll.records", PropertyFileReader.getInstance().getProperty("spark.streaming.kafka.max.poll.records")); LOGGER.info("registering as a consumer with the topic :: " + KAFKA_TOPIC); topics = Arrays.asList(KAFKA_TOPIC); sparkConf = new SparkConf() // .setMaster(PropertyFileReader.getInstance().getProperty("spark.master.url")) .setAppName(PropertyFileReader.getInstance().getProperty("spark.application.name")) .set("spark.streaming.dynamicAllocation.enabled", DYNAMIC_ALLOCATION_ENABLED) .set("spark.streaming.dynamicAllocation.scalingInterval", DYNAMIC_ALLOCATION_SCALING_INTERVAL) .set("spark.streaming.dynamicAllocation.minExecutors", DYNAMIC_ALLOCATION_MIN_EXECUTORS) .set("spark.streaming.dynamicAllocation.maxExecutors", DYNAMIC_ALLOCATION_MAX_EXECUTORS) .set("spark.streaming.dynamicAllocation.executorIdleTimeout", DYNAMIC_ALLOCATION_EXECUTOR_IDLE_TIMEOUT) .set("spark.streaming.dynamicAllocation.cachedExecutorIdleTimeout", DYNAMIC_ALLOCATION_CACHED_EXECUTOR_IDLE_TIMEOUT) .set("spark.shuffle.service.enabled", SPARK_SHUFFLE_SERVICE_ENABLED) .set("spark.locality.wait", SPARK_LOCALITY_WAIT) .set("spark.streaming.kafka.consumer.poll.ms", SPARK_KAFKA_CONSUMER_POLL_INTERVAL) .set("spark.streaming.kafka.maxRatePerPartition", SPARK_KAFKA_MAX_RATE_PER_PARTITION); LOGGER.debug("creating streaming context with minutes batch interval ::: " + SPARK_BATCH_DURATION_IN_SECONDS); streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(Integer.parseInt(SPARK_BATCH_DURATION_IN_SECONDS))); /* * todo: add checkpointing to the streaming context to recover from driver * failures and also for offset management */ LOGGER.info("checkpointing the streaming transactions at hdfs path :: /checkpoint"); streamingContext.checkpoint("/checkpoint"); streamingContext.addStreamingListener(new DataProcessingListener()); } #Override public void execute() throws InterruptedException { LOGGER.info("started telemetry pipeline executor to consume data"); // Data Consume from the Kafka topic JavaInputDStream<ConsumerRecord<String, String>> telemetryStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topics, kafkaParams)); telemetryStream.foreachRDD(rawRDD -> { if (!rawRDD.isEmpty()) { OffsetRange[] offsetRanges = ((HasOffsetRanges) rawRDD.rdd()).offsetRanges(); LOGGER.debug("list of OffsetRanges getting processed as a string :: " + Arrays.asList(offsetRanges).toString()); System.out.println("offsetRanges : " + offsetRanges.length); SparkSession spark = JavaSparkSessionSingleton.getInstance(rawRDD.context().getConf()); JavaPairRDD<String, String> flattenedRawRDD = rawRDD.mapToPair(record -> { //LOGGER.debug("flattening JSON record with telemetry json value ::: " + record.value()); ObjectMapper om = new ObjectMapper(); JsonNode root = om.readTree(record.value()); Map<String, JsonNode> flattenedMap = new FlatJsonGenerator(root).flatten(); JsonNode flattenedRootNode = om.convertValue(flattenedMap, JsonNode.class); //LOGGER.debug("creating Tuple for the JSON record Key :: " + flattenedRootNode.get("/name").asText() // + ", value :: " + flattenedRootNode.toString()); return new Tuple2<String, String>(flattenedRootNode.get("/name").asText(), flattenedRootNode.toString()); }); Dataset<Row> rawFlattenedDataRDD = spark .createDataset(flattenedRawRDD.rdd(), Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .toDF("sensor_path", "sensor_data"); Dataset<Row> groupedDS = rawFlattenedDataRDD.groupBy(col("sensor_path")) .agg(collect_list(col("sensor_data").as("sensor_data"))); Dataset<Row> lldpGroupedDS = groupedDS.filter((FilterFunction<Row>) r -> r.getString(0).equals("Cisco-IOS-XR-ethernet-lldp-oper:lldp/nodes/node/neighbors/devices/device")); LOGGER.info("printing the LLDP GROUPED DS ------------------>"); lldpGroupedDS.show(2); LOGGER.info("creating telemetry pipeline to process the telemetry data"); HashMap<Object, Object> params = new HashMap<>(); params.put(DPConstants.OTSDB_CONFIG_F_PATH, ExternalizedConfigsReader.getPropertyValueFromCache("/opentsdb.config.file.path")); params.put(DPConstants.OTSDB_CLIENT_TYPE, ExternalizedConfigsReader.getPropertyValueFromCache("/opentsdb.client.type")); try { LOGGER.info("<-------------------processing lldp data and write to hive STARTED ----------------->"); Pipeline lldpPipeline = PipelineFactory.getPipeline(PipelineType.LLDPTELEMETRY); lldpPipeline.process(lldpGroupedDS, null); LOGGER.info("<-------------------processing lldp data and write to hive COMPLETED ----------------->"); LOGGER.info("<-------------------processing groupedDS data and write to OPENTSDB STARTED ----------------->"); Pipeline pipeline = PipelineFactory.getPipeline(PipelineType.TELEMETRY); pipeline.process(groupedDS, params); LOGGER.info("<-------------------processing groupedDS data and write to OPENTSDB COMPLETED ----------------->"); }catch (Throwable t){ t.printStackTrace(); } LOGGER.info("commiting offsets after processing the batch"); ((CanCommitOffsets) telemetryStream.inputDStream()).commitAsync(offsetRanges); } }); streamingContext.start(); streamingContext.awaitTermination(); } Am I missing something here? Any help is appreciated. Thanks.
" java.lang.NullPointerException" in Clustering using Spark
I am trying to understand K-means clustering on a input .csv file which consists of 56376 rows and two columns with first column representing id and second column a group of words/Example of this data is given as **1. 1428951621 do rememb came milan 19 april 2013 maynardmonday 16 1429163429 rt windeerlust sehun hyungluhan yessehun do even rememb day today** The Scala code for processing this data looks like this val inputData = sc.textFile("test.csv") // this is a changable parameter for the number of clusters to use for kmeans val numClusters = 4; // number of iterations for the kmeans val numIterations = 10; // this is the size of the vectors to be created by Word2Vec this is tunable val vectorSize = 600; val filtereddata = inputData.filter(!_.isEmpty). map(line=>line.split(",",-1)). map(line=>(line(1),line(1).split(" ").filter(_.nonEmpty))) val corpus = inputData.filter(!_.isEmpty). map(line=>line.split(",",-1)). map(line=>line(1).split(" ").toSeq) val values:RDD[Seq[String]] = filtereddata.map(s=>s._2) val keys = filtereddata.map(s=>s._1) /*******************Word2Vec and normalisation*****************************/ val w2vec = new Word2Vec().setVectorSize(vectorSize); val model = w2vec.fit(corpus) val outtest:RDD[Seq[Vector]]= values.map(x=>x.map(m=>try { model.transform(m) } catch { case e: Exception => Vectors.zeros(vectorSize) })) val convertest = outtest.map(m=>m.map(x=>(x.toArray))) val withkey = keys.zip(convertest) val filterkey = withkey.filter(!_._2.isEmpty) val keysfinal= filterkey.map(x=>x._1) val valfinal= filterkey.map(x=>x._2) // for each collections of vectors that is one tweet, add the vectors val reducetest = valfinal.map(x=>x.reduce((a,b)=>a.zip(b).map(t=>t._1+t._2))) val filtertest = reducetest.map(x=>x.map(m=>(m,x.length)).map(m=>m._1/m._2)) val test = filtertest.map(x=>new DenseVector(x).asInstanceOf[Vector]) val normalizer = new Normalizer() val data1= test.map(x=>(normalizer.transform(x))) /*********************Clustering Algorithm***********************************/ val clusters = KMeans.train(data1,numClusters,numIterations) val predictions= clusters.predict(data1) val clustercount= keysfinal.zip(predictions).distinct.map(s=>(s._2,1)).reduceByKey(_+_) val result= keysfinal.zip(predictions).distinct result.saveAsTextFile(fileToSaveResults) val wsse = clusters.computeCost(data1) println(s"The number of clusters is $numClusters") println("The cluster counts are:") println(clustercount.collect().mkString(" ")) println(s"The wsse is: $wsse") However After some iterations it throws a "java.lang.NullPointerException" and exits at stage 36.The error looks like this: 17/10/07 14:42:10 INFO TaskSchedulerImpl: Adding task set 26.0 with 2 tasks 17/10/07 14:42:10 INFO TaskSetManager: Starting task 0.0 in stage 26.0 (TID 50, localhost, partition 0, ANY, 5149 bytes) 17/10/07 14:42:10 INFO TaskSetManager: Starting task 1.0 in stage 26.0 (TID 51, localhost, partition 1, ANY, 5149 bytes) 17/10/07 14:42:10 INFO Executor: Running task 1.0 in stage 26.0 (TID 51) 17/10/07 14:42:10 INFO Executor: Running task 0.0 in stage 26.0 (TID 50) 17/10/07 14:42:10 INFO deprecation: mapred.output.dir is deprecated. Instead, use mapreduce.output.fileoutputformat.outputdir 17/10/07 14:42:10 INFO deprecation: mapred.output.key.class is deprecated. Instead, use mapreduce.job.output.key.class 17/10/07 14:42:10 INFO deprecation: mapred.output.value.class is deprecated. Instead, use mapreduce.job.output.value.class 17/10/07 14:42:10 INFO deprecation: mapred.working.dir is deprecated. Instead, use mapreduce.job.working.dir 17/10/07 14:42:10 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks 17/10/07 14:42:10 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms 17/10/07 14:42:10 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks out of 2 blocks 17/10/07 14:42:10 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms 17/10/07 14:42:10 ERROR Executor: Exception in task 0.0 in stage 26.0 (TID 50) java.lang.NullPointerException at java.lang.ProcessBuilder.start(ProcessBuilder.java:1012) at org.apache.hadoop.util.Shell.runCommand(Shell.java:404) Kindly help me in localizing the issue in this code as I am not able to understand. Note:This code written by other people
I think this has nothing to do with your code. This exception is thrown if one of the arguments passed to the ProcessBuilder is null. So I guess this must be a configuration issue or a bug in Hadoop. From the quick googling for "hadoop java.lang.ProcessBuilder.start nullpointerexception" it seems this is a known problem: https://www.fachschaft.informatik.tu-darmstadt.de/forum/viewtopic.php?t=34250 Is it possible to run Hadoop jobs (like the WordCount sample) in the local mode on Windows without Cygwin?