Spark: Two SparkContexts in a single Application Best Practice

Spark: Two SparkContexts in a single Application Best Practice - java

I think I have an interesting question for all of you today. In the code below you will notice I have two SparkContexts one for SparkStreaming and the other one which is a normal SparkContext. According to best practices you should only have one SparkContext in a Spark application even though its possible to circumvent this via allowMultipleContexts in the configuration.
Problem is, I need to retrieve data from hive and from a Kafka topic to do some logic, and whenever I submit my application it obviously returns "Cannot have 2 Spark Contexts Running on JVM".
My question is, is there a correct way to do this than how I am doing it right now?
public class MainApp {
private final String logFile= Properties.getString("SparkLogFileDir");
private static final String KAFKA_GROUPID = Properties.getString("KafkaGroupId");
private static final String ZOOKEEPER_URL = Properties.getString("ZookeeperURL");
private static final String KAFKA_BROKER = Properties.getString("KafkaBroker");
private static final String KAFKA_TOPIC = Properties.getString("KafkaTopic");
private static final String Database = Properties.getString("HiveDatabase");
private static final Integer KAFKA_PARA = Properties.getInt("KafkaParrallel");
public static void main(String[] args){
//set settings
String sql="";
//START APP
System.out.println("Starting NPI_TWITTERAPP...." + new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()));
System.out.println("Configuring Settings...."+ new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()));
SparkConf conf = new SparkConf()
.setAppName(Properties.getString("SparkAppName"))
.setMaster(Properties.getString("SparkMasterUrl"));
//Set Spark/hive/sql Context
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(5000));
JavaHiveContext HiveSqlContext = new JavaHiveContext(sc);
//Check if Twitter Hive Table Exists
try {
HiveSqlContext.sql("DROP TABLE IF EXISTS "+Database+"TWITTERSTORE");
HiveSqlContext.sql("CREATE TABLE IF NOT EXISTS "+Database+".TWITTERSTORE "
+" (created_at String, id String, id_str String, text String, source String, truncated String, in_reply_to_user_id String, processed_at String, lon String, lat String)"
+" STORED AS TEXTFILE");
}catch(Exception e){
System.out.println(e);
}
//Check if Ivapp Table Exists
sql ="CREATE TABLE IF NOT EXISTS "+Database+".IVAPPGEO AS SELECT DISTINCT a.LATITUDE, a.LONGITUDE, b.ODNCIRCUIT_OLT_CLLI, b.ODNCIRCUIT_OLT_TID, a.CITY, a.STATE, a.ZIP FROM "
+Database+".T_PONNMS_SERVICE B, "
+Database+".CLLI_LATLON_MSTR A WHERE a.BID_CLLI = substr(b.ODNCIRCUIT_OLT_CLLI,0,8)";
try {
System.out.println(sql + new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()));
HiveSqlContext.sql(sql);
sql = "SELECT LATITUDE, LONGITUDE, ODNCIRCUIT_OLT_CLLI, ODNCIRCUIT_OLT_TID, CITY, STATE, ZIP FROM "+Database+".IVAPPGEO";
JavaSchemaRDD RDD_IVAPPGEO = HiveSqlContext.sql(sql).cache();
}catch(Exception e){
System.out.println(sql + new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()));
}
//JavaHiveContext hc = new JavaHiveContext();
System.out.println("Retrieve Data from Kafka Topic: "+ new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()));
Map<String, Integer> topicMap = new HashMap<String, Integer>();
topicMap.put(KAFKA_TOPIC,KAFKA_PARA);
JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(
jssc, KAFKA_GROUPID, ZOOKEEPER_URL, topicMap);
JavaDStream<String> json = messages.map(
new Function<Tuple2<String, String>, String>() {
private static final long serialVersionUID = 42l;
#Override
public String call(Tuple2<String, String> message) {
return message._2();
}
}
);
System.out.println("Completed Kafka Messages... "+ new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()));
System.out.println("Filtering Resultset... "+ new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()));
JavaPairDStream<Long, String> tweets = json.mapToPair(
new TwitterFilterFunction());
JavaPairDStream<Long, String> filtered = tweets.filter(
new Function<Tuple2<Long, String>, Boolean>() {
private static final long serialVersionUID = 42l;
#Override
public Boolean call(Tuple2<Long, String> tweet) {
return tweet != null;
}
}
);
JavaDStream<Tuple2<Long, String>> tweetsFiltered = filtered.map(
new TextFilterFunction());
tweetsFiltered = tweetsFiltered.map(
new StemmingFunction());
System.out.println("Finished Filtering Resultset... "+ new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()));
System.out.println("Processing Sentiment Data... "+ new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()));
//calculate postive tweets
JavaPairDStream<Tuple2<Long, String>, Float> positiveTweets =
tweetsFiltered.mapToPair(new PositiveScoreFunction());
//calculate negative tweets
JavaPairDStream<Tuple2<Long, String>, Float> negativeTweets =
tweetsFiltered.mapToPair(new NegativeScoreFunction());
JavaPairDStream<Tuple2<Long, String>, Tuple2<Float, Float>> joined =
positiveTweets.join(negativeTweets);
//Score tweets
JavaDStream<Tuple4<Long, String, Float, Float>> scoredTweets =
joined.map(new Function<Tuple2<Tuple2<Long, String>,
Tuple2<Float, Float>>,
Tuple4<Long, String, Float, Float>>() {
private static final long serialVersionUID = 42l;
#Override
public Tuple4<Long, String, Float, Float> call(
Tuple2<Tuple2<Long, String>, Tuple2<Float, Float>> tweet)
{
return new Tuple4<Long, String, Float, Float>(
tweet._1()._1(),
tweet._1()._2(),
tweet._2()._1(),
tweet._2()._2());
}
});
System.out.println("Finished Processing Sentiment Data... "+ new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()));
System.out.println("Outputting Tweets Data to flat file "+Properties.getString("HdfsOutput")+" ... "+ new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()));
JavaDStream<Tuple5<Long, String, Float, Float, String>> result =
scoredTweets.map(new ScoreTweetsFunction());
result.foreachRDD(new FileWriter());
System.out.println("Outputting Sentiment Data to Hive... "+ new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()));
jssc.start();
jssc.awaitTermination();
}
}

Creating SparkContext
You can create a SparkContext instance with or without creating a SparkConf object first.
Getting Existing or Creating New SparkContext (getOrCreate methods)
getOrCreate(): SparkContext
getOrCreate(conf: SparkConf): SparkContext
SparkContext.getOrCreate methods allow you to get the existing SparkContext or create a new one.
import org.apache.spark.SparkContext
val sc = SparkContext.getOrCreate()
// Using an explicit SparkConf object
import org.apache.spark.SparkConf
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("SparkMe App")
val sc = SparkContext.getOrCreate(conf)
Refer Here - https://jaceklaskowski.gitbooks.io/mastering-apache-spark/content/spark-sparkcontext.html

Apparently if I use sc.close() to close the original SparkContext before executing JavaStreaming Context it works perfectly, no errors or issues.

you can use a singleton object ContextManager which would handle which context to provide.
public class ContextManager {
private static JavaSparkContext context;
private static String currentType;
private ContextManager() {}
public static JavaSparkContext getContext(String type) {
if(type == currentType && context != null) {
return context;
}
else if (type == "streaming"){
.. clean up the current context ..
.. initialize the context to streaming context ..
currentType = type;
}
else {
..clean up the current context..
... initialize the context to normal context ..
currentType = type;
}
return context;
}
}
There are some issues like in projects where you switch context quite rapidly the overhead would be quite large.

You can access the SparkContext from your JavaStreamingSparkContext, and use that reference when creating additional contexts.
SparkConf sparkConfig = new SparkConf().setAppName("foo");
JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, Duration.seconds(30));
SqlContext sqlContext = new SqlContext(jssc.sparkContext());

Related

Use mapPartitionsWithIndex for DStream - Spark Streaming

I want to do something very simple: to check what is the content of each partition in the first RDD of my DStream.
This is what I'm doing now:
SparkConf sparkConfiguration= new SparkConf().setAppName("DataAnalysis").setMaster("local[*]");
JavaStreamingContext sparkStrContext=new JavaStreamingContext(sparkConfiguration, Durations.seconds(1));
JavaReceiverInputDStream<String> receiveParkingData=sparkStrContext.socketTextStream("localhost",5554);
Time time=new Time(1000);
JavaRDD<String>dataRDD= receiveParkingData.compute(time);
//I get an error in this RDD
JavaRDD<String>indexDataRDD=dataRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
#Override
public Iterator<String> call(Integer integer, Iterator<String> stringIterator) throws Exception {
return null;
}
});
indexDataRDD.collect();
So I want to print the content of each partition and its ID. However, on the indexDataRDD I get this message in my IntelliJ IDE: mapPartitionsWithIndex (Function2<Integer, Iterator<String>, Iterator<String>>, boolean) in AbstractJavaRDDLike cannot be applied to (Function2<Integer, Iterator<String>, Iterator<String>>)
Can someone help me with this issue? Is there another, easier way to get the content in each partition? I really want to know the specific content of each partition.
Thank you so much.

Here is sample program for mapPartitionsWithIndex for your reference.
public class SparkDemo {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("SparkDemo").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<String> data = Arrays.asList("one","two","three","four","five");
JavaRDD<String> javaRDD = sc.parallelize(data, 2);
JavaRDD<String> mapPartitionsWithIndexRDD = javaRDD
.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
#Override
public Iterator<String> call(Integer index, Iterator<String> iterator) throws Exception {
LinkedList<String> linkedList = new LinkedList<String>();
while (iterator.hasNext()){
linkedList.add(Integer.toString(index) + "-" + iterator.next());
}
return linkedList.iterator();
}
}, false);
System.out.println("mapPartitionsWithIndexRDD " + mapPartitionsWithIndexRDD.collect());
sc.stop();
sc.close();
}
}

Getting error when invoking elasticSearch from spark

I have a use case, where I need to read messages from kafka and for each message, extract data and invoke elasticsearch Index. The response will be further used to do further processing.
I am getting below error when invoking JavaEsSpark.esJsonRDD
java.lang.ClassCastException: org.elasticsearch.spark.rdd.EsPartition incompatible with org.apache.spark.rdd.ParallelCollectionPartition
at org.apache.spark.rdd.ParallelCollectionRDD.compute(ParallelCollectionRDD.scala:102)
My code snippet is below
public static void main(String[] args) {
if (args.length < 4) {
System.err.println("Usage: JavaKafkaIntegration <zkQuorum> <group> <topics> <numThreads>");
System.exit(1);
}
SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaIntegration").setMaster("local[2]").set("spark.driver.allowMultipleContexts", "true");
//Setting when using JavaEsSpark.esJsonRDD
sparkConf.set("es.nodes",<NODE URL>);
sparkConf.set("es.nodes.wan.only","true");
context = new JavaSparkContext(sparkConf);
// Create the context with 2 seconds batch size
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));
int numThreads = Integer.parseInt(args[3]);
Map<String, Integer> topicMap = new HashMap<>();
String[] topics = args[2].split(",");
for (String topic: topics) {
topicMap.put(topic, numThreads);
}
//Receive Message From kafka
JavaPairReceiverInputDStream<String, String> messages =
KafkaUtils.createStream(jssc,args[0], args[1], topicMap);
JavaDStream<String> jsons = messages
.map(new Function<Tuple2<String, String>, String>() {
/**
*
*/
private static final long serialVersionUID = 1L;
#Override
public String call(Tuple2<String, String> tuple2){
JavaRDD<String> esRDD = JavaEsSpark.esJsonRDD(context, <index>,<search string> ).values() ;
return null;
}
});
jsons.print();
jssc.start();
jssc.awaitTermination();
}
I am getting error when invoking JavaEsSpark.esJsonRDD. Is it correct way to do it? How do I successfully invoke ES from spark?
I am running kafka and spark on windows and invoking external elastic search index.

Efficient Spark Cassandra Java join

I've got two tables:
my_keyspace.name with columns:
name (string) - partition key
timestamp (date) - second part of partition key
id (int) - third part of partition key
my_keyspace.data with columns:
timestamp (date) - partition key
id (int) - second part of partition key
data (string)
I'm trying to join on timestamp and id from a name table. I'm doing it by getting all timestamps and ids associated with a given name and retrieving data from data table for those entries.
It's really fast to do it in CQL. I expected Spark Cassandra to be equally fast at it, but instead it seems to be doing a full table scan. It might be due to not knowing which fields are partition/primary key. Though I don't seem to be able to find a way to tell it the mappings.
How can I make this join as efficient as it should be? Here's my code sample:
private static void notSoEfficientJoin() {
SparkConf conf = new SparkConf().setAppName("Simple Application")
.setMaster("local[*]")
.set("spark.cassandra.connection.host", "localhost")
.set("spark.driver.allowMultipleContexts", "true");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<DataKey, NameRow> nameIndexRDD = javaFunctions(sc).cassandraTable("my_keyspace", "name", mapRowTo(NameRow.class)).where("name = 'John'")
.keyBy(new Function<NameRow, DataKey>() {
#Override
public DataKey call(NameRow v1) throws Exception {
return new DataKey(v1.timestamp, v1.id);
}
});
JavaPairRDD<DataKey, DataRow> dataRDD = javaFunctions(sc).cassandraTable("my_keyspace", "data", mapRowTo(DataRow.class))
.keyBy(new Function<DataRow, DataKey>() {
#Override
public DataKey call(DataRow v1) throws Exception {
return new DataKey(v1.timestamp, v1.id);
}
});
JavaRDD<String> cassandraRowsRDD = nameIndexRDD.join(dataRDD)
.map(new Function<Tuple2<DataKey, Tuple2<NameRow, DataRow>>, String>() {
#Override
public String call(Tuple2<DataKey, Tuple2<NameRow, DataRow>> v1) throws Exception {
NameRow nameRow = v1._2()._1();
DataRow dataRow = v1._2()._2();
return nameRow + " " + dataRow;
}
});
List<String> collect = cassandraRowsRDD.collect();
}

The way to do this join more efficiently is to actually invoke joinWithCassandraTable this can be done by wrapping results with another javaFunctions call:
private static void moreEfficientJoin() {
SparkConf conf = new SparkConf().setAppName("Simple Application")
.setMaster("local[*]")
.set("spark.cassandra.connection.host", "localhost")
.set("spark.driver.allowMultipleContexts", "true");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<DataKey> nameIndexRDD = sc.parallelize(javaFunctions(sc).cassandraTable("my_keyspace", "name", mapRowTo(DataKey.class))
.where("name = 'John'")
.collect());
JavaRDD<Data> dataRDD = javaFunctions(nameIndexRDD).joinWithCassandraTable("my_keyspace", "data", allColumns, someColumns("timestamp", "id"), mapRowTo(Data.class), mapToRow(DataKey.class))
.map(new Function<Tuple2<DataKey, Data>, Data>() {
#Override
public Data call(Tuple2<DataKey, Data> v1) throws Exception {
return v1._2();
}
});
List<Data> data = dataRDD.collect();
}
The important thing is to wrap a JavaRDD with javaFunctions. So it is possible to not call collect and sc.parallelize on nameIndexRDD

apache ignite query

I have created a cache using Automatic Persistence, connecting to Mysql database.
1 million rows are populated on startup into that node. Node is in PARTITIONED mode
When I try retrieving data from that Cache using SQL queries, it always returns empty array.
I have indexed the cache using "CacheTypeMetadata".
Please could anyone point out what I have missed out, or done incorrectly. I have been following the tutorials, but I cannot figure out why my query is not working fine.
Thanks in advance!
Cache:
CacheConfiguration<Dataloadermd5Key, DataLoaderMd5> cfg =
CacheConfigMd5.cache("DataMd5Cache",
new JDBCFactory<Dataloadermd5Key, DataLoaderMd5>());
DataLoaderMd5Key:
public class Dataloadermd5Key implements Serializable {
/** */
private static final long serialVersionUID = 0L;
/** Value for idClient. */
private int idClient;
/** Value for clientPropId. */
private String clientPropId;
//...
}
DataLoaderMd5:
public class DataLoaderMd5 implements Serializable {
/** */
private static final long serialVersionUID = 0L;
/** Value for idClient. */
private int idClient;
/** Value for clientPropId. */
private String clientPropId;
/** Value for md5. */
private String md5;
//...
}
CacheConfigMd5:
public static <K, V> CacheConfiguration<K, V> cache(String name,
Factory<CacheStore<K, V>> storeFactory) {
if (storeFactory == null)
throw new IllegalArgumentException("Cache store factory cannot be null.");
CacheConfiguration<K, V> ccfg = new CacheConfiguration<>(name);
ccfg.setCacheStoreFactory(storeFactory);
ccfg.setReadThrough(true);
ccfg.setWriteThrough(true);
// Configure cache types.
Collection<CacheTypeMetadata> meta = new ArrayList<>();
// DataLoaderMd5.
CacheTypeMetadata type = new CacheTypeMetadata();
meta.add(type);
type.setDatabaseSchema("abc");
type.setDatabaseTable("dfg");
type.setKeyType(Dataloadermd5Key.class.getName());
type.setValueType(DataLoaderMd5.class.getName());
// Key fields for DataLoaderMd5.
Collection<CacheTypeFieldMetadata> keys = new ArrayList<>();
keys.add(new CacheTypeFieldMetadata("id_client", Types.INTEGER,
"idclient", int.class));
keys.add(new CacheTypeFieldMetadata("client_prop_id", Types.VARCHAR,
"clientPropId", String.class));
type.setKeyFields(keys);
// Value fields for DataLoaderMd5.
Collection<CacheTypeFieldMetadata> vals = new ArrayList<>();
vals.add(new CacheTypeFieldMetadata("id_client", Types.INTEGER,
"idclient", int.class));
vals.add(new CacheTypeFieldMetadata("client_prop_id", Types.VARCHAR,
"clientPropId", String.class));
vals.add(new CacheTypeFieldMetadata("Md5", Types.VARCHAR,
"md5", String.class));
type.setValueFields(vals);
// Query fields for DataLoaderMd5.
Map<String, Class<?>> qryFlds = new LinkedHashMap<>();
qryFlds.put("idclient", int.class);
qryFlds.put("clientPropId", String.class);
qryFlds.put("md5", String.class);
type.setQueryFields(qryFlds);
// Groups for DataLoaderMd5.
Map<String, LinkedHashMap<String,
IgniteBiTuple<Class<?>, Boolean>>> grps = new LinkedHashMap<>();
LinkedHashMap<String, IgniteBiTuple<Class<?>, Boolean>> grpItems =
new LinkedHashMap<>();
grpItems.put("idclient", new IgniteBiTuple<Class<?>,
Boolean>(int.class, false));
grpItems.put("clientPropId", new IgniteBiTuple<Class<?>,
Boolean>(String.class, false));
grps.put("PRIMARY", grpItems);
type.setGroups(grps);
ccfg.setTypeMetadata(meta);
return ccfg;
}
Query:
Ignite ignite = Ignition.start("examples/config/example-cache.xml");
IgniteCache<Dataloadermd5Key, DataLoaderMd5> cache = ignite.cache(CACHE_NAME);
Dataloadermd5Key key = new Dataloadermd5Key();
key.setIdClient(98255);
key.setClientPropId("1000008");
SqlQuery<Dataloadermd5Key, DataLoaderMd5> qry =
new SqlQuery<Dataloadermd5Key, DataLoaderMd5>(
DataLoaderMd5.class, "idClient = ? and clientPropId = ?");
//Excecute query
System.out.println("sqlQuery Lookup result :: " +
cache.query(qry).getAll());
System.out.println("sqlQuery Lookup result :: " +
cache.query(qry.setArgs(key.getIdClient(), key.getClientPropId())).getAll());

I found out the problem. It was because of my Ignite version.
I updated the version on my maven to 1.1.0, and the code started working fine.
<dependency>
<groupId>org.apache.ignite</groupId>
<artifactId>ignite-indexing</artifactId>
<version>1.1.0-incubating</version>
</dependency>

Hadoop 2.4.0 + HCatalog + Mapreduce

in Hadoop 2.4.0, I get the following error while executing below code sample. I think, there is mismatch hadoop version. Are you review the code? and How can I fix this codes?
I am trying to write map-reduce job that copying Hcatalog table.
thank you.
Exception in thread "main" java.lang.IncompatibleClassChangeError: Found interface org.apache.hadoop.mapreduce.JobContext, but class was expected
at org.apache.hcatalog.mapreduce.HCatBaseOutputFormat.getJobInfo(HCatBaseOutputFormat.java:94)
at org.apache.hcatalog.mapreduce.HCatBaseOutputFormat.getOutputFormat(HCatBaseOutputFormat.java:82)
at org.apache.hcatalog.mapreduce.HCatBaseOutputFormat.checkOutputSpecs(HCatBaseOutputFormat.java:72)
at org.apache.hadoop.mapreduce.JobSubmitter.checkSpecs(JobSubmitter.java:458)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:343)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1285)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1282)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1548)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1282)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1303)
at org.deneme.hadoop.UseHCat.run(UseHCat.java:102)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
at org.deneme.hadoop.UseHCat.main(UseHCat.java:107)
Code Sample
public class UseHCat extends Configured implements Tool{
public static class Map extends Mapper<WritableComparable, HCatRecord,Text,IntWritable> {
String groupname;
#Override
protected void map( WritableComparable key,
HCatRecord value,
org.apache.hadoop.mapreduce.Mapper<WritableComparable, HCatRecord,
Text, IntWritable>.Context context)
throws IOException, InterruptedException {
// The group table from /etc/group has name, 'x', id
groupname = (String) value.get(0);
int id = (Integer) value.get(2);
// Just select and emit the name and ID
context.write(new Text(groupname), new IntWritable(id));
}
}
public static class Reduce extends Reducer<Text, IntWritable,
WritableComparable, HCatRecord> {
protected void reduce( Text key,
java.lang.Iterable<IntWritable> values,
org.apache.hadoop.mapreduce.Reducer<Text, IntWritable,
WritableComparable, HCatRecord>.Context context)
throws IOException, InterruptedException {
// Only expecting one ID per group name
Iterator<IntWritable> iter = values.iterator();
IntWritable iw = iter.next();
int id = iw.get();
// Emit the group name and ID as a record
HCatRecord record = new DefaultHCatRecord(2);
record.set(0, key.toString());
record.set(1, id);
context.write(null, record);
}
}
public int run(String[] args) throws Exception {
Configuration conf = getConf(); //hdfs://sandbox.hortonworks.com:8020
//conf.set("fs.defaultFS", "hdfs://192.168.1.198:8020");
//conf.set("mapreduce.job.tracker", "192.168.1.115:50001");
//Configuration conf = new Configuration();
//conf.set("fs.defaultFS", "hdfs://192.168.1.198:8020/data");
args = new GenericOptionsParser(conf, args).getRemainingArgs();
// Get the input and output table names as arguments
String inputTableName = args[0];
String outputTableName = args[1];
// Assume the default database
String dbName = null;
String jobName = "UseHCat";
String userChosenName = getConf().get(JobContext.JOB_NAME);
if (userChosenName != null)
jobName += ": " + userChosenName;
Job job = Job.getInstance(getConf());
job.setJobName(jobName);
// Job job = new Job(conf, "UseHCat");
// HCatInputFormat.setInput(job, InputJobInfo.create(dbName,inputTableName, null));
HCatInputFormat.setInput(job, dbName, inputTableName);
job.setJarByClass(UseHCat.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
// An HCatalog record as input
job.setInputFormatClass(HCatInputFormat.class);
// Mapper emits a string as key and an integer as value
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// Ignore the key for the reducer output; emitting an HCatalog record as value
job.setOutputKeyClass(WritableComparable.class);
job.setOutputValueClass(DefaultHCatRecord.class);
job.setOutputFormatClass(HCatOutputFormat.class);
HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, null));
HCatSchema s = HCatOutputFormat.getTableSchema(job.getConfiguration());
System.err.println("INFO: output schema explicitly set for writing:" + s);
HCatOutputFormat.setSchema(job, s);
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception {
// System.setProperty("hadoop.home.dir", "C:"+File.separator+"hadoop-2.4.0");
int exitCode = ToolRunner.run(new UseHCat(), args);
System.exit(exitCode);
}
}

In Hadoop 1.x.x JobContext is a Class where as in Hadoop 2.x.x, it is an interface and HCatalog-core APIs are not compatible with hadoop 2.x.x.
HCatalogBaseOutputFormat class needs the following code change to fix the issue:
//JobContext ctx = new JobContext(conf,jobContext.getJobID());
JobContext ctx = new Job(conf);

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Spark: Two SparkContexts in a single Application Best Practice - java

Apparently if I use sc.close() to close the original SparkContext before executing JavaStreaming Context it works perfectly, no errors or issues.

Related

Use mapPartitionsWithIndex for DStream - Spark Streaming

Getting error when invoking elasticSearch from spark

Efficient Spark Cassandra Java join

apache ignite query

Hadoop 2.4.0 + HCatalog + Mapreduce

Categories

Resources