Kafka sending Streams - java

hello i want to send a message from producer and received in consumer as capitalized. using CONFLUENT and KAFKA with a build.gradle file and StreamingApp.java containing this codes:
ProcessingApp.java:
package kafka_stream;
import org.apache.kafka.streams.StreamsBuilder;
import org.apache.kafka.streams.Topology;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.kstream.KStream;
import java.util.Properties;
public class StreamingApp {
public static void main(String[] args) throws Exception {
Properties props = new Properties();
props.put(StreamsConfig.APPLICATION_ID_CONFIG,"streaming_app_id");
props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG,"localhost:9092");
StreamsConfig config = new StreamsConfig(props);
StreamsBuilder builder = new StreamsBuilder();
Topology topology = builder.build();
KafkaStreams streams = new KafkaStreams(topology,config);
KStream<String, String> simpleFirstStream = builder.stream("src-topic");
KStream<String, String> upperCasedStream = simpleFirstStream.mapValues(String::toUpperCase);
upperCasedStream.to("out-topic");
System.out.println("Streaming App Started");
streams.start();
Thread.sleep(30000);
System.out.println("shutting downl the streaming app");
streams.close();
}
}
so hot to get this option solved?

Related

kafka streams abandoned cart development - session window

I am attempting to build out a kstreams app that takes in records from an input topic that is a simple json payload (id and timestamp included - the key is a simple 3 digit string) (there is also no schema required). for the output topic I wish to produce only the records in which have been abandoned for 30 minutes or more (session window). based on this link, I have begun to develop a kafka streams app:
package io.confluent.developer;
import org.apache.kafka.clients.admin.AdminClient;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.common.serialization.StringSerializer;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.KeyValue;
import org.apache.kafka.streams.StreamsBuilder;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.Topology;
import org.apache.kafka.streams.kstream.Consumed;
import org.apache.kafka.streams.kstream.Produced;
import org.apache.kafka.streams.kstream.SessionWindows;
import java.io.FileInputStream;
import java.io.IOException;
import java.time.Duration;
import java.time.Instant;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.time.format.FormatStyle;
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.CountDownLatch;
public class SessionWindow {
private final DateTimeFormatter timeFormatter = DateTimeFormatter.ofLocalizedTime(FormatStyle.LONG)
.withLocale(Locale.US)
.withZone(ZoneId.systemDefault());
public Topology buildTopology(Properties allProps) {
final StreamsBuilder builder = new StreamsBuilder();
final String inputTopic = allProps.getProperty("input.topic.name");
final String outputTopic = allProps.getProperty("output.topic.name");
builder.stream(inputTopic, Consumed.with(Serdes.String(), Serdes.String()))
.groupByKey()
.windowedBy(SessionWindows.ofInactivityGapAndGrace(Duration.ofMinutes(5), Duration.ofSeconds(10)))
.count()
.toStream()
.map((windowedKey, count) -> {
String start = timeFormatter.format(windowedKey.window().startTime());
String end = timeFormatter.format(windowedKey.window().endTime());
String sessionInfo = String.format("Session info started: %s ended: %s with count %s", start, end, count);
return KeyValue.pair(windowedKey.key(), sessionInfo);
})
.to(outputTopic, Produced.with(Serdes.String(), Serdes.String()));
return builder.build();
}
public Properties loadEnvProperties(String fileName) throws IOException {
Properties allProps = new Properties();
FileInputStream input = new FileInputStream(fileName);
allProps.load(input);
input.close();
return allProps;
}
public static void main(String[] args) throws Exception {
if (args.length < 1) {
throw new IllegalArgumentException("This program takes one argument: the path to an environment configuration file.");
}
SessionWindow tw = new SessionWindow();
Properties allProps = tw.loadEnvProperties(args[0]);
allProps.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
allProps.put(StreamsConfig.DEFAULT_TIMESTAMP_EXTRACTOR_CLASS_CONFIG, ClickEventTimestampExtractor.class);
Topology topology = tw.buildTopology(allProps);
ClicksDataGenerator dataGenerator = new ClicksDataGenerator(allProps);
dataGenerator.generate();
final KafkaStreams streams = new KafkaStreams(topology, allProps);
final CountDownLatch latch = new CountDownLatch(1);
// Attach shutdown handler to catch Control-C.
Runtime.getRuntime().addShutdownHook(new Thread("streams-shutdown-hook") {
#Override
public void run() {
streams.close(Duration.ofSeconds(5));
latch.countDown();
}
});
try {
streams.cleanUp();
streams.start();
latch.await();
} catch (Throwable e) {
System.exit(1);
}
System.exit(0);
}
static class ClicksDataGenerator {
final Properties properties;
public ClicksDataGenerator(final Properties properties) {
this.properties = properties;
}
public void generate() {
properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
}
}
}
package io.confluent.developer;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.streams.processor.TimestampExtractor;
public class ClickEventTimestampExtractor implements TimestampExtractor {
#Override
public long extract(ConsumerRecord<Object, Object> record, long previousTimestamp) {
System.out.println(record.value());
return record.getTimestamp();
}
}
i am having issues withe the following:
getting the code to compile - I keep getting this error (I am new to java so please bear with me). what is the correct way to call the getTimestamp?:
error: cannot find symbol
return record.getTimestamp();
^
symbol: method getTimestamp()
location: variable record of type ConsumerRecord<Object,Object>
1 error
not sure if the timestamp extractor will work for this particular scenario. I read here that 'The Timestamp extractor can only give you one timestamp'. does that mean that if there are multiple messages with different keys this wont work? some clarification or examples would help.
thanks!

Kafka consumer showing numbers in unreadable format

I am trying out the kafka streaming. I am reading messages from one topic and doing groupByKey and then doing the count of groups. But the problem is that the messages count is coming as unreadable "boxes".
If I run the console consumer these are coming as empty strings
This is the WordCount code I wrote
package streams;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.StreamsBuilder;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.kstream.KStream;
import java.util.Arrays;
import java.util.Properties;
public class WordCount {
public static void main(String[] args) {
Properties properties = new Properties();
properties.setProperty(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
properties.setProperty(StreamsConfig.APPLICATION_ID_CONFIG, "streams-demo-2");
properties.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
properties.setProperty(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.StringSerde.class.getName());
properties.setProperty(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.StringSerde.class.getName());
// topology
StreamsBuilder builder = new StreamsBuilder();
KStream<String, String> input = builder.stream("temp-in");
KStream<String, Long> fil = input.flatMapValues(val -> Arrays.asList(val.split(" "))) // making stream of text line to stream of words
.selectKey((k, v) -> v) // changing the key
.groupByKey().count().toStream(); // getting count after groupBy
fil.to("temp-out");
KafkaStreams streams = new KafkaStreams(builder.build(), properties);
streams.start();
System.out.println(streams.toString());
Runtime.getRuntime().addShutdownHook(new Thread(streams::close));
}
}
This is the output I am getting in the consumer. It is there on the right side in image
I had tried casting the long to long again to see if it works. But it's not working
I am attaching the consumer code too if it helps.
package tutorial;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import java.time.Duration;
import java.util.Collections;
import java.util.Properties;
public class Consumer {
public static void main(String[] args) {
Properties properties = new Properties();
properties.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
properties.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
properties.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
// Once the consumer starts running it keeps running even after we stop in console
// We should create new consumer to read from earliest because the previous one had already consumed until certain offset
// when we run the same consumer in two consoles kafka detects it and re balances
// In this case the consoles split the partitions they consume forming a consumer group
properties.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "consumer-application-1"); // -> consumer id
properties.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); // -> From when consumer gets data
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(properties);
consumer.subscribe(Collections.singleton("temp-out"));
while (true) {
ConsumerRecords<String, String> consumerRecords = consumer.poll(Duration.ofMillis(1000));
for (ConsumerRecord<String, String> record: consumerRecords) {
System.out.println(record.key() + " " + record.value());
System.out.println(record.partition() + " " + record.offset());
}
}
}
}
Any help is appreciated. Thanks in advance.
The message value you're writing with Kafka Streams is a Long, and you're consuming it as a String.
If you make the following changes to your Consumer class, you'll be able to see the count printed correctly to stdout:
// Change this from StringDeserializer to LongDeserializer.
properties.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, LongDeserializer.class.getName());
...
// The value you're consuming here is a Long, not a String.
KafkaConsumer<String, Long> consumer = new KafkaConsumer<>(properties);
consumer.subscribe(Collections.singleton("temp-out"));
while (true) {
ConsumerRecords<String, Long> consumerRecords = consumer.poll(Duration.ofMillis(1000));
for (ConsumerRecord<String, Long> record : consumerRecords) {
System.out.println(record.key() + " " + record.value());
System.out.println(record.partition() + " " + record.offset());
}
}

Write to kafka Topic based on the content on content of record using kafkastreams

I'm trying to write from a topic(parent) to another topic(child) in kafka based on the content of the records in the parent.
A sample record if i consume from the parent topic is {"date":{"string":"2017-03-20"},"time":{"string":"20:04:13:563"},"event_nr":1572470,"interface":"Transaction Manager","event_id":5001,"date_time":1490040253563,"entity":"Transaction Manager","state":0,"msg_param_1":{"string":"ISWSnk"},"msg_param_2":{"string":"Application startup"},"msg_param_3":null,"msg_param_4":null,"msg_param_5":null,"msg_param_6":null,"msg_param_7":null,"msg_param_8":null,"msg_param_9":null,"long_msg_param_1":null,"long_msg_param_2":null,"long_msg_param_3":null,"long_msg_param_4":null,"long_msg_param_5":null,"long_msg_param_6":null,"long_msg_param_7":null,"long_msg_param_8":null,"long_msg_param_9":null,"last_sent":{"long":1490040253563},"transmit_count":{"int":1},"team_id":null,"app_id":{"int":4},"logged_by_app_id":{"int":4},"entity_type":{"int":3},"binary_data":null}.
I'll like to use the value of entity to write to a topic of the same name as the value of entity(There's a fixed amount of values of entity so i can statically create that if it's difficult programmatically to dynamically create topics). I'm trying to use this
import org.apache.kafka.common.serialization.Serde;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.KeyValue;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.kstream.KStream;
import org.apache.kafka.streams.kstream.KStreamBuilder;
import java.util.Properties;
public class entityDataLoader {
public static void main(final String[] args) throws Exception {
final Properties streamsConfiguration = new Properties();
streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "map-function-lambda-example");
streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
streamsConfiguration.put(StreamsConfig.KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
streamsConfiguration.put(StreamsConfig.VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
// Set up serializers and deserializers, which we will use for overriding the default serdes
// specified above.
final Serde<String> stringSerde = Serdes.String();
final Serde<byte[]> byteArraySerde = Serdes.ByteArray();
// In the subsequent lines we define the processing topology of the Streams application.
final KStreamBuilder builder = new KStreamBuilder();
// Read the input Kafka topic into a KStream instance.
final KStream<byte[], String> textLines = builder.stream(byteArraySerde, stringSerde, "postilion-events");
String content = textLines.toString();
String entity = JSONExtractor.returnJSONValue(content, "entity");
System.out.println(entity);
textLines.to(entity);
final KafkaStreams streams = new KafkaStreams(builder, streamsConfiguration);
streams.cleanUp();
streams.start();
// Add shutdown hook to respond to SIGTERM and gracefully close Kafka Streams
Runtime.getRuntime().addShutdownHook(new Thread(streams::close));
}
}
The content of content is org.apache.kafka.streams.kstream.internals.KStreamImpl#568db2f2 making it obvious that #KStream.toString() isn't the correct method to use to attempt to get the value of entity.
P.S. The JSONExtractor class is defined as
import org.json.simple.JSONObject;
import org.json.simple.parser.ParseException;
import org.json.simple.parser.JSONParser;
class JSONExtractor {
public static String returnJSONValue(String args, String value){
JSONParser parser = new JSONParser();
String app= null;
System.out.println(args);
try{
Object obj = parser.parse(args);
JSONObject JObj = (JSONObject)obj;
app= (String) JObj.get(value);
return app;
}
catch(ParseException pe){
System.out.println("No Object found");
System.out.println(pe);
}
return app;
}
}
You can use branch() to split your parent stream into "sub streams" and write each "sub stream" to one output topic (cf. http://docs.confluent.io/current/streams/developer-guide.html#stateless-transformations)
Your branch() must create a single "sub stream" for all you output topic, but because you know all you topics, this should not be a problem.
Also, for Kafka Streams, it's recommended to create all output topic before you start you application (cf. http://docs.confluent.io/current/streams/developer-guide.html#user-topics)

Value not getting from Kafka topic using Java Spark - kafka direct stream

I am not getting any data from the queue using Kafka direct stream. In my code I put System.out.println() This statement not run that means I am not getting any data from that topic..
I am pretty sure data available in queue and since not getting in console.
I didn't see any error in console also.
Can anyone please suggest something?
Here is my Java code,
SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount11").setMaster("local[*]");
sparkConf.set("spark.streaming.concurrentJobs", "3");
// Create the context with 2 seconds batch size
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(3000));
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "x.xx.xxx.xxx:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", true);
Collection<String> topics = Arrays.asList("topicName");
final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(jssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));
JavaPairDStream<String, String> lines = stream
.mapToPair(new PairFunction<ConsumerRecord<String, String>, String, String>() {
#Override
public Tuple2<String, String> call(ConsumerRecord<String, String> record) {
return new Tuple2<>(record.key(), record.value());
}
});
lines.print();
// System.out.println(lines.count());
lines.foreachRDD(rdd -> {
rdd.values().foreachPartition(p -> {
while (p.hasNext()) {
System.out.println("Value of Kafka queue" + p.next());
}
});
});
I am able to print string which fetch from the kafka queue using direct kafka stream..
Here is my code,
import java.util.HashMap;
import java.util.HashSet;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collection;
import java.util.Currency;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Pattern;
import scala.Tuple2;
import kafka.serializer.StringDecoder;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.*;
import org.apache.spark.streaming.api.java.*;
import org.apache.spark.streaming.kafka.HasOffsetRanges;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange;
import org.json.JSONObject;
import org.omg.CORBA.Current;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.Durations;
public final class KafkaConsumerDirectStream {
public static void main(String[] args) throws Exception {
try {
SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount11").setMaster("local[*]");
sparkConf.set("spark.streaming.concurrentJobs", "30");
// Create the context with 2 seconds batch size
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(200));
Map<String, String> kafkaParams = new HashMap<>();
kafkaParams.put("metadata.broker.list", "x.xx.xxx.xxx:9091");
Set<String> topics = new HashSet();
topics.add("PartWithTopic02Queue");
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class,
String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);
JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
#Override
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
lines.foreachRDD(rdd -> {
if (rdd.count() > 0) {
List<String> strArray = rdd.collect();
// Print string here
}
});
jssc.start();
jssc.awaitTermination();
}
}
catch (Exception e) {
e.printStackTrace();
}
}
#Vimal Here is a link to the working version of creating direct streams in Scala.
I believe after reviewing it in Scala, you must convert it easily.
Please make sure that you are turning off for reading the latest topics in Kafka. It might not pick any topic which was processed last time.

Sending Data to Kafka Producer

i am trying to read the 100k file and send it to kafka topic. Here is my Kafka Code Which sends data to Kafka-console-consumer. When i am sending data i am receiving the data like this
java.util.stream.ReferencePipeline$Head#e9e54c2
Here is the sample single record data what i am sending:
173|172686|548247079|837113012|0x548247079f|7|173|172686a|0|173|2059 22143|0|173|1|173|172686|||0|||7|0||7|||7|172686|allowAllServices|?20161231:22143|548247079||0|173||172686|5:2266490827:DCCInter;20160905152146;2784
Any suggestion to get the data which i had showned in above...Thanks
Code:
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Properties;
import java.util.Properties;
import java.util.concurrent.ExecutionException;
import java.util.stream.Stream;
import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;
#SuppressWarnings("unused")
public class HundredKRecords {
private static String sCurrentLine;
public static void main(String args[]) throws InterruptedException, ExecutionException{
String fileName = "/Users/sreeeedupuganti/Downloads/octfwriter.txt";
//read file into stream, try-with-resources
try (Stream<String> stream = Files.lines(Paths.get(fileName))) {
stream.forEach(System.out::println);
kafka(stream.toString());
} catch (IOException e) {
e.printStackTrace();
}
}
public static void kafka(String stream) {
Properties props = new Properties();
props.put("metadata.broker.list", "localhost:9092");
props.put("serializer.class", "kafka.serializer.StringEncoder");
props.put("partitioner.class","kafka.producer.DefaultPartitioner");
props.put("request.required.acks", "1");
ProducerConfig config = new ProducerConfig(props);
Producer<String, String> producer = new Producer<String, String>(config);
producer.send(new KeyedMessage<String, String>("test",stream));
producer.close();
}
}
Problem is in line kafka(stream.toString());
Java stream class doesn't override method toString. By default it returns getClass().getName() + '#' + Integer.toHexString(hashCode()). That's exactly that you recieve.
In order to receive in kafka the whole file, you have manually convert it to one String (array of bytes).
Please, note, that kafka has limit for message size.

Categories