I have a Kafka Streams Application that is receiving records to an input topic doing some stream processing on it and sending processed records to multiple output topics. It was running perfectly fine until I stopped it.
After stopping the first Streams app, I created a different streams application that has different input and output topic.
Next, I started both applications and my second application worked perfect but my first application has stopped doing any processing at all.
When I start a console-consumer on input topics I can see records being produced, but when I start console-consumer on output topics I am not receiving any records. I don't what could have gone wrong. I thought may be its because of the second streams application. So I stopped it and recreated all the topics and restarted the first application again but its still not doing anything.
Note: I am doing this on a local server.
Next I build the streams application on my personal machine and its working as expected.
What could have gone wrong that I am seeing this unexpectedly strange behavior?
Here is my code:
Main Class:
public class Pipe {
static Logger log = Logger.getLogger(Pipe.class.getName());
public static void main(String[] args) throws Exception {
PropertyConfigurator.configure("log4j.properties");
log.info("Starting application");
Map<String, String> env = System.getenv();
Properties props = new Properties();
String BROKER_URL = env.get("BROKER_URL");
String appId = "98aff1c5-7a69-46b7-899c-186851054b43";
String appSecret = "zVyS/V694ffWe99QpCvYqE1sqeqLo36uuvTL8gmZV0A=";
String appTenant = "2f6cb1a6-ecb8-4578-b680-bf84ded07ff4";
props.put(StreamsConfig.APPLICATION_ID_CONFIG, "streams-pipe");
props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); // pass from env localhost:9092 | BROKER_URL + ":9092"
props.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
props.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());
final StreamsBuilder builder = new StreamsBuilder();
log.info("Creating stream: o365_storage");
KStream<String, String> source_o365_storage = builder.stream("o365_storage");
log.info("Creating stream: scan_result_dlp");
KStream<String, String> source_scan_result_dlp = builder.stream("scan_result_dlp");
log.info("Creating stream: scan_result_malware");
KStream<String, String> source_scan_result_malware = builder.stream("scan_result_malware");
log.info("Creating stream: source_o365_user_contenturl");
//KStream<String, String> source_o365_contenturl = builder.stream("o365_activity_contenturl");
KStream<String, String> source_o365_user_contenturl = builder.stream("o365_activity_contenturl");
log.info("Creating stream: source_o365_contenturl_result");
KStream<String, String> source_o365_contenturl_result = source_o365_user_contenturl.flatMapValues(new ValueMapper<String, Iterable<String>>() {
#Override
public Iterable<String> apply(String value) {
ArrayList<String> keywords = new ArrayList<String>();
ExecutorService executor = new ThreadPoolExecutor(4, 4, 1, TimeUnit.MINUTES, new LinkedBlockingQueue<Runnable>());
try {
String accessToken = O365Util.getAccessToken(appId, appSecret, appTenant);
System.out.println("accessToken : " + accessToken);
System.out.println("Creating futures..");
List<Future<?>> futures = new ArrayList<Future<?>>();
JSONArray contentUrlList = new JSONArray(value);
for (int i = 0; i < contentUrlList.length(); i++) {
JSONObject contentUri = contentUrlList.getJSONObject(i);
//futures.add(executor.submit(new FetchLogService(accessToken, contentUri.getString("contentUri"))));
futures.add(executor.submit(new FetchLogService(accessToken, contentUri, appTenant)));
}
System.out.println("futures size is : " + futures.size());
for (Future<?> f : futures) {
if (f.get() != null) {
//System.out.println("Executing contentUri parallel....................... ");
String futureResult = f.get().toString();
if (String.valueOf(futureResult.charAt(0)).equalsIgnoreCase("[")) {
//System.out.println("futureResult is JSONArray");
JSONArray logList = new JSONArray(futureResult);
for (int k = 0; k < logList.length(); k++) {
JSONObject log = logList.getJSONObject(k);
//System.out.println("Added logs into Events for action : " + log.getString("Operation"));
keywords.add(log.toString());
}
} else {
System.out.println("futureResult is JSONObject");
JSONObject contentUrlObj = new JSONObject(futureResult);
keywords.add(contentUrlObj.toString());
}
} else {
System.out.println("future result is nullllllllllllllllllllllllllllllllllllllll");
}
}
} catch (Exception e) {
System.err.println("Unable to convert to json");
e.printStackTrace();
} finally {
executor.shutdownNow();
}
return keywords;
}
});
log.info("Creating stream: source_o365_user_activity_intermediate");
KStream<String, String> source_o365_user_activity_intermediate = source_o365_contenturl_result.flatMapValues(new ValueMapper<String, Iterable<String>>() {
#Override
public Iterable<String> apply(String value) {
ArrayList<String> keywords = new ArrayList<String>();
try {
if (value.contains("Operation\":")) {
keywords.add(value);
}
} catch (Exception e) {
System.err.println("Unable to convert to json");
e.printStackTrace();
}
return keywords;
}
});
source_o365_user_activity_intermediate.to("o365_user_activity");
log.info("Creating stream: o365_contenturls");
KStream<String, String> o365_contenturls = source_o365_contenturl_result.flatMapValues(new ValueMapper<String, Iterable<String>>() {
#Override
public Iterable<String> apply(String value) {
ArrayList<String> keywords = new ArrayList<String>();
try {
if (value.contains("contentUri\":")) {
keywords.add("["+value+"]");
}
} catch (Exception e) {
System.err.println("Unable to convert to json");
e.printStackTrace();
}
return keywords;
}
});
o365_contenturls.to("o365_activity_contenturl");
log.info("Creating stream: o365_user_activity");
KStream<String, String> source_o365_user_activity = builder.stream("o365_user_activity");
log.info("Creating branch: branches_source_o365_user_activity");
#SuppressWarnings("unchecked")
KStream<String, String>[] branches_source_o365_user_activity = source_o365_user_activity.branch(
(key, value) -> (value.contains("Operation\":\"SharingSet") && value.contains("ItemType\":\"File")), // Sharing Set by Date
(key, value) -> (value.contains("Operation\":\"AddedToSecureLink") && value.contains("ItemType\":\"File")), // Added to secure link
(key, value) -> (value.contains("Operation\":\"AddedToGroup")), // Added to group
(key, value) -> (value.contains("Operation\":\"Add member to role.") || value.contains("Operation\":\"Remove member from role.")),//Role update by date
(key, value) -> (value.contains("Operation\":\"FileUploaded") || value.contains("Operation\":\"FileDeleted")
|| value.contains("Operation\":\"FileRenamed") || value.contains("Operation\":\"FileMoved")), // Upload file by date
(key, value) -> (value.contains("Operation\":\"UserLoggedIn")), // User logged in by date
(key, value) -> (value.contains("Operation\":\"Delete user.") || value.contains("Operation\":\"Add user.")
&& value.contains("ResultStatus\":\"success")) // Manage user by date
);
log.info("Creating branch: branches1_source_o365_user_activity");
#SuppressWarnings("unchecked")
KStream<String, String>[] branches1_source_o365_user_activity = source_o365_user_activity.branch(
(key, value) -> (value.contains("Operation\":\"FileUploaded") || value.contains("Operation\":\"FileModified")
|| value.contains("Operation\":\"FileDeleted")), // File update by date
(key, value) -> (value.contains("Operation\":\"FileAccessed")) // File access by date
);
log.info("Creating branch: branches2_source_o365_user_activity");
#SuppressWarnings("unchecked")
KStream<String, String>[] branches2_source_o365_user_activity = source_o365_user_activity.branch(
(key, value) -> (value.contains("Operation\":\"FileUploaded") || value.contains("Operation\":\"FileModified")
|| value.contains("Operation\":\"FileDeleted") || value.contains("Operation\":\"SharingSet")
&& value.contains("ItemType\":\"File")) // File operation by date
);
log.info("Creating branch: branches3_source_o365_user_activity");
#SuppressWarnings("unchecked")
KStream<String, String>[] branches3_source_o365_user_activity = source_o365_user_activity.branch(
(key, value) -> (value.contains("Workload\":\"AzureActiveDirectory") || value.contains("Workload\":\"OneDrive") || value.contains("Workload\":\"SharePoint")) // Activity log by date
);
log.info("Creating branch: branches4_source_o365_user_activity");
#SuppressWarnings("unchecked")
KStream<String, String>[] branches4_source_o365_user_activity = source_o365_user_activity.branch(
(key, value) -> (value.contains("Operation\":\"FileUploaded") || value.contains("Operation\":\"FileModified")) // Download file for scanning
);
/////////////////////////////////========================= DLP LOGS ========================/////////////////////////////////////////////////////////
AppUtil.pushToTopic(source_scan_result_dlp, Constant.O365_GTB_BREACHED_POLICY_BY_DATE, "o365_gtb_dlp_breached_policy_by_date");
//////////////////////////////////////==================== MALWARE LOGS ================================////////////////////////////////////////////
AppUtil.pushToTopic(source_scan_result_malware, Constant.O365_LAST_LINE_MALWARE, "o365_last_line_malware");
//////////////////////////////////////==================== ALL LOGS ====================================////////////////////////////////////////////
AppUtil.pushToTopic(source_o365_user_activity, Constant.O365_USER_ACTIVITY_BY_DATE, "o365_user_activity_by_date");
////////////////////////////////////====================== STORAGE LOGS ====================================////////////////////////////////////////////
AppUtil.pushToTopic(source_o365_storage, Constant.O365_STORAGE_BY_DATE, "o365_storage_by_date");
//////////////////////////////////////==================== BRANCH LOGS ====================================////////////////////////////////////////////
AppUtil.pushToTopic(branches_source_o365_user_activity[0], Constant.O365_SHARING_SET_BY_DATE, "o365_sharing_set_by_date", Constant.O365_SHARING_SET_BY_DATE_EXCEP_KEYS);
AppUtil.pushToTopic(branches_source_o365_user_activity[1], Constant.O365_ADDED_TO_SECURE_LINK_BY_DATE, "o365_added_to_secure_link_by_date");
AppUtil.pushToTopic(branches_source_o365_user_activity[2], Constant.O365_ADDED_TO_GROUP_BY_DATE, "o365_added_to_group_by_date");
AppUtil.pushToTopic(branches_source_o365_user_activity[3], Constant.O365_ROLE_UPDATE_BY_DATE, "o365_role_update_by_date");
AppUtil.pushToTopic(branches_source_o365_user_activity[4], Constant.O365_UPLOAD_FILE_BY_DATE, "o365_upload_file_by_date", Constant.O365_UPLOAD_FILE_BY_DATE_EXCEP_KEYS);
AppUtil.pushToTopic(branches_source_o365_user_activity[5], Constant.O365_USER_LOGGED_IN_BY_DATE, "o365_user_logged_in_by_date");
AppUtil.pushToTopic(branches_source_o365_user_activity[6], Constant.O365_MANAGE_USER_BY_DATE, "o365_manage_user_by_date");
////////////////////////////////////====================== BRANCH 1 LOGS ====================================////////////////////////////////////////////
AppUtil.pushToTopic(branches1_source_o365_user_activity[0], Constant.O365_FILE_UPDATE_BY_DATE, "o365_file_update_by_date");
AppUtil.pushToTopic(branches1_source_o365_user_activity[1], Constant.O365_FILE_ACCESS_BY_DATE, "o365_file_access_by_date");
////////////////////////////////////====================== BRANCH 2 LOGS ====================================////////////////////////////////////////////
AppUtil.pushToTopic(branches2_source_o365_user_activity[0], Constant.O365_FILE_OPERATION_BY_DATE, "o365_file_operation_by_date");
////////////////////////////////////====================== BRANCH 3 LOGS ====================================////////////////////////////////////////////
AppUtil.pushToTopic(branches3_source_o365_user_activity[0], Constant.O365_ACTIVITY_LOG_BY_DATE, "o365_activity_log_by_date");
////////////////////////////////////====================== BRANCH 4 LOGS ====================================////////////////////////////////////////////
branches4_source_o365_user_activity[0].to("download_file_for_scanning");
final Topology topology = builder.build();
final KafkaStreams streams = new KafkaStreams(topology, props);
final CountDownLatch latch = new CountDownLatch(1);
// attach shutdown handler to catch control-c
Runtime.getRuntime().addShutdownHook(new Thread("streams-shutdown-hook") {
#Override
public void run() {
log.trace("Exiting application.");
streams.close();
latch.countDown();
}
});
try {
streams.start();
latch.await();
} catch (Throwable e) {
System.exit(1);
}
System.exit(0);
}
}
AppUtil:
public final class AppUtil {
static Logger log = Logger.getLogger(Pipe.class.getName());
public static HashMap createHashMap(String[] keys, String[] values) {
HashMap<String, String> hmap = new HashMap<String, String>();
for (int i = 0; i < values.length; i++) {
hmap.put(keys[i], values[i]);
}
return hmap;
}
public static void pushToTopic(KStream<String, String> sourceTopic, HashMap<String, String> hmap, String destTopicName) {
log.info(destTopicName+ " inside function");
System.out.println(destTopicName + " inside function");
sourceTopic.flatMapValues(new ValueMapper<String, Iterable<String>>() {
#Override
public Iterable<String> apply(String value) {
log.info("================================================================================================================================================================================");
log.info("========> " + destTopicName + " Log:\n \n" + value);
System.out.println("================================================================================================================================================================================");
System.out.println("========> " + destTopicName + " Log:\n \n" + value);
ArrayList<String> keywords = new ArrayList<String>();
try {
JSONObject send = new JSONObject();
JSONObject received = processJSON(new JSONObject(value), destTopicName);
send.put("current_date", getCurrentDate().toString());
if (!destTopicName.equals("o365_storage_by_date")) {
send.put("insertion_time", getCurrentDateTime().toString());
}
boolean valid_json = true;
for(String key: hmap.keySet()) {
if (received.has(hmap.get(key))) {
send.put(key, received.get(hmap.get(key)));
}
else {
System.out.println("\n \n Missing Key in JSON: Cannot send log to destination topic = " + destTopicName + " | " + hmap.get(key) + " Key is missing.");
log.error("\n \n Missing Key in JSON: Cannot send log to destination topic = " + destTopicName + " | " + hmap.get(key) + " Key is missing.");
valid_json = false;
}
}
if (valid_json) {
keywords.add(send.toString());
}
// apply regex to value and for each match add it to keywords
} catch (Exception e) {
// TODO: handle exception
log.error("Unable to convert to json");
System.err.println("Unable to convert to json");
e.printStackTrace();
}
return keywords;
}
}).to(destTopicName);
}
//////////////////////////////////////
public static void pushToTopic(KStream<String, String> sourceTopic, HashMap<String, String> hmap, String destTopicName, String[] exceptionalKeys) {
sourceTopic.flatMapValues(new ValueMapper<String, Iterable<String>>() {
#Override
public Iterable<String> apply(String value) {
log.info("================================================================================================================================================================================");
log.info("========> " + destTopicName + " Log:\n \n" + value);
System.out.println("================================================================================================================================================================================");
System.out.println("========> " + destTopicName + " Log:\n \n" + value);
ArrayList<String> keywords = new ArrayList<String>();
try {
JSONObject send = new JSONObject();
JSONObject received = processJSON(new JSONObject(value), destTopicName);
send.put("current_date", getCurrentDate().toString());
if (!destTopicName.equals("o365_storage_by_date")) {
send.put("insertion_time", getCurrentDateTime().toString());
}
boolean valid_json = true;
for(String key: hmap.keySet()) {
if (received.has(hmap.get(key))) {
send.put(key, received.get(hmap.get(key)));
}
else {
System.out.println("\n \n Missing Key in JSON: Sending log to destination topic = " + destTopicName + " with null value | " + hmap.get(key) + " Key is missing.");
log.warn("\n \n Missing Key in JSON: Sending log to destination topic = " + destTopicName + " with null value | " + hmap.get(key) + " Key is missing.");
if(!isExceptionalKey(exceptionalKeys, hmap.get(key))) {
valid_json = false;
}
}
}
if (valid_json) {
keywords.add(send.toString());
}
// apply regex to value and for each match add it to keywords
} catch (Exception e) {
// TODO: handle exception
log.error("Unable to convert to json");
System.err.println("Unable to convert to json");
e.printStackTrace();
}
return keywords;
}
}).to(destTopicName);
}
//////////////////////////////////////
private static boolean isExceptionalKey(String[] exceptionalKeys, String currKey) {
// TODO Auto-generated method stub
boolean isExceptionalKey = false;
for (String string : exceptionalKeys) {
if (string.equals(currKey)) {
isExceptionalKey = true;
break;
}
}
return isExceptionalKey;
}
public static JSONObject processJSON(JSONObject jsonObj, String destTopicName) {
if (jsonObj.has("UserId")) {
String val = jsonObj.get("UserId").toString().toLowerCase();
jsonObj.remove("UserId");
jsonObj.put("UserId", val);
}
if (jsonObj.has("TargetUserOrGroupName")) {
String val = jsonObj.get("TargetUserOrGroupName").toString().toLowerCase();
jsonObj.remove("TargetUserOrGroupName");
jsonObj.put("TargetUserOrGroupName", val);
}
if (jsonObj.has("ObjectId")) {
String val = jsonObj.get("ObjectId").toString().toLowerCase();
jsonObj.remove("ObjectId");
jsonObj.put("ObjectId", val);
}
if (jsonObj.has("EventData")) {
String val = jsonObj.get("EventData").toString().toLowerCase();
jsonObj.remove("EventData");
jsonObj.put("EventData", val);
}
if (destTopicName.equals("o365_last_line_malware")) {
jsonObj.put("MaliciousScore", "-1");
}
if (destTopicName.equals("o365_activity_log_by_date") || destTopicName.equals("o365_gtb_dlp_breached_policy_by_date")) {
jsonObj.put("ActivityDetail", jsonObj.toString());
}
return jsonObj;
}
public static String getCurrentDate() {
Date date = new Date();
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
String UTCdate = dateFormat.format(date);
return UTCdate;
}
private static String getCurrentDateTime() {
DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date date = new Date();
String datetime = dateFormat.format(date);
return datetime;
}
}
Related
I have two different topics as follows:
Streams_Input_Topic
Streams_Output_Topic
I have to create a kafka application using java to read a message from the Streams_Input_Topic and if the value already exists in the persistent store then I have to find the differences in the value and send the updated fields to the target topic Streams_Output_Topic with the same key as we received.
{"key":"key1", "value":{"prop1":"value1","prop2":"value2"}}
{"key":"key1", "value":{"prop1":"value1","prop2":"value4"}} ==> "prop2" is updated with new value
when we receive the second message with an updated value to prop2 I have to send the message to the target topic i.e. Streams_Output_Topic
{"key":"key1", "value":{"prop1":"value1","prop2":"value4"}}
I referred below link already but the whole example is not available to adopt as per my requirement.
Kafka compare consecutive values for a key
Below is my code I am using to do the same:
private void prepareStream(String topic, String appId) throws IOException {
System.out.println("Inside the prepare stream");
String storeName = store + "-" + topic;
Properties props = new Properties();
props.put(StreamsConfig.APPLICATION_ID_CONFIG, appId);
Properties p = account.connect();
props.putAll(p);
props.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
props.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());
Path path = Paths.get("persistentKeyValueStore/kafka_streams_custom_" + topic);
try {
path = Files.createDirectories(path);
} catch (IOException e) {
e.printStackTrace();
}
LOG.info("Path : " + path.toAbsolutePath().toString());
props.put(StreamsConfig.STATE_DIR_CONFIG, path.toAbsolutePath().toString());
KeyValueBytesStoreSupplier stateStore = Stores.persistentKeyValueStore(storeName);
LOG.debug("Creating Stream Object");
StreamsBuilder builder = new StreamsBuilder();
StoreBuilder<KeyValueStore<String, String>> keyValueStoreBuilder =
Stores.keyValueStoreBuilder(Stores.persistentKeyValueStore(storeName),
Serdes.String(),
Serdes.String());
builder.addStateStore(keyValueStoreBuilder);
builder.stream("Streams_Input_Topic",
Consumed.with(Serdes.String(),
Serdes.String()))
.transformValues(() -> new ValueTransformerWithKey<String, String, String>() {
private KeyValueStore<String, String> state;
#Override
public void init(final ProcessorContext context) {
KafkaStreamLog.printConsole("Inside the init method of processor");
state = (KeyValueStore<String, String>) context.getStateStore(storeName);
}
#Override
public String transform(final String key, final String value) {
String prevValue = state.get(key);
KafkaStreamLog.printConsole("Prev value : " + prevValue);
KafkaStreamLog.printConsole("Curr value : " + prevValue);
if (prevValue != null) {
return prevValue;
} else {
state.put(key, value);
}
return null;
}
#Override
public void close() {
}
}, storeName).to("Streams_Output_Topic");
}
How do I pass the properties to the Kstream to connect to a specific broker.
If anybody can provide me with a working example that would be really helpful. Thanks in advance.
I am using AWS Textract in a Java Spring boot project. I have set up AWS CLI and have the SDK as a maven dependency.
I have written Java code, converted from C# in order to extract the Key and Value pairs and I am receiving the following error after successfully extracting some words
"AGENCYCUSTOMERID:FEIN(ifapplicable)MARITALSTATUS/CIVILUNION(ifapplicable)INSUREDLOCATIONCODEBUSPRIMARYE-MAILADDRESS:FEIN(ifapplicable)LINEOFBUSINESSCELLMARITALSTATUScivilUNION(ifapplicable)CELLCELLHOME":
AGENCYCUSTOMERID:FEIN(ifapplicable)MARITALSTATUS/CIVILUNION(ifapplicable)INSUREDLOCATIONCODEBUSPRIMARYE-MAILADDRESS:FEIN(ifapplicable)LINEOFBUSINESSCELLMARITALSTATUScivilUNION(ifapplicable)CELLCELLHOMEException in thread "main" java.lang.NullPointerException
at ai.tautona.lloyds.mailboxprocessor.service.AWSTextractService.Get_text(AWSTextractService.java:112)
at ai.tautona.lloyds.mailboxprocessor.service.AWSTextractService.getKVMapRelationship(AWSTextractService.java:74)
at ai.tautona.lloyds.mailboxprocessor.service.AWSTextractService.getKVMap(AWSTextractService.java:57)
at ai.tautona.lloyds.mailboxprocessor.service.AWSTextractService.main(AWSTextractService.java:148)
Through debugging I found the line that is causing the error to be :
text += "X ";
It appears that after finding a SELECTION ELEMENT / CHECKBOX it fails?
My code :
public class AWSTextractService {
public static void getKVMap(String localFile) throws IOException {
File file = new File(localFile);
byte[] fileContent = Files.readAllBytes(file.toPath());
AmazonTextract client = AmazonTextractClientBuilder.defaultClient();
AnalyzeDocumentRequest request = new AnalyzeDocumentRequest()
.withDocument(new Document()
.withBytes(ByteBuffer.wrap(fileContent))).withFeatureTypes(FeatureType.FORMS);
AnalyzeDocumentResult result = client.analyzeDocument(request);
//Get the text blocks
List<Block> blocks = result.getBlocks();
//get key and value maps
List<Block> key_map = new ArrayList<>();
List<Block> value_map = new ArrayList<>();
List<Block> block_map = new ArrayList<>();
for (Block block : blocks) {
block_map.add(block);
if (block.getBlockType().equals("KEY_VALUE_SET")) {
if (block.getEntityTypes().contains("KEY")) {
key_map.add(block);
} else {
value_map.add(block);
}
}
}
//Get Key Value relationship
getKVMapRelationship(key_map, value_map, block_map).forEach((k, v) -> System.out.println("key: " + k + " value:" + v));
getKeyValueRelationship.forEach((k,v)-> System.out.println("key: "+k+" value:"+v));
}
#NotNull
public static HashMap<String, String> getKVMapRelationship(List<Block> key_map, List<Block> value_map, List<Block> block_map) throws IOException {
HashMap<String, String> kvs = new HashMap<>();
;
Block value_block;
String key, val = "";
for (Block key_block : key_map) {
value_block = Find_value_block(key_block, value_map);
key = Get_text(key_block, block_map);
val = Get_text(value_block, block_map);
System.out.printf(key, val);
kvs.put("1", "2");
}
return kvs;
}
#NotNull
public static Block Find_value_block(Block block, List<Block> value_map) {
Block value_block = new Block();
for (Relationship relationship : block.getRelationships()) {
if (relationship.getType().equals("VALUE")) {
for (String value_id : relationship.getIds()) {
for (Block value : value_map) {
if (value.getId().equals(value_id)) {
value_block = value;
}
}
}
}
}
return value_block;
}
//null
#NotNull
public static String Get_text(Block result, List<Block> block_map) throws IOException {
String text = "";
Block word = new Block();
Block word2 = null;
if (result.getRelationships().stream().count() > 0) {
for (Relationship relationship : result.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String child_id : relationship.getIds()) {
word = block_map.stream()
.filter((x)-> x.getId().equals(child_id)).findFirst().orElse(word2);
if (word.getBlockType().equals("WORD"))
{
text += (word.getText() ==null ? "" : word.getText()) + "";
}
if (word.getBlockType().equals("SELECTION_ELEMENT"))
{
if(word.getSelectionStatus().equals("SELECTED"))
{
text += "X ";
}
}
}
}
}
}
return text;
}
public static void main (String[]args) throws IOException {
String fileStr = "/home/daniel/Documents/atrium_sources/accordImage-1.png";
AWSTextractService.getKVMap(fileStr);
System.out.println("Done!");
}
}
Im not sure what is the issue?
I am very sure other Java Devs are going to appreciate this Code. I answered my question with the help of Rikus.
package ai.tautona.lloyds.mailboxprocessor.service;
import com.amazonaws.services.textract.AmazonTextract;
import com.amazonaws.services.textract.AmazonTextractClientBuilder;
import com.amazonaws.services.textract.model.Document;
import java.nio.file.Files;
import com.amazonaws.services.textract.model.*;
import org.apache.commons.collections.CollectionUtils;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import javax.validation.constraints.NotNull;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
#Service
#Transactional
public class AWSTextractService {
public static void getKVMap(String localFile) throws IOException {
File file = new File(localFile);
byte[] fileContent = Files.readAllBytes(file.toPath());
AmazonTextract client = AmazonTextractClientBuilder.defaultClient();
AnalyzeDocumentRequest request = new AnalyzeDocumentRequest()
.withDocument(new Document()
.withBytes(ByteBuffer.wrap(fileContent))).withFeatureTypes(FeatureType.FORMS);
AnalyzeDocumentResult result = client.analyzeDocument(request);
//Get the text blocks
List<Block> blocks = result.getBlocks();
//get key and value maps
List<Block> key_map = new ArrayList<>();
List<Block> value_map = new ArrayList<>();
List<Block> block_map = new ArrayList<>();
for (Block block : blocks) {
block_map.add(block);
if (block.getBlockType().equals("KEY_VALUE_SET")) {
if (block.getEntityTypes().contains("KEY")) {
key_map.add(block);
} else {
value_map.add(block);
}
}
}
//Get Key Value relationship
getKVMapRelationship(key_map, value_map, block_map).forEach((k, v) -> System.out.println("key: " + k + " value:" + v));
}
#NotNull
public static HashMap<String, String> getKVMapRelationship(List<Block> key_map, List<Block> value_map, List<Block> block_map) throws IOException {
HashMap<String, String> kvs = new HashMap<>();
;
Block value_block;
String key, val = "";
for (Block key_block : key_map) {
value_block = Find_value_block(key_block, value_map);
key = Get_text(key_block, block_map);
val = Get_text(value_block, block_map);
kvs.put(key, val);
}
return kvs;
}
#NotNull
public static Block Find_value_block(Block block, List<Block> value_map) {
Block value_block = new Block();
for (Relationship relationship : block.getRelationships()) {
if (relationship.getType().equals("VALUE")) {
for (String value_id : relationship.getIds()) {
for (Block value : value_map) {
if (value.getId().equals(value_id)) {
value_block = value;
}
}
}
}
}
return value_block;
}
//null
#NotNull
public static String Get_text(Block result, List<Block> block_map) throws IOException {
String text = "";
Block word2= new Block();
try {
if (result != null
&& CollectionUtils.isNotEmpty(result.getRelationships())) {
for (Relationship relationship : result.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String id : relationship.getIds()) {
Block word= (block_map.stream().filter(x-> x.getId().equals(id)).findFirst().orElse(word2));
if (word.getBlockType().equals("WORD")) {
text += word.getText() + " ";
} else if (word.getBlockType().equals("SELECTION_ELEMENT")) {
if (word.getSelectionStatus().equals("SELECTED")) {
text += "X ";
}
}
}
}
}
}
} catch (Exception e) {
System.out.println(e);
}
return text;
}
public static void main (String[]args) throws IOException {
String fileStr = "/home/daniel/Documents/atrium_sources/accordImage-1.png";
AWSTextractService.getKVMap(fileStr);
System.out.println("Done!");
}
}
I found article below to do in python.
https://docs.aws.amazon.com/textract/latest/dg/examples-export-table-csv.html
also I used article below to extract text.
https://docs.aws.amazon.com/textract/latest/dg/detecting-document-text.html
but above article helped to get only text, I also used function "block.getBlockType()"
of Block but none of block returned its type as "CELL" even tables are there in image/pdf.
Help me found java library similar to "boto3" to extract all tables.
What I did, I created models of each dataset in the json response and can use this models to build a table view in jsf.
public static List<TableModel> getTablesFromTextract(TextractModel textractModel) {
List<TableModel> tables = null;
try {
if (textractModel != null) {
tables = new ArrayList<>();
List<BlockModel> tableBlocks = new ArrayList<>();
Map<String, BlockModel> blockMap = new HashMap<>();
for (BlockModel block : textractModel.getBlocks()) {
if (block.getBlockType().equals("TABLE")) {
tableBlocks.add(block);
}
blockMap.put(block.getId(), block);
}
for (BlockModel blockModel : tableBlocks) {
Map<Long, Map<Long, String>> rowMap = new HashMap<>();
for (RelationshipModel relationship : blockModel.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String id : relationship.getIds()) {
BlockModel cell = blockMap.get(id);
if (cell.getBlockType().equals("CELL")) {
long rowIndex = cell.getRowIndex();
long columnIndex = cell.getColumnIndex();
if (!rowMap.containsKey(rowIndex)) {
rowMap.put(rowIndex, new HashMap<>());
}
Map<Long, String> columnMap = rowMap.get(rowIndex);
columnMap.put(columnIndex, getCellText(cell, blockMap));
}
}
}
}
tables.add(new TableModel(blockModel, rowMap));
}
System.out.println("row Map " + tables.toString());
}
} catch (Exception e) {
LOG.error("Could not get table from textract model", e);
}
return tables;
}
private static String getCellText(BlockModel cell, Map<String, BlockModel> blockMap) {
String text = "";
try {
if (cell != null
&& CollectionUtils.isNotEmpty(cell.getRelationships())) {
for (RelationshipModel relationship : cell.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String id : relationship.getIds()) {
BlockModel word = blockMap.get(id);
if (word.getBlockType().equals("WORD")) {
text += word.getText() + " ";
} else if (word.getBlockType().equals("SELECTION_ELEMENT")) {
if (word.getSelectionStatus().equals("SELECTED")) {
text += "X ";
}
}
}
}
}
}
} catch (Exception e) {
LOG.error("Could not get cell text of table", e);
}
return text;
}
TableModel to create the view from:
public class TableModel {
private BlockModel table;
private Map<Long, Map<Long, String>> rowMap;
public TableModel(BlockModel table, Map<Long, Map<Long, String>> rowMap) {
this.table = table;
this.rowMap = rowMap;
}
public BlockModel getTable() {
return table;
}
public void setTable(BlockModel table) {
this.table = table;
}
public Map<Long, Map<Long, String>> getRowMap() {
return rowMap;
}
public void setRowMap(Map<Long, Map<Long, String>> rowMap) {
this.rowMap = rowMap;
}
#Override
public String toString() {
return table.getId() + " - " + rowMap.toString();
}
I have something similar:
public class AnalyzeDocument {
public DocumentModel startProcess(byte[] content) {
Region region = Region.EU_WEST_2;
TextractClient textractClient = TextractClient.builder().region(region)
.credentialsProvider(EnvironmentVariableCredentialsProvider.create()).build();
return analyzeDoc(textractClient, content);
}
public DocumentModel analyzeDoc(TextractClient textractClient, byte[] content) {
try {
SdkBytes sourceBytes = SdkBytes.fromByteArray(content);
Util util = new Util();
Document myDoc = Document.builder().bytes(sourceBytes).build();
List<FeatureType> featureTypes = new ArrayList<FeatureType>();
featureTypes.add(FeatureType.FORMS);
featureTypes.add(FeatureType.TABLES);
AnalyzeDocumentRequest analyzeDocumentRequest = AnalyzeDocumentRequest.builder().featureTypes(featureTypes)
.document(myDoc).build();
AnalyzeDocumentResponse analyzeDocument = textractClient.analyzeDocument(analyzeDocumentRequest);
List<Block> docInfo = analyzeDocument.blocks();
// util.displayBlockInfo(docInfo);
PageModel pageModel = util.getTableResults(docInfo);
DocumentModel documentModel = new DocumentModel();
documentModel.getPages().add(pageModel);
Iterator<Block> blockIterator = docInfo.iterator();
while (blockIterator.hasNext()) {
Block block = blockIterator.next();
log.debug("The block type is " + block.blockType().toString());
}
return documentModel;
} catch (TextractException e) {
System.err.println(e.getMessage());
}
return null;
}
and this is the util file:
public PageModel getTableResults(List<Block> blocks) {
List<Block> tableBlocks = new ArrayList<>();
Map<String, Block> blockMap = new HashMap<>();
for (Block block : blocks) {
blockMap.put(block.id(), block);
if (block.blockType().equals(BlockType.TABLE)) {
tableBlocks.add(block);
log.debug("added table: " + block.text());
}
}
PageModel page = new PageModel();
if (tableBlocks.size() == 0) {
return null;
}
int i = 0;
for (Block table : tableBlocks) {
page.getTables().add(generateTable(table, blockMap, i++));
}
return page;
}
private TableModel generateTable(Block table, Map<String, Block> blockMap, int index) {
TableModel model = new TableModel();
Map<Integer, Map<Integer, String>> rows = getRowsColumnsMap(table, blockMap);
model.setTableId("Table_" + index);
for (Map.Entry<Integer, Map<Integer, String>> entry : rows.entrySet()) {
RowModel rowModel = new RowModel();
Map<Integer, String> value = entry.getValue();
for (int i = 0; i < value.size(); i++) {
rowModel.getCells().add(value.get(i));
}
model.getRows().add(rowModel);
}
return model;
}
private Map<Integer, Map<Integer, String>> getRowsColumnsMap(Block block, Map<String, Block> blockMap) {
Map<Integer, Map<Integer, String>> rows = new HashMap<>();
for (Relationship relationship : block.relationships()) {
if (relationship.type().equals(RelationshipType.CHILD)) {
for (String childId : relationship.ids()) {
Block cell = blockMap.get(childId);
if (cell != null) {
int rowIndex = cell.rowIndex();
int colIndex = cell.columnIndex();
if (rows.get(rowIndex) == null) {
Map<Integer, String> row = new HashMap<>();
rows.put(rowIndex, row);
}
rows.get(rowIndex).put(colIndex, getText(cell, blockMap));
}
}
}
}
return rows;
}
public String getText(Block block, Map<String, Block> blockMap) {
String text = "";
if (block.relationships() != null && block.relationships().size() > 0) {
for (Relationship relationship : block.relationships()) {
if (relationship.type().equals(RelationshipType.CHILD)) {
for (String childId : relationship.ids()) {
Block wordBlock = blockMap.get(childId);
if (wordBlock != null && wordBlock.blockType() != null) {
if (wordBlock.blockType().equals(BlockType.WORD))) {
text += wordBlock.text() + " ";
}
}
}
}
}
}
return text;
}
I have following Structure in my firebase firestore
I want to get data of Form1 and Form2 how to achieve..
Below is what i tried
registration= query.whereEqualTo("UID", sharedPref.getString("userId",null)).addSnapshotListener(new EventListener<QuerySnapshot>() {
#Override
public void onEvent(QuerySnapshot documentSnapshots, FirebaseFirestoreException e) {
if (e != null) {
}
for (DocumentChange documentChange : documentSnapshots.getDocumentChanges()) {
if (documentChange.getDocument().getData().get("mobileMenus") != null) {
try {
Log.d("order","one");
String notesResponse = documentChange.getDocument().getData().get("mobileMenus").toString().replace(" ", "").replace(":", "");
String responseNotes = notesResponse.replace("=", ":");
Log.d("shownotes","***** "+responseNotes);
JSONObject jsonObject = new JSONObject(responseNotes);
isAttendance = jsonObject.getString("Attendance");
shared_menueditor.putString("isAttendance",isAttendance);
isCalender = jsonObject.getString("Calender");
shared_menueditor.putString("Calender",isCalender);
isExpenses = jsonObject.getString("Expenses");
shared_menueditor.putString("Expenses",isExpenses);
isleaves = jsonObject.getString("Leaves");
shared_menueditor.putString("Leaves",isleaves);
isLogout = jsonObject.getString("LogOut");
shared_menueditor.putString("LogOut",isLogout);
isNoticeboard = jsonObject.getString("NoticeBoard");
shared_menueditor.putString("NoticeBoard",isNoticeboard);
isTasks = jsonObject.getString("Tasks");
shared_menueditor.putString("Tasks",isTasks);
isTrackEmployee = jsonObject.getString("TrackEmployee");
shared_menueditor.putString("TrackEmployee",isTrackEmployee);
// documentChange.getDocument().getData().get("dynForms");
// Log.d("total", String.valueOf(documentChange.getDocument().getData().get("dynForms")));
Log.d("order","two");
forms= (Map<String, Object>) documentChange.getDocument().getData().get("mobileMenus");
Log.d("showfomsizes","*** "+forms.size());
} catch (JSONException e1) {
e1.printStackTrace();
}
}
if (forms!= null) {
for (Map.Entry<String, Object> form : forms.entrySet()) {
String key = form.getKey();
Map<Object, Object> values = (Map<Object, Object>) form.getValue();
name = (String) values.get("name");
String id = (String) values.get("id");
Log.d("nesteddata", name + "......" + id + "......." + key);
if (key.contains("Form1")) {
shared_menueditor.putString("nav_form1",name);
}
if (key.contains("Form2")) {
shared_menueditor.putString("nav_form2",name);
}
if (key.contains("Form3")) {
shared_menueditor.putString("nav_form3",name);
}
}
shared_menueditor.apply();
shared_menueditor.commit();
}
userprofile();
}
}
});
I am able to get Attendance,Calender,etc..But my pbm is i am not able
to get Form1 and Form2 datas
Error:java.lang.ClassCastException: java.lang.Boolean cannot be cast to java.util.Map
You are getting that exception because you are not checking for the Boolean values that you have. This should be your code snippet for getting the forms. Your forms object is the entire mobileMenus object.
if (forms!= null) {
for (Map.Entry<String, Object> form : forms.entrySet()) {
String key = form.getKey();
if(key.contains("Form")) {
Map<Object, Object> values = (Map<Object, Object>) form.getValue();
name = (String) values.get("name");
String id = (String) values.get("id");
Log.d("nesteddata", name + "......" + id + "......." + key);
if (key.contains("Form1")) {
shared_menueditor.putString("nav_form1",name);
}
if (key.contains("Form2")) {
shared_menueditor.putString("nav_form2",name);
}
if (key.contains("Form3")) {
shared_menueditor.putString("nav_form3",name);
}
}
}
}
I'm not too familiar with how to output files back to the client with Java. I am trying to create a CSV file to be sent back to the client and opened in Excel.
I found this tool for the server side creation. I'm not sure exactly how to use it to return the actual file though. Here is a sample of code I have used to return a txt file that I think I can use parts of the response for, but I'm not fetching a file anymore since I'm creating this CSV so I'm not sure what I can use.
In the code below my biggest question is what do I have to return with the controller and how do I accomplish that? I'm not sure what I need to be returning between that and also from the CSV writer to the controller. Any help would be appreciated.
Here's my code so far:
Controller:
#RequestMapping(value = "/web/csvexport", method = RequestMethod.POST)
protected void processCSV(HttpServletRequest request, HttpServletResponse response, #RequestBody String jsonRequest)
throws ServletException, IOException {
response.setContentType("text/html;charset=UTF-8");
try {
CSVWriter csvWriter = new CSVWriter();
JsonFlattener jsonFlattener = new JsonFlattener();
String fileName = "StandardQuery";
csvWriter.writeAsCSV(jsonFlattener.parseJson(jsonRequest), fileName);
} catch (Exception e) {
System.out.println("Exception: " + e);
}
}
CVS Writer:
public class CSVWriter {
public void writeAsCSV(List<Map<String, String>> flatJson, String fileName) throws FileNotFoundException {
Set<String> headers = collectHeaders(flatJson);
String output = StringUtils.join(headers.toArray(), ",") + "\n";
for (Map<String, String> map : flatJson) {
output = output + getCommaSeperatedRow(headers, map) + "\n";
}
writeToFile(output, fileName);
}
private void writeToFile(String output, String fileName) throws FileNotFoundException {
BufferedWriter writer = null;
try {
writer = new BufferedWriter(new FileWriter(fileName));
writer.write(output);
} catch (IOException e) {
e.printStackTrace();
} finally {
close(writer);
}
}
private void close(BufferedWriter writer) {
try {
if (writer != null) {
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
private String getCommaSeperatedRow(Set<String> headers, Map<String, String> map) {
List<String> items = new ArrayList<String>();
for (String header : headers) {
String value = map.get(header) == null ? "" : map.get(header).replace(",", "");
items.add(value);
}
return StringUtils.join(items.toArray(), ",");
}
private Set<String> collectHeaders(List<Map<String, String>> flatJson) {
Set<String> headers = new TreeSet<String>();
for (Map<String, String> map : flatJson) {
headers.addAll(map.keySet());
}
return headers;
}
}
Json Flattener:
public class JsonFlattener {
public Map<String, String> parse(JSONObject jsonObject) {
Map<String, String> flatJson = new HashMap<String, String>();
flatten(jsonObject, flatJson, "");
return flatJson;
}
public List<Map<String, String>> parse(JSONArray jsonArray) {
List<Map<String, String>> flatJson = new ArrayList<Map<String, String>>();
int length = jsonArray.length();
for (int i = 0; i < length; i++) {
JSONObject jsonObject = jsonArray.getJSONObject(i);
Map<String, String> stringMap = parse(jsonObject);
flatJson.add(stringMap);
}
return flatJson;
}
public List<Map<String, String>> parseJson(String json) throws Exception {
List<Map<String, String>> flatJson = null;
try {
JSONObject jsonObject = new JSONObject(json);
flatJson = new ArrayList<Map<String, String>>();
flatJson.add(parse(jsonObject));
} catch (JSONException je) {
flatJson = handleAsArray(json);
}
return flatJson;
}
private List<Map<String, String>> handleAsArray(String json) throws Exception {
List<Map<String, String>> flatJson = null;
try {
JSONArray jsonArray = new JSONArray(json);
flatJson = parse(jsonArray);
} catch (Exception e) {
throw new Exception("Json might be malformed");
}
return flatJson;
}
private void flatten(JSONArray obj, Map<String, String> flatJson, String prefix) {
int length = obj.length();
for (int i = 0; i < length; i++) {
if (obj.get(i).getClass() == JSONArray.class) {
JSONArray jsonArray = (JSONArray) obj.get(i);
if (jsonArray.length() < 1) continue;
flatten(jsonArray, flatJson, prefix + i);
} else if (obj.get(i).getClass() == JSONObject.class) {
JSONObject jsonObject = (JSONObject) obj.get(i);
flatten(jsonObject, flatJson, prefix + (i + 1));
} else {
String value = obj.getString(i);
if (value != null)
flatJson.put(prefix + (i + 1), value);
}
}
}
private void flatten(JSONObject obj, Map<String, String> flatJson, String prefix) {
Iterator iterator = obj.keys();
while (iterator.hasNext()) {
String key = iterator.next().toString();
if (obj.get(key).getClass() == JSONObject.class) {
JSONObject jsonObject = (JSONObject) obj.get(key);
flatten(jsonObject, flatJson, prefix);
} else if (obj.get(key).getClass() == JSONArray.class) {
JSONArray jsonArray = (JSONArray) obj.get(key);
if (jsonArray.length() < 1) continue;
flatten(jsonArray, flatJson, key);
} else {
String value = obj.getString(key);
if (value != null && !value.equals("null"))
flatJson.put(prefix + key, value);
}
}
}
}
Here's the service that I'm calling the controller from. I used this to return a .txt file before so I'm not sure how usable it is, but I think if I stream the file back it will handle it...:
getFile: function(jsonObj, fileName) {
var _defer = $q.defer();
$http.post("/web/csvexport/", jsonObj).success(function(data, status, headers) {
var octetStreamMime = "application/octet-stream";
// Get the headers
headers = headers();
// Get the filename from the x-filename header or default to "download.bin"
//var filename = headers["x-filename"] || "logfile.log";
var filename = fileName;
// Determine the content type from the header or default to "application/octet-stream"
var contentType = headers["content-type"] || octetStreamMime;
if(navigator.msSaveBlob)
{
// Save blob is supported, so get the blob as it's contentType and call save.
var blob = new Blob([data], { type: contentType });
navigator.msSaveBlob(blob, filename);
console.log("SaveBlob Success");
}
else
{
// Get the blob url creator
var urlCreator = window.URL || window.webkitURL || window.mozURL || window.msURL;
if(urlCreator)
{
// Try to use a download link
var link = document.createElement("a");
if("download" in link)
{
// Prepare a blob URL
var blob = new Blob([data], { type: contentType });
var url = urlCreator.createObjectURL(blob);
link.setAttribute("href", url);
// Set the download attribute (Supported in Chrome 14+ / Firefox 20+)
link.setAttribute("download", filename);
// Simulate clicking the download link
var event = document.createEvent('MouseEvents');
event.initMouseEvent('click', true, true, window, 1, 0, 0, 0, 0, false, false, false, false, 0, null);
link.dispatchEvent(event);
console.log("Download link Success");
} else {
// Prepare a blob URL
// Use application/octet-stream when using window.location to force download
var blob = new Blob([data], { type: octetStreamMime });
var url = urlCreator.createObjectURL(blob);
window.location = url;
console.log("window.location Success");
}
} else {
console.log("Not supported");
}
}
Firstly, why don't use CSV mime type instead of html ?
replace
response.setContentType("text/html;charset=UTF-8");
by
response.setContentType("text/csv");
And do you know that Jackson, Java JSON API handle CSV ? see
https://github.com/FasterXML/jackson-dataformat-csv
Finaly, in the controler you need to use the printWriter from the response to write the CSV.
Dont forget, to prefer Stream or BufferedString to handle large file and have better performances.