Getting Unique results for Objects inside List in Elasticsearch - java

I have mapping like this
"custom_metadata": {
"properties": {
"key": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
}
The ingested data looks like this
// data in document 1
"custom_metadata": [
{
"value": "NPL",
"key": "schema"
},
{
"value": "SAPERP",
"key": "system"
}
]
// data in document 2
"custom_metadata": [
{
"value": "trial",
"key": "schema"
},
{
"value": "Oracle",
"key": "system"
}
]
I want to aggregate on each key and get relevant value in search results, like this
"schema": [
{"value": "NPL"},
{ "value": "trial",}
],
"system":[
{"value": "SAPERP"},
{ "value": "Oracle",}
]
Note: Above output is for representation.If I get something like in ES then I can parse and get desired result on service side
What I have tried:
"custom_metadata_key": {
"terms": {
"field": "custom_metadata.key"
},
"aggregations": {
"custom_metadata_value": {
"terms": {
"field": "custom_metadata.value"
}
}
}
}
Above nested agg , aggregates each key and give all the values in results.
{
"key" : "schema",
"doc_count" : 2,
"custom_metadata_value" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Oracle",
"doc_count" : 2
},
{
"key" : "NPL",
"doc_count" : 1
},
{
"key" : "SAPERP",
"doc_count" : 1
},
{
"key" : "trial",
"doc_count" : 1
}
]
}
}
Above output repeats for all key and gives same aggregation for all.

You need to change data type of field custom_metadata from object to nested and you can achive your desire output easily.
Mapping
{
"mappings": {
"properties": {
"custom_metadata":{
"type": "nested"
}
}
}
}
Query
{
"size": 0,
"aggs": {
"data": {
"nested": {
"path": "custom_metadata"
},
"aggs": {
"custom_metadata_key": {
"terms": {
"field": "custom_metadata.key.keyword",
"size": 10
},
"aggs": {
"custom_metadata_value": {
"terms": {
"field": "custom_metadata.value.keyword",
"size": 10
}
}
}
}
}
}
}
}
Output
"aggregations": {
"data": {
"doc_count": 4,
"custom_metadata_key": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "schema",
"doc_count": 2,
"custom_metadata_value": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "NPL",
"doc_count": 1
},
{
"key": "trial",
"doc_count": 1
}
]
}
},
{
"key": "system",
"doc_count": 2,
"custom_metadata_value": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Oracle",
"doc_count": 1
},
{
"key": "SAPERP",
"doc_count": 1
}
]
}
}
]
}
}
}

Related

MongoDB : fields are excluded when invoke multiple lookup stages

db={
"dashboard": [
{
"_id": "dashboard1",
"name": "test",
"user": 1
}
],
"templatefolders": [
{
"dashboardId": "dashboard1",
"folderId": "folder123",
"name": "folder",
"region": "XXX"
}
],
"folders": [
{
"_id": "folder123"
}
],
"user": [
{
"_id": 1,
"name": "alaa"
}
],
}
this is my function:
db.dashboard.aggregate([
{
"$lookup": {
"from": "templatefolders",
"localField": "_id",
"foreignField": "dashboardId",
"as": "joinDashboard"
}
},
{
"$lookup": {
"from": "folders",
"localField": "joinDashboard.folderId",
"foreignField": "_id",
"as": "joinDashboard.joinFolder"
}
},
])
Result :
[
{
"_id": "dashboard1",
"joinDashboard": {
"joinFolder": [
{
"_id": "folder123"
}
]
},
"name": "test",
"user": 1
}
]
[![enter image description here][1]][1]
Why the fields name and region in collection templatefolders are excluded ?
I want to know why this behavior ? I don't like to use unwind because i have multiple collections with multiple refrence relation.
Your second $lookup, is overriding the joinDashboard key completely. Since you want joinFolder to be within joinDashboard, you can try nested lookups like this:
db.dashboard.aggregate([
{
$lookup: {
from: "templatefolders",
let: {
"boardId": "$_id"
},
pipeline: [
{
$match: {
$expr: {
$eq: [
"$dashboardId",
"$$boardId"
]
}
}
},
{
$lookup: {
from: "folders",
let: {
"folderId": "$folderId"
},
pipeline: [
{
$match: {
$expr: {
$eq: [
"$_id",
"$$folderId"
]
}
}
},
],
as: "joinFolder"
},
},
],
as: "joinDashboard"
}
}
])
MongoPlayground link.

Elasticsearch: Filter the records based on nested field with nested field containing only the filtered object

I am trying to filter the records based on nested field and want only the matching object in that array to be shown as part of the record.
Below is the detailed explanation of my requirement.
So, I have Elasticsearch data like this:
[{
"basicInfo": {
"requestId": 123,
},
"managerInfo": {
"manager": "John",
},
"groupInfo": [
{
"id": "id1",
"name": "abc",
"status": "Approved"
},
{
"id": "id2",
"name": "abc",
"status": "Pending"
}
]
},
{
"basicInfo": {
"requestId": 233,
},
"managerInfo": {
"manager": "John Sr",
},
"groupInfo": [
{
"id": "id3",
"name": "abc",
"status": "Pending"
}
]
}
]
I want to filter the records only with groupInfo.status as Approved and basicInfo.requestId as 123, but my condition is I should only get the Approved record in the groupInfo and not the pending ones. So, the output I am expecting is:
{
"took": 23,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 3.0602708,
"hits": [
{
"_index": "my_index",
"_type": "request",
"_id": "123",
"_score": 3.0602708,
"_source": {
"basicInfo": {
"requestId": 123
},
"managerInfo": {
"manager": "John"
},
"groupInfo": [
{
"id": "id1",
"name": "abc",
"status": "Approved"
}
// No id2 here as it is in pending state
]
}
}
]
}
}
But instead I am able to achieve:
{
"took": 23,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 3.0602708,
"hits": [
{
"_index": "my_index",
"_type": "request",
"_id": "123",
"_score": 3.0602708,
"_source": {
"basicInfo": {
"requestId": 123
},
"managerInfo": {
"manager": "John"
},
"groupInfo": [
{
"id": "id1",
"name": "abc",
"status": "Approved"
},
{
"id": "id2",
"name": "abc",
"status": "Pending"
}
]
}
}
]
}
}
This is the query I am using:
{
"query": {
"bool": {
"must": [
{
"match": {
"basicInfo.requestId": "123"
}
},
{
"nested": {
"path": "groupInfo",
"query": {
"bool": {
"must": [
{
"term": {
"groupInfo.status": "Approved"
}
}
]
}
}
}
}
]
}
}
}
So, my question is first what I am expecting, is that even possible? Can we filter the result and make sure that we get only the matched array from that result?
If yes, how can we do it?
Thanks in advance.
Maybe you are looking for Inner Hits.
In many cases, it’s very useful to know which inner nested objects (in
the case of nested) or children/parent documents (in the case of
parent/child) caused certain information to be returned. The inner
hits feature can be used for this. This feature returns per search hit
in the search response additional nested hits that caused a search hit
to match in a different scope.
{
"query": {
"bool": {
"must": [
{
"match": {
"basicInfo.requestId": "123"
}
},
{
"nested": {
"path": "groupInfo",
"query": {
"bool": {
"must": [
{
"term": {
"groupInfo.status": "Approved"
}
}
]
}
},
"inner_hits":{}
}
}
]
}
}
}

Limiting Bucket Size Returned By Aggregation

I have a huge amount of elasticsearch data, I neeed to make aggregations and return buckets. I need to limit data size returned from elasticsearch to only get a sample for the data not all data.
I have tried adding "size" attribute. But it's not acceptable in bucketing aggregations.
{
"size": 0,
"query": {
"bool": {
"adjust_pure_negative": true,
"boost": 1
}
},
"aggregations": {
"my_agg_1": {
"histogram": {
"field": "coAt",
"interval": 86400,
"offset": 1558216800,
"order": {
"_key": "asc"
},
"keyed": false,
"min_doc_count": 1
},
"aggregations": {
"my_agg_2": {
"terms": {
"field": "atr1",
"missing": "NaN",
"value_type": "string",
"size": 2147483647,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false,
"order": [
{
"_count": "desc"
},
{
"_key": "asc"
}
]
},
"aggregations": {
"atr2": {
"top_hits": {
"from": 0,
"size": 1,
"version": false,
"explain": false,
"sort": [
{
"coAt": {
"order": "desc"
}
}
]
}
},
"clientIP_count": {
"value_count": {
"field": "clientIP"
}
}
}
}
}
}
}
}

Loading Graphson into Titan graph database

I have graphjson like this :
{
"id": 0,
"label": "Person",
"outE": {
"transfer": [
{
"id": 0,
"inV": "ICIC0000008-805000017",
"properties": {
"amount": "228786690"
}
}
]
},
"properties": {
"name": [
{
"id": 0,
"value": "SBIN0000196-33502796303"
}
]
}
}
I am trying to load this into titan using following query: graph.io(IoCore.graphson()).readGraph("test.json")
However, I am getting error: Invalid Vertex:Null

Elasticsearch group by multiple fields and sum the hours (aggregation)

I am working on an elasticsearch(1.5) query to get all the tasks for users and their respective hours within a time interval. For example, 1st Jan 2016 - 31 Dec 2016.
This is what I have managed to get so far:
{
"query": {
"filtered": {
"query" : {
"bool" : {
"must":
{
"term": {
"userId": [1,2,3,4,5,6]
}
}
}
},
"filter": {
"range": {
"spentOn": {
"gte": "1451606400000", // 1st Jan
"lte": "1483142400000" // 31st Dec
}
}
}
}
},
"size":0,
"aggs": {
"group_by_interval": {
"date_histogram": {
"field": "spentOn",
"interval": "month",
"min_doc_count": 0,
"extended_bounds": {
"min": "1451606400000",
"max": "1483142400000"
}
},
"aggs": {
"group_per_project": {
"histogram": {
"field": "taskId",
"interval": 1
},
"aggs": {
"hours": {
"sum": {
"field": "hours"
}
}
}
}
}
}
}
}
The above query gives me the following result:
{
...
[{
"key_as_string" : "2016-01-01T00:00:00.000Z",
"doc_count" : 10,
"group_per_project" : {
"buckets" : [{
"doc_count" : 1,
"key" : Task A,
"hours_per_taskAssignment" : {
"value" : 5
}
}, {
"doc_count" : 15,
"key" : Task B,
"hours_per_taskAssignment" : {
"value" : 60
}
}, {
"doc_count" : 1,
"key" :Task C,
"hours_per_taskAssignment" : {
"value" : 10
}
}
]
},
"key" : 1451606400000
}, {
"key_as_string" : "2016-02-01T00:00:00.000Z",
"doc_count" : 23,
"group_per_project" : {
"buckets" : [{
"doc_count" : 1,
"key" : Task A,
"hours" : {
"value" : 2
}
}, {
"doc_count" : 20,
"key" : Task B,
"hours" : {
"value" : 180
}
}
]
},
"key" : 1454284800000
}
...
]
However, I need the hours to be grouped and summed by the user, instead of a summation of all the user's hours. For example in January:
{
"doc_count" : 2,
"key" : Task A,
{
"userId": 1
"hours": {"value": 2}
},
{
"userId": 2
"hours": {"value": 5}
}
}
Is there any way I can achieve the above result using ElasticSearch 1.5 without having to loop through each individual user and get the total hours and in doing so reducing the performance of the application?
Thanks in advance!
"aggs": {
"group_by_interval": {
"date_histogram": {
"field": "spentOn",
"interval": "month",
"min_doc_count": 0,
"extended_bounds": {
"min": "1451606400000",
"max": "1483142400000"
}
},
"aggs": {
"group_per_project": {
"histogram": {
"field": "taskId",
"interval": 1
},
"aggs": {
"per_user": {
"terms": {
"field": "userId"
},
"aggs": {
"hours": {
"sum": {
"field": "hours"
}
}
}
}
}
}
}
}
}

Categories