Elasticsearch group by multiple fields and sum the hours (aggregation) - java

I am working on an elasticsearch(1.5) query to get all the tasks for users and their respective hours within a time interval. For example, 1st Jan 2016 - 31 Dec 2016.
This is what I have managed to get so far:
{
"query": {
"filtered": {
"query" : {
"bool" : {
"must":
{
"term": {
"userId": [1,2,3,4,5,6]
}
}
}
},
"filter": {
"range": {
"spentOn": {
"gte": "1451606400000", // 1st Jan
"lte": "1483142400000" // 31st Dec
}
}
}
}
},
"size":0,
"aggs": {
"group_by_interval": {
"date_histogram": {
"field": "spentOn",
"interval": "month",
"min_doc_count": 0,
"extended_bounds": {
"min": "1451606400000",
"max": "1483142400000"
}
},
"aggs": {
"group_per_project": {
"histogram": {
"field": "taskId",
"interval": 1
},
"aggs": {
"hours": {
"sum": {
"field": "hours"
}
}
}
}
}
}
}
}
The above query gives me the following result:
{
...
[{
"key_as_string" : "2016-01-01T00:00:00.000Z",
"doc_count" : 10,
"group_per_project" : {
"buckets" : [{
"doc_count" : 1,
"key" : Task A,
"hours_per_taskAssignment" : {
"value" : 5
}
}, {
"doc_count" : 15,
"key" : Task B,
"hours_per_taskAssignment" : {
"value" : 60
}
}, {
"doc_count" : 1,
"key" :Task C,
"hours_per_taskAssignment" : {
"value" : 10
}
}
]
},
"key" : 1451606400000
}, {
"key_as_string" : "2016-02-01T00:00:00.000Z",
"doc_count" : 23,
"group_per_project" : {
"buckets" : [{
"doc_count" : 1,
"key" : Task A,
"hours" : {
"value" : 2
}
}, {
"doc_count" : 20,
"key" : Task B,
"hours" : {
"value" : 180
}
}
]
},
"key" : 1454284800000
}
...
]
However, I need the hours to be grouped and summed by the user, instead of a summation of all the user's hours. For example in January:
{
"doc_count" : 2,
"key" : Task A,
{
"userId": 1
"hours": {"value": 2}
},
{
"userId": 2
"hours": {"value": 5}
}
}
Is there any way I can achieve the above result using ElasticSearch 1.5 without having to loop through each individual user and get the total hours and in doing so reducing the performance of the application?
Thanks in advance!

"aggs": {
"group_by_interval": {
"date_histogram": {
"field": "spentOn",
"interval": "month",
"min_doc_count": 0,
"extended_bounds": {
"min": "1451606400000",
"max": "1483142400000"
}
},
"aggs": {
"group_per_project": {
"histogram": {
"field": "taskId",
"interval": 1
},
"aggs": {
"per_user": {
"terms": {
"field": "userId"
},
"aggs": {
"hours": {
"sum": {
"field": "hours"
}
}
}
}
}
}
}
}
}

Related

Getting Unique results for Objects inside List in Elasticsearch

I have mapping like this
"custom_metadata": {
"properties": {
"key": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
}
The ingested data looks like this
// data in document 1
"custom_metadata": [
{
"value": "NPL",
"key": "schema"
},
{
"value": "SAPERP",
"key": "system"
}
]
// data in document 2
"custom_metadata": [
{
"value": "trial",
"key": "schema"
},
{
"value": "Oracle",
"key": "system"
}
]
I want to aggregate on each key and get relevant value in search results, like this
"schema": [
{"value": "NPL"},
{ "value": "trial",}
],
"system":[
{"value": "SAPERP"},
{ "value": "Oracle",}
]
Note: Above output is for representation.If I get something like in ES then I can parse and get desired result on service side
What I have tried:
"custom_metadata_key": {
"terms": {
"field": "custom_metadata.key"
},
"aggregations": {
"custom_metadata_value": {
"terms": {
"field": "custom_metadata.value"
}
}
}
}
Above nested agg , aggregates each key and give all the values in results.
{
"key" : "schema",
"doc_count" : 2,
"custom_metadata_value" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Oracle",
"doc_count" : 2
},
{
"key" : "NPL",
"doc_count" : 1
},
{
"key" : "SAPERP",
"doc_count" : 1
},
{
"key" : "trial",
"doc_count" : 1
}
]
}
}
Above output repeats for all key and gives same aggregation for all.
You need to change data type of field custom_metadata from object to nested and you can achive your desire output easily.
Mapping
{
"mappings": {
"properties": {
"custom_metadata":{
"type": "nested"
}
}
}
}
Query
{
"size": 0,
"aggs": {
"data": {
"nested": {
"path": "custom_metadata"
},
"aggs": {
"custom_metadata_key": {
"terms": {
"field": "custom_metadata.key.keyword",
"size": 10
},
"aggs": {
"custom_metadata_value": {
"terms": {
"field": "custom_metadata.value.keyword",
"size": 10
}
}
}
}
}
}
}
}
Output
"aggregations": {
"data": {
"doc_count": 4,
"custom_metadata_key": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "schema",
"doc_count": 2,
"custom_metadata_value": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "NPL",
"doc_count": 1
},
{
"key": "trial",
"doc_count": 1
}
]
}
},
{
"key": "system",
"doc_count": 2,
"custom_metadata_value": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Oracle",
"doc_count": 1
},
{
"key": "SAPERP",
"doc_count": 1
}
]
}
}
]
}
}
}

Elasticsearch: Java Client for Composite Aggregation using Terms queries

I have created a composite query for aggregating on 2 different attributes as below -
{
"from": 0,
"size": 0,
"query": {
"bool": {
"must": [
{
"nested": {
"query": {
"script": {
"script": {
"source": "params.territoryIds.contains(doc['territoryHierarchy.id'].value) ",
"lang": "painless",
"params": {
"territoryIds": [
12345678
]
}
},
"boost": 1.0
}
},
"path": "territoryHierarchy",
"ignore_unmapped": false,
"score_mode": "none",
"boost": 1.0
}
},
{
"bool": {
"should": [
{
"nested": {
"query": {
"script": {
"script": {
"source": "doc['forecastHeaders.id'].value == params.id && doc['forecastHeaders.revenueCategory'].value == params.revenueCategory ",
"lang": "painless",
"params": {
"revenueCategory": 0,
"id": 987654321
}
},
"boost": 1.0
}
},
"path": "forecastHeaders",
"ignore_unmapped": false,
"score_mode": "none",
"boost": 1.0
}
},
{
"nested": {
"query": {
"script": {
"script": {
"source": "doc['forecastHeaders.id'].value == params.id && doc['forecastHeaders.revenueCategory'].value == params.revenueCategory ",
"lang": "painless",
"params": {
"revenueCategory": 0,
"id": 987654321
}
},
"boost": 1.0
}
},
"path": "forecastHeaders",
"ignore_unmapped": false,
"score_mode": "none",
"boost": 1.0
}
}
],
"adjust_pure_negative": true,
"boost": 1.0
}
},
{
"terms": {
"revnWinProbability": [
40,
50
],
"boost": 1.0
}
},
{
"terms": {
"revenueStatus.keyword": [
"OPEN"
],
"boost": 1.0
}
},
{
"range": {
"recordUpdateTime":{
"gte":1655117440000
}
}
}
],
"adjust_pure_negative": true,
"boost": 1.0
}
},
"version": true,
"aggregations": {
"TopLevelAggregation": {
"composite" : {
"size" : 10000,
"sources" : [
{
"directs": {
"terms": {
"script": {
"source": "def territoryNamesList = new ArrayList(); def name; def thLength = params._source.territoryHierarchy.length; for(int i = 0; i< thLength;i++) { def thRecord = params._source.territoryHierarchy[i]; if (params.territoryIds.contains(thRecord.id) && i+params.levelToReturn < thLength) { territoryNamesList.add(params._source.territoryHierarchy[i+params.levelToReturn].name);} } return territoryNamesList;",
"lang": "painless",
"params": {
"territoryIds": [
12345678
],
"levelToReturn": 1
}
}
}
}
},
{
"qtr" : {
"terms" : {
"field" : "quarter.keyword",
"missing_bucket" : false,
"order" : "asc"
}
}
}
]
},
"aggregations": {
"revnRevenueAmount": {
"sum": {
"script": {
"source": "doc['revenueTypeCategory.keyword'].value != 'Other' ? doc['revnRevenueAmount']:doc['revnRevenueAmount']",
"lang": "painless"
},
"value_type": "long"
}
}
}
}
}
}
So this query does a composite aggregation based on two different terms aggregations, directs and qtr, and it works fine.
Now I am trying to create a corresponding spring data java client implementation for it. So I have created the code as below -
BoolQueryBuilder baseQueryBuilder = getQueryBuilder(searchCriteria);
List<TermsAggregationBuilder> aggregationBuilders = getMultiBaseAggregationBuilders(searchCriteria, baseQueryBuilder);
Where the bool query supplies the first part of the bool query and the getMultiBaseAggregationBuilders method returns the 2 different terms aggregations shown in the query above - directs and qtr. Now I am not finding any API to send this list of terms aggregations to the composite aggregation builder. Would be really grateful if someone can give me a pointer as to how this list of terms aggregations can be used inside the composite aggregation builder so the same can be achieved in the java code as it shows in the elastic query above. Thanks in advance.

How to get single field in mongodb query?

I have data like this:
{ id : 1,
book: "Flash",
chapters: [
{
chap_no: "1",
sub_chapter: [
{sub_no: 1, description: "<description>"
},
{sub_no: 2, description: "<description>"
},
]
}
]
}
i want to show one field like this base on book -> chapter_no -> sub_no
{
sub_no: 2, description: "<description>"
}
in mongodb query.
$match
$unwind
$unwind
$match
$replaceRoot
db.collection.aggregate([
{
"$match": {
"chapters.sub_chapter.sub_no": 2
}
},
{
"$unwind": "$chapters"
},
{
"$unwind": "$chapters.sub_chapter"
},
{
"$match": {
"chapters.sub_chapter.sub_no": 2
}
},
{
"$replaceRoot": {
"newRoot": "$chapters.sub_chapter"
}
}
])
mongoplayground
you can make like this
db.collection.aggregate([
{
"$match": {
$and: [
{
"book": "Flash3"
},
{
"chapters.chap_no": "2"
},
{
"chapters.sub_chapter.sub_no": "1"
}
]
}
},
{
"$unwind": "$chapters"
},
{
"$unwind": "$chapters.sub_chapter"
},
{
"$match": {
$and: [
{
"book": "Flash3"
},
{
"chapters.chap_no": "2"
},
{
"chapters.sub_chapter.sub_no": "1"
}
]
}
},
{
"$replaceRoot": {
"newRoot": "$chapters.sub_chapter"
}
}
])

Elasticsearch: order by length of array properties

im using Kibana and ES, i have an index with objects and an array called "reviews" that inside has properties called positive_comment, negative_comment, reviewer_name and more.
There are some reviews that have the field positive_comment empty (not null, just empty).
I need to order by the length of the field positive_comment first, so reviews with empty positive_comment comes last. The results are the same when ordering in SQL with LEN() property.
This is my query.
I also tried:
.value.size() in the script, or type "string" but no results.
{
"_source":[
"reviews.positive_comment"
],
"query":{
"bool":{
"filter":[
{
"term":{
"id":214
}
}
]
}
},
"sort":{
"_script":{
"script":"doc['reviews.positive_comment'].value.length()",
"type":"number",
"order":"asc"
}
}
}
This is my result:
{
"_source":{
"reviews":[
{
"positive_comment":"Great"
},
{
"positive_comment":"Really good product"
},
{
"positive_comment":""
},
{
"positive_comment":""
},
{
"positive_comment":""
},
{
"positive_comment":""
},
{
"positive_comment":""
},
{
"positive_comment":""
},
{
"positive_comment":""
},
{
"positive_comment":""
},
{
"positive_comment":""
},
{
"positive_comment":""
},
{
"positive_comment":""
},
{
"positive_comment":""
},
{
"positive_comment":""
},
{
"positive_comment":"Awesome"
}
]
},
"sort":[
"0"
]
}
Elasticsearch doesn't support counting array elements, unless you use a script.
However, running a script for every matching document is going to degrade performance for every search query.
A better solution would be to count the values once, at index-time, and store the counts in dedicated fields (positive_comments_count, negative_comments_count, etc.) and use these fields for sorting.
Try following. It worked for me on ES 5.6.3. So, should work on higher versions too.
GET test/test/_search?filter_path=hits.hits
{
"query": {
"match_all": {}
},
"sort": {
"_script": {
"type": "number",
"script": {
"lang": "painless",
"source": "doc['reviews.positive_comment'].value.length()"
},
"order": "asc"
}
}
}
I have tested on ES 7.1 version too.
Mapping
PUT test
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 1
},
"mappings": {
"properties": {
"somefield": {
"type": "keyword"
},
"reviews": {
"properties": {
"positive_comment": {
"type": "keyword"
},
"item_id": {
"type": "double"
}
}
}
}
}
}
Query:
GET test/_search
{
"query": {
"match_all": {}
},
"_source": "reviews.positive_comment",
"sort": {
"_script": {
"type": "number",
"script": {
"lang": "painless",
"source": "doc['reviews.positive_comment'].value.length() % 100"
},
"order": "asc"
}
}
}
output
{
"hits" : {
"total" : {
"value" : 5,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "test",
"_type" : "_doc",
"_id" : "test5",
"_score" : null,
"_source" : {
"reviews" : [
{
"positive_comment" : ""
}
]
},
"sort" : [
0.0
]
},
{
"_index" : "test",
"_type" : "_doc",
"_id" : "test1",
"_score" : null,
"_source" : {
"reviews" : [
{
"positive_comment" : "Awesome"
}
]
},
"sort" : [
7.0
]
},
{
"_index" : "test",
"_type" : "_doc",
"_id" : "test3",
"_score" : null,
"_source" : {
"reviews" : [
{
"positive_comment" : "What a product"
}
]
},
"sort" : [
14.0
]
},
{
"_index" : "test",
"_type" : "_doc",
"_id" : "test2",
"_score" : null,
"_source" : {
"reviews" : [
{
"positive_comment" : "What a product.. amazing"
}
]
},
"sort" : [
24.0
]
},
{
"_index" : "test",
"_type" : "_doc",
"_id" : "test4",
"_score" : null,
"_source" : {
"reviews" : [
{
"positive_comment" : "Thats a great product.. "
}
]
},
"sort" : [
24.0
]
}
]
}
}

elasticsearch search by special character

I have a set of the following phrases: [remix], [18+], etc. How can I make a search by one character, for example "[", to find all these variants ?
Right now I have the following analyzers config:
{
"analysis": {
"analyzer": {
{ "bigram_analyzer": {
{ "type": "custom",
{ "tokenizer": { "keyword",
{ "filter": [
{ "lowercase",
"bigram_filter".
]
},
{ "full_text_analyzer": {
{ "type": "custom",
{ "tokenizer": { "ngram_tokenizer",
{ "filter": [
"lowercase"
]
}
},
{ "filter": {
{ "bigram_filter": {
{ "type": "edge_ngram",
{ "max_gram": 2
}
},
{ "tokenizer": {
{ "ngram_tokenizer": {
{ "type": "ngram",
{ "min_gram": 3,
{ "max_gram": 3,
{ "token_chars": [
{ "letter",
{ "digit",
{ "symbol",
"punctuation"
]
}
}
}
}
Mapping occurs at the java entity level using the spring boot data elasticsearch starter
If I understand your problem correctly - you want to implement an autocomplete analyzer that will return any term that starts with [ or any other character. To do so you can create a custom analyzer using ngram autocomplete. Here is an example:
Here is the testing index:
PUT /testing-index-v3
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 15
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
}
}
},
"mappings": {
"properties": {
"term": {
"type": "text",
"analyzer": "autocomplete"
}
}
}
}
Here is the documents input:
POST /testing-index-v3/_doc
{
"term": "[+18]"
}
POST testing-index-v3/_doc
{
"term": "[remix]"
}
POST testing-index-v3/_doc
{
"term": "test"
}
And finally our search:
GET testing-index-v3/_search
{
"query": {
"match": {
"term": {
"query": "[remi",
"analyzer": "keyword",
"fuzziness": 0
}
}
}
}
As you can see I chose the keyword tokenizer for the autocomplete filter. I'm using ngram filter with min_gram: 1 and max_gram 15 which means our query will be separated into tokens like this:
input-query = i, in, inp, inpu, input .. and etc. Separates up to 15 tokens. This is wanted only at indexing time. Looking at the query we specify keyword analyzer as well - this analyzer is for the search time and it hard matches results. Here are some example searches and results:
GET testing-index-v3/_search
{
"query": {
"match": {
"term": {
"query": "[",
"analyzer": "keyword",
"fuzziness": 0
}
}
}
}
result:
"hits" : [
{
"_index" : "testing-index-v3",
"_type" : "_doc",
"_id" : "w5c_IHsBGGZ-oIJIi-6n",
"_score" : 0.7040055,
"_source" : {
"term" : "[remix]"
}
},
{
"_index" : "testing-index-v3",
"_type" : "_doc",
"_id" : "xJc_IHsBGGZ-oIJIju7m",
"_score" : 0.7040055,
"_source" : {
"term" : "[+18]"
}
}
]
GET testing-index-v3/_search
{
"query": {
"match": {
"term": {
"query": "[+",
"analyzer": "keyword",
"fuzziness": 0
}
}
}
}
result:
"hits" : [
{
"_index" : "testing-index-v3",
"_type" : "_doc",
"_id" : "xJc_IHsBGGZ-oIJIju7m",
"_score" : 0.7040055,
"_source" : {
"term" : "[+18]"
}
}
]
Hope this answer helps you. Good luck with your adventures with elasticsearch!

Categories