Related
I've developed a java application that uses an Atlas MongoDB Serverless DB.
This application performs an Aggregation query with the following steps:
$match
$project
$addFields
$sort
$facet
$project
When I perform a query thats returns a lot of results I'm obtaining this exception: QueryExceededMemoryLimitNoDiskUseAllowed.
I've tried to modify my code adding allowDiskUse: true in the aggregation, but didn't resolve the exception.
I've tried to replicate my aggregation pipeline in Atlas console and found that every think works fine until $facet step that returns
Reason: PlanExecutor error during aggregation :: caused by :: Sort exceeded memory limit of 33554432 bytes, but did not opt in to external sorting.
This is my $facet step:
{$facet: {
paginatedResults: [{ $skip: 0 }, { $limit: 50 }],
totalCount: [
{
$count: 'count'
}
]
}
}
As you can see I'm using it to paginate my query results.
Any suggestion to avoid this problem?
I was thinking about making two different query one for the results and one for total count, but I'm not sure this is the best solution.
EDIT: added query
db.vendor_search.aggregate(
{$match: {
$or: [
{'searchKeys.value': {$regex: "vendor"}},
{'searchKeys.value': {$regex: "test"}},
{'searchKeys.valueClean': {$regex: "vendor"}},
{'searchKeys.valueClean': {$regex: "test"}},
],
buyerId: 7
}},
{$project: {
companyId: 1,
buyerId: 1,
companyName: 1,
legalForm: 1,
country: 1,
supplhiCompanyCode: 1,
vat: 1,
erpCode: 1,
visibility: 1,
businessStatus: 1,
city: 1,
logo: 1,
location: {$concat : ["$country.value",'$city']},
searchKeys: {
"$filter": {
"input": "$searchKeys",
"cond": {
"$or": [
{$regexMatch: {input: "$$this.value",regex: "vendor"}},
{$regexMatch: {input: "$$this.value",regex: "test"}}
{$regexMatch: {input: "$$this.valueClean",regex: "vendor"}},
{$regexMatch: {input: "$$this.valueClean",regex: "test"}}
]
}
}
}
}},
{$addFields: {
searchMatching: {
$reduce: {
input: "$searchKeys.type",
initialValue: [],
in: {
$concatArrays: [
"$$value",
{$cond: [{$in: ["$$this", "$$value"]},[],["$$this"]]}
]
}
}
},
'sort.supplhiId': { $toLower: "$supplhiCompanyCode" },
'sort.companyName': { $toLower: "$companyName" },
'sort.location': { $toLower: {$concat : ["$country.value"," ","$city"]}},
'sort.vat': { $toLower: "$vat" },
'sort.companyStatus': { $toLower: "$businessStatus" },
'sort.erpCode': { $toLower: "$erpCode" }
}},
{$sort: {"sort.companyName": 1}},
{$facet: {
paginatedResults: [{ $skip: 0 }, { $limit: 50 }],
totalCount: [
{
$count: 'count'
}
]
}
},
{$project: {paginatedResults:1, 'totalCount': {$first : '$totalCount.count'}}}
)
EDIT: Added model
{
"buyerId": 1,
"companyId": 869048,
"address": "FP8R+52H",
"businessStatus": "AC",
"city": "Chiffa",
"companyName": "Test Algeria 25 agosto",
"country": {
"lookupId": 78,
"code": "DZA",
"value": "Algeria"
},
"erpCode": null,
"legalForm": "Ltd.",
"logo": "fc4d821a-e814-49e4-96d1-f32421fdaa6d_1.jpg",
"searchKeys": [
{
"type": "contact",
"value": "pebiw81522#xitudy.com",
"valueClean": "pebiw81522xitudycom"
},
{
"type": "company_registration_number",
"value": "112211331144",
"valueClean": "112211331144"
},
{
"type": "vendor_name",
"value": "test algeria 25 agosto ltd.",
"valueClean": "test algeria 25 agosto ltd"
},
{
"type": "contact",
"value": "tredicisf2#ottobre2022.com",
"valueClean": "tredicisf2ottobre2022com"
},
{
"type": "contact",
"value": "ty#s.com",
"valueClean": "tyscom"
},
{
"type": "contact",
"value": "info#x.com",
"valueClean": "infoxcom"
},
{
"type": "tin",
"value": "00112341675",
"valueClean": "00112341675"
},
{
"type": "contact",
"value": "hatikog381#rxcay.com",
"valueClean": "hatikog381rxcaycom"
},
{
"type": "supplhi_id",
"value": "100059410",
"valueClean": "100059410"
},
{
"type": "contact",
"value": "tredici#ottobre2022.com",
"valueClean": "trediciottobre2022com"
},
{
"type": "country_key",
"value": "00112341675",
"valueClean": "00112341675"
},
{
"type": "vat",
"value": "00112341675",
"valueClean": "00112341675"
},
{
"type": "address",
"value": "fp8r+52h",
"valueClean": "fp8r52h"
},
{
"type": "city",
"value": "chiffa",
"valueClean": "chiffa"
},
{
"type": "contact",
"value": "prova#supplhi.com",
"valueClean": "provasupplhicom"
},
{
"type": "contact",
"value": "saraxo2669#dmonies.com",
"valueClean": "saraxo2669dmoniescom"
}
],
"supplhiCompanyCode": "100059410",
"vat": "00112341675",
"visibility": true
}
in ATLAS M0 free clusters and M2/M5 shared clusters sort in memory limit is 32 MB. ( ref ) , this limit seems to apply also to serverless
For not limited mongod you may usually increase this limit from 32MB for example to 320MB as follow:
db.adminCommand({setParameter: 1, internalQueryExecMaxBlockingSortBytes: 335544320})
You can check the current value with:
db.runCommand( { getParameter : 1, "internalQueryExecMaxBlockingSortBytes" : 1 } )
But it is best to optimize your queries to not hit this limit , if you post your full query and indexes ( db.collection.getIndexes() perhaps there is a better way ...
I am using Java to perform queries on Elasticsearch, via the ElasticSearchClient. As there are big variables returned, I would like to only retrieve the ones that are relevant but the variables in _source are nested.
Below is a sample index response (multiple indexes can be returned with same _source structure)
[
{
"_index": "kn-tas-20200630",
"_type": "_doc",
"_id": "1122334455",
"_score": null,
"_source": {
"variables": [
{
"rawValue": "DEFH",
"name": "MANAGER"
},
{
"rawValue": "ABCD",
"name": "EMPLOYEE"
},
{
"rawValue": "[{\"rowId\":102030,\"rowType\":\"SIM\"}]",
"name": "extData"
}
]
},
"sort": [
1665735632119
]
}
]
I would like to create a query using SearchSourceBuilder to query ES and only retrieve the following:
Get the rawValue by name (I provide Manager, I get "DFEH")
Get the rowType value (I provide extData + row Type, I get "SIM")
Below is my query:
{
"from": 0,
"size": 100,
"query": {
"bool": {
"must": [
{
"terms": {
"prcKey": [
"K-112"
],
"boost": 1.0
}
}
],
"must_not": [
{
"exists": {
"field": "endDate",
"boost": 1.0
}
},
{
"term": {
"personInCharge": {
"value": "ABC",
"boost": 1.0
}
}
}
],
"adjust_pure_negative": true,
"boost": 1.0
}
},
"_source": {
"includes": [
"variables.name",
"variables.rawValue"
],
"excludes": []
},
"sort": [
{
"createTime": {
"order": "desc"
}
}
]
}
How can I fix my query? I tried using nested queries but without any luck.
I have mapping like this
"custom_metadata": {
"properties": {
"key": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
}
The ingested data looks like this
// data in document 1
"custom_metadata": [
{
"value": "NPL",
"key": "schema"
},
{
"value": "SAPERP",
"key": "system"
}
]
// data in document 2
"custom_metadata": [
{
"value": "trial",
"key": "schema"
},
{
"value": "Oracle",
"key": "system"
}
]
I want to aggregate on each key and get relevant value in search results, like this
"schema": [
{"value": "NPL"},
{ "value": "trial",}
],
"system":[
{"value": "SAPERP"},
{ "value": "Oracle",}
]
Note: Above output is for representation.If I get something like in ES then I can parse and get desired result on service side
What I have tried:
"custom_metadata_key": {
"terms": {
"field": "custom_metadata.key"
},
"aggregations": {
"custom_metadata_value": {
"terms": {
"field": "custom_metadata.value"
}
}
}
}
Above nested agg , aggregates each key and give all the values in results.
{
"key" : "schema",
"doc_count" : 2,
"custom_metadata_value" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "Oracle",
"doc_count" : 2
},
{
"key" : "NPL",
"doc_count" : 1
},
{
"key" : "SAPERP",
"doc_count" : 1
},
{
"key" : "trial",
"doc_count" : 1
}
]
}
}
Above output repeats for all key and gives same aggregation for all.
You need to change data type of field custom_metadata from object to nested and you can achive your desire output easily.
Mapping
{
"mappings": {
"properties": {
"custom_metadata":{
"type": "nested"
}
}
}
}
Query
{
"size": 0,
"aggs": {
"data": {
"nested": {
"path": "custom_metadata"
},
"aggs": {
"custom_metadata_key": {
"terms": {
"field": "custom_metadata.key.keyword",
"size": 10
},
"aggs": {
"custom_metadata_value": {
"terms": {
"field": "custom_metadata.value.keyword",
"size": 10
}
}
}
}
}
}
}
}
Output
"aggregations": {
"data": {
"doc_count": 4,
"custom_metadata_key": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "schema",
"doc_count": 2,
"custom_metadata_value": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "NPL",
"doc_count": 1
},
{
"key": "trial",
"doc_count": 1
}
]
}
},
{
"key": "system",
"doc_count": 2,
"custom_metadata_value": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Oracle",
"doc_count": 1
},
{
"key": "SAPERP",
"doc_count": 1
}
]
}
}
]
}
}
}
I am trying to filter the records based on nested field and want only the matching object in that array to be shown as part of the record.
Below is the detailed explanation of my requirement.
So, I have Elasticsearch data like this:
[{
"basicInfo": {
"requestId": 123,
},
"managerInfo": {
"manager": "John",
},
"groupInfo": [
{
"id": "id1",
"name": "abc",
"status": "Approved"
},
{
"id": "id2",
"name": "abc",
"status": "Pending"
}
]
},
{
"basicInfo": {
"requestId": 233,
},
"managerInfo": {
"manager": "John Sr",
},
"groupInfo": [
{
"id": "id3",
"name": "abc",
"status": "Pending"
}
]
}
]
I want to filter the records only with groupInfo.status as Approved and basicInfo.requestId as 123, but my condition is I should only get the Approved record in the groupInfo and not the pending ones. So, the output I am expecting is:
{
"took": 23,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 3.0602708,
"hits": [
{
"_index": "my_index",
"_type": "request",
"_id": "123",
"_score": 3.0602708,
"_source": {
"basicInfo": {
"requestId": 123
},
"managerInfo": {
"manager": "John"
},
"groupInfo": [
{
"id": "id1",
"name": "abc",
"status": "Approved"
}
// No id2 here as it is in pending state
]
}
}
]
}
}
But instead I am able to achieve:
{
"took": 23,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 3.0602708,
"hits": [
{
"_index": "my_index",
"_type": "request",
"_id": "123",
"_score": 3.0602708,
"_source": {
"basicInfo": {
"requestId": 123
},
"managerInfo": {
"manager": "John"
},
"groupInfo": [
{
"id": "id1",
"name": "abc",
"status": "Approved"
},
{
"id": "id2",
"name": "abc",
"status": "Pending"
}
]
}
}
]
}
}
This is the query I am using:
{
"query": {
"bool": {
"must": [
{
"match": {
"basicInfo.requestId": "123"
}
},
{
"nested": {
"path": "groupInfo",
"query": {
"bool": {
"must": [
{
"term": {
"groupInfo.status": "Approved"
}
}
]
}
}
}
}
]
}
}
}
So, my question is first what I am expecting, is that even possible? Can we filter the result and make sure that we get only the matched array from that result?
If yes, how can we do it?
Thanks in advance.
Maybe you are looking for Inner Hits.
In many cases, it’s very useful to know which inner nested objects (in
the case of nested) or children/parent documents (in the case of
parent/child) caused certain information to be returned. The inner
hits feature can be used for this. This feature returns per search hit
in the search response additional nested hits that caused a search hit
to match in a different scope.
{
"query": {
"bool": {
"must": [
{
"match": {
"basicInfo.requestId": "123"
}
},
{
"nested": {
"path": "groupInfo",
"query": {
"bool": {
"must": [
{
"term": {
"groupInfo.status": "Approved"
}
}
]
}
},
"inner_hits":{}
}
}
]
}
}
}
I have a JSON
{
"Id": "xxx",
"Type": "Transaction.Create",
"Payload": {
"result": 2,
"description": "Pending",
"body": {
"redirect": {
"url": "xxx",
"fields": {
"MD": "8a829449620619e80162252adeb66a39"
}
},
"card": {
"expiryMonth": "1",
"expiryYear": "2033"
},
"order": {
"amount": 1
}
}
}
}
And I want to remove the card info of it like this:
{
"Id": "xxx",
"Type": "Transaction.Create",
"Payload": {
"result": 2,
"description": "Pending",
"body": {
"redirect": {
"url": "xxx",
"fields": {
"MD": "8a829449620619e80162252adeb66a39"
}
},
"order": {
"amount": 1
}
}
}
}
How can I do this with Apache velocity?
What works is:
#set($content = $util.urlEncode($input.json('$')))
#set($new = $content.replaceAll("2033","2055"))
Action=SendMessage&MessageBody={"body": "$new","Event-Signature": "$util.urlEncode($input.params('Event-Signature'))"}
This gives me
{
"Id": "xxx",
"Type": "Transaction.Create",
"Payload": {
"result": 2,
"description": "Pending",
"body": {
"redirect": {
"url": "xxx",
"fields": {
"MD": "8a829449620619e80162252adeb66a39"
}
},
"card": {
"expiryMonth": "1",
"expiryYear": "2050"
},
"order": {
"amount": 1
}
}
}
}
But now I want to remove the card part but it does not work:
#set($content = $util.urlEncode($input.json('$')))
#set($new = $content.delete("$.Payload.body.card"))
Action=SendMessage&MessageBody={"body": "$new","Event-Signature": "$util.urlEncode($input.params('Event-Signature'))"}
what am I doing wrong?
Main goal is transform a mapping template in API Gateway for a webhook. The webhook contains to many information and we want to remove some part of the JSON POST call.
Try using the below
#set($dummy=$content.Payload.remove("card"))