Operational Defect Database

BugZero found this defect 17 days ago.

MongoDB | 2660493

smoketest timeseries $emit processor hit "Invalid checkpoint. Checkpoint has 7 operators, OperatorDag has 8 operators" after restore

Last update date:

5/2/2024

Affected products:

MongoDB Server

Affected releases:

No affected releases provided.

Fixed releases:

8.1.0-rc0

Description:

Info

The smoke tests have run into this in prod, no customers have run into it: https://splunk.corp.mongodb.com/en-US/app/cloud/search?earliest=1714572000&latest=1714659996&q=search%20index%3Dmhouse%20%22error%20response%20from%20mstream%22%20%22Invalid%20checkpoint.%20Checkpoint%20has%22%20%7C%20stats%20count%20by%20tenantID&display.page.search.mode=smart&dispatch.sample_ratio=1&display.page.search.tab=statistics&display.general.type=statistics&sid=1714660838.2907128 This only seems to happen to processors created as part of TestStreamProcessorWithLargePushAccumulator, with pipelines like the below: {"pipeline":[ {"$source":{"connectionName":"StreamsAtlasConnection","db":"smoke_test_db_c49f0f12","coll":"change_stream_coll_c49f0f12","timeField":{"$toDate":"$timestamp"},"config":{"fullDocument":"required","fullDocumentOnly":true}}}, {"$tumblingWindow":{"interval":{"size":{"$numberInt":"1"},"unit":"second"},"pipeline":[ {"$project":{"docSize":{"$numberInt":"1"},"seed":{"$numberInt":"1"},"timestamp":{"$numberInt":"1"},"value":{"$range":[{"$numberInt":"0"},"$docCount"]}}}, {"$unwind":"$value"},{"$project":{"seed":{"$numberInt":"1"},"timestamp":{"$numberInt":"1"},"bigValue":{"$range":[{"$numberInt":"0"},"$docSize"]}}}, {"$project":{"bigStr":{"$reduce":{"input":"$bigValue","initialValue":{"$toString":{"$rand":{}}},"in":{"$concat":["$$value","$seed"]}}},"timestamp":{"$numberInt":"1"}}}, {"$group":{"_id":"$_id","bigArr":{"$push":"$bigStr"},"timestamp":{"$max":"$timestamp"}}}, {"$unwind":"$bigArr"}]}}, {"$emit":{"connectionName":"StreamsAtlasConnection","db":"smoke_test_db_c49f0f12","coll":"merge_coll_c49f0f12","timeseries":{"timeField":"timestamp"}}} ]} Spot checking one repro (splunk): 6633914d52c42db5fcb280e0 Stop flow fails because the pod is concurrently shut down, due to known concurrent start/stop issue: https://jira.mongodb.org/browse/MHOUSE-10684 Last checkpoint uploaded 651f3bac0302051bf2c6f664/6633914d52c42db5fcb280e0/1714655639293/MANIFEST /tmp/checkpoints/upload/651f3bac0302051bf2c6f664/6633914d52c42db5fcb280e0/1714655639293/ The manifest is: bash-3.2$ tail -c +5 MANIFEST >> out.manifest bash-3.2$ bsondump out.manifest {"version":{"$numberInt":"1"},"metadata":{"tenantId":"651f3bac0302051bf2c6f664","streamProcessorId":"6633914d52c42db5fcb280e0","checkpointId":{"$numberLong":"1714655639293"},"checkpointStartTime":{"$date":{"$numberLong":"1714655639293"}},"checkpointEndTime":{"$date":{"$numberLong":"1714655639293"}},"checkpointSizeBytes":{"$numberLong":"417"},"hostName":"streams-spp-966bdf65-h67cp","userPipeline":[{"$source":{"connectionName":"StreamsAtlasConnection","db":"smoke_test_db_c49f0f12","coll":"change_stream_coll_c49f0f12","timeField":{"$toDate":"$timestamp"},"config":{"fullDocument":"required","fullDocumentOnly":true}}},{"$tumblingWindow":{"interval":{"size":{"$numberInt":"1"},"unit":"second"},"pipeline":[{"$project":{"docSize":{"$numberInt":"1"},"seed":{"$numberInt":"1"},"timestamp":{"$numberInt":"1"},"value":{"$range":[{"$numberInt":"0"},"$docCount"]}}},{"$unwind":"$value"},{"$project":{"seed":{"$numberInt":"1"},"timestamp":{"$numberInt":"1"},"bigValue":{"$range":[{"$numberInt":"0"},"$docSize"]}}},{"$project":{"bigStr":{"$reduce":{"input":"$bigValue","initialValue":{"$toString":{"$rand":{}}},"in":{"$concat":["$$value","$seed"]}}},"timestamp":{"$numberInt":"1"}}},{"$group":{"_id":"$_id","bigArr":{"$push":"$bigStr"},"timestamp":{"$max":"$timestamp"}}},{"$unwind":"$bigArr"}]}},{"$emit":{"connectionName":"StreamsAtlasConnection","db":"smoke_test_db_c49f0f12","coll":"merge_coll_c49f0f12","timeseries":{"timeField":"timestamp"}}}],"operatorStats": [ {"operatorId":{"$numberInt":"0"},"stats":{"name":"ChangeStreamConsumerOperator","inputDocs":{"$numberLong":"2"},"inputBytes":{"$numberLong":"8936"},"outputDocs":{"$numberLong":"2"},"outputBytes":{"$numberLong":"0"},"dlqDocs":{"$numberLong":"0"},"dlqBytes":{"$numberLong":"0"},"stateSize":{"$numberLong":"0"}}}, {"operatorId":{"$numberInt":"1"},"stats":{"name":"ProjectOperator","inputDocs":{"$numberLong":"2"},"inputBytes":{"$numberLong":"0"},"outputDocs":{"$numberLong":"2"},"outputBytes":{"$numberLong":"0"},"dlqDocs":{"$numberLong":"0"},"dlqBytes":{"$numberLong":"0"},"stateSize":{"$numberLong":"0"}}}, {"operatorId":{"$numberInt":"2"},"stats":{"name":"UnwindOperator","inputDocs":{"$numberLong":"2"},"inputBytes":{"$numberLong":"0"},"outputDocs":{"$numberLong":"101"},"outputBytes":{"$numberLong":"0"},"dlqDocs":{"$numberLong":"0"},"dlqBytes":{"$numberLong":"0"},"stateSize":{"$numberLong":"0"}}}, {"operatorId":{"$numberInt":"3"},"stats":{"name":"ProjectOperator","inputDocs":{"$numberLong":"101"},"inputBytes":{"$numberLong":"0"},"outputDocs":{"$numberLong":"101"},"outputBytes":{"$numberLong":"0"},"dlqDocs":{"$numberLong":"0"},"dlqBytes":{"$numberLong":"0"},"stateSize":{"$numberLong":"0"}}}, {"operatorId":{"$numberInt":"4"},"stats":{"name":"ProjectOperator","inputDocs":{"$numberLong":"101"},"inputBytes":{"$numberLong":"0"},"outputDocs":{"$numberLong":"101"},"outputBytes":{"$numberLong":"0"},"dlqDocs":{"$numberLong":"0"},"dlqBytes":{"$numberLong":"0"},"stateSize":{"$numberLong":"0"}}}, {"operatorId":{"$numberInt":"5"},"stats":{"name":"GroupOperator","inputDocs":{"$numberLong":"101"},"inputBytes":{"$numberLong":"0"},"outputDocs":{"$numberLong":"0"},"outputBytes":{"$numberLong":"0"},"dlqDocs":{"$numberLong":"1"},"dlqBytes":{"$numberLong":"258"},"stateSize":{"$numberLong":"4316"}}}, {"operatorId":{"$numberInt":"6"},"stats":{"name":"UnwindOperator","inputDocs":{"$numberLong":"0"},"inputBytes":{"$numberLong":"0"},"outputDocs":{"$numberLong":"0"},"outputBytes":{"$numberLong":"0"},"dlqDocs":{"$numberLong":"0"},"dlqBytes":{"$numberLong":"0"},"stateSize":{"$numberLong":"0"}}} ] }, "checkpointFileList":{"checksumAlgo":"murmur3","files":[{"name":"state00000.bin","checksum":{"$numberLong":"726024177"}}]},"operatorCheckpointFileRanges":[{"opid":{"$numberInt":"0"},"fileRanges":[{"file":"state00000.bin","begin":{"$numberInt":"0"},"end":{"$numberInt":"103"}}]},{"opid":{"$numberInt":"5"},"fileRanges":[{"file":"state00000.bin","begin":{"$numberInt":"103"},"end":{"$numberInt":"4352"}}]}]} On restart with this checkpoint mongostream throws: { [-] _p: F clientHostname: streams-spm-6d45765594-jm5bf correlationID: 17cbae38667df4018bd0809a endpoint: /com.xgen.mhouse.services.spagent.grpc.v1.StreamExecutionService/StartStream error: (InternalError) Invalid checkpoint. Checkpoint has 7 operators, OperatorDag has 8 gitVersion: 0e2abf4b6df46b9c9028fda691e60e2806680e76 internalTimestamp: 2024-05-02T13:14:01.611Z kube: { [+] } level: warn logger: agent msg: error response from mstream remoteIP: 10.128.68.69 stream: stdout streamProcessorID: 6633914d52c42db5fcb280e0 streamProcessorName: kanopy_smoke_test_c49f0f12_dc850247 tenantID: 651f3bac0302051bf2c6f664 time: 2024-05-02T13:14:01.611331224Z } Fixes Why didn't this alert fire? https://github.com/10gen/mongohouse/pull/9494

Top User Comments


Steps to Reproduce


Additional Resources / Links

Share:

BugZero® Risk Score

What's this?

Coming soon

Status

Closed

Learn More

Search:

...