Hi,
we are facing some problems with the shredder again. We recently updated to shredder and rdbloader 2.1.0 and also updated the AMI version to 6.5.0 because that was a requirement from our IT security.
The shredder jobs usually runs for about 20-30 minutes. At some point it starts slowing down considerably (see screenshot).
The influx of data is constant and rather small though less than 1GB per day).
Our current emr-config looks like this:
{
"schema": "iglu:com.snowplowanalytics.dataflowrunner/ClusterConfig/avro/1-1-0",
"data": {
"name": "com.oneapp",
"logUri": "LOGURI",
"region": "eu-west-1",
"credentials": {
"accessKeyId": "AWS_ACCESS_KEY_ID",
"secretAccessKey": "AWS_SECRET_ACCESS_KEY"
},
"roles": {
"jobflow": "EMR_EC2_DefaultRole",
"service": "EMR_DefaultRole"
},
"ec2": {
"amiVersion": "6.5.0",
"instances": {
"core": {
"count": 1,
"type": "r5.xlarge"
},
"master": {
"ebsConfiguration": {
"ebsBlockDeviceConfigs": [],
"ebsOptimized": true
},
"type": "m5.xlarge"
},
"task": {
"bid": "0.015",
"count": 0,
"type": "m5.xlarge"
}
},
"keyName": "EMR_ECS_KEY_PAIR",
"location": {
"vpc": {
"subnetId": "AWS_SUBNET_PUBLIC_ID"
}
}
},
"tags": [
{
"key": "client",
"value": "com.oneapp"
},
{
"key": "job",
"value": "main"
},
{
"key": "GITC-VulnScanTool",
"value": "tenable_io"
}
],
"bootstrapActionConfigs": [],
"configurations": [
{
"classification": "spark",
"configurations": [],
"properties": {
"maximizeResourceAllocation": "false"
}
},
{
"classification": "spark-defaults",
"configurations": [],
"properties": {
"spark.default.parallelism": "8",
"spark.driver.maxResultSize": "0",
"spark.driver.cores": "1",
"spark.driver.memory": "9G",
"spark.dynamicAllocation.enabled": "false",
"spark.executor.cores": "1",
"spark.executor.instances": "2",
"spark.executor.memory": "9G",
"spark.yarn.driver.memoryOverhead": "1024",
"spark.yarn.executor.memoryOverhead": "1024"
}
},
{
"classification": "yarn-site",
"configurations": [],
"properties": {
"yarn.nodemanager.resource.memory-mb": "24576",
"yarn.nodemanager.vmem-check-enabled": "false",
"yarn.scheduler.maximum-allocation-mb": "24576"
}
}
],
"applications": [ "Hadoop", "Spark" ]
}
}
What causes the shredder to slow down so much?