Yes, we are using the spark configuration cheatsheet for our setup. When looking at your example configuration, it looks like you tweaked the spark.default.parallelism, can you explain some reasoning behind this?
For example, the recommended spark.default.parallelism is 54 by default but you had 108.
I have also attached a copy of our spark configuration for reference:
"schema":"iglu:com.snowplowanalytics.dataflowrunner/ClusterConfig/avro/1-1-0",
"data":{
"name":"dataflow-runner-snowflake-transformer",
"logUri":"",
"region":"",
"credentials":{
},
"roles":{
"jobflow":"EMR_EC2_DefaultRole",
"service":"EMR_DefaultRole"
},
"ec2":{
"amiVersion":"5.13.0",
"keyName":"snowplow-prod-snowflake",
"location":{
},
"instances":{
"master":{
"type":"m4.large"
},
"core":{
"type":"r5.xlarge",
"count":1
},
"task":{
"type":"m4.large",
"count":0,
"bid":"0.015"
}
}
},
"tags":[ ],
"bootstrapActionConfigs":[ ],
"configurations":[
{
"classification":"core-site",
"properties":{
"Io.file.buffer.size":"65536"
}
},
{
"classification":"mapred-site",
"properties":{
"Mapreduce.user.classpath.first":"true"
}
},
{
"classification":"yarn-site",
"properties":{
"yarn.resourcemanager.am.max-attempts":"1",
"yarn.nodemanager.vmem-check-enabled": "false",
"yarn.nodemanager.resource.memory-mb":"28416",
"yarn.scheduler.maximum-allocation-mb":"28416"
}
},
{
"classification":"spark",
"properties":{
"maximizeResourceAllocation":"false"
}
},
{
"classification":"spark-defaults",
"properties":{
"spark.dynamicAllocation.enabled":"false",
"spark.executor.instances": "2",
"spark.executor.memoryOverhead": "2048",
"spark.executor.memory": "7G",
"spark.driver.memoryOverhead": "2048",
"spark.driver.memory": "7G",
"spark.executor.cores": "1",
"spark.driver.cores": "1",
"spark.default.parallelism": "4"
}
}
],
"applications":[ "Hadoop", "Spark" ]
}
}