Hey everyone, I’ve been trying to use an AWS IAM Role instead of passing secrets around together with the dataflow-runner to launch the EMR. For more details: Support IAM roles · Issue #34 · snowplow/dataflow-runner · GitHub
I checked the codebase and it looks like IAM Roles are supported (there’re explicit tests for it here and the implementation here). I also used this example config as a reference. But I get the following error message:
My cluster.json config:
{
"schema": "iglu:com.snowplowanalytics.dataflowrunner/ClusterConfig/avro/1-1-0",
"data": {
"name": "com.oneapp",
"logUri": "LOGURI",
"region": "AWS_DEFAULT_REGION",
"credentials": {
"accessKeyId": "iam",
"secretAccessKey": "iam"
},
"roles": {
"jobflow": "EMR_EC2_DefaultRole",
"service": "EMR_DefaultRole"
},
"ec2": {
"amiVersion": "6.10.0",
"instances": {
"core": {
"count": 1,
"type": "r5.12xlarge"
},
"master": {
"ebsConfiguration": {
"ebsBlockDeviceConfigs": [],
"ebsOptimized": true
},
"type": "m4.large"
},
"task": {
"bid": "0.015",
"count": 0,
"type": "m4.large"
}
},
"keyName": "EMR_ECS_KEY_PAIR",
"location": {
"vpc": {
"subnetId": "AWS_PUBLIC_SUBNET_ID"
}
}
},
"tags": [
{
"key": "client",
"value": "com.oneapp"
},
{
"key": "job",
"value": "main"
}
],
"bootstrapActionConfigs": [],
"configurations": [
{
"classification": "spark",
"configurations": [],
"properties": {
"maximizeResourceAllocation": "false"
}
},
{
"classification": "spark-defaults",
"configurations": [],
"properties": {
"spark.default.parallelism": "80",
"spark.driver.cores": "5",
"spark.driver.memory": "37G",
"spark.dynamicAllocation.enabled": "false",
"spark.executor.cores": "5",
"spark.executor.instances": "8",
"spark.executor.memory": "37G",
"spark.yarn.driver.memoryOverhead": "5G",
"spark.yarn.executor.memoryOverhead": "5G"
}
},
{
"classification": "yarn-site",
"configurations": [],
"properties": {
"yarn.nodemanager.resource.memory-mb": "385024",
"yarn.nodemanager.vmem-check-enabled": "false",
"yarn.scheduler.maximum-allocation-mb": "385024"
}
}
],
"applications": [
"Hadoop",
"Spark"
]
}
}
My playbook.json config:
{
"schema": "iglu:com.snowplowanalytics.dataflowrunner/ClusterConfig/avro/1-1-0",
"data": {
"region": "AWS_DEFAULT_REGION",
"credentials": {
"accessKeyId": "iam",
"secretAccessKey": "iam"
},
"roles": {
"jobflow": "EMR_EC2_DefaultRole",
"service": "EMR_DefaultRole"
},
"steps": [
{
"type": "CUSTOM_JAR",
"name": "S3DistCp enriched data archiving",
"actionOnFailure": "CANCEL_AND_WAIT",
"jar": "/usr/share/aws/emr/s3-dist-cp/lib/s3-dist-cp.jar",
"arguments": [
"--src", "SP_LOADER_URI",
"--dest", "SP_ENRICHED_URIrun={{nowWithFormat "2006-01-02-15-04-05"}}/",
"--srcPattern", ".*",
"--outputCodec", "gz",
"--deleteOnSuccess"
]
},
{
"type": "CUSTOM_JAR",
"name": "RDB Transformer Shredder",
"actionOnFailure": "CANCEL_AND_WAIT",
"jar": "command-runner.jar",
"arguments": [
"spark-submit",
"--class", "com.snowplowanalytics.snowplow.rdbloader.transformer.batch.Main",
"--master", "yarn",
"--deploy-mode", "cluster",
"s3://snowplow-hosted-assets/4-storage/transformer-batch/snowplow-transformer-batch-4.1.0.jar",
"--iglu-config", "{{base64File "resolver.json"}}",
"--config", "{{base64File "config.hocon"}}"
]
}
],
"tags": []
}
}
Note that some values in caps like AWS_DEFAULT_REGION
are replaced with the values of the respective environment variables with the sed
command.
It looks like it might be an issue with the aws sdk but I lack experience to pin it down. Can someone help me with that?