Hello fellow snowplowers,
I was wondering how I can specify the security groups attached to EC2 instances that are launched as part of the EMR job. As far as I understood, by default, the security groups are created and managed by AWS (as specified in the AWS documentation). Under the link you can also check the default inbound and outbound rules for the security groups and they are displayed for us as ElasticMapReduce-master
and ElasticMapReduce-slave
.
Now, what we would like to do is change the configuration of the cluster’s network and security settings to use a security group without the rule with unrestricted traffic on port 22. We could theoretically create our own security groups and attach them to launched EC2s but how can we do that? Would be super nice to have some parameter like ServiceAccessSecurityGroup
in AWS CLI for specifying a security group id.
More on the problem can be found in this blogpost: Why you need to update your risky default EMR managed roles and policies - Security Boulevard
Our EMR cluster config:
{
"schema": "iglu:com.snowplowanalytics.dataflowrunner/ClusterConfig/avro/1-1-0",
"data": {
"name": "com.oneapp",
"logUri": "LOGURI",
"region": "AWS_DEFAULT_REGION",
"credentials": {
"accessKeyId": "AWS_ACCESS_KEY_ID",
"secretAccessKey": "AWS_SECRET_ACCESS_KEY"
},
"roles": {
"jobflow": "EMR_EC2_DefaultRole",
"service": "EMR_DefaultRole"
},
"ec2": {
"amiVersion": "6.10.0",
"instances": {
"core": {
"count": 1,
"type": "r5.12xlarge"
},
"master": {
"ebsConfiguration": {
"ebsBlockDeviceConfigs": [],
"ebsOptimized": true
},
"type": "m4.large"
},
"task": {
"bid": "0.015",
"count": 0,
"type": "m4.large"
}
},
"keyName": "EMR_ECS_KEY_PAIR",
"location": {
"vpc": {
"subnetId": "AWS_PUBLIC_SUBNET_ID"
}
}
},
"tags": [
{
"key": "client",
"value": "com.oneapp"
},
{
"key": "job",
"value": "main"
}
],
"bootstrapActionConfigs": [],
"configurations": [
{
"classification": "spark",
"configurations": [],
"properties": {
"maximizeResourceAllocation": "false"
}
},
{
"classification": "spark-defaults",
"configurations": [],
"properties": {
"spark.default.parallelism": "80",
"spark.driver.cores": "5",
"spark.driver.memory": "37G",
"spark.dynamicAllocation.enabled": "false",
"spark.executor.cores": "5",
"spark.executor.instances": "8",
"spark.executor.memory": "37G",
"spark.yarn.driver.memoryOverhead": "5G",
"spark.yarn.executor.memoryOverhead": "5G"
}
},
{
"classification": "yarn-site",
"configurations": [],
"properties": {
"yarn.nodemanager.resource.memory-mb": "385024",
"yarn.nodemanager.vmem-check-enabled": "false",
"yarn.scheduler.maximum-allocation-mb": "385024"
}
}
],
"applications": ["Hadoop", "Spark"]
}
}
Our playbook.json:
{
"schema": "iglu:com.snowplowanalytics.dataflowrunner/ClusterConfig/avro/1-1-0",
"data": {
"region": "AWS_DEFAULT_REGION",
"credentials": {
"accessKeyId": "AWS_ACCESS_KEY_ID",
"secretAccessKey": "AWS_SECRET_ACCESS_KEY"
},
"steps": [
{
"type": "CUSTOM_JAR",
"name": "S3DistCp enriched data archiving",
"actionOnFailure": "CANCEL_AND_WAIT",
"jar": "/usr/share/aws/emr/s3-dist-cp/lib/s3-dist-cp.jar",
"arguments": [
"--src", "SP_LOADER_URI",
"--dest", "SP_ENRICHED_URIrun={{nowWithFormat "2006-01-02-15-04-05"}}/",
"--srcPattern", ".*",
"--outputCodec", "gz",
"--deleteOnSuccess"
]
},
{
"type": "CUSTOM_JAR",
"name": "RDB Transformer Shredder",
"actionOnFailure": "CANCEL_AND_WAIT",
"jar": "command-runner.jar",
"arguments": [
"spark-submit",
"--class", "com.snowplowanalytics.snowplow.rdbloader.transformer.batch.Main",
"--master", "yarn",
"--deploy-mode", "cluster",
"s3://snowplow-hosted-assets/4-storage/transformer-batch/snowplow-transformer-batch-4.1.0.jar",
"--iglu-config", "{{base64File "resolver.json"}}",
"--config", "{{base64File "config.hocon"}}"
]
}
],
"tags": []
}
}