Hi all,
I’m new to Snowplow and dataflow-runner so it may be a misunderstanding but I’ve copied the setup guide exactly from here for loading data into snowflake:
Yet I keep getting an error when attempting to run the command
./dataflow-runner run-transient --emr-config=cluster.json --emr-playbook=playbook.json
The error is "Cannot run program "spark-submit" (in directory "."): error=2, No such file or directory"
I can verify that through cloudtrail that the command is sent to AWS EMR, the clsuter starts but for some reason some of the parameters I sent through my playbook.json are not making it’s way into EMR so it continuously fails
My cluster.json is almost an exact copy of the tutorial
{
"schema":"iglu:com.snowplowanalytics.dataflowrunner/ClusterConfig/avro/1-1-0",
"data":{
"name":"dataflow-runner - snowflake transformer",
"logUri":"s3://logs/data-snowplow-emr-etl-runner/",
"region":"us-west-2",
"credentials":{
"accessKeyId":"xxxxxxxx",
"secretAccessKey":"xxxxxx"
},
"roles":{
"jobflow":"x",
"service":"x"
},
"ec2":{
"amiVersion":"6.1.0",
"keyName":"test",
"location":{
"vpc":{
"subnetId": "test"
}
},
"instances":{
"master":{
"type":"m4.large"
},
"core":{
"type":"r4.xlarge",
"count":1,
"ebsConfiguration":{
"ebs_optimized": false,
"ebsBlockDeviceConfigs": [
{
"volumesPerInstance" : 1
}
]
}
},
"task":{
"type":"m4.large",
"count":0,
"bid":"0.015"
}
}
},
"tags":[ ],
"bootstrapActionConfigs":[ ],
"configurations":[
{
"classification":"core-site",
"properties":{
"Io.file.buffer.size":"65536"
}
},
{
"classification":"mapred-site",
"properties":{
"Mapreduce.user.classpath.first":"true"
}
},
{
"classification":"yarn-site",
"properties":{
"yarn.resourcemanager.am.max-attempts":"1"
}
},
{
"classification":"spark",
"properties":{
"maximizeResourceAllocation":"true"
}
}
],
"applications":[ "Hadoop", "Spark" ]
}
}
My playbook.json is also an exact copy
{
"schema":"iglu:com.snowplowanalytics.dataflowrunner/PlaybookConfig/avro/1-0-1",
"data":{
"region":"region",
"credentials":{
"accessKeyId":"<%= ENV['AWS_ACCESS_KEY'] %>",
"secretAccessKey":"<%= ENV['AWS_SECRET_KEY'] %>"
},
"steps":[
{
"type":"CUSTOM_JAR",
"name":"Snowflake Transformer",
"actionOnFailure":"CANCEL_AND_WAIT",
"jar":"command-runner.jar",
"arguments":[
"spark-submit",
"--conf",
"spark.hadoop.mapreduce.job.outputformat.class=com.snowplowanalytics.snowflake.transformer.S3OutputFormat",
"--deploy-mode",
"cluster",
"--class",
"com.snowplowanalytics.snowflake.transformer.Main",
"s3://snowplow-hosted-assets/4-storage/snowflake-loader/snowplow-snowflake-transformer-0.7.1.jar",
"--config",
"{{base64File "./targets/snowflake.json"}}",
"--resolver",
"{{base64File "resolver.json"}}",
"--events-manifest",
"{{base64File "dynamodb.json"}}"
]
},
{
"type":"CUSTOM_JAR",
"name":"Snowflake Loader",
"actionOnFailure":"CANCEL_AND_WAIT",
"jar":"s3://snowplow-hosted-assets/4-storage/snowflake-loader/snowplow-snowflake-loader-0.7.1.jar",
"arguments":[
"load",
"--base64",
"--config",
"{{base64File "./targets/snowflake.json"}}",
"--resolver",
"{{base64File "./resolver.json"}}"
]
}
],
"tags":[ ]
}
}
In the end dataflow-runner has all the control with setting up EMR, so I have no visibility into whats going wrong, the only thing I know is the following request that was sent from dataflow-runner through cloudtrail which does not specify the need for spark anywhere, even though I’ve added it to my cluster.json
"requestParameters": {
"name": "dataflow-runner - snowflake transformer",
"logUri": "s3://logs/data-snowplow-emr-etl-runner/",
"releaseLabel": "emr-6.1.0",
"instances": {
"instanceGroups": [
{
"instanceRole": "MASTER",
"instanceType": "m4.large",
"instanceCount": 1
},
{
"instanceRole": "CORE",
"instanceType": "r4.xlarge",
"instanceCount": 1,
"ebsConfiguration": {
"ebsOptimized": false
}
}
],
"ec2KeyName": "test",
"placement": {
"availabilityZone": ""
},
"keepJobFlowAliveWhenNoSteps": true,
"terminationProtected": false,
"ec2SubnetId": "test"
},
"visibleToAllUsers": true,
"jobFlowRole": "x",
"serviceRole": "x"
},
Did I mess something up? Any help would be really appreciated