Hello,
I am using dataflow-runner to execute Snowflake transformer and loader.
The ‘Snowflake Transformer’ EMR step is failing. I looked in the container stderr for the relating job and can see it is failing due to the following:
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 106 in stage 0.0 failed 4 times, most recent failure: Lost task 106.3 in stage 0.0 (TID 124, ip-172-31-38-253.ap-southeast-2.compute.internal, executor 1): com.amazonaws.SdkClientException: Unable to execute HTTP request: Timeout waiting for connection from pool
What does this mean?
I checked the DynamoDB table and can see a RunId with the name of the only “run=xxxxarchive folder” so it doesn’t seem like an issue with DynamoDB at least…
Here is my self-describing-config.json:
{
"schema": "iglu:com.snowplowanalytics.snowplow.storage/snowflake_config/jsonschema/1-0-2",
"data": {
"name": "Compareclub Snowflake Storage Target",
"awsRegion": "ap-southeast-2",
"auth": {
"accessKeyId": "xxxxxxxxxx",
"secretAccessKey": "xxxxxxxxxx",
"sessionDuration": 900
},
"manifest": "cc-snowplow-snowflake-manifest",
"snowflakeRegion": "us-west-2",
"database": "snowplow",
"input": "s3://cc-snowplow-enriched/archive/",
"stage": "snowplow_stage",
"badOutputUrl": "s3://cc-snowplow-snowflake/bad/",
"stageUrl": "s3://cc-snowplow-snowflake/transformed/",
"warehouse": "snowplow_wh",
"schema": "atomic",
"account": "xxxxxxxxx",
"username": "snowplow_user",
"password": "xxxxxxxxxx",
"maxError": 1,
"jdbcHost": "xxxxxxxx.snowflakecomputing.com",
"purpose": "ENRICHED_EVENTS"
}
}
This is my cluster.json file:
{
"schema":"iglu:com.snowplowanalytics.dataflowrunner/ClusterConfig/avro/1-1-0",
"data":{
"name":"dataflow-runner - snowflake transformer",
"logUri":"s3://cc-snowplow-snowflake-logs/",
"region":"ap-southeast-2",
"credentials":{
"accessKeyId":"xxxxxxxxxx",
"secretAccessKey":"xxxxxxxxxx"
},
"roles":{
"jobflow":"EMR_EC2_DefaultRole",
"service":"EMR_DefaultRole"
},
"ec2":{
"amiVersion":"5.9.0",
"keyName":"snowplow.etl.runner",
"location":{
"vpc":{
"subnetId":"subnet-51d0ce18"
}
},
"instances":{
"master":{
"type":"m2.xlarge"
},
"core":{
"type":"m2.xlarge",
"count":1
},
"task":{
"type":"m1.medium",
"count":0,
"bid":"0.04"
}
}
},
"tags":[ ],
"bootstrapActionConfigs":[ ],
"configurations":[
{
"classification":"core-site",
"properties":{
"Io.file.buffer.size":"65536"
}
},
{
"classification":"mapred-site",
"properties":{
"Mapreduce.user.classpath.first":"true"
}
},
{
"classification":"yarn-site",
"properties":{
"yarn.resourcemanager.am.max-attempts":"1"
}
},
{
"classification":"spark",
"properties":{
"maximizeResourceAllocation":"true"
}
}
],
"applications":[ "Hadoop", "Spark" ]
}
}
This is my playbook.json:
{
"schema":"iglu:com.snowplowanalytics.dataflowrunner/PlaybookConfig/avro/1-0-1",
"data":{
"region":"ap-southeast-2",
"credentials":{
"accessKeyId":"xxxxxxxxxx",
"secretAccessKey":"xxxxxxxxxx"
},
"steps":[
{
"type":"CUSTOM_JAR",
"name":"Snowflake Transformer",
"actionOnFailure":"CANCEL_AND_WAIT",
"jar":"command-runner.jar",
"arguments":[
"spark-submit",
"--conf",
"spark.hadoop.mapreduce.job.outputformat.class=com.snowplowanalytics.snowflake.transformer.S3OutputFormat",
"--deploy-mode",
"cluster",
"--class",
"com.snowplowanalytics.snowflake.transformer.Main",
"s3://snowplow-hosted-assets/4-storage/snowflake-loader/snowplow-snowflake-transformer-0.6.0.jar",
"--config",
"{{base64File "./config/self-describing-config.json"}}",
"--resolver",
"{{base64File "./config/iglu_resolver.json"}}",
"--events-manifest",
"{{base64File "./config/dynamodb_config.json"}}"
]
},
{
"type":"CUSTOM_JAR",
"name":"Snowflake Loader",
"actionOnFailure":"CANCEL_AND_WAIT",
"jar":"s3://snowplow-hosted-assets/4-storage/snowflake-loader/snowplow-snowflake-loader-0.6.0.jar",
"arguments":[
"load",
"--base64",
"--config",
"{{base64File "./config/self-describing-config.json"}}",
"--resolver",
"{{base64File "./config/iglu_resolver.json"}}"
]
}
],
"tags":[ ]
}
}
I am executing dataflow-runner using the following command:
./dataflow-runner run-transient --emr-config ./config/cluster.json --emr-playbook ./config/playbook.json
Where am i going wrong? I feel like I am so close now in getting the data into Snowflake!
Appreciate your help,
Ryan