Error is :
Value guarded in: Snowplow::EmrEtlRunner::Cli::load_config
With Contract: Maybe, String, Bool => Maybe
At: uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:211
failure_callback at uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb:32
call_with at uri:classloader:/gems/contracts-0.11.0/lib/contracts/call_with.rb:80
block in redefine_method at uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_handler.rb:138
process_options at uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:199
get_args_config_enrichments_resolver at uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:173
send_to at uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_reference.rb:43
call_with at uri:classloader:/gems/contracts-0.11.0/lib/contracts/call_with.rb:76
block in redefine_method at uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_handler.rb:138
at uri:classloader:/emr-etl-runner/bin/snowplow-emr-etl-runner:37
load at org/jruby/RubyKernel.java:994
at uri:classloader:/META-INF/main.rb:1
require at org/jruby/RubyKernel.java:970
(root) at uri:classloader:/META-INF/main.rb:1
at uri:classloader:/META-INF/jruby.home/lib/ruby/stdlib/rubygems/core_ext/kernel_require.rb:1
ERROR: org.jruby.embed.EvalFailedException: (ReturnContractError) Contract violation for return value:
I am using :
Scala Stream Collector (Collector) --> JavaScript Tracker (Tracker) -> S3-Loader (For write data to s3 bucket) --> EmrEtlRunner --> PostgreSQL
EmrETLRunner version is : snowplow_emr_r117_biskupin
EmrETLRunner Config file is below:
aws:
Credentials can be hardcoded or set in environment variables
access_key_id: “env” #<%= ENV[‘AWS_SNOWPLOW_ACCESS_KEY’] %>
secret_access_key: “env” #<%= ENV[‘AWS_SNOWPLOW_SECRET_KEY’] %>
s3:
region: “us-east-1” #ADD HERE
buckets:
assets: s3://snowplow-hosted-assets # DO NOT CHANGE unless you are hosting the jarfiles etc yourself in your own bucket
jsonpath_assets: # If you have defined your own JSON Schemas, add the s3:// path to your own JSON Path files in your own bucket here
log: s3:// #ADD HERE
encrypted: false
raw:
in: # Multiple in buckets are permitted
- s3:// #- ADD HERE # e.g. s3://my-in-bucket
#- ADD HERE
processing: s3:// #ADD HERE
archive: s3:// #ADD HERE # e.g. s3://my-archive-bucket/in
enriched:
good: s3:// #ADD HERE # e.g. s3://my-out-bucket/enriched/good
bad: s3:// #ADD HERE # e.g. s3://my-out-bucket/enriched/bad
errors: s3:// #ADD HERE # Leave blank unless continue_on_unexpected_error: set to true below
archive: s3:// #ADD HERE # Where to archive enriched events to, e.g. s3://my-archive-bucket/enriched
shredded:
good: s3:// #ADD HERE # e.g. s3://my-out-bucket/shredded/good
bad: s3:// #ADD HERE # e.g. s3://my-out-bucket/shredded/bad
errors: s3:// #ADD HERE # Leave blank unless continue_on_unexpected_error: set to true below
archive: s3:// #ADD HERE # Where to archive shredded events to, e.g. s3://my-archive-bucket/shredded
consolidate_shredded_output: false
emr:
#job_name: Snowplow ETL # Give your job a name
ami_version: 5.9.0 # Don’t change this
region: “us-east-1” #ADD HERE # Always set this
jobflow_role: EMR_EC2_DefaultRole # Created using aws emr create-default-roles
service_role: EMR_DefaultRole # Created using aws emr create-default-roles
placement: #ADD HERE # Set this if not running in VPC. Leave blank otherwise
ec2_subnet_id: #ADD HERE # Set this if running in VPC. Leave blank otherwise
ec2_key_name: snowplow #ADD HERE
security_configuration:
bootstrap: # Set this to specify custom boostrap actions. Leave empty otherwise
software:
hbase: “0.92.0” # Optional. To launch on cluster, provide version, “0.92.0”, keep quotes. Leave empty otherwise.
lingual: “1.1” # Optional. To launch on cluster, provide version, “1.1”, keep quotes. Leave empty otherwise.
# Adjust your Spark cluster below
jobflow:
job_name: Snowplow ETL
master_instance_type: m1.medium #m1.medium
core_instance_count: 2 #2
core_instance_type: m1.medium #m1.medium
core_instance_ebs: # Optional. Attach an EBS volume to each core instance.
volume_size: 100 # Gigabytes
volume_type: “gp2”
volume_iops: 400 # Optional. Will only be used if volume_type is “io1”
ebs_optimized: false # Optional. Will default to true
task_instance_count: 0 # Increase to use spot instances
task_instance_type: m1.medium #m1.medium
task_instance_bid: #0.015 # In USD. Adjust bid, or leave blank for non-spot-priced (i.e. on-demand) task instances
bootstrap_failure_tries: 3 # Number of times to attempt the job in the event of bootstrap failures
additional_info: # Optional JSON string for selecting additional features
collectors:
format: ‘thrift’ #‘cloudfront’ Or ‘clj-tomcat’ for the Clojure Collector, or ‘thrift’ for Thrift records, or ‘tsv/com.amazon.aws.cloudfront/wd_access_log’ for Cloudfront access logs
enrich:
versions:
spark_enrich: 1.10.0 # Version of the Spark Enrichment process
continue_on_unexpected_error: false # Set to ‘true’ (and set out_errors: above) if you don’t want any exceptions thrown from ETL
output_compression: NONE # Compression only supported with Redshift, set to NONE if you have Postgres targets. Allowed formats: NONE, GZIP
storage:
versions:
rdb_shredder: 0.13.0 # Version of the Relational Database Shredding process
rdb_loader: 0.14.0 # Version of the Relational Database Loader app
hadoop_elasticsearch: 0.1.0 # Version of the Hadoop to Elasticsearch copying process
monitoring:
tags: {} # Name-value pairs describing this job
logging:
level: DEBUG # You can optionally switch to INFO for production
snowplow:
method: get
app_id: snowplow #ADD HERE # e.g. snowplow
collector: localhost:8001/ #ADD HERE # e.g. d3rkrsqld9gmqf.cloudfront.net
Hi @hareeshsoni,
It looks like your config file is missing a new line between jobflow_role
and and service_role
. The parameter service_role
is a required parameter, so EmrETLRunner is complaining that it is not set properly.
Try changing this line:
jobflow_role: EMR_EC2_DefaultRole # Created using aws emr create-default-roles service_role: EMR_DefaultRole # Created using aws emr create-default-roles
to this:
jobflow_role: EMR_EC2_DefaultRole # Created using aws emr create-default-roles
service_role: EMR_DefaultRole # Created using aws emr create-default-roles
You can see an example of setting the roles at line 32-33 of this example config file.
That explains the error message you contacted us about. You will also need to complete your config to point towards real S3 buckets; i.e. change your placeholders s3://
into real urls like s3://my-bucket/enriched/good
. You will continue to get other error messages until your s3 paths are valid.
Hopefully that helps.
5 Likes
Hi @istreeter,
Thanks for reply. I have made above changes inside configuration file but still facing same issue.
Applied below changes :
jobflow_role: EMR_EC2_DefaultRole # Created using aws emr create-default-roles
service_role: EMR_DefaultRole # Created using aws emr create-default-roles
New Configuration file is :
aws:
Credentials can be hardcoded or set in environment variables
access_key_id: “env” #<%= ENV[‘AWS_SNOWPLOW_ACCESS_KEY’] %>
secret_access_key: “env” #<%= ENV[‘AWS_SNOWPLOW_SECRET_KEY’] %>
s3:
region: “us-east-1” #ADD HERE
buckets:
assets: s3://snowplow-hosted-assets # DO NOT CHANGE unless you are hosting the jarfiles etc yourself in your own bucket
jsonpath_assets: # If you have defined your own JSON Schemas, add the s3:// path to your own JSON Path files in your own bucket here
log: s3:// #ADD HERE
encrypted: false
raw:
in: # Multiple in buckets are permitted
- s3:// #- ADD HERE # e.g. s3://my-in-bucket
#- ADD HERE
processing: s3:// #ADD HERE
archive: s3:// #ADD HERE # e.g. s3://my-archive-bucket/in
enriched:
good: s3:// #ADD HERE # e.g. s3://my-out-bucket/enriched/good
bad: s3:// #ADD HERE # e.g. s3://my-out-bucket/enriched/bad
errors: s3:// #ADD HERE # Leave blank unless continue_on_unexpected_error: set to true below
archive: s3:// #ADD HERE # Where to archive enriched events to, e.g. s3://my-archive-bucket/enriched
shredded:
good: s3:// #ADD HERE # e.g. s3://my-out-bucket/shredded/good
bad: s3:// #ADD HERE # e.g. s3://my-out-bucket/shredded/bad
errors: s3:// #ADD HERE # Leave blank unless continue_on_unexpected_error: set to true below
archive: s3:// #ADD HERE # Where to archive shredded events to, e.g. s3://my-archive-bucket/shredded
consolidate_shredded_output: false
emr:
#job_name: Snowplow ETL # Give your job a name
ami_version: 5.9.0 # Don’t change this
region: “us-east-1” #ADD HERE # Always set this
jobflow_role: EMR_EC2_DefaultRole # Created using aws emr create-default-roles
service_role: EMR_DefaultRole # Created using aws emr create-default-roles
placement: #ADD HERE # Set this if not running in VPC. Leave blank otherwise
ec2_subnet_id: #ADD HERE # Set this if running in VPC. Leave blank otherwise
ec2_key_name: snowplow #ADD HERE
security_configuration:
bootstrap: # Set this to specify custom boostrap actions. Leave empty otherwise
software:
hbase: “0.92.0” # Optional. To launch on cluster, provide version, “0.92.0”, keep quotes. Leave empty otherwise.
lingual: “1.1” # Optional. To launch on cluster, provide version, “1.1”, keep quotes. Leave empty otherwise.
Adjust your Spark cluster below
jobflow:
job_name: Snowplow ETL
master_instance_type: m1.medium #m1.medium
core_instance_count: 2 #2
core_instance_type: m1.medium #m1.medium
core_instance_ebs: # Optional. Attach an EBS volume to each core instance.
volume_size: 100 # Gigabytes
volume_type: “gp2”
volume_iops: 400 # Optional. Will only be used if volume_type is “io1”
ebs_optimized: false # Optional. Will default to true
task_instance_count: 0 # Increase to use spot instances
task_instance_type: m1.medium #m1.medium
task_instance_bid: #0.015 # In USD. Adjust bid, or leave blank for non-spot-priced (i.e. on-demand) task instances
bootstrap_failure_tries: 3 # Number of times to attempt the job in the event of bootstrap failures
additional_info: # Optional JSON string for selecting additional features
collectors:
format: ‘thrift’ #‘cloudfront’ Or ‘clj-tomcat’ for the Clojure Collector, or ‘thrift’ for Thrift records, or ‘tsv/com.amazon.aws.cloudfront/wd_access_log’ for Cloudfront access logs
enrich:
versions:
spark_enrich: 1.10.0 # Version of the Spark Enrichment process
continue_on_unexpected_error: false # Set to ‘true’ (and set out_errors: above) if you don’t want any exceptions thrown from ETL
output_compression: NONE # Compression only supported with Redshift, set to NONE if you have Postgres targets. Allowed formats: NONE, GZIP
storage:
versions:
rdb_shredder: 0.13.0 # Version of the Relational Database Shredding process
rdb_loader: 0.14.0 # Version of the Relational Database Loader app
hadoop_elasticsearch: 0.1.0 # Version of the Hadoop to Elasticsearch copying process
monitoring:
tags: {} # Name-value pairs describing this job
logging:
level: DEBUG # You can optionally switch to INFO for production
snowplow:
method: get
app_id: snowplow #ADD HERE # e.g. snowplow
collector: localhost:8001/ #ADD HERE # e.g. d3rkrsqld9gmqf.cloudfront.net
and I have already mentioned correct s3://full bucket name.
Hi @hareeshsoni,
It is a bit tricky to debug your config because the text is being formatted strangely as you have copied it into this forum. Nevertheless, I have taken your config and reformatted it and it works OK for me. I have pasted the config I tried at the bottom of this message.
Please can you try copying and pasting this config exactly to your local setup? I am very interested if you still get an error with this version.
If you still get errors then it would be helpful if you can paste back your config as preformatted text. You can do this in discourse either by using markdown syntax, or look for the “preformatted text” button in the text editor.
Please also always post back the exact error message.
aws:
# Credentials can be hardcoded or set in environment variables
access_key_id: "env" #<%= ENV['AWS_SNOWPLOW_ACCESS_KEY'] %>
secret_access_key: "env" #<%= ENV['AWS_SNOWPLOW_SECRET_KEY'] %>
s3:
region: "us-east-1" #ADD HERE
buckets:
assets: s3://snowplow-hosted-assets # DO NOT CHANGE unless you are hosting the jarfiles etc yourself in your own bucket
jsonpath_assets: # If you have defined your own JSON Schemas, add the s3:// path to your own JSON Path files in your own bucket here
log: s3://xxx #ADD HERE
encrypted: false
raw:
in: # Multiple in buckets are permitted
- s3://xxx #- ADD HERE # e.g. s3://my-in-bucket
#- ADD HERE
processing: s3://xxx #ADD HERE
archive: s3://xxx #ADD HERE # e.g. s3://my-archive-bucket/in
enriched:
good: s3://xxx #ADD HERE # e.g. s3://my-out-bucket/enriched/good
bad: s3://xxx #ADD HERE # e.g. s3://my-out-bucket/enriched/bad
errors: s3://xxx #ADD HERE # Leave blank unless continue_on_unexpected_error: set to true below
archive: s3://xxx #ADD HERE # Where to archive enriched events to, e.g. s3://my-archive-bucket/enriched
shredded:
good: s3://xxx #ADD HERE # e.g. s3://my-out-bucket/shredded/good
bad: s3://xxx #ADD HERE # e.g. s3://my-out-bucket/shredded/bad
errors: s3://xxx #ADD HERE # Leave blank unless continue_on_unexpected_error: set to true below
archive: s3://xxx #ADD HERE # Where to archive shredded events to, e.g. s3://my-archive-bucket/shredded
consolidate_shredded_output: false
emr:
#job_name: Snowplow ETL # Give your job a name
ami_version: 5.9.0 # Don’t change this
region: "us-east-1" #ADD HERE # Always set this
jobflow_role: EMR_EC2_DefaultRole # Created using aws emr create-default-roles
service_role: EMR_DefaultRole # Created using aws emr create-default-roles
placement: #ADD HERE # Set this if not running in VPC. Leave blank otherwise
ec2_subnet_id: #ADD HERE # Set this if running in VPC. Leave blank otherwise
ec2_key_name: snowplow #ADD HERE
security_configuration:
bootstrap: # Set this to specify custom boostrap actions. Leave empty otherwise
software:
hbase: "0.92.0" # Optional. To launch on cluster, provide version, "0.92.0", keep quotes. Leave empty otherwise.
lingual: "1.1" # Optional. To launch on cluster, provide version, "1.1", keep quotes. Leave empty otherwise.
# Adjust your Spark cluster below
jobflow:
job_name: Snowplow ETL
master_instance_type: m1.medium #m1.medium
core_instance_count: 2 #2
core_instance_type: m1.medium #m1.medium
core_instance_ebs: # Optional. Attach an EBS volume to each core instance.
volume_size: 100 # Gigabytes
volume_type: "gp2"
volume_iops: 400 # Optional. Will only be used if volume_type is "io1"
ebs_optimized: false # Optional. Will default to true
task_instance_count: 0 # Increase to use spot instances
task_instance_type: m1.medium #m1.medium
task_instance_bid: #0.015 # In USD. Adjust bid, or leave blank for non-spot-priced (i.e. on-demand) task instances
bootstrap_failure_tries: 3 # Number of times to attempt the job in the event of bootstrap failures
additional_info: # Optional JSON string for selecting additional features
collectors:
format: 'thrift' #‘cloudfront’ Or ‘clj-tomcat’ for the Clojure Collector, or ‘thrift’ for Thrift records, or ‘tsv/com.amazon.aws.cloudfront/wd_access_log’ for Cloudfront access logs
enrich:
versions:
spark_enrich: 1.10.0 # Version of the Spark Enrichment process
continue_on_unexpected_error: false # Set to ‘true’ (and set out_errors: above) if you don’t want any exceptions thrown from ETL
output_compression: NONE # Compression only supported with Redshift, set to NONE if you have Postgres targets. Allowed formats: NONE, GZIP
storage:
versions:
rdb_shredder: 0.13.0 # Version of the Relational Database Shredding process
rdb_loader: 0.14.0 # Version of the Relational Database Loader app
hadoop_elasticsearch: 0.1.0 # Version of the Hadoop to Elasticsearch copying process
monitoring:
tags: {} # Name-value pairs describing this job
logging:
level: DEBUG # You can optionally switch to INFO for production
snowplow:
method: get
app_id: snowplow #ADD HERE # e.g. snowplow
collector: localhost:8001/ #ADD HERE # e.g. d3rkrsqld9gmqf.cloudfront.net