Hi @ihor ,
I have comeup with the new issue when I tried to run the EmrEtlRunner.
etl/snowplow-emr-etl-runner run --config etl/config.yml --resolver etl/resolver.json
Following is the config.yml
aws:
access_key_id: *************************
secret_access_key: ****************************
s3:
region: eu-west-1
buckets:
assets: 's3://sp-dev-hostedassetsmirror-25zbzj9qbp5t'
jsonpath_assets: 's3://sp-dev-iglurepo-3in6ep4wt6ld/jsonpaths'
log: 's3://sp-dev-badevents-recovered/logs/2020-11-01/'
encrypted: false
raw:
in:
- 's3://sp-dev-badevents-recovered/2020-11-01/'
processing: 's3://sp-dev-badevents-recovered/2020-11-01/processing'
archive: 's3://sp-dev-archive-badevents-recovered/raw/'
enriched:
good: 's3://sp-dev-badevents-recovered/enriched/good/2020-11-01/'
bad: 's3://sp-dev-badevents-recovered/enriched/bad/2020-11-01/'
errors: 's3://sp-dev-badevents-recovered/enriched/errors/2020-11-01/'
archive: 's3://sp-dev-archive-badevents-recovered/enriched/good/2020-11-01/'
shredded:
good: 's3://sp-dev-badevents-recovered/shredded/good/2020-11-01/'
bad: 's3://sp-dev-badevents-recovered/shredded/bad/2020-11-01/'
errors: 's3://sp-dev-badevents-recovered/shredded/errors/2020-11-01/'
archive: 's3://sp-dev-archive-badevents-recovered/shredded/good/2020-11-01/'
emr:
ami_version: 5.9.0
region: eu-west-1
jobflow_role: EMR_EC2_DefaultRole
service_role: EMR_DefaultRole
placement: null
ec2_subnet_id: null
ec2_key_name: '--'
bootstrap: []
software:
hbase: null
lingual: null
jobflow:
job_name: Snowplow BadData ETL
master_instance_type: m4.xlarge
core_instance_count: 1
core_instance_type: m4.xlarge
core_instance_ebs:
volume_size: 100
volume_type: gp2
volume_iops: null
ebs_optimized: false
task_instance_count: 2
task_instance_type: m4.xlarge
task_instance_bid: 0.999
bootstrap_failure_tries: 2
configuration:
yarn-site:
yarn.resourcemanager.am.max-attempts: '1'
spark:
maximizeResourceAllocation: 'true'
additional_info: null
collectors:
format: thrift
enrich:
versions:
spark_enrich: 1.14.0
continue_on_unexpected_error: true
output_compression: GZIP
storage:
versions:
rdb_loader: 0.14.0
rdb_shredder: 0.13.1
hadoop_elasticsearch: 0.1.0
monitoring:
tags: {}
logging: null
level: DEBUG
snowplow:
method: get
app_id: pf-dev-snowplow
collector: c-dev.propertyfinder.ae
Failing with the following error :
ERROR: org.jruby.embed.EvalFailedException: (ReturnContractError) Contract violation for return value:
Expected: #<Contracts::Maybe:0x333e01c6 @vals=[{:aws=>{:access_key_id=>String, :secret_access_key=>String, :s3=>{:region=>String, :buckets=>{:assets=>String, :jsonpath_assets=>#<Contracts::Maybe:0x2487b621 @vals=[String, nil]>, :log=>String, :raw=>#<Contracts::Maybe:0x504b4a97 @vals=[{:in=>#<Contracts::CollectionOf:0x3e79473d @contract=String, @collection_class=Array>, :processing=>String, :archive=>String}, nil]>, :enriched=>{:good=>String, :bad=>#<Contracts::Maybe:0x49741e80 @vals=[String, nil]>, :errors=>#<Contracts::Maybe:0x39acf187 @vals=[String, nil]>, :archive=>#<Contracts::Maybe:0xdd3e1e3 @vals=[String, nil]>, :stream=>#<Contracts::Maybe:0x7878459f @vals=[String, nil]>}, :shredded=>{:good=>String, :bad=>String, :errors=>#<Contracts::Maybe:0x4ef10d3b @vals=[String, nil]>, :archive=>#<Contracts::Maybe:0x749ffdc7 @vals=[String, nil]>}}}, :emr=>{:ami_version=>String, :region=>String, :jobflow_role=>String, :service_role=>String, :placement=>#<Contracts::Maybe:0x74ab8610 @vals=[String, nil]>, :ec2_subnet_id=>#<Contracts::Maybe:0x296949c8 @vals=[String, nil]>, :ec2_key_name=>String, :bootstrap=>#<Contracts::Maybe:0x729d1428 @vals=[#<Contracts::CollectionOf:0x257e8c43 @contract=String, @collection_class=Array>, nil]>, :software=>{:hbase=>#<Contracts::Maybe:0x3f0b5619 @vals=[String, nil]>, :lingual=>#<Contracts::Maybe:0x36ce9eaf @vals=[String, nil]>}, :jobflow=>{:job_name=>String, :master_instance_type=>String, :core_instance_count=>Contracts::Num, :core_instance_type=>String, :core_instance_ebs=>#<Contracts::Maybe:0xdc3eda6 @vals=[{:volume_size=>#<Proc:0x77587422@uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb:28 (lambda)>, :volume_type=>#<Proc:0x39eea4f6@uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb:27 (lambda)>, :volume_iops=>#<Contracts::Maybe:0x5c94d4b8 @vals=[#<Proc:0x77587422@uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb:28 (lambda)>, nil]>, :ebs_optimized=>#<Contracts::Maybe:0x308d8de8 @vals=[Contracts::Bool, nil]>}, nil]>, :task_instance_count=>Contracts::Num, :task_instance_type=>String, :task_instance_bid=>#<Contracts::Maybe:0x325236f5 @vals=[Contracts::Num, nil]>}, :additional_info=>#<Contracts::Maybe:0x73633230 @vals=[String, nil]>, :bootstrap_failure_tries=>Contracts::Num, :configuration=>#<Contracts::Maybe:0x69d2c460 @vals=[#<Contracts::HashOf:0x60be9fdf @key=Symbol, @value=#<Contracts::HashOf:0x7d816d32 @key=Symbol, @value=String>>, nil]>}}, :collectors=>#<Contracts::Maybe:0x3e984100 @vals=[{:format=>String}, nil]>, :enrich=>{:versions=>#<Contracts::Maybe:0x47d7e4b6 @vals=[{:spark_enrich=>String}, nil]>, :continue_on_unexpected_error=>#<Contracts::Maybe:0x49e92724 @vals=[Contracts::Bool, nil]>, :output_compression=>#<Proc:0x36e95b7b@uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb:26 (lambda)>}, :storage=>{:versions=>{:rdb_shredder=>String, :hadoop_elasticsearch=>String, :rdb_loader=>String}}, :monitoring=>{:tags=>#<Contracts::HashOf:0x58189132 @key=Symbol, @value=String>, :logging=>{:level=>String}, :snowplow=>#<Contracts::Maybe:0x6f9999f6 @vals=[{:method=>String, :collector=>String, :app_id=>String}, nil]>}}, nil]>,
Actual: {:aws=>{:access_key_id=>"", :secret_access_key=>"", :s3=>{:region=>“eu-west-1”, :buckets=>{:assets=>“s3://sp-dev-hostedassetsmirror-25zbzj9qbp5t”, :jsonpath_assets=>“s3://sp-dev-iglurepo-3in6ep4wt6ld/jsonpaths”, :log=>“s3://sp-dev-badevents-recovered/logs/2020-11-01/”, :encrypted=>false, :raw=>{:in=>[“s3://sp-dev-badevents-recovered/2020-11-01/”], :processing=>“s3://sp-dev-badevents-recovered/2020-11-01/processing”, :archive=>“s3://sp-dev-archive-badevents-recovered/raw/”}, :enriched=>{:good=>“s3://sp-dev-badevents-recovered/enriched/good/2020-11-01/”, :bad=>“s3://sp-dev-badevents-recovered/enriched/bad/2020-11-01/”, :errors=>“s3://sp-dev-badevents-recovered/enriched/errors/2020-11-01/”, :archive=>“s3://sp-dev-archive-badevents-recovered/enriched/good/2020-11-01/”}, :shredded=>{:good=>“s3://sp-dev-badevents-recovered/shredded/good/2020-11-01/”, :bad=>“s3://sp-dev-badevents-recovered/shredded/bad/2020-11-01/”, :errors=>“s3://sp-dev-badevents-recovered/shredded/errors/2020-11-01/”, :archive=>“s3://sp-dev-archive-badevents-recovered/shredded/good/2020-11-01/”}}}, :emr=>{:ami_version=>“5.9.0”, :region=>“eu-west-1”, :jobflow_role=>“EMR_EC2_DefaultRole”, :service_role=>“EMR_DefaultRole”, :placement=>nil, :ec2_subnet_id=>nil, :ec2_key_name=>"–", :bootstrap=>, :software=>{:hbase=>nil, :lingual=>nil}, :jobflow=>{:job_name=>“Snowplow BadData ETL”, :master_instance_type=>“m4.xlarge”, :core_instance_count=>1, :core_instance_type=>“m4.xlarge”, :core_instance_ebs=>{:volume_size=>100, :volume_type=>“gp2”, :volume_iops=>nil, :ebs_optimized=>false}, :task_instance_count=>2, :task_instance_type=>“m4.xlarge”, :task_instance_bid=>0.999}, :bootstrap_failure_tries=>2, :configuration=>{:“yarn-site”=>{:“yarn.resourcemanager.am.max-attempts”=>“1”}, :spark=>{:maximizeResourceAllocation=>“true”}}, :additional_info=>nil}, :collectors=>{:format=>“thrift”}, :enrich=>{:versions=>{:spark_enrich=>“1.14.0”}, :continue_on_unexpected_error=>true, :output_compression=>“GZIP”}, :storage=>{:versions=>{:rdb_loader=>“0.14.0”, :rdb_shredder=>“0.13.1”, :hadoop_elasticsearch=>“0.1.0”}}, :monitoring=>{:tags=>{}, :logging=>nil, :level=>“DEBUG”}, :snowplow=>{:method=>“get”, :app_id=>“pf-dev-snowplow”, :collector=>“c-dev.propertyfinder.ae”}}}
Value guarded in: Snowplow::EmrEtlRunner::Cli::load_config
** With Contract: Maybe, String, Bool => Maybe**
** At: uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:202**
I checked actual and expected results and i couldnot find out the real issue.
Can you please guide me on this plus I would like to know whats the relevance of the snowplow block part in this YAML.
Whatever name I specified as app_id and collector are just a name I defined. (no relevance), let me know if it has to be the real snowplow collector name that we use for the real time streaming pipeline.