Hello @alex,
#!/usr/bin/env bash
main() {
aws emr create-cluster --applications Name=Hadoop --ec2-attributes '{
"InstanceProfile":"EMR_EC2_DefaultRole",
"AvailabilityZone":"eu-west-1b"
}' --service-role EMR_DefaultRole --enable-debugging --release-label emr-4.3.0 --log-uri 's3n://company-snowplow-troubleshoot/recovery/logs/cluster/' --steps '[
{
"Args":[
"--src",
"s3n://company-snowplow-efritin/logs/enriched/bad/",
"--dest",
"hdfs:///local/monthly/",
"--groupBy",
".*(run)=2016-08.*",
"--targetSize",
"128",
"--outputCodec",
"lzo"
],
"Type":"CUSTOM_JAR",
"ActionOnFailure":"TERMINATE_CLUSTER",
"Jar":"/usr/share/aws/emr/s3-dist-cp/lib/s3-dist-cp.jar",
"Name":"Combine Months"
},
{
"Args":[
"com.snowplowanalytics.hadoop.scalding.SnowplowEventRecoveryJob",
"--hdfs",
"--input",
"hdfs:///local/monthly/*",
"--output",
"hdfs:///local/recovery/",
"--inputFormat",
"bad",
"--script",
"ZnVuY3Rpb24gcHJvY2VzcyhldmVudCwgZXJyb3JzKSB7CiAgdmFyIGZpZWxkcyA9IHRzdlRvQXJyYXkoZXZlbnQpOwogICAgaWYgKGZpZWxkc1s1XSA9PSAnR0VUJykgewogICAgICAgIHZhciBxdWVyeXN0cmluZyA9IHBhcnNlUXVlcnlzdHJpbmcoZmllbGRzWzExXSk7CiAgICAgICAgcXVlcnlzdHJpbmdbJ3AnXSA9ICdtb2InOwogICAgICAgIGZpZWxkc1sxMV0gPSBidWlsZFF1ZXJ5c3RyaW5nKHF1ZXJ5c3RyaW5nKTsKICAgIH0KICAgIHJldHVybiBhcnJheVRvVHN2KGZpZWxkcyk7Cn0KCg==" ],
"Type":"CUSTOM_JAR",
"ActionOnFailure":"CONTINUE",
"Jar":"s3://snowplow-hosted-assets/3-enrich/hadoop-event-recovery/snowplow-hadoop-event-recovery-0.2.0.jar",
"Name":"Fix up bad rows"
},
{
"Args":[
"--src",
"hdfs:///local/recovery/",
"--dest",
"s3n://company-snowplow-troubleshoot/logs/raw/processing/"
],
"Type":"CUSTOM_JAR",
"ActionOnFailure":"TERMINATE_CLUSTER",
"Jar":"/usr/share/aws/emr/s3-dist-cp/lib/s3-dist-cp.jar",
"Name":"Back to S3"
}
]' --name 'Snowplow-Recovery-Cluster' --instance-groups '[
{
"InstanceCount":1,
"InstanceGroupType":"MASTER",
"InstanceType":"m1.medium",
"Name":"MASTER"
},
{
"InstanceCount":2,
"InstanceGroupType":"CORE",
"InstanceType":"m3.xlarge",
"Name":"CORE"
}
]'
}
main "$@"
This is my EMR configuration when I am running this Fix up bad rows
steps are failing this is the log
2016-08-17 18:36:59,456 INFO cascading.flow.hadoop.util.HadoopUtil (main): resolving application jar from found main method on: com.snowplowanalytics.hadoop.scalding.JobRunner$
2016-08-17 18:36:59,459 INFO cascading.flow.hadoop.planner.HadoopPlanner (main): using application jar: /mnt/var/lib/hadoop/steps/s-2G3U750KH6Q9K/snowplow-hadoop-event-recovery-0.2.0.jar
2016-08-17 18:36:59,478 INFO cascading.property.AppProps (main): using app.id: 04799887A3E1448CB011342C972A9FE3
2016-08-17 18:36:59,692 INFO org.apache.hadoop.conf.Configuration.deprecation (main): mapred.used.genericoptionsparser is deprecated. Instead, use mapreduce.client.genericoptionsparser.used
2016-08-17 18:36:59,756 INFO org.apache.hadoop.conf.Configuration.deprecation (main): mapred.output.compress is deprecated. Instead, use mapreduce.output.fileoutputformat.compress
2016-08-17 18:36:59,770 WARN cascading.util.Version (main): found multiple 'cascading/version.properties' files on the CLASSPATH. Please check your dependencies: file:/mnt/var/lib/hadoop/steps/s-2G3U750KH6Q9K/tmp/hadoop-unjar2414297679156794865/cascading/version.properties,jar:file:/mnt/var/lib/hadoop/steps/s-2G3U750KH6Q9K/snowplow-hadoop-event-recovery-0.2.0.jar!/cascading/version.properties, using first returned
2016-08-17 18:37:00,117 INFO cascading.util.Version (flow com.snowplowanalytics.hadoop.scalding.SnowplowEventRecoveryJob): Concurrent, Inc - Cascading 2.7.0
2016-08-17 18:37:00,121 INFO cascading.flow.Flow (flow com.snowplowanalytics.hadoop.scalding.SnowplowEventRecoveryJob): [com.snowplowanalytics....] starting
2016-08-17 18:37:00,122 INFO cascading.flow.Flow (flow com.snowplowanalytics.hadoop.scalding.SnowplowEventRecoveryJob): [com.snowplowanalytics....] source: Hfs["TextLine[['offset', 'line']->[ALL]]"]["hdfs:/local/monthly/*"]
2016-08-17 18:37:00,122 INFO cascading.flow.Flow (flow com.snowplowanalytics.hadoop.scalding.SnowplowEventRecoveryJob): [com.snowplowanalytics....] sink: Hfs["TextDelimited[['altered']]"]["hdfs:/local/recovery"]
2016-08-17 18:37:00,122 INFO cascading.flow.Flow (flow com.snowplowanalytics.hadoop.scalding.SnowplowEventRecoveryJob): [com.snowplowanalytics....] parallel execution is enabled: true
2016-08-17 18:37:00,122 INFO cascading.flow.Flow (flow com.snowplowanalytics.hadoop.scalding.SnowplowEventRecoveryJob): [com.snowplowanalytics....] starting jobs: 1
2016-08-17 18:37:00,123 INFO cascading.flow.Flow (flow com.snowplowanalytics.hadoop.scalding.SnowplowEventRecoveryJob): [com.snowplowanalytics....] allocating threads: 1
2016-08-17 18:37:00,124 INFO cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] starting step: (1/1) hdfs:/local/recovery
2016-08-17 18:37:00,291 INFO org.apache.hadoop.yarn.client.RMProxy (pool-4-thread-1): Connecting to ResourceManager at ip-10-73-164-48.eu-west-1.compute.internal/10.73.164.48:8032
2016-08-17 18:37:00,743 INFO org.apache.hadoop.yarn.client.RMProxy (pool-4-thread-1): Connecting to ResourceManager at ip-10-73-164-48.eu-west-1.compute.internal/10.73.164.48:8032
2016-08-17 18:37:01,775 INFO amazon.emr.metrics.MetricsSaver (pool-4-thread-1): MetricsConfigRecord disabledInCluster: false instanceEngineCycleSec: 60 clusterEngineCycleSec: 60 disableClusterEngine: false maxMemoryMb: 3072 maxInstanceCount: 500 lastModified: 1471458713864
2016-08-17 18:37:01,775 INFO amazon.emr.metrics.MetricsSaver (pool-4-thread-1): Created MetricsSaver j-2QPHXXGPPETKV:i-d2cf1c59:RunJar:08173 period:60 /mnt/var/em/raw/i-d2cf1c59_20160817_RunJar_08173_raw.bin
2016-08-17 18:37:02,807 INFO com.hadoop.compression.lzo.GPLNativeCodeLoader (pool-4-thread-1): Loaded native gpl library
2016-08-17 18:37:02,824 INFO com.hadoop.compression.lzo.LzoCodec (pool-4-thread-1): Successfully loaded & initialized native-lzo library [hadoop-lzo rev 02f444f0932ea7710dcc4bcdc1aa7ca55adf48c9]
2016-08-17 18:37:02,943 INFO org.apache.hadoop.mapred.FileInputFormat (pool-4-thread-1): Total input paths to process : 10
2016-08-17 18:37:03,163 INFO org.apache.hadoop.mapreduce.JobSubmitter (pool-4-thread-1): number of splits:10
2016-08-17 18:37:03,723 INFO org.apache.hadoop.mapreduce.JobSubmitter (pool-4-thread-1): Submitting tokens for job: job_1471458701193_0002
2016-08-17 18:37:04,037 INFO org.apache.hadoop.yarn.client.api.impl.YarnClientImpl (pool-4-thread-1): Submitted application application_1471458701193_0002
2016-08-17 18:37:04,096 INFO org.apache.hadoop.mapreduce.Job (pool-4-thread-1): The url to track the job: http://ip-10-73-164-48.eu-west-1.compute.internal:20888/proxy/application_1471458701193_0002/
2016-08-17 18:37:04,097 INFO cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] submitted hadoop job: job_1471458701193_0002
2016-08-17 18:37:04,098 INFO cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] tracking url: http://ip-10-73-164-48.eu-west-1.compute.internal:20888/proxy/application_1471458701193_0002/
2016-08-17 18:37:30,317 INFO cascading.util.Update (UpdateRequestTimer): newer Cascading release available: 2.7.1
2016-08-17 18:48:40,069 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] hadoop job job_1471458701193_0002 state at FAILED
2016-08-17 18:48:40,070 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] failure info: Task failed task_1471458701193_0002_m_000001
Job failed as tasks failed. failedMaps:1 failedReduces:0
2016-08-17 18:48:40,106 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] task completion events identify failed tasks
2016-08-17 18:48:40,106 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] task completion events count: 10
2016-08-17 18:48:40,108 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] event = Task Id : attempt_1471458701193_0002_m_000009_0, Status : FAILED
2016-08-17 18:48:40,109 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] event = Task Id : attempt_1471458701193_0002_m_000001_0, Status : FAILED
2016-08-17 18:48:40,109 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] event = Task Id : attempt_1471458701193_0002_m_000009_1, Status : FAILED
2016-08-17 18:48:40,109 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] event = Task Id : attempt_1471458701193_0002_m_000002_0, Status : FAILED
2016-08-17 18:48:40,109 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] event = Task Id : attempt_1471458701193_0002_m_000007_0, Status : FAILED
2016-08-17 18:48:40,109 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] event = Task Id : attempt_1471458701193_0002_m_000001_1, Status : FAILED
2016-08-17 18:48:40,109 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] event = Task Id : attempt_1471458701193_0002_m_000002_1, Status : FAILED
2016-08-17 18:48:40,110 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] event = Task Id : attempt_1471458701193_0002_m_000008_0, Status : SUCCEEDED
2016-08-17 18:48:40,110 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] event = Task Id : attempt_1471458701193_0002_m_000007_1, Status : FAILED
2016-08-17 18:48:40,110 WARN cascading.flow.FlowStep (pool-4-thread-1): [com.snowplowanalytics....] event = Task Id : attempt_1471458701193_0002_m_000009_2, Status : FAILED
2016-08-17 18:48:40,119 INFO cascading.flow.Flow (flow com.snowplowanalytics.hadoop.scalding.SnowplowEventRecoveryJob): [com.snowplowanalytics....] stopping all jobs
2016-08-17 18:48:40,119 INFO cascading.flow.FlowStep (flow com.snowplowanalytics.hadoop.scalding.SnowplowEventRecoveryJob): [com.snowplowanalytics....] stopping: (1/1) hdfs:/local/recovery
2016-08-17 18:48:40,121 INFO cascading.flow.Flow (flow com.snowplowanalytics.hadoop.scalding.SnowplowEventRecoveryJob): [com.snowplowanalytics....] stopped all jobs
2016-08-17 18:48:40,131 INFO cascading.tap.hadoop.util.Hadoop18TapUtil (flow com.snowplowanalytics.hadoop.scalding.SnowplowEventRecoveryJob): deleting temp path hdfs:/local/recovery/_temporary
Can you help me out where I am missing.
function process(event, errors) {
var fields = tsvToArray(event);
if (fields[5] == 'GET') {
var querystring = parseQuerystring(fields[11]);
querystring['p'] = 'mob';
fields[11] = buildQuerystring(querystring);
}
return arrayToTsv(fields);
}