From e46e378cb58dbf8ce287c560777278f28b54c59b Mon Sep 17 00:00:00 2001 From: jfg9 Date: Wed, 19 Feb 2014 11:25:22 +0000 Subject: [PATCH 1/2] Added hyphen to hostname regex Hyphen is also a valid character in hostnames --- camus2hive.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/camus2hive.sh b/camus2hive.sh index ffee6fd..72dcdb9 100755 --- a/camus2hive.sh +++ b/camus2hive.sh @@ -85,7 +85,7 @@ fi HIVE="hive --database $DATABASE -S" # What namenode Hive is communicating with for this database -NAME_NODE_URI=$(${HIVE} -e "describe database $DATABASE;" | sed -re 's%.*\t(hdfs://[a-zA-Z0-9]+)(:[0-9]+)?.*%\1\2%') +NAME_NODE_URI=$(${HIVE} -e "describe database $DATABASE;" | sed -re 's%.*\t(hdfs://[a-zA-Z0-9\-]+)(:[0-9]+)?.*%\1\2%') # Behavior config REQUERY_HADOOP_DIRS=true From 17fd26b5cd60920720a52f21d326dcba5f1fee85 Mon Sep 17 00:00:00 2001 From: jfg9 Date: Wed, 19 Feb 2014 11:29:08 +0000 Subject: [PATCH 2/2] Remove logging lines from Hive output Filter Hive output by removing any logging lines (detected as a line beginning with '['). Currently, when running Hive with Hadoop 2, Hive will output log messages to the console. E.g. [deprecation] - mapred.reduce.tasks is deprecated. Instead, use mapreduce.job.reduces This commit filters out these log messages before processing the Hive output. --- camus2hive.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/camus2hive.sh b/camus2hive.sh index 72dcdb9..f793957 100755 --- a/camus2hive.sh +++ b/camus2hive.sh @@ -85,7 +85,7 @@ fi HIVE="hive --database $DATABASE -S" # What namenode Hive is communicating with for this database -NAME_NODE_URI=$(${HIVE} -e "describe database $DATABASE;" | sed -re 's%.*\t(hdfs://[a-zA-Z0-9\-]+)(:[0-9]+)?.*%\1\2%') +NAME_NODE_URI=$(${HIVE} -e "describe database $DATABASE;" | grep -v '^\[' | sed -re 's%.*\t(hdfs://[a-zA-Z0-9\-]+)(:[0-9]+)?.*%\1\2%') # Behavior config REQUERY_HADOOP_DIRS=true @@ -175,7 +175,7 @@ while read topic; do fi # Check if the table already exists in Hive -${HIVE} -e "SHOW PARTITIONS $topic" 1> $EXISTING_HIVE_PARTITIONS_WITH_SLASHES 2> $HIVE_STDERR +${HIVE} -e "SHOW PARTITIONS $topic" | grep -v '^\[' 1> $EXISTING_HIVE_PARTITIONS_WITH_SLASHES 2> $HIVE_STDERR if ! hive_success_check "Table '$topic' does not currently exist in Hive (or Hive returned some other error on SHOW PARTITIONS $topic)."; then if [[ ! -z "$AVRO_SCHEMA_REPOSITORY" ]]; then @@ -214,7 +214,7 @@ ${HIVE} -e "SHOW PARTITIONS $topic" 1> $EXISTING_HIVE_PARTITIONS_WITH_SLASHES 2> fi fi - cat $EXISTING_HIVE_PARTITIONS_WITH_SLASHES | sed 's%/%, %g' > $EXISTING_HIVE_PARTITIONS + cat $EXISTING_HIVE_PARTITIONS_WITH_SLASHES | grep -v '^\[' | sed 's%/%, %g' > $EXISTING_HIVE_PARTITIONS # Extract all partitions currently ingested by Camus hdfs dfs -ls -R $CAMUS_DESTINATION_DIR/$topic | sed "s%.*$CAMUS_DESTINATION_DIR/$topic/hourly/\([0-9]*\)/\([0-9]*\)/\([0-9]*\)/\([0-9]*\)/.*%year=\1, month=\2, day=\3, hour=\4%" | grep "year.*" | sort | uniq > $EXISTING_CAMUS_PARTITIONS