Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- ##################################################################
- #
- # This script uploads files on hdfs directory to s3 RECURSIVELY
- #
- # usage: hdf2s3.sh <source hdfs dir> <s3://your bucket>
- #
- ##################################################################
- set -eu
- if [ $# -ne 2 ]; then
- echo " usage: hdf2s3.sh <source hdfs dir> <s3://your bucket>"
- exit 1
- fi
- if [[ ! $2 =~ ^s3:\/\/.* ]]; then
- echo "s3 URI should start with s3://"
- exit 1
- fi
- directory_to_upload=$1
- target_bucket=$2
- target_bucket=${target_bucket/\/$//}
- target_bucket=$(echo $target_bucket | sed 's/s3:\/\///g')
- tmp_folder=$(mktemp -d /tmp/$(basename $0).XXXXXX) || exit 1
- uploadfile_list=$(hdfs dfs -ls -R $directory_to_upload | grep -v drw | tr -s ' '| cut -d' ' -f8)
- uploadfile_count=$(printf '%s\n' "${uploadfile_list[@]}" | wc -l )
- counter=0
- echo TOTAL FILES TO UPLOAD = ${uploadfile_count}
- for i in ${uploadfile_list}; do
- filename=$(basename ${i})
- dirname=$(dirname ${i})
- dest=$target_bucket/$dirname/$filename
- dest=$(echo $dest | sed 's/\/\//\//g')
- dest=$(echo $dest | sed 's/\/\//\//g')
- dest=s3://$dest
- let counter=counter+1
- echo "##### Uploading ${counter}/${uploadfile_count} #####"
- target_dir_ls=$(s3cmd ls --recursive $dest | wc -l)
- if [ $target_dir_ls -gt 0 ]; then
- echo "The file $dest already exists. Skipping..."
- continue
- fi
- #echo "hdfs dfs -get ${i} ${tmp_folder}";
- hdfs dfs -get ${i} ${tmp_folder}/;
- echo s3cmd --quiet put ${tmp_folder}/${filename} $dest
- s3cmd --quiet put ${tmp_folder}/${filename} $dest
- #echo rm ${tmp_folder}/${filename};
- rm ${tmp_folder}/${filename};
- done
- rmdir ${tmp_folder}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement