Untitled

#!/bin/bash

# START OF CONFIGURATION

# dbname is the name of the Canvas Data database
dbname=canvas_data

# basedir is the directory containing the data files.
# These may be from the CLI tool or files you've manually downloaded
# The names of the files do not depend on the structure of the CLI tool
# It may be a relative path to the current directory or an absolute path
basedir='/home/muhe/dataFiles'

# checksequence queries the database before importing data to see if
# this sequence has already been imported. It mostly applies to
# incremental files like the requests table. It allows you to leave your
# files on the disk without having to extract them or truncate your tables
# It can also be used to pick back up where you left off, although if the
# sequence bombs in the middle, you may need to truncate the offended table
# Note that this relies on a versions table that was created using the
# accompanying SQL script
checksequence=1

# incrementaltables is a comma separated list of the tables that are partial
# and should not have their tables truncated before importing
# If you are using checksequence=1, then it will try to figure this out from
# the database
incrementaltables=requests

# leaveasgzip temporarily extracts the compressed gzip file for importing
# but then removes the uncompressed version after the import
# This is an attempt to save file space
leaveasgzip=1

# sortdata will pipe the data through the sort -u command
# This can have a huge impact on the import process
sortdata=1

# MYSQL is the mysql command that is needed to execute mysql
# You can put items like username and password here, but it is recommended
# that you configure the ~/.my.cnf file instead
MYSQL='mysql --local-infile --user=yourusername --password=yourpassword'

# verbosity controls the logging of messages
# 0 = no logging of messages
# 1 = minimal logging of 1 per file
# 2 = log importing of data into database
# 3 = more verbose logging including decompressing file and truncating tables
# 4 = log little things that probably don't need logged
verbosity=2

# END OF CONFIGURATION


# Create a temporary file to hold all of the files to be considered
# Store only the directory and basenames to allow for both the compressed
# and uncompressed versions to exist
# This file will be created in your systems $TMPDIR folder,
# which is often /tmp or /var/tmp
# The file will be removed if the process successfully completes, but will
# be orphaned if it aborts
tmpfile="$(mktemp)"

# Exclude requests table ( ! -name "*requests*" )
find "${basedir}" -type f -regextype posix-egrep -regex '.*/([0-9]+_)?[a-z_]+-[0-9]{5}-[0-9a-f]{8}(\.gz)?$' ! -name "*requests*" -printf '%h/' -exec basename {} .gz \; | sort -u > "${tmpfile}"


if [ ${checksequence} -eq 1 ]
then
  # Try to fetch the list of incremental tables from the database
  tables=$( ${MYSQL} ${dbname} -sse "SELECT CONCAT_WS(',', table_name) FROM versions WHERE incremental=1" )
  if [ "${tables}" != "" ]
  then
    incrementaltables="${tables}"
  fi
fi

# Initialize some variables
oldtable=""
oldseq=""
hasprocessed=0

# Iterate through the files
for pathname in $(cat "${tmpfile}")
do

  if [ ${verbosity} -gt 0 ]
  then
    echo "Processing ${pathname}"
  fi

  # Split the filename into parts
  dirname=$(dirname "${pathname}")
  filename=$(basename "${pathname}")
  firstpart=$(echo "${filename}" | cut -f1 -d-)
  tablepart=$(echo "${firstpart}" | sed -r "s/^[0-9]+_//")
  seqpart=$(echo "${firstpart}" | grep -Eo "^[0-9]+")
  numidpart=$(echo "${filename}" | cut -f2- -d-)

  # Check to see if the previous table has been processed, if so then update the database
  if [ ${hasprocessed} -eq 1 -a ${checksequence} -eq 1 -a "${oldtable}" != "${tablepart}" -a "$oldseq" != "" ]
  then

    if [ ${verbosity} -ge 4 ]
    then
      echo "Updating sequence number for ${oldtable} to ${oldseq}"
    fi

    $( ${MYSQL} ${dbname} -sqe "UPDATE versions SET version = ${oldseq} WHERE table_name = '${oldtable}'" )
    hasprocessed=0
  fi

  process=1

  # Get the last version imported
  if [ ${checksequence} -eq 1 -a "$seqpart" != ""  ]
  then

    if [ ${verbosity} -ge 4 ]
    then
      echo "Checking for previously saved version of ${tablepart}"
    fi

    extseq=$( ${MYSQL} ${dbname} -sse "SELECT IFNULL(version,0) FROM versions WHERE table_name='${tablepart}'" )
    if [ "${extseq}" == "" -o ${extseq} -ge $seqpart ]
    then
      process=0
    fi
  fi

  # Process this file
  if [ ${process} -eq 1 ]
  then
    removefile=0
    datafile="${dirname}/${filename}"
    if [ ! -f "${datafile}" ]
    then
      # There is no already-extracted file
      if [ -f "${datafile}.gz" ]
      then
        # There is a gzipped version

        if [ ${verbosity} -ge 3 ]
        then
          echo "Uncompressing ${filename}"
        fi

        if [ ${leaveasgzip} -eq 1 ]
        then
          # Extract it, but plan on removing it later
          gzip -dc "${datafile}.gz" > "${datafile}"
          if [ ${sortdata} -eq 1 ]
          then
            sort -u -o "${datafile}" "${datafile}"
          fi
          removefile=1
        else
          # Extract it and leave it extracted
          gzip -d "${datafile}.gz"
          if [ ${sortdata} -eq 1 ]
          then
            sort -u -o "${datafile}" "${datafile}"
          fi
        fi
      fi
    fi

    # Check to see if this is an incremental table
    partial=0
    if [ "x${incrementaltables/$tablepart}" != "x${incrementaltables}" ]
    then
      partial=1
    fi

    # If it is incremental or the previous file used the same table,
    # then don't truncate it first, but do an append instead
    if [ ${partial} -eq 0 -a "${oldtable}" != "${tablepart}" ]
    then

      if [ ${verbosity} -ge 3 ]
      then
        echo "Truncating ${tablepart}"
      fi

      ${MYSQL} ${dbname} -sqe "TRUNCATE ${tablepart}"

    fi
    # Load the data into the table

    if [ ${verbosity} -ge 2 ]
    then
      echo "Loading ${filename} into ${tablepart}"
    fi

    ${MYSQL} ${dbname} -sqe "LOAD DATA LOCAL INFILE '${datafile}' INTO TABLE ${tablepart}"

    hasprocessed=1
    # Remove the uncompressed version if needed
    if [ ${removefile} -eq 1 ]
    then

      if [ ${verbosity} -ge 4 ]
      then
        echo "Removing uncompressed version of ${datafile}"
      fi

      rm "${datafile}"
    fi

  fi

  # Update the previous table and sequence data
  oldtable=${tablepart}
  oldseq=${seqpart}

done

# Update the sequence on the final file if necessary
if [ ${hasprocessed} -eq 1 -a ${checksequence} -eq 1 -a "${oldtable}" != "" -a "$oldseq" != "" ]
then

  if [ ${verbosity} -ge 4 ]
  then
    echo "Updating sequence number for ${oldtable} to ${oldseq}"
  fi

  $( ${MYSQL} ${dbname} -sqe "UPDATE versions SET version = ${oldseq} WHERE table_name = '${oldtable}'" )
fi

# Remove the list of files
rm "${tmpfile}"