main.py

# Code for Melanie (oDesk)

# Basic Libraries
import datetime
import os.path
import traceback
import csv
import utilities
import sys

# Listdir libraries
from os import listdir
from os.path import isfile, join

# Excel library
from openpyxl import load_workbook

# Declare break words for each individual state

# Following are break words for 1940, 1950, 1960. Uncomment whichever is required
break_words = {
    'AC': ['TERRITORIO', 'ESTADO', 'Total'],
    'AL': ['ESTADO'],
    'AM': ['ESTADO'],
    'BA': ['ESTSADO', 'ESTADO'],
    'CE': ['ESTADO'],
    'ES': ['ESTADO'],
    'GO': ['ESTADO'],
    'MA': ['ESTADO'],
    'PA': ['ESTADO', 'ESTADOS', 'Zona Guajarina (Conclusao)'],
    'MT': ['ESTADO'],
    'MG': ['ESTADO'],
    'PB': ['ESTADO'],
    'PI': ['ESTADO'],
    'PR': ['ESTADO'],
    'PE': ['ESTADO'],
    'RJ': ['ESTADO'],
    'RN': ['ESTADO'],
    'RS': ['ESTADO'],
    'SC': ['ESTADO', 'ESTADO (4)'],
    'SP': ['ESTADO'],
    'DF': ['TOTAIS *', 'TOTAIS', 'DISTRITO FEDERAL', 'DISTRITO FEDERAL*'],
    'SE': ['ESTADO'],
    'AP': ['TERRITORIO'],
    'RO': ['TERRITORIO'],
    'RR' : ['TERRITORIO'],
    'GB': ['ESTADO']
}

# function for returning break word
def estado(state):
    global break_words
    if state in break_words:
        bw = break_words[state]
        if len(bw) == 1: # if there is only break word, return it
            return (bw[0], True)
        else: # if there are more than one break words, return list
            return (bw, False)
    else: # state not found
        return ("", False)

# function for debug message printing
def println(msg):
    print(msg + "\n\n")

println("Script init complete.")

# get current time
now = datetime.datetime.now()

# read input directory argument
println("Reading arguments for input directory..")
args = sys.argv
try:
    input_dir_name = args[1]
except IndexError:
    # invalid syntax used
    println("ERROR! Please use correct syntax.")
    println("Syntax is python main.py <input folder name> .")
    println("Example: python main.py 2014-05-20")
    sys.exit(0)

# Select input and output directory
input_dir = 'input/' + input_dir_name

# Iterate over all files in the input directory
println("Reading input from directory ./" + input_dir + "..")
try:
    # Get list of all files
    allfiles = [ f for f in listdir(input_dir) if isfile(join(input_dir, f)) ]
except:
    # Invalid directory
    println("ERROR! The specified input directory was not found.")
    println("Please make sure you are running the script in the correct directory.")
    sys.exit(0)

# create output folder if output folder doesn't exist
output_dir = 'output/' + input_dir_name
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for f in allfiles:
    # initialize variables
    matrix = []
    rows = cols = i = j = 0
    single_rows = break_row = break_col = 0
    single_rows_complete = break_word_found = 0

    println("New input file found: " + f)

    # Get state and file name without extension
    f_state = utilities.getFileState(f)
    f_name = utilities.removeFileExt(f)
    break_word, bw_is_string = estado(f_state)

    if (break_word == ""):
        println("Omitting input file: " + f + " as no break word was found for this state. State code was: " + f_state)
        continue

    input_file = input_dir + '/' + f
    output_file = output_dir + '/' + f_name + '.csv'

    # Open csv writer for output
    writer = csv.writer(open(output_file, 'wb'))
    println("Output file path: " + repr(output_file))

    # Load first worksheet of input file for processing
    println("Opening Input Excel Workbook..")
    wb = load_workbook(filename = input_file, use_iterators = True)
    ws = wb.active

    # Count number of rows and columns
    println("Counting number of rows and columns..")
    for row in ws.iter_rows():
        rows = rows + 1
        if (rows == 1):
            for cell in row:
                cols = cols + 1

    println("Complete. Total Rows = " + repr(rows) + " and Total Cols = " + repr(cols) + ".")

    # define matrix 2d array for storing excel data
    matrix = [[0 for x in xrange(cols)] for x in xrange(rows)]

    # read excel file into the 2d array
    i = -1;
    println("Reading data into THE MATRIX..")
    for row in ws.iter_rows():
        i = i + 1
        j = -1

        for cell in row:
            j = j + 1
            # println("i = " + repr(i) + " & j = " + repr(j))
            if (cell.value is None):
                matrix[i][j] = ""
            else:
                matrix[i][j] = cell.value

    println("Complete.")

    # print matrix
    # utilities.printMatrix(matrix, 6, 3)

    # println("Value at i = 3, j = 1 is: " + repr(matrix[i][j]))

    # parse matrix again to find estado, in the same iteration count single rows
    if (bw_is_string == 1):
        println("Parsing matrix to find the break word: " + break_word)
    else:
        println("Parsing matrix to find the break word: " + ', '.join(break_word))

    # loop on all rows until break word has been found
    i = 0
    while (i < rows) and (break_word_found == 0):
        j = 0

        # if processing single rows is not over, set v to 0 (to store no of values in that row)
        if (single_rows_complete == 0):
            v = 0

        # loop over all columns in the current row searching for breakword
        while (j < cols):
            # get content of current cell
            cnt = matrix[i][j]

            # if content is not a string then go to next column
            if (not isinstance(cnt, basestring)):
                j = j + 1
                continue

            # format content
            cnt = utilities.removeNonAlNum(cnt.strip())

            # if breakword is found, store its row and col
            if (bw_is_string == True): # if there is only one break word
                if (cnt == break_word): # check equality
                    break_word_found = 1
                    break_row = i
                    break_col = j
            else: # if there are more than one break words
                if (cnt in break_word): # check if content exists in the list
                    break_word_found = 1
                    break_row = i
                    break_col = j

            # if processing single rows is not complete and there is a value in this cell increment v
            if (not (cnt == "")) and (single_rows_complete == 0):
                v = v + 1

            j = j + 1 # Col increment

        # if processing single rows is not complete
        if (single_rows_complete == 0):
            if (v == 1):
                # and there is a single value in this row, increment single row
                # println("There was a single valued row here")
                single_rows = single_rows + 1
            elif (v > 1):
                # there are multiple values, set processing single rows complete
                # println("More than one value was found in the same row here")
                single_rows_complete = 1

        i = i + 1 # Row increment

    if (bw_is_string == 1):
        println("Break word (" + break_word + ") found at row: " + repr(break_row) + " and col: " + repr(break_col))
    else:
        println("Break word (one of " + ', '.join(break_word) + ") found at row: " + repr(break_row) + " and col: " + repr(break_col))

    println(repr(single_rows) + " rows were found at the top with only single string value")

    # Generate output csv heading row
    println("Generating Column Names for Output..")
    if (break_col == 0): # Content starts from first column
        csv_headers = ['Municipios']
    else: # Content starts from second column, with first column being S.No.
        csv_headers = ['S.No.', 'Municipios']
    j = break_col + 1
    i = break_row - 1

    # loop over columns starting from break col's next col
    while (j < cols):
        # call the get col label method
        col_label = utilities.getRecursiveColLabel(matrix, i, j, single_rows, break_row)

        # format it so that there is no _ at the end
        col_label = col_label[1:]
        if (col_label[-1:] == '_'):
            col_label = col_label[:-1]

        # println("Column name for col=" + repr(j) + " is: " + col_label)
        csv_headers.append(col_label)
        j = j + 1 # col increment

    # write into output file
    writer.writerows([
        (csv_headers)
    ])
    println("Column names written to output successfully.")

    # start from break row to write actual statistics data
    i = break_row

    println("Writing Statistics data to output..")

    # loop over all rows
    while (i < rows):
        # this data var will be stored into the csv finally
        stats_data = []

        # If break word is in first column, that means the names are in the format <int> <str>
        # println("In first col: " + repr(matrix[i][0]))
        if (break_col == 0):
            if (i == break_row):
                # if this is the row containing breakword then format is only <str>
                # stats_data.append(0)
                stats_data.append(matrix[i][0].encode('utf-8'))
            else:
                # here format is <int> <str>
                try:
                    parts = matrix[i][break_col].split()
                    try:
                        if utilities.RepresentsInt(parts[0]):
                            # stats_data.append(int(parts[0]))
                            stats_data.append(''.join(parts[1:]).encode('utf-8').strip())
                        else:
                            # if format is something else just append the value as it is
                            stats_data.append(''.join(matrix[i][break_col]).encode('utf-8'))
                    except:
                        stats_data.append(''.join(parts).encode('utf-8').strip())
                except:
                    stats_data.append(''.join(str(matrix[i][break_col])))
        else:
            # if breakword is not in the first column, that means input is properly numbered with 123 in col A
            # append directly into output
            if (isinstance(matrix[i][0], basestring)):
                stats_data.append(''.join(matrix[i][0]).encode('utf-8'))
            else:
                stats_data.append(matrix[i][0])

            if (isinstance(matrix[i][1], basestring)):
                stats_data.append(''.join(matrix[i][1]).encode('utf-8'))
            else:
                stats_data.append(matrix[i][1])

        j = break_col + 1

        # loop over all columns starting from next the break word col
        # this does the actual statistics numbers in the input
        while (j < cols):
            # format output
            # println("Here i= " + repr(i) + " and j=" + repr(j))
            c = matrix[i][j]
            if (isinstance(c, basestring)):
                if (c.strip().lower() == "(x)"): # replace (x) values with 0
                    stats_data.append(0)
                elif (c.strip() == u'\xad'): # special replacement for unicode problems
                    stats_data.append("-")
                else: # if it is a string (-,chars) then it needs to be properly encoded
                    stats_data.append(c.encode('utf-8'))
            elif utilities.RepresentsInt(c): # if the number is an integer
                stats_data.append(int(c))
            else: # if its not a string just append it as it is
                stats_data.append(c)

            j = j + 1 # col increment

        # write current row to output csv
        writer.writerows([
            (stats_data)
        ])

        i = i + 1 # row increment

    println("Writing output complete successfully.")
    println("Closing Input and Output files..")

println("Finished processing all files in the specified input folder. Please check the generated output files.")
println("Thank you for using this program.\n- a1ananth")