Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Code for Melanie (oDesk)
- # Basic Libraries
- import datetime
- import os.path
- import traceback
- import csv
- import utilities
- import sys
- # Listdir libraries
- from os import listdir
- from os.path import isfile, join
- # Excel library
- from openpyxl import load_workbook
- # Declare break words for each individual state
- # Following are break words for 1940, 1950, 1960. Uncomment whichever is required
- break_words = {
- 'AC': ['TERRITORIO', 'ESTADO', 'Total'],
- 'AL': ['ESTADO'],
- 'AM': ['ESTADO'],
- 'BA': ['ESTSADO', 'ESTADO'],
- 'CE': ['ESTADO'],
- 'ES': ['ESTADO'],
- 'GO': ['ESTADO'],
- 'MA': ['ESTADO'],
- 'PA': ['ESTADO', 'ESTADOS', 'Zona Guajarina (Conclusao)'],
- 'MT': ['ESTADO'],
- 'MG': ['ESTADO'],
- 'PB': ['ESTADO'],
- 'PI': ['ESTADO'],
- 'PR': ['ESTADO'],
- 'PE': ['ESTADO'],
- 'RJ': ['ESTADO'],
- 'RN': ['ESTADO'],
- 'RS': ['ESTADO'],
- 'SC': ['ESTADO', 'ESTADO (4)'],
- 'SP': ['ESTADO'],
- 'DF': ['TOTAIS *', 'TOTAIS', 'DISTRITO FEDERAL', 'DISTRITO FEDERAL*'],
- 'SE': ['ESTADO'],
- 'AP': ['TERRITORIO'],
- 'RO': ['TERRITORIO'],
- 'RR' : ['TERRITORIO'],
- 'GB': ['ESTADO']
- }
- # function for returning break word
- def estado(state):
- global break_words
- if state in break_words:
- bw = break_words[state]
- if len(bw) == 1: # if there is only break word, return it
- return (bw[0], True)
- else: # if there are more than one break words, return list
- return (bw, False)
- else: # state not found
- return ("", False)
- # function for debug message printing
- def println(msg):
- print(msg + "\n\n")
- println("Script init complete.")
- # get current time
- now = datetime.datetime.now()
- # read input directory argument
- println("Reading arguments for input directory..")
- args = sys.argv
- try:
- input_dir_name = args[1]
- except IndexError:
- # invalid syntax used
- println("ERROR! Please use correct syntax.")
- println("Syntax is python main.py <input folder name> .")
- println("Example: python main.py 2014-05-20")
- sys.exit(0)
- # Select input and output directory
- input_dir = 'input/' + input_dir_name
- # Iterate over all files in the input directory
- println("Reading input from directory ./" + input_dir + "..")
- try:
- # Get list of all files
- allfiles = [ f for f in listdir(input_dir) if isfile(join(input_dir, f)) ]
- except:
- # Invalid directory
- println("ERROR! The specified input directory was not found.")
- println("Please make sure you are running the script in the correct directory.")
- sys.exit(0)
- # create output folder if output folder doesn't exist
- output_dir = 'output/' + input_dir_name
- if not os.path.exists(output_dir):
- os.makedirs(output_dir)
- for f in allfiles:
- # initialize variables
- matrix = []
- rows = cols = i = j = 0
- single_rows = break_row = break_col = 0
- single_rows_complete = break_word_found = 0
- println("New input file found: " + f)
- # Get state and file name without extension
- f_state = utilities.getFileState(f)
- f_name = utilities.removeFileExt(f)
- break_word, bw_is_string = estado(f_state)
- if (break_word == ""):
- println("Omitting input file: " + f + " as no break word was found for this state. State code was: " + f_state)
- continue
- input_file = input_dir + '/' + f
- output_file = output_dir + '/' + f_name + '.csv'
- # Open csv writer for output
- writer = csv.writer(open(output_file, 'wb'))
- println("Output file path: " + repr(output_file))
- # Load first worksheet of input file for processing
- println("Opening Input Excel Workbook..")
- wb = load_workbook(filename = input_file, use_iterators = True)
- ws = wb.active
- # Count number of rows and columns
- println("Counting number of rows and columns..")
- for row in ws.iter_rows():
- rows = rows + 1
- if (rows == 1):
- for cell in row:
- cols = cols + 1
- println("Complete. Total Rows = " + repr(rows) + " and Total Cols = " + repr(cols) + ".")
- # define matrix 2d array for storing excel data
- matrix = [[0 for x in xrange(cols)] for x in xrange(rows)]
- # read excel file into the 2d array
- i = -1;
- println("Reading data into THE MATRIX..")
- for row in ws.iter_rows():
- i = i + 1
- j = -1
- for cell in row:
- j = j + 1
- # println("i = " + repr(i) + " & j = " + repr(j))
- if (cell.value is None):
- matrix[i][j] = ""
- else:
- matrix[i][j] = cell.value
- println("Complete.")
- # print matrix
- # utilities.printMatrix(matrix, 6, 3)
- # println("Value at i = 3, j = 1 is: " + repr(matrix[i][j]))
- # parse matrix again to find estado, in the same iteration count single rows
- if (bw_is_string == 1):
- println("Parsing matrix to find the break word: " + break_word)
- else:
- println("Parsing matrix to find the break word: " + ', '.join(break_word))
- # loop on all rows until break word has been found
- i = 0
- while (i < rows) and (break_word_found == 0):
- j = 0
- # if processing single rows is not over, set v to 0 (to store no of values in that row)
- if (single_rows_complete == 0):
- v = 0
- # loop over all columns in the current row searching for breakword
- while (j < cols):
- # get content of current cell
- cnt = matrix[i][j]
- # if content is not a string then go to next column
- if (not isinstance(cnt, basestring)):
- j = j + 1
- continue
- # format content
- cnt = utilities.removeNonAlNum(cnt.strip())
- # if breakword is found, store its row and col
- if (bw_is_string == True): # if there is only one break word
- if (cnt == break_word): # check equality
- break_word_found = 1
- break_row = i
- break_col = j
- else: # if there are more than one break words
- if (cnt in break_word): # check if content exists in the list
- break_word_found = 1
- break_row = i
- break_col = j
- # if processing single rows is not complete and there is a value in this cell increment v
- if (not (cnt == "")) and (single_rows_complete == 0):
- v = v + 1
- j = j + 1 # Col increment
- # if processing single rows is not complete
- if (single_rows_complete == 0):
- if (v == 1):
- # and there is a single value in this row, increment single row
- # println("There was a single valued row here")
- single_rows = single_rows + 1
- elif (v > 1):
- # there are multiple values, set processing single rows complete
- # println("More than one value was found in the same row here")
- single_rows_complete = 1
- i = i + 1 # Row increment
- if (bw_is_string == 1):
- println("Break word (" + break_word + ") found at row: " + repr(break_row) + " and col: " + repr(break_col))
- else:
- println("Break word (one of " + ', '.join(break_word) + ") found at row: " + repr(break_row) + " and col: " + repr(break_col))
- println(repr(single_rows) + " rows were found at the top with only single string value")
- # Generate output csv heading row
- println("Generating Column Names for Output..")
- if (break_col == 0): # Content starts from first column
- csv_headers = ['Municipios']
- else: # Content starts from second column, with first column being S.No.
- csv_headers = ['S.No.', 'Municipios']
- j = break_col + 1
- i = break_row - 1
- # loop over columns starting from break col's next col
- while (j < cols):
- # call the get col label method
- col_label = utilities.getRecursiveColLabel(matrix, i, j, single_rows, break_row)
- # format it so that there is no _ at the end
- col_label = col_label[1:]
- if (col_label[-1:] == '_'):
- col_label = col_label[:-1]
- # println("Column name for col=" + repr(j) + " is: " + col_label)
- csv_headers.append(col_label)
- j = j + 1 # col increment
- # write into output file
- writer.writerows([
- (csv_headers)
- ])
- println("Column names written to output successfully.")
- # start from break row to write actual statistics data
- i = break_row
- println("Writing Statistics data to output..")
- # loop over all rows
- while (i < rows):
- # this data var will be stored into the csv finally
- stats_data = []
- # If break word is in first column, that means the names are in the format <int> <str>
- # println("In first col: " + repr(matrix[i][0]))
- if (break_col == 0):
- if (i == break_row):
- # if this is the row containing breakword then format is only <str>
- # stats_data.append(0)
- stats_data.append(matrix[i][0].encode('utf-8'))
- else:
- # here format is <int> <str>
- try:
- parts = matrix[i][break_col].split()
- try:
- if utilities.RepresentsInt(parts[0]):
- # stats_data.append(int(parts[0]))
- stats_data.append(''.join(parts[1:]).encode('utf-8').strip())
- else:
- # if format is something else just append the value as it is
- stats_data.append(''.join(matrix[i][break_col]).encode('utf-8'))
- except:
- stats_data.append(''.join(parts).encode('utf-8').strip())
- except:
- stats_data.append(''.join(str(matrix[i][break_col])))
- else:
- # if breakword is not in the first column, that means input is properly numbered with 123 in col A
- # append directly into output
- if (isinstance(matrix[i][0], basestring)):
- stats_data.append(''.join(matrix[i][0]).encode('utf-8'))
- else:
- stats_data.append(matrix[i][0])
- if (isinstance(matrix[i][1], basestring)):
- stats_data.append(''.join(matrix[i][1]).encode('utf-8'))
- else:
- stats_data.append(matrix[i][1])
- j = break_col + 1
- # loop over all columns starting from next the break word col
- # this does the actual statistics numbers in the input
- while (j < cols):
- # format output
- # println("Here i= " + repr(i) + " and j=" + repr(j))
- c = matrix[i][j]
- if (isinstance(c, basestring)):
- if (c.strip().lower() == "(x)"): # replace (x) values with 0
- stats_data.append(0)
- elif (c.strip() == u'\xad'): # special replacement for unicode problems
- stats_data.append("-")
- else: # if it is a string (-,chars) then it needs to be properly encoded
- stats_data.append(c.encode('utf-8'))
- elif utilities.RepresentsInt(c): # if the number is an integer
- stats_data.append(int(c))
- else: # if its not a string just append it as it is
- stats_data.append(c)
- j = j + 1 # col increment
- # write current row to output csv
- writer.writerows([
- (stats_data)
- ])
- i = i + 1 # row increment
- println("Writing output complete successfully.")
- println("Closing Input and Output files..")
- println("Finished processing all files in the specified input folder. Please check the generated output files.")
- println("Thank you for using this program.\n- a1ananth")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement