Advertisement
a1ananth

main.py

Aug 11th, 2014
139
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.27 KB | None | 0 0
  1. # Code for Melanie (oDesk)
  2.  
  3. # Basic Libraries
  4. import datetime
  5. import os.path
  6. import traceback
  7. import csv
  8. import utilities
  9. import sys
  10.  
  11. # Listdir libraries
  12. from os import listdir
  13. from os.path import isfile, join
  14.  
  15. # Excel library
  16. from openpyxl import load_workbook
  17.  
  18. # Declare break words for each individual state
  19.  
  20. # Following are break words for 1940, 1950, 1960. Uncomment whichever is required
  21. break_words = {
  22.     'AC': ['TERRITORIO', 'ESTADO', 'Total'],
  23.     'AL': ['ESTADO'],
  24.     'AM': ['ESTADO'],
  25.     'BA': ['ESTSADO', 'ESTADO'],
  26.     'CE': ['ESTADO'],
  27.     'ES': ['ESTADO'],
  28.     'GO': ['ESTADO'],
  29.     'MA': ['ESTADO'],
  30.     'PA': ['ESTADO', 'ESTADOS', 'Zona Guajarina (Conclusao)'],
  31.     'MT': ['ESTADO'],
  32.     'MG': ['ESTADO'],
  33.     'PB': ['ESTADO'],
  34.     'PI': ['ESTADO'],
  35.     'PR': ['ESTADO'],
  36.     'PE': ['ESTADO'],
  37.     'RJ': ['ESTADO'],
  38.     'RN': ['ESTADO'],
  39.     'RS': ['ESTADO'],
  40.     'SC': ['ESTADO', 'ESTADO (4)'],
  41.     'SP': ['ESTADO'],
  42.     'DF': ['TOTAIS *', 'TOTAIS', 'DISTRITO FEDERAL', 'DISTRITO FEDERAL*'],
  43.     'SE': ['ESTADO'],
  44.     'AP': ['TERRITORIO'],
  45.     'RO': ['TERRITORIO'],
  46.     'RR' : ['TERRITORIO'],
  47.     'GB': ['ESTADO']
  48. }
  49.  
  50. # function for returning break word
  51. def estado(state):
  52.     global break_words
  53.     if state in break_words:
  54.         bw = break_words[state]
  55.         if len(bw) == 1: # if there is only break word, return it
  56.             return (bw[0], True)
  57.         else: # if there are more than one break words, return list
  58.             return (bw, False)
  59.     else: # state not found
  60.         return ("", False)
  61.  
  62. # function for debug message printing
  63. def println(msg):
  64.     print(msg + "\n\n")
  65.    
  66. println("Script init complete.")
  67.  
  68. # get current time
  69. now = datetime.datetime.now()
  70.  
  71. # read input directory argument
  72. println("Reading arguments for input directory..")
  73. args = sys.argv
  74. try:
  75.     input_dir_name = args[1]
  76. except IndexError:
  77.     # invalid syntax used
  78.     println("ERROR! Please use correct syntax.")
  79.     println("Syntax is python main.py <input folder name> .")
  80.     println("Example: python main.py 2014-05-20")
  81.     sys.exit(0)
  82.  
  83. # Select input and output directory
  84. input_dir = 'input/' + input_dir_name
  85.  
  86. # Iterate over all files in the input directory
  87. println("Reading input from directory ./" + input_dir + "..")
  88. try:
  89.     # Get list of all files
  90.     allfiles = [ f for f in listdir(input_dir) if isfile(join(input_dir, f)) ]
  91. except:
  92.     # Invalid directory
  93.     println("ERROR! The specified input directory was not found.")
  94.     println("Please make sure you are running the script in the correct directory.")
  95.     sys.exit(0)
  96.    
  97. # create output folder if output folder doesn't exist
  98. output_dir = 'output/' + input_dir_name
  99. if not os.path.exists(output_dir):
  100.     os.makedirs(output_dir)
  101.    
  102. for f in allfiles:
  103.     # initialize variables
  104.     matrix = []
  105.     rows = cols = i = j = 0
  106.     single_rows = break_row = break_col = 0
  107.     single_rows_complete = break_word_found = 0
  108.    
  109.     println("New input file found: " + f)
  110.    
  111.     # Get state and file name without extension
  112.     f_state = utilities.getFileState(f)
  113.     f_name = utilities.removeFileExt(f)
  114.     break_word, bw_is_string = estado(f_state)
  115.    
  116.     if (break_word == ""):
  117.         println("Omitting input file: " + f + " as no break word was found for this state. State code was: " + f_state)
  118.         continue
  119.    
  120.     input_file = input_dir + '/' + f
  121.     output_file = output_dir + '/' + f_name + '.csv'
  122.    
  123.     # Open csv writer for output
  124.     writer = csv.writer(open(output_file, 'wb'))
  125.     println("Output file path: " + repr(output_file))
  126.    
  127.     # Load first worksheet of input file for processing
  128.     println("Opening Input Excel Workbook..")
  129.     wb = load_workbook(filename = input_file, use_iterators = True)
  130.     ws = wb.active
  131.    
  132.     # Count number of rows and columns
  133.     println("Counting number of rows and columns..")
  134.     for row in ws.iter_rows():
  135.         rows = rows + 1
  136.         if (rows == 1):
  137.             for cell in row:
  138.                 cols = cols + 1
  139.                
  140.     println("Complete. Total Rows = " + repr(rows) + " and Total Cols = " + repr(cols) + ".")
  141.    
  142.     # define matrix 2d array for storing excel data
  143.     matrix = [[0 for x in xrange(cols)] for x in xrange(rows)]
  144.    
  145.     # read excel file into the 2d array
  146.     i = -1;
  147.     println("Reading data into THE MATRIX..")
  148.     for row in ws.iter_rows():
  149.         i = i + 1
  150.         j = -1
  151.        
  152.         for cell in row:
  153.             j = j + 1
  154.             # println("i = " + repr(i) + " & j = " + repr(j))
  155.             if (cell.value is None):
  156.                 matrix[i][j] = ""
  157.             else:
  158.                 matrix[i][j] = cell.value
  159.                
  160.     println("Complete.")
  161.    
  162.     # print matrix
  163.     # utilities.printMatrix(matrix, 6, 3)
  164.    
  165.     # println("Value at i = 3, j = 1 is: " + repr(matrix[i][j]))
  166.    
  167.     # parse matrix again to find estado, in the same iteration count single rows
  168.     if (bw_is_string == 1):
  169.         println("Parsing matrix to find the break word: " + break_word)
  170.     else:
  171.         println("Parsing matrix to find the break word: " + ', '.join(break_word))
  172.    
  173.     # loop on all rows until break word has been found
  174.     i = 0
  175.     while (i < rows) and (break_word_found == 0):
  176.         j = 0
  177.        
  178.         # if processing single rows is not over, set v to 0 (to store no of values in that row)
  179.         if (single_rows_complete == 0):
  180.             v = 0
  181.            
  182.         # loop over all columns in the current row searching for breakword
  183.         while (j < cols):
  184.             # get content of current cell
  185.             cnt = matrix[i][j]
  186.            
  187.             # if content is not a string then go to next column
  188.             if (not isinstance(cnt, basestring)):
  189.                 j = j + 1
  190.                 continue
  191.                
  192.             # format content
  193.             cnt = utilities.removeNonAlNum(cnt.strip())
  194.            
  195.             # if breakword is found, store its row and col
  196.             if (bw_is_string == True): # if there is only one break word
  197.                 if (cnt == break_word): # check equality
  198.                     break_word_found = 1
  199.                     break_row = i
  200.                     break_col = j
  201.             else: # if there are more than one break words
  202.                 if (cnt in break_word): # check if content exists in the list
  203.                     break_word_found = 1
  204.                     break_row = i
  205.                     break_col = j
  206.            
  207.             # if processing single rows is not complete and there is a value in this cell increment v
  208.             if (not (cnt == "")) and (single_rows_complete == 0):
  209.                 v = v + 1
  210.                
  211.             j = j + 1 # Col increment
  212.            
  213.         # if processing single rows is not complete
  214.         if (single_rows_complete == 0):
  215.             if (v == 1):
  216.                 # and there is a single value in this row, increment single row
  217.                 # println("There was a single valued row here")
  218.                 single_rows = single_rows + 1
  219.             elif (v > 1):
  220.                 # there are multiple values, set processing single rows complete
  221.                 # println("More than one value was found in the same row here")
  222.                 single_rows_complete = 1
  223.                
  224.         i = i + 1 # Row increment
  225.        
  226.     if (bw_is_string == 1):
  227.         println("Break word (" + break_word + ") found at row: " + repr(break_row) + " and col: " + repr(break_col))
  228.     else:
  229.         println("Break word (one of " + ', '.join(break_word) + ") found at row: " + repr(break_row) + " and col: " + repr(break_col))
  230.        
  231.     println(repr(single_rows) + " rows were found at the top with only single string value")
  232.    
  233.     # Generate output csv heading row
  234.     println("Generating Column Names for Output..")
  235.     if (break_col == 0): # Content starts from first column
  236.         csv_headers = ['Municipios']
  237.     else: # Content starts from second column, with first column being S.No.
  238.         csv_headers = ['S.No.', 'Municipios']
  239.     j = break_col + 1
  240.     i = break_row - 1
  241.    
  242.     # loop over columns starting from break col's next col
  243.     while (j < cols):
  244.         # call the get col label method
  245.         col_label = utilities.getRecursiveColLabel(matrix, i, j, single_rows, break_row)
  246.        
  247.         # format it so that there is no _ at the end
  248.         col_label = col_label[1:]
  249.         if (col_label[-1:] == '_'):
  250.             col_label = col_label[:-1]
  251.            
  252.         # println("Column name for col=" + repr(j) + " is: " + col_label)
  253.         csv_headers.append(col_label)
  254.         j = j + 1 # col increment
  255.        
  256.     # write into output file
  257.     writer.writerows([
  258.         (csv_headers)
  259.     ])
  260.     println("Column names written to output successfully.")
  261.    
  262.     # start from break row to write actual statistics data
  263.     i = break_row
  264.    
  265.     println("Writing Statistics data to output..")
  266.    
  267.     # loop over all rows
  268.     while (i < rows):
  269.         # this data var will be stored into the csv finally
  270.         stats_data = []
  271.  
  272.         # If break word is in first column, that means the names are in the format <int> <str>
  273.         # println("In first col: " + repr(matrix[i][0]))
  274.         if (break_col == 0):
  275.             if (i == break_row):
  276.                 # if this is the row containing breakword then format is only <str>
  277.                 # stats_data.append(0)
  278.                 stats_data.append(matrix[i][0].encode('utf-8'))
  279.             else:
  280.                 # here format is <int> <str>
  281.                 try:
  282.                     parts = matrix[i][break_col].split()
  283.                     try:
  284.                         if utilities.RepresentsInt(parts[0]):
  285.                             # stats_data.append(int(parts[0]))
  286.                             stats_data.append(''.join(parts[1:]).encode('utf-8').strip())
  287.                         else:
  288.                             # if format is something else just append the value as it is
  289.                             stats_data.append(''.join(matrix[i][break_col]).encode('utf-8'))
  290.                     except:
  291.                         stats_data.append(''.join(parts).encode('utf-8').strip())
  292.                 except:
  293.                     stats_data.append(''.join(str(matrix[i][break_col])))
  294.         else:
  295.             # if breakword is not in the first column, that means input is properly numbered with 123 in col A
  296.             # append directly into output
  297.             if (isinstance(matrix[i][0], basestring)):
  298.                 stats_data.append(''.join(matrix[i][0]).encode('utf-8'))
  299.             else:
  300.                 stats_data.append(matrix[i][0])
  301.                
  302.             if (isinstance(matrix[i][1], basestring)):
  303.                 stats_data.append(''.join(matrix[i][1]).encode('utf-8'))
  304.             else:
  305.                 stats_data.append(matrix[i][1])
  306.            
  307.         j = break_col + 1
  308.        
  309.         # loop over all columns starting from next the break word col
  310.         # this does the actual statistics numbers in the input
  311.         while (j < cols):
  312.             # format output
  313.             # println("Here i= " + repr(i) + " and j=" + repr(j))
  314.             c = matrix[i][j]
  315.             if (isinstance(c, basestring)):
  316.                 if (c.strip().lower() == "(x)"): # replace (x) values with 0
  317.                     stats_data.append(0)
  318.                 elif (c.strip() == u'\xad'): # special replacement for unicode problems
  319.                     stats_data.append("-")
  320.                 else: # if it is a string (-,chars) then it needs to be properly encoded
  321.                     stats_data.append(c.encode('utf-8'))
  322.             elif utilities.RepresentsInt(c): # if the number is an integer
  323.                 stats_data.append(int(c))
  324.             else: # if its not a string just append it as it is
  325.                 stats_data.append(c)
  326.  
  327.             j = j + 1 # col increment
  328.            
  329.         # write current row to output csv
  330.         writer.writerows([
  331.             (stats_data)
  332.         ])
  333.        
  334.         i = i + 1 # row increment
  335.        
  336.     println("Writing output complete successfully.")
  337.     println("Closing Input and Output files..")
  338.  
  339. println("Finished processing all files in the specified input folder. Please check the generated output files.")
  340. println("Thank you for using this program.\n- a1ananth")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement