Guest User

Untitled

a guest
Jun 23rd, 2018
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.98 KB | None | 0 0
  1. """
  2. From an Excel table containing the following format in a tab called <Catalog1>,
  3. generate a query to create a table for a serialized JSON to be used by Athena:
  4.  
  5. Excel table:
  6.  
  7. A | B | C | D | E
  8. ---------------------------------------------------
  9. this is text | this too | 123 | 122.12 | 12-12-2012
  10.  
  11.  
  12. Result query string:
  13.  
  14. CREATE EXTERNAL TABLE IF NOT EXISTS test_one.Catalog1(
  15. `A STRING,
  16. `B STRING,
  17. `C FLOAT,
  18. `D FLOAT,
  19. `E TIMESTAMP
  20. )
  21. ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
  22. WITH SERDEPROPERTIES (
  23. 'serialization.format' = '1'
  24. ) LOCATION '%s'
  25. TBLPROPERTIES ('has_encrypted_data'='false');
  26. """
  27.  
  28. import xlrd
  29. import datetime
  30. import boto3
  31.  
  32. def sheet_to_dict(sheet, date_mode):
  33. """
  34. Parse an excel sheet and return it as a json, considering the
  35. first row is always containing the headers for the following
  36. values
  37. """
  38. dict_sheet = {}
  39. headers = []
  40. try:
  41. for row in range(sheet.nrows):
  42. for column in range(sheet.ncols):
  43. value_ = sheet.cell(row, column).value
  44. if row == 0:
  45. headers.append(value_)
  46. dict_sheet[value_] = []
  47. else:
  48. value_ = value_best_fit(value_, date_mode)
  49. dict_sheet[headers[column]].append(value_)
  50. except (ValueError, IndexError) as error:
  51. print('Invalid value or position attempted to be retrieved')
  52. return dict_sheet
  53.  
  54. def value_best_fit(value, date_mode):
  55. """
  56. Guess the best type for a cell and fortmat it; precedence order:
  57. date, float, string
  58. """
  59. if isinstance(value, str):
  60. return value_as_digit(value)
  61. else:
  62. try:
  63. as_date = xlrd.xldate_as_tuple(value, date_mode)
  64. value_ = datetime.datetime.strptime('{}-{}-{} {}:{}:{}'.format(
  65. as_date[0],
  66. as_date[1],
  67. as_date[2],
  68. as_date[3],
  69. as_date[4],
  70. as_date[5]
  71. ), '%Y-%m-%d %H:%M:%S')
  72. if value_.year <= 1900:
  73. value_ = value_as_digit(str(value))
  74. return value_
  75. except TypeError:
  76. print('Value is not a valid date, trying numeric formats instead')
  77. return value_as_digit(str(value))
  78.  
  79. def value_as_digit(value):
  80. """
  81. Convert a string to a digit if possible, float is preferred so no mismatches
  82. occur if a float is parsed after an integer and a column was set to integer
  83. """
  84. try:
  85. if any(char in value for char in (',', '.')):
  86. if value.replace(',', '').replace('.', '').isdigit():
  87. value = float(value.replace(',', '.'))
  88. return value
  89. elif value.isdigit():
  90. return float(value)
  91. else:
  92. return value
  93. except ValueError:
  94. print('The value is neither a string nor a number, I give up')
  95. return None
  96.  
  97. def type_as_string(value):
  98. """
  99. Possible types mapped as table types
  100. """
  101. if isinstance(value, datetime.datetime):
  102. return 'TIMESTAMP'
  103. elif isinstance(value, float):
  104. return 'FLOAT'
  105. return 'STRING'
  106.  
  107. def create_table_query(database, sheet, date_mode):
  108. """
  109. Table query to create a table using the headers and types
  110. identified from creating the JSON
  111. """
  112. data = sheet_to_dict(sheet, date_mode)
  113. create_query = 'CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}('.format(
  114. database,
  115. sheet.name
  116. )
  117. for key in data:
  118. create_query += '\n `{} {},'.format(key, type_as_string(data[key][0]))
  119. create_query = create_query[:-1]
  120. create_query += '''\n )
  121. ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
  122. WITH SERDEPROPERTIES (
  123. 'serialization.format' = '1'
  124. ) LOCATION '%s'
  125. TBLPROPERTIES ('has_encrypted_data'='false');
  126. '''
  127. return create_query, data
  128.  
  129. if __name__ == '__main__':
  130. book = xlrd.open_workbook('test_one.xlsx')
  131. for sheet in book.sheets():
  132. query, data = create_table_query('test_one', sheet, book.datemode)
  133. print(query)
Add Comment
Please, Sign In to add comment