Advertisement
Guest User

Untitled

a guest
Feb 14th, 2016
55
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.15 KB | None | 0 0
  1. """
  2. Utilities for converting JSON-like structures to CSV-like tables and back
  3.  
  4. You should be able to convert your structure to a table and back perfectly:
  5. assert equal( next(structure(flatten(doc))), doc )
  6.  
  7.  
  8. TODO
  9. - convert tables schemas to json schemas and back
  10.  
  11. """
  12.  
  13. def fields(structured_object, prefix=''):
  14. """ Get the field names of a structured object """
  15. if type(structured_object) == dict:
  16. for k,v in structured_object.iteritems():
  17. for field in fields(v, prefix='.'+k):
  18. yield prefix+field
  19. elif type(structured_object) == list:
  20. for row in structured_object:
  21. for field in fields(row, prefix='[]'):
  22. yield prefix+field
  23. else:
  24. yield prefix
  25.  
  26. def equal(a, b):
  27. assert type(a) == type(b)
  28. if type(a) == dict:
  29. for k in a:
  30. if not (k in b and equal(a[k], b[k])):
  31. return False
  32. elif type(a) == list:
  33. for a_,b_ in zip(sorted(a), sorted(b)):
  34. if not equal(a_,b_):
  35. return False
  36. else:
  37. if not a==b:
  38. return False
  39. return True
  40.  
  41. def flatten(structured_object, prefix=''):
  42. """ Flatten a structured object into rows of flat dicts """
  43. if type(structured_object) == dict:
  44. flat_row = {}
  45. nested_rows = []
  46. for k,v in structured_object.iteritems():
  47. for row in flatten(v, prefix='.'+k):
  48. for k_ in row.keys():
  49. if '[]' not in k_:
  50. flat_row[prefix+k_] = row.pop(k_)
  51. if row:
  52. nested_rows.append(row)
  53.  
  54. if nested_rows:
  55. # all lists in the structure create extra rows
  56. for row in nested_rows:
  57. row = {prefix+k_:v_ for k_,v_ in row.iteritems()}
  58. row.update(flat_row)
  59. yield row
  60. else:
  61. # if the structure didn't contain any lists, it's just one row
  62. yield flat_row
  63.  
  64. elif type(structured_object) == list:
  65. for row in structured_object:
  66. for row_ in flatten(row, prefix='[]'):
  67. yield {prefix+k_:v_ for k_,v_ in row_.iteritems()}
  68. else:
  69. yield {prefix: structured_object}
  70.  
  71. def add_nested(obj, key, val):
  72. assert type(obj) == dict, 'obj must be a dict'
  73. _, _, key_ = key.rpartition('[]')
  74. k1, _, k2 = key_.partition('.')
  75. if k2:
  76. add_nested(obj.setdefault(k1, {}) if k1 else obj, k2, val)
  77. else:
  78. obj[k1] = val
  79.  
  80. def structure(flat_objects):
  81. """ Proof of concept """
  82. struct = {}
  83. key = {}
  84. for obj in flat_objects:
  85. for k,v in obj.iteritems():
  86. prefix, _, suffix = k.rpartition('[]')
  87.  
  88. # add to a dict for everything on this level
  89. # the dict is then indexed by everything except
  90. # items that only share the prefix and not the rest
  91. outside = {}
  92. for k_,v_ in obj.iteritems():
  93. prefix_, _, suffix_ = k_.rpartition('[]')
  94. if not(prefix_.startswith(prefix) and prefix!=prefix_):
  95. if v_:
  96. outside[k_] = v_
  97. index = tuple(sorted(outside.items()))
  98.  
  99. if v:
  100. struct.setdefault(index, {})[k] = v
  101. key[index] = prefix
  102.  
  103. root = []
  104. for outside, obj in struct.iteritems():
  105. parent = {k:v for k,v in outside if k not in obj}
  106. index = tuple(sorted(parent.items()))
  107. if index in struct:
  108. for k in obj.keys():
  109. # add non-object list items
  110. if k.endswith('[]'):
  111. struct[index].setdefault(key[outside], []).append(obj.pop(k))
  112. if obj:
  113. struct[index].setdefault(key[outside], []).append( obj )
  114. else:
  115. root.append(obj)
  116.  
  117. for obj in struct.itervalues():
  118. for key in obj.keys():
  119. add_nested(obj, key, obj.pop(key))
  120.  
  121. for part in root:
  122. yield part
  123.  
  124. if __name__ == '__main__':
  125. import argparse, sys, csv, json
  126. parser = argparse.ArgumentParser(description=__doc__)
  127. parser.add_argument('command', choices=['fields', 'structure', 'flatten'])
  128. parser.add_argument('--input', '-i', nargs='?', type=argparse.FileType('r'),
  129. default=sys.stdin, help='default: stdin')
  130. args = parser.parse_args()
  131.  
  132. def csv_field(field):
  133. return field.lstrip('[]').lstrip('.')
  134.  
  135. if args.command == 'structure':
  136. with sys.stdout as w:
  137. struct = list(structure(csv.DictReader(args.input)))
  138. if len(struct) == 1:
  139. struct = struct[0]
  140. print >> w, json.dumps(struct, indent=4)
  141. elif args.command == 'flatten':
  142. full_structure = json.loads(args.input.read())
  143. header = map(csv_field, set(fields(full_structure)))
  144. with sys.stdout as w:
  145. writer = csv.DictWriter(w, header)
  146. writer.writeheader()
  147. for row in flatten(full_structure):
  148. writer.writerow({csv_field(k): v for k,v in row.items()})
  149. elif args.command == 'fields':
  150. full_structure = json.loads(args.input.read())
  151. for field in set(fields(full_structure)):
  152. print csv_field(field)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement