Advertisement
Guest User

Untitled

a guest
Mar 29th, 2017
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.84 KB | None | 0 0
  1. # Print DataFrame Fill Rates and Top Values
  2. # For easy pasting to Excel
  3. # Charlie Hack, 3/24/17
  4. # <charles.hack@accenture.com>
  5. # In [0]: summarize(df)
  6. # Paste into a text editor first to strip formatting
  7.  
  8.  
  9. import sys
  10. import re
  11. import string
  12. import types
  13. from __future__ import print_function
  14.  
  15.  
  16. # text utils borrowed from fuzzywuzzy
  17.  
  18. PY3 = sys.version_info[0] == 3
  19. bad_chars = str("").join([chr(i) for i in range(128, 256)]) # ascii!
  20. if PY3:
  21. translation_table = dict((ord(c), None) for c in bad_chars)
  22. unicode = str
  23.  
  24. def asciionly(s):
  25. if PY3:
  26. return s.translate(translation_table)
  27. else:
  28. return s.translate(None, bad_chars)
  29.  
  30. def force_ascii(s):
  31. if type(s) is str:
  32. return asciionly(s)
  33. elif type(s) is unicode:
  34. return asciionly(s.encode('ascii', 'ignore'))
  35. else:
  36. return force_ascii(unicode(s))
  37.  
  38.  
  39. # print fill rates
  40.  
  41. def fillcount(frame, col):
  42. return len(frame[col].dropna()) / float(len(frame[col]))
  43.  
  44. def value_counts_percents(frame, col, top_n=15):
  45. counts = list(reversed(sorted(frame[col].fillna('NULL').value_counts(normalize=True).to_dict().items(), key=lambda x: x[1])))
  46. names = map(lambda x: force_ascii(x[0]), counts)
  47. total_count = len(frame)
  48.  
  49. if len(counts) > top_n:
  50. counts = counts[:top_n]
  51. names = names[:top_n]
  52.  
  53. out = []
  54. out.append(total_count)
  55. out.append(frame[col].nunique())
  56. out.append(", ".join(names))
  57. for x in counts:
  58. out.append(x[1])
  59. return out
  60.  
  61. def summarize(subframe, top_n=15):
  62. print("Data Element", "Fill Rate", "Count", "Unique", "Top {} Values".format(top_n), sep="\t", end="\t")
  63. print(*("Value {} % of Total".format(x+1) for x in xrange(top_n)), sep="\t")
  64. for col in subframe.columns:
  65. row = [col, fillcount(subframe,col)] + value_counts_percents(subframe,col, top_n=top_n)
  66. print(*row, sep="\t")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement