mbirth

fix_fbleak_csv.py

Apr 5th, 2021
452
347 days
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python3
  2.  
  3. FILENAME = "Germany.txt"
  4. FIELD_COUNT = 12
  5. SEPARATOR = ","
  6.  
  7. FB_RELATIONSHIPS = [
  8.     "Divorced",
  9.     "Engaged",
  10.     "In a civil union",
  11.     "In a domestic partnership",
  12.     "In a relationship",
  13.     "In an open relationship",
  14.     "It's complicated",
  15.     "Married",
  16.     "Separated",
  17.     "Single",
  18.     "Widowed",
  19.     ""
  20. ]
  21.  
  22. import re
  23.  
  24. re_timestr = re.compile(r"(\d\d),(\d\d),(\d\d [AP]M,)")
  25.  
  26. good = open("good.txt", "wt")
  27. bad = open("bad.txt", "wt")
  28.  
  29. fields = {}
  30. with open(FILENAME, "rt") as f:
  31.     for line in f:
  32.         line = line.strip()
  33.  
  34.         # Fix time format (hh,mm,ss is stupid when delimiter is , too)
  35.         line = re.sub(re_timestr, r"\1-\2-\3", line)
  36.  
  37.         parts = line.split(SEPARATOR)
  38.  
  39.         i = 0
  40.         sexok = True
  41.         relok = True
  42.         while i < len(parts):
  43.             if len(parts[i]) > 0 and parts[i][0] == " " and len(parts[i-1]) > 0:
  44.                 # starts with a space, so probably belongs to previous field
  45.                 parts[i-1] += SEPARATOR + parts[i]
  46.                 parts.pop(i)
  47.             elif parts[i] == "male (":
  48.                 # common issue in exported data
  49.                 parts[i] = "male"
  50.             elif i == 4 and parts[i] not in ["male", "female", ""]:
  51.                 # column mismatch in name
  52.                 # @TODO
  53.                 print("SEX ({}): ".format(parts[i]) + line)
  54.                 sexok = False
  55.                 i += 1
  56.             elif i == 7 and parts[i] not in FB_RELATIONSHIPS:
  57.                 # column mismatch between name and relationship (cities)
  58.                 # @TODO
  59.                 print("REL ({}): ".format(parts[i]) + line)
  60.                 relok = False
  61.                 i += 1
  62.             elif parts[i] == "real" and len(parts[i+1]) > 0 and parts[i+1][0] == "-":
  63.                 # "real,-"
  64.                 parts[i] += SEPARATOR + parts[i+1]
  65.                 parts.pop(i+1)
  66.                 i += 1
  67.             else:
  68.                 i += 1
  69.  
  70.         if sexok and relok and len(parts) > 12:
  71.             # all OK up to relationship col - must be workplace (col 8) that's wrong
  72.             while len(parts) > 12:
  73.                 parts[8] += SEPARATOR + parts[9]
  74.                 parts.pop(9)
  75.  
  76.         fieldstr = str(len(parts))
  77.         if not fieldstr in fields:
  78.             fields[fieldstr] = 1
  79.         else:
  80.             fields[fieldstr] += 1
  81.  
  82.         parts = list(map(lambda x: x.replace('"', '""'), parts))    # quote quotes in to-be-quoted strings ;)
  83.  
  84.         if len(parts) != 12:
  85.             bad.write("\"{}\"\n".format("\"{}\"".format(SEPARATOR).join(parts)))
  86.         else:
  87.             good.write("\"{}\"\n".format("\"{}\"".format(SEPARATOR).join(parts)))
  88.  
  89. good.close()
  90. bad.close()
  91. print(repr(fields))
  92.  
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×