Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- FILENAME = "Germany.txt"
- FIELD_COUNT = 12
- SEPARATOR = ","
- FB_RELATIONSHIPS = [
- "Divorced",
- "Engaged",
- "In a civil union",
- "In a domestic partnership",
- "In a relationship",
- "In an open relationship",
- "It's complicated",
- "Married",
- "Separated",
- "Single",
- "Widowed",
- ""
- ]
- import re
- re_timestr = re.compile(r"(\d\d),(\d\d),(\d\d [AP]M,)")
- good = open("good.txt", "wt")
- bad = open("bad.txt", "wt")
- fields = {}
- with open(FILENAME, "rt") as f:
- for line in f:
- line = line.strip()
- # Fix time format (hh,mm,ss is stupid when delimiter is , too)
- line = re.sub(re_timestr, r"\1-\2-\3", line)
- parts = line.split(SEPARATOR)
- i = 0
- sexok = True
- relok = True
- while i < len(parts):
- if len(parts[i]) > 0 and parts[i][0] == " " and len(parts[i-1]) > 0:
- # starts with a space, so probably belongs to previous field
- parts[i-1] += SEPARATOR + parts[i]
- parts.pop(i)
- elif parts[i] == "male (":
- # common issue in exported data
- parts[i] = "male"
- elif i == 4 and parts[i] not in ["male", "female", ""]:
- # column mismatch in name
- # @TODO
- print("SEX ({}): ".format(parts[i]) + line)
- sexok = False
- i += 1
- elif i == 7 and parts[i] not in FB_RELATIONSHIPS:
- # column mismatch between name and relationship (cities)
- # @TODO
- print("REL ({}): ".format(parts[i]) + line)
- relok = False
- i += 1
- elif parts[i] == "real" and len(parts[i+1]) > 0 and parts[i+1][0] == "-":
- # "real,-"
- parts[i] += SEPARATOR + parts[i+1]
- parts.pop(i+1)
- i += 1
- else:
- i += 1
- if sexok and relok and len(parts) > 12:
- # all OK up to relationship col - must be workplace (col 8) that's wrong
- while len(parts) > 12:
- parts[8] += SEPARATOR + parts[9]
- parts.pop(9)
- fieldstr = str(len(parts))
- if not fieldstr in fields:
- fields[fieldstr] = 1
- else:
- fields[fieldstr] += 1
- parts = list(map(lambda x: x.replace('"', '""'), parts)) # quote quotes in to-be-quoted strings ;)
- if len(parts) != 12:
- bad.write("\"{}\"\n".format("\"{}\"".format(SEPARATOR).join(parts)))
- else:
- good.write("\"{}\"\n".format("\"{}\"".format(SEPARATOR).join(parts)))
- good.close()
- bad.close()
- print(repr(fields))
RAW Paste Data