Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import sys
- import re
- import psycopg2
- conn = psycopg2.connect(host="db-postgresql-nyc1-82368-do-user-4775635-0.db.ondigitalocean.com", port="25060",
- database="big_bang", user="doadmin", password="jujyszdagjsh6rb6")
- conn.autocommit = True
- cur = conn.cursor()
- seen = []
- def strip_common(s):
- common = [
- "AND", "ASSN", "ASSOC", "PLC", "INC", "PLLC", "ASSOCIATION", "INCORPORATED", "THE", "OF", "CO", "LIMITED",
- "SERVICE", "COMP", "LLC", "SERVICES",
- "COMPANY", "LLP", "STORE", "CORP", "LP", "SVCS", "CORPORATION", "LTD"]
- words = s.split()
- res_words = [word for word in words if word not in common]
- res = ' '.join(res_words)
- return res if len(res) >= 4 else s
- def rem_vowel(s):
- vowels = ('A', 'E', 'I', 'O', 'U')
- for x in s:
- if x in vowels:
- s = s.replace(x, "")
- return s
- def address_id(record):
- s = ""
- record["STATE"] = record["STATE"].upper()
- record["STREET"] = record["STREET"].upper()
- if record["STATE"] and len(record["STATE"]) == 2:
- s += record["STATE"]
- else:
- s += "NN"
- street = re.sub('[^a-zA-Z]+', '', record["STREET"])
- if len(street) > 0:
- s += street[0]
- s += street[int(len(street)/2)]
- else:
- s += "NN"
- return s
- def company_id(record):
- s = ""
- fingerprint = strip_common(re.sub("[^A-Z\s]", "", record["COMPANY"].strip().upper())).replace(" ", "")
- if len(fingerprint) > 0:
- s += fingerprint[:2]
- s += fingerprint[int(len(fingerprint)/2)]
- s += fingerprint[-2:]
- return s.ljust(5, "*")
- def branch_hq_id(record):
- sql = "SELECT account_link_id FROM xplorer.crosswalk WHERE infogroup_id = %s;"
- cur.execute(sql, (record["DAX_HQ_ID"],))
- hq_id = cur.fetchone()
- return hq_id[0][:4] if hq_id else "0000"
- def generate_id(record):
- acct_link_id = ""
- if record["PLACE_TYPE"] == "headquarters":
- acct_link_id += company_id(record)
- acct_link_id += "00000"
- elif record["PLACE_TYPE"] == "branch":
- acct_link_id += branch_hq_id(record)
- acct_link_id += company_id(record)
- else:
- acct_link_id += "00000"
- acct_link_id += company_id(record)
- acct_link_id += address_id(record)
- return acct_link_id
- def save_place(record, acct_link_id):
- values = (
- acct_link_id, record["DOMAIN"], record["COMPANY"], record["STREET"], record["SUITE"], record["CITY"], record["STATE"],
- int(record["ZIP"]) if record["ZIP"] else None,
- int(record["ZIP_4"]) if record["ZIP_4"] else None,)
- sql = "INSERT INTO xplorer.places " \
- "(account_link_id, domain, name, street, suite, city, state, zip, zip_4, created_at, updated_at) " \
- "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, NOW(), NOW());"
- try:
- cur.execute(sql, values)
- except Exception as e:
- print(cur.query)
- print(e)
- print("\n")
- with open(sys.argv[1]) as fd_in, open(sys.argv[2], "w") as fd_out:
- csv_in = csv.DictReader(fd_in)
- csv_out = csv.writer(fd_out)
- csv_out.writerow(["ACCOUNT_LINK_ID", "INFOGROUP_ID"])
- for row in csv_in:
- if row["PLACE_TYPE"] == "branch":
- continue
- if row["tab1_DAX_COMP_ID"] not in seen:
- seen.append(row["tab1_DAX_COMP_ID"])
- account_link_id = generate_id(row)
- save_place(row, account_link_id)
- csv_out.writerow([account_link_id, row["tab1_DAX_COMP_ID"]])
- conn.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement