Untitled

import psycopg2

# --- SETUP ---
# for printing some messages in the console
DEBUG = True

# The only things that need to be filled in to run the script (+ line 102: tables, will change that later):
# dbname, user, password
# and
# year, old data filename, category, empty string for later
input_for_script = (2016, 'ongeval.csv', 'ongevallen', '')

# Connect to an existing database
conn = psycopg2.connect("dbname=rws_2011_2017 user=postgres password=Prandall19s!")

# --- EXECUTION ---
# This code adds a new year of RWS data to the old dataset.
# Requirements:            1. all tables are located in the same schema (old years, new year to add, reference tables)
#              2. the table will only join the attributes from the old table
#              3. the names of the reference tables end in '.txt.csv' */

# Open a cursor to perform database operations
cur = conn.cursor()

# Creates settings table. input it takes:
#   the year (to add),
#   the name of the old table (containing the previous years)
#   and category (partij/ongeval/slachtoffer/voertuig/etcetera)
cur.execute("""
DROP TABLE IF EXISTS _rws_settings;
CREATE TABLE _rws_settings (
    year_to_add     INT,
    old_data        text,
    category        text,
    year_filename   text);
""")

cur.execute("""
INSERT INTO _rws_settings
    (year_to_add, old_data, category, year_filename)
VALUES
    (%s, %s, %s, %s);
    """,
    input_for_script)

cur.execute("""
UPDATE _rws_settings
SET year_filename = CONCAT(_rws_settings.category, '_', _rws_settings.year_to_add, '.csv');
""")


# Creates lookup table _rws_datatypes, which states the column names and datatypes of old_data from _rws_settings
cur.execute("""
DROP TABLE IF EXISTS _rws_datatypes;
CREATE TABLE _rws_datatypes AS
SELECT column_name, data_type FROM INFORMATION_SCHEMA.columns
WHERE table_name = (SELECT old_data FROM _rws_settings);
""")

# Creates lookup table _rws_references: It takes the columns ending in '_ID' or '_CODE' and the name of the reference table.
# With this, it'll be possible to look up whether columns in the new table (year to add) have to be transformed into '_OMS'.
cur.execute("""
DROP TABLE IF EXISTS _rws_references;
CREATE TABLE _rws_references AS
SELECT column_name, table_name FROM INFORMATION_SCHEMA.columns
WHERE table_name LIKE '%.txt.csv'
AND (column_name LIKE '%\_ID' OR column_name LIKE '%\_CODE');
""")

# Selects all columns from new table that end in '_ID' or '_CODE', and outputs it into a Python list
cur.execute("""
DROP TABLE IF EXISTS _rws_columns_to_convert;
CREATE TABLE _rws_columns_to_convert AS
SELECT column_name FROM INFORMATION_SCHEMA.columns
WHERE table_name LIKE (SELECT _rws_settings.year_filename FROM _rws_settings)
AND (column_name LIKE '%\_ID' OR column_name LIKE '%\_CODE' OR column_name LIKE '%_ID_%');
""")

# Makes a list on what columns need to be converted with a reference table
cur.execute("""
SELECT _rws_columns_to_convert."column_name" FROM _rws_columns_to_convert;
""")
list_to_convert = cur.fetchall()
# removes '(' and '),'
list_to_convert = [i[0] for i in list_to_convert]
# List of column names that do end in _ID or _CODE, but should not be converted (checked manually)
columns_not_to_convert = ['JTE_ID', 'WVK_ID', 'GME_ID', 'PVE_CODE', 'WSE_ID']

# Loop to check if the name should not have a conversion to _OMS, and remove them from the conversion list that will be iterated through (not the table in pgAdmin)
for name in columns_not_to_convert:
    if name in list_to_convert:
        list_to_convert.remove(name)
print(list_to_convert)

# Query for the loop, selects the correct reference table based on the iteration
get_table_name_query = """
SELECT table_name from _rws_references WHERE column_name = '{0}'
"""

# List of tables that have ID/CODE to convert to OMS  ["ongevallen_2016.csv", "ongevallen_2017.csv"]
# create a temporary data table and fill it with the old data
tables = ["ongevallen_2016.csv"]
all_data = cur.execute("""
DROP TABLE IF EXISTS _rws_all_temp;
CREATE TABLE _rws_all_temp AS SELECT * FROM (SELECT _rws_settings.old_data FROM _rws_settings) AS derived_temp
""")

# VKL_NUMMER <--- ID for rows
# Function to print some messages for easy debug
def run_query(query):
    if(DEBUG):
        print(query)
    cur.execute(query)

# Function to make a copy of the table to edit in
def copy_table(table_name):
    copy_table_query = """
    DROP TABLE IF EXISTS "{0}_copy";
    CREATE TABLE "{0}_copy" AS
    SELECT * FROM "{0}"
    """.format(table_name)

    run_query(copy_table_query)

# Function to add a column to a table
def add_column(table_name, column_name, datatype):
    add_column_query = """
    ALTER TABLE "{0}"
    ADD COLUMN "{1}" {2};
    """.format(table_name, column_name, datatype)

    run_query(add_column_query)

# Runs the function to copy the table mentioned in the variable tables (list)
for table in tables:
    copy_table(table)


# Function that converts _ID or _CODE into _OMS
# 0= data table (2016 for example) - table_to_convert
# 1= the reference table
# 2=item name, iterable (_ID or _CODE)
# 3=item name (but the _OMS version, needs function turn_id_or_code_into_oms)
def run_queries(table_to_convert, from_table, item_name, item_oms):

    add_column(table_to_convert, item_oms, "VARCHAR")

    if item_name == 'BZD_%':
        newname = item_name.replace("_ID", "")
        newname = newname + str("_OMS")
        bzd_id = "BZD_ID"
        grab_oms_query = """
            DROP TABLE IF EXISTS "{4}_temp";
            CREATE TABLE "{4}_temp" AS
            SELECT "{0}"."VKL_NUMMER", "{0}"."{2}", (oms_table."{3}" AS "{4}")
            FROM "{0}"
            LEFT JOIN "{1}" as oms_table
            ON CAST("{0}"."{2}" AS VARCHAR) = CAST(oms_table."{5}" AS VARCHAR);
        """.format(table_to_convert, from_table, item_name, item_oms, newname, bzd_id)
    else:
        grab_oms_query = """
            DROP TABLE IF EXISTS "{3}_temp";
            CREATE TABLE "{3}_temp" AS
            SELECT "{0}"."VKL_NUMMER", "{0}"."{2}", oms_table."{3}"
            FROM "{0}"
            LEFT JOIN "{1}" as oms_table
            ON CAST("{0}"."{2}" AS VARCHAR) = CAST(oms_table."{2}" AS VARCHAR);
        """.format(table_to_convert, from_table, item_name, item_oms)

    fill_oms_column_query = """
        UPDATE "{0}"
        SET "{1}" = "{1}_temp"."{1}"
        FROM "{1}_temp"
        WHERE "{0}"."VKL_NUMMER" = "{1}_temp"."VKL_NUMMER"
    """.format(table_to_convert, item_oms)

    drop_temp_table_query = """
        DROP TABLE IF EXISTS "{0}";
    """.format(item_oms + "_temp")

    run_query(grab_oms_query)
    run_query(fill_oms_column_query)
    run_query(drop_temp_table_query)


# changes extension _ID and _CODE into _OMS (for easy lookup in reference table)
def turn_id_or_code_into_oms(name):
    if "_CODE" in name:
        if name == "DAG_CODE" or name == "PVE_CODE":
            return name.replace("_CODE", "_NAAM")
        else:
            return name.replace("_CODE", "_OMS")
    #writing for BZD_ID_IF1 -> BZD_IF1_OMS
    elif name =="BZD_ID_%":
        #newname = name.replace("_ID", "")
        #newname = newname + str("_OMS")
        newname = "BZD_OMS"
        return newname
    else:
        return name.replace("_ID", "_OMS")

# Get list with column names from original datafile
cur.execute("""
SELECT _rws_datatypes."column_name" FROM _rws_datatypes;
""")
columns_for_union = cur.fetchall()
# Removes '(' and '),'
columns_for_union = [i[0] for i in columns_for_union]

# Loop through all the tables and convertable items, find all reference tables and start function run_queries
for table in tables:
    table_copy = table + "_copy"
    for item in list_to_convert:
        print("PROCESSING: " + table + " - item: " + item)
        if item == 'BZD_%':
            item_id = 'BZD_ID'
            from_table = cur.execute(get_table_name_query.format(item_id))
            from_table = cur.fetchone()[0] #ERROR: Nonetype object is not subscriptable
        else:
            cur.execute(get_table_name_query.format(item))
            from_table = cur.fetchone()[0]

        run_queries(table_copy, from_table, item, turn_id_or_code_into_oms(item))

    # Get list with column names from new year datafile
    columns_in_new_data = cur.execute("""
    SELECT column_name FROM INFORMATION_SCHEMA.columns
    WHERE table_name = '{0}';
    """.format(table_copy))
    # Removes '(' and '),'
    columns_in_new_data = cur.fetchall()
    columns_in_new_data = [i[0] for i in columns_in_new_data]
    print('2016:', columns_in_new_data)

    for column_name in columns_for_union:
        # If column exists in original datafile, but not in new datafile: add column with correct datatype
        if column_name not in columns_in_new_data:
            datatype_column = cur.execute("""SELECT data_type FROM _rws_datatypes WHERE column_name = '{0}'""".format(column_name))
            datatype_column = cur.fetchone()
            datatype_column = datatype_column[0]
            print(datatype_column)
            cur.execute("""
            ALTER TABLE "{0}"
            ADD COLUMN "{1}" {2};
            """.format(table_copy, column_name, datatype_column))
#    If column name exists in new datafile, but not in original datafile: drop column
    for column_name in columns_in_new_data:
            if column_name not in columns_for_union:
                cur.execute("""
                ALTER TABLE "{0}"
                DROP COLUMN "{1}";
                """.format(table_copy, column_name))

    # get all column names
    cur.execute("""SELECT column_name FROM _rws_datatypes""")
    all_columns = cur.fetchall()
    all_columns = [i[0] for i in all_columns]

    # Check if column names are same datatype (old and new dataset)
    for column_name in all_columns:
        wanted_datatype = cur.execute("""SELECT data_type FROM _rws_datatypes WHERE column_name = '{0}'""".format(column_name))
        wanted_datatype = cur.fetchone()
        wanted_datatype = wanted_datatype[0]
        print("column name: ", column_name, "datatype: ", wanted_datatype)
        current_datatype = cur.execute("""SELECT data_type FROM INFORMATION_SCHEMA.columns WHERE table_name = '{0}' AND column_name = '{1}';""".format(table_copy, column_name))
        current_datatype = cur.fetchone()
        current_datatype = current_datatype[0]
        print('current: ', current_datatype, 'wanted: ', wanted_datatype)

        # Changes a varchar column to the correct datatype and adds NULLS to empty values
        if current_datatype != wanted_datatype and current_datatype == 'character varying':
            cur.execute("""UPDATE "{0}" SET "{1}" = NULL WHERE "{1}" = '';""".format(table_copy, column_name))
            cur.execute("""
            ALTER TABLE "{0}"
            ALTER COLUMN "{1}" TYPE {2} USING "{1}"::{2};
            """.format(table_copy, column_name, wanted_datatype))
        # changes a non varchar column to the correct datatype - no NULLS changed
        elif current_datatype != wanted_datatype and current_datatype != 'character varying':
            cur.execute("""
            ALTER TABLE "{0}"
            ALTER COLUMN "{1}" TYPE {2} USING "{1}"::{2};
            """.format(table_copy, column_name, wanted_datatype))

#Union goes wrong because it sees a column with Null timestamps as varchar (even though it doesn't show..)
#If you add all column names manually instead of putting a star (*), it's all good
    # Union old and new table, still need it to work with multiple years
    def concat_column_names(column_names):
        all_column_names = ""
        for i in range(len(column_names) - 1):
            all_column_names += ("\"" + column_names[i] + "\"" + ", ")

        all_column_names += ("\"" + column_names[len(column_names) - 1] + "\"")
        return all_column_names

    union_query = ("""DROP TABLE IF EXISTS _all_data; CREATE TABLE _all_data AS SELECT {0} FROM "{1}" UNION SELECT {0} FROM "{2}";""".format(concat_column_names(all_columns), input_for_script[1], table_copy))
    cur.execute(union_query)


# --- TO DO
# BZD_ID_IF1 -> BZD_IF1_OMS inbouwen
# union old and new data: make sure that it can add multiple years
# create linking tables
# make it work with partij
# make it work with voertuig
# make it work with slachtoffer
# create index? aliases?

# new scripts - restructure for better readability
# 1. functions and queries
# 2. find and convert all txt to csv (reference tables) (input: main folder and year(s))
# 3. prepare datasets rws specific (coordinate system mainly) (input: main folder and year(s))
# 4. import all data found in the folders to pgadmin (input: main folder, old data table(s) and year(s))
# 5. add new year's data ongeval/partij/voertuig/slachtoffer
# 6. make linking tables


# --- FINISH UP RUNNING THE SCRIPT
# Make the changes to the database persistent
conn.commit()

# Close communication with the database
cur.close()
conn.close()