Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import os
- import numpy as np
- import psycopg2
- def generate_headers(df):
- '''function to create list of headers with their respective data type. Input a pandas dataframe'''
- headers = df.columns.tolist()
- type_list = []
- for col in headers: # for loop where we are defining a string representation of the postgres syntax for that column's dataframe datatype and appending it to type_list
- df_type = str(df[col].dtype)
- if 'int' in df_type:
- type_list.append('int')
- elif 'float' in df_type:
- type_list.append('decimal')
- else:
- type_list.append('varchar')
- return headers, type_list
- def create_table_sql(tablename, headers, type_list):
- '''function to generate string of "create table" statements. takes in tablename, columns, and datatypes of each column'''
- statement = '''drop table if exists dl.''' + tablename + ''';
- create table dl.''' + tablename + ''' (''' # may need to change schema here!
- for i in range(len(headers)):
- if type_list[i] == 'varchar':
- statement = (statement + '\n{} varchar ,').format(headers[i].lower())
- else:
- statement = (statement + '\n' + '{} {}' + ' ,').format(headers[i].lower(), type_list[i])
- statement = statement + ' primary key (frn_adjusted));'
- return statement
- ############ Main code to run
- df = pd.read_csv('features.csv')
- # assuming you have your dataframe df of features from the csv and tablename is 'ml_final_features' and schema is 'dl',
- # will output a sql script that you need to execute in the forked DB
- headers, type_list = generate_headers(df)
- statement = create_table_sql('ml_final_features',headers,type_list)
- outputile = open('create_ml_final_features.sql','w')
- outputile.write(statement) # now definitely inspect the file before running anything below!!
- ##### Now for inserting the data from your df
- # get forked DB credentials whatever they're called
- HOST = os.environ.get("HOST_TEMP")
- USER = os.environ.get("USER_TEMP")
- PASSWORD = os.environ.get("PASSWORD_TEMP")
- DB = os.environ.get("DB_TEMP")
- myConnection = psycopg2.connect(host=HOST, user=USER, password=PASSWORD, database=DB, port=5432)
- cursor = myConnection.cursor()
- # create the table -- hope this works, not sure if I've ever done this way and not manually
- cursor.execute(open('create_ml_final_features.sql', "r").read())
- # function to insert into tables with lots of columns
- def insert_large_table(df, full_tablename, cursor):
- columns_list = df.columns.values.tolist()
- columns = ', '.join(map(str, columns_list))
- query = '(' + ','.join(["%s"] * len(columns.split(','))) + ')'
- df = df.fillna(None) # hope this works; might need to try 'NULL', or maybe even do nothing
- final_query = ','.join(cursor.mogrify(query, row.values.tolist()) for index, row in df.iterrows())
- cursor.execute("INSERT INTO " + full_tablename + " VALUES " + final_query)
- print(tablename + " inserted")
- # finally, insert the data
- insert_large_table(df, 'dl.ml_final_features', cursor)
- cursor.close()
- myConnection.commit()
- myConnection.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement