Advertisement
Guest User

Untitled

a guest
Feb 24th, 2019
212
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.05 KB | None | 0 0
  1. import pandas as pd
  2. import os
  3. import numpy as np
  4. import psycopg2
  5.  
  6. def generate_headers(df):
  7. '''function to create list of headers with their respective data type. Input a pandas dataframe'''
  8. headers = df.columns.tolist()
  9. type_list = []
  10. for col in headers: # for loop where we are defining a string representation of the postgres syntax for that column's dataframe datatype and appending it to type_list
  11. df_type = str(df[col].dtype)
  12. if 'int' in df_type:
  13. type_list.append('int')
  14. elif 'float' in df_type:
  15. type_list.append('decimal')
  16. else:
  17. type_list.append('varchar')
  18. return headers, type_list
  19.  
  20. def create_table_sql(tablename, headers, type_list):
  21. '''function to generate string of "create table" statements. takes in tablename, columns, and datatypes of each column'''
  22.  
  23. statement = '''drop table if exists dl.''' + tablename + ''';
  24. create table dl.''' + tablename + ''' (''' # may need to change schema here!
  25.  
  26. for i in range(len(headers)):
  27. if type_list[i] == 'varchar':
  28. statement = (statement + '\n{} varchar ,').format(headers[i].lower())
  29. else:
  30. statement = (statement + '\n' + '{} {}' + ' ,').format(headers[i].lower(), type_list[i])
  31.  
  32. statement = statement + ' primary key (frn_adjusted));'
  33. return statement
  34.  
  35. ############ Main code to run
  36.  
  37. df = pd.read_csv('features.csv')
  38.  
  39. # assuming you have your dataframe df of features from the csv and tablename is 'ml_final_features' and schema is 'dl',
  40. # will output a sql script that you need to execute in the forked DB
  41. headers, type_list = generate_headers(df)
  42. statement = create_table_sql('ml_final_features',headers,type_list)
  43. outputile = open('create_ml_final_features.sql','w')
  44. outputile.write(statement) # now definitely inspect the file before running anything below!!
  45.  
  46. ##### Now for inserting the data from your df
  47.  
  48. # get forked DB credentials whatever they're called
  49. HOST = os.environ.get("HOST_TEMP")
  50. USER = os.environ.get("USER_TEMP")
  51. PASSWORD = os.environ.get("PASSWORD_TEMP")
  52. DB = os.environ.get("DB_TEMP")
  53.  
  54. myConnection = psycopg2.connect(host=HOST, user=USER, password=PASSWORD, database=DB, port=5432)
  55. cursor = myConnection.cursor()
  56.  
  57. # create the table -- hope this works, not sure if I've ever done this way and not manually
  58. cursor.execute(open('create_ml_final_features.sql', "r").read())
  59.  
  60. # function to insert into tables with lots of columns
  61. def insert_large_table(df, full_tablename, cursor):
  62.  
  63. columns_list = df.columns.values.tolist()
  64. columns = ', '.join(map(str, columns_list))
  65. query = '(' + ','.join(["%s"] * len(columns.split(','))) + ')'
  66. df = df.fillna(None) # hope this works; might need to try 'NULL', or maybe even do nothing
  67. final_query = ','.join(cursor.mogrify(query, row.values.tolist()) for index, row in df.iterrows())
  68.  
  69. cursor.execute("INSERT INTO " + full_tablename + " VALUES " + final_query)
  70.  
  71. print(tablename + " inserted")
  72.  
  73. # finally, insert the data
  74. insert_large_table(df, 'dl.ml_final_features', cursor)
  75.  
  76. cursor.close()
  77. myConnection.commit()
  78. myConnection.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement