Advertisement
Guest User

Untitled

a guest
Mar 30th, 2017
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.69 KB | None | 0 0
  1. # Libraries that are helpful
  2. import pandas as pd # all your dataframe needs
  3. import matplotlib.pyplot as plt # plotting
  4. from scipy import stats # statistical tests, like hypothesis testing
  5. import statsmodels.api as sm # linear regressions
  6.  
  7. #######################################################
  8. # Reading/munging data #
  9. #######################################################
  10.  
  11. # Connect to a redshift/postgres database
  12. # and import to a numpy array (this is fast)
  13. import psycopg2
  14. connection = psycopg2.connect(dbname='', host='', port='', user='', password='')
  15. cursor = connection.cursor()
  16. cursor.execute("YOUR SQL QUERY")
  17. data = cursor.fetchall()
  18. data_ary = np.array(data)
  19.  
  20. # Connect to a redshift/postgres database
  21. # and import to a pandas dataframe (this is slow)
  22. from sqlalchemy import create_engine
  23. engine = create_engine('database://url')
  24. df = pd.read_sql_query("YOUR SQL QUERY", engine)
  25.  
  26. # Reading data into a data frame
  27. df = pd.read_csv("your_data.csv",index_col="index_you_might_already_have")
  28.  
  29. # Remove a column - if more than one, put into an array
  30. df.drop('column_name', axis=1)
  31.  
  32. # Looking at your data
  33. df.head()
  34. df.describe() # percentiles and other goodies
  35. df.corr() # correlation matix
  36.  
  37. # Describe each field when you have lots of fields and it cuts them off
  38. col_list = list(df.columns.values)
  39. for col in col_list:
  40. print(df[col].describe())
  41.  
  42. # Print number of missing values for each column
  43. def num_missing(x):
  44. return sum(x.isnull())
  45.  
  46. for col in col_list:
  47. print(df[col].name, num_missing(df[col]))
  48.  
  49. # Sorting
  50. df.sort_values(['col_one','col_two','col_etc'],ascending=False)
  51.  
  52. # Adding a column based on some conditional
  53. df['new_column'] = np.where(df['base_column']=condition, 'value_if_true', 'value_if_false')
  54.  
  55. #######################################################
  56. # Exploration #
  57. #######################################################
  58.  
  59. # Crosstabs
  60. pd.crosstab(col_one,col_two,margins=True)
  61.  
  62. # Linear regression, style 1
  63. model = sm.formula.ols(formula='y_variable ~ x_variable', data=df)
  64. result = model.fit()
  65. result.summary()
  66.  
  67. # Linear regression, style 2 - see the documentation
  68.  
  69.  
  70. # t-testing (student's t-test)
  71. stats.ttest_ind(data_set_1, data_set_2)
  72.  
  73. #######################################################
  74. # Plotting #
  75. #######################################################
  76.  
  77. # Start here
  78. plt.figure();
  79.  
  80. # Regular line graph
  81. plt.plot(x,y)
  82.  
  83. # Make a histogram with 10 bins
  84. df['column_of_interest'].hist(bins=10)
  85.  
  86. # Additions to your plot
  87. plt.xlabel('x_data')
  88. plt.ylabel('y_data')
  89. plt.title('title')
  90.  
  91. # Show whatever plot you've just made
  92. plt.show()
  93.  
  94. # Or save it to a file
  95. plt.savefig('name.pdf')
  96. plt.savefig('name.png')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement