Untitled

# Libraries that are helpful
import pandas as pd # all your dataframe needs
import matplotlib.pyplot as plt # plotting
from scipy import stats # statistical tests, like hypothesis testing
import statsmodels.api as sm # linear regressions

#######################################################
#                  Reading/munging data               #
#######################################################

# Connect to a redshift/postgres database
# and import to a numpy array (this is fast)
import psycopg2
connection = psycopg2.connect(dbname='', host='', port='', user='', password='')
cursor = connection.cursor()
cursor.execute("YOUR SQL QUERY")
data = cursor.fetchall()
data_ary = np.array(data)

# Connect to a redshift/postgres database
# and import to a pandas dataframe (this is slow)
from sqlalchemy import create_engine
engine = create_engine('database://url')
df = pd.read_sql_query("YOUR SQL QUERY", engine)

# Reading data into a data frame
df = pd.read_csv("your_data.csv",index_col="index_you_might_already_have")

# Remove a column - if more than one, put into an array
df.drop('column_name', axis=1)

# Looking at your data
df.head()
df.describe() # percentiles and other goodies
df.corr() # correlation matix

# Describe each field when you have lots of fields and it cuts them off
col_list = list(df.columns.values)
for col in col_list:
	print(df[col].describe())

# Print number of missing values for each column
def num_missing(x):
	return sum(x.isnull())

for col in col_list:
	print(df[col].name, num_missing(df[col]))

# Sorting
df.sort_values(['col_one','col_two','col_etc'],ascending=False)

# Adding a column based on some conditional
df['new_column'] = np.where(df['base_column']=condition, 'value_if_true', 'value_if_false')

#######################################################
#                     Exploration                     #
#######################################################

# Crosstabs
pd.crosstab(col_one,col_two,margins=True)

# Linear regression, style 1
model = sm.formula.ols(formula='y_variable ~ x_variable', data=df)
result = model.fit()
result.summary()

# Linear regression, style 2 - see the documentation


# t-testing (student's t-test)
stats.ttest_ind(data_set_1, data_set_2)

#######################################################
#                      Plotting                       #
#######################################################

# Start here
plt.figure();

# Regular line graph
plt.plot(x,y)

# Make a histogram with 10 bins
df['column_of_interest'].hist(bins=10)

# Additions to your plot
plt.xlabel('x_data')
plt.ylabel('y_data')
plt.title('title')

# Show whatever plot you've just made
plt.show()

# Or save it to a file
plt.savefig('name.pdf')
plt.savefig('name.png')