Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Libraries that are helpful
- import pandas as pd # all your dataframe needs
- import matplotlib.pyplot as plt # plotting
- from scipy import stats # statistical tests, like hypothesis testing
- import statsmodels.api as sm # linear regressions
- #######################################################
- # Reading/munging data #
- #######################################################
- # Connect to a redshift/postgres database
- # and import to a numpy array (this is fast)
- import psycopg2
- connection = psycopg2.connect(dbname='', host='', port='', user='', password='')
- cursor = connection.cursor()
- cursor.execute("YOUR SQL QUERY")
- data = cursor.fetchall()
- data_ary = np.array(data)
- # Connect to a redshift/postgres database
- # and import to a pandas dataframe (this is slow)
- from sqlalchemy import create_engine
- engine = create_engine('database://url')
- df = pd.read_sql_query("YOUR SQL QUERY", engine)
- # Reading data into a data frame
- df = pd.read_csv("your_data.csv",index_col="index_you_might_already_have")
- # Remove a column - if more than one, put into an array
- df.drop('column_name', axis=1)
- # Looking at your data
- df.head()
- df.describe() # percentiles and other goodies
- df.corr() # correlation matix
- # Describe each field when you have lots of fields and it cuts them off
- col_list = list(df.columns.values)
- for col in col_list:
- print(df[col].describe())
- # Print number of missing values for each column
- def num_missing(x):
- return sum(x.isnull())
- for col in col_list:
- print(df[col].name, num_missing(df[col]))
- # Sorting
- df.sort_values(['col_one','col_two','col_etc'],ascending=False)
- # Adding a column based on some conditional
- df['new_column'] = np.where(df['base_column']=condition, 'value_if_true', 'value_if_false')
- #######################################################
- # Exploration #
- #######################################################
- # Crosstabs
- pd.crosstab(col_one,col_two,margins=True)
- # Linear regression, style 1
- model = sm.formula.ols(formula='y_variable ~ x_variable', data=df)
- result = model.fit()
- result.summary()
- # Linear regression, style 2 - see the documentation
- # t-testing (student's t-test)
- stats.ttest_ind(data_set_1, data_set_2)
- #######################################################
- # Plotting #
- #######################################################
- # Start here
- plt.figure();
- # Regular line graph
- plt.plot(x,y)
- # Make a histogram with 10 bins
- df['column_of_interest'].hist(bins=10)
- # Additions to your plot
- plt.xlabel('x_data')
- plt.ylabel('y_data')
- plt.title('title')
- # Show whatever plot you've just made
- plt.show()
- # Or save it to a file
- plt.savefig('name.pdf')
- plt.savefig('name.png')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement