Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- GroupBy: covert to DataFrame, OLS Regression, group predictions
- Created on Mon Jun 12 20:57:32 2017
- """
- import pandas as pd
- import numpy as np
- import statsmodels.api as sm
- from sklearn.linear_model import LinearRegression
- #%% Converting a Pandas GroupBy object to DataFrame
- #https://stackoverflow.com/questions/10373660/
- df1 = pd.DataFrame( {
- "Name" : ["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] ,
- "City" : ["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"] } )
- # g1 here is a DataFrame. It has a hierarchical index
- g1 = df1.groupby( [ "Name", "City"] ).count()
- g2=g1.add_suffix('_Count').reset_index()
- df2=pd.DataFrame({'count' : df1.groupby( [ "Name", "City"] ).size()}).reset_index()
- df3=pd.DataFrame({'count' : df1.groupby( [ "Name", "City"] ).size()})
- #%% OLS Regression with groupby
- M=500
- df=pd.DataFrame({'FID': np.random.randint(low=1, high=3, size=500),
- 'MEAN':np.random.randn(M),
- 'Accum_Prcp':np.random.randn(M)*300,
- 'Accum_HDD':np.random.randn(M)*1000})
- def group_ols(grps):
- results=[]
- for fid, grp in grps: # fid此处必须要有,是指index列。
- y=grp.loc[:,'MEAN']
- x=grp.loc[:,['Accum_Prcp', 'Accum_HDD']]
- result=sm.OLS(y,x ).fit()
- results.append((fid, result.summary()))
- return results
- r_ols=group_ols(df.groupby(['FID']))
- r_ols
- #%% group predictions
- # predict the value for each group for 01-10-2016.
- df4=pd.DataFrame({'group':['A','A','A','A','A','A','B','B','B','C','C'],
- 'date':['2016-1-2','2016-1-3','2016-1-4','2016-1-5','2016-1-6',
- '2016-1-7','2016-1-2','2016-1-3','2016-1-4','2016-1-2','2016-1-3'],
- 'value':[16,15,14,17,19,20,16,13,13,16,16]})
- def model(df4, delta):
- y = df4[['value']].values
- X = df4[['date_delta']].values
- return np.squeeze(LinearRegression().fit(X, y).predict(delta))
- def group_predictions(df, date):
- date = pd.to_datetime(date)
- df4.date = pd.to_datetime(df4.date)
- day = np.timedelta64(1, 'D')
- mn = df4.date.min()
- df4['date_delta'] = df4.date.sub(mn).div(day)
- dd = (date - mn) / day
- return df.groupby('group').apply(model, delta=dd)
- group_predictions(df4, '2016-1-10')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement