Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- df = pd.DataFrame(
- {'trial_num': [1, 2, 3, 1, 2, 3],
- 'subject': [1, 1, 1, 2, 2, 2],
- 'samples': [list(np.random.randn(3).round(2)) for i in range(6)]
- }
- )
- df
- Out[10]:
- samples subject trial_num
- 0 [0.57, -0.83, 1.44] 1 1
- 1 [-0.01, 1.13, 0.36] 1 2
- 2 [1.18, -1.46, -0.94] 1 3
- 3 [-0.08, -4.22, -2.05] 2 1
- 4 [0.72, 0.79, 0.53] 2 2
- 5 [0.4, -0.32, -0.13] 2 3
- subject trial_num sample sample_num
- 0 1 1 0.57 0
- 1 1 1 -0.83 1
- 2 1 1 1.44 2
- 3 1 2 -0.01 0
- 4 1 2 1.13 1
- 5 1 2 0.36 2
- 6 1 3 1.18 0
- # etc.
- lst_col = 'samples'
- r = pd.DataFrame({
- col:np.repeat(df[col].values, df[lst_col].str.len())
- for col in df.columns.drop(lst_col)}
- ).assign(**{lst_col:np.concatenate(df[lst_col].values)})[df.columns]
- In [103]: r
- Out[103]:
- samples subject trial_num
- 0 0.10 1 1
- 1 -0.20 1 1
- 2 0.05 1 1
- 3 0.25 1 2
- 4 1.32 1 2
- 5 -0.17 1 2
- 6 0.64 1 3
- 7 -0.22 1 3
- 8 -0.71 1 3
- 9 -0.03 2 1
- 10 -0.65 2 1
- 11 0.76 2 1
- 12 1.77 2 2
- 13 0.89 2 2
- 14 0.65 2 2
- 15 -0.98 2 3
- 16 0.65 2 3
- 17 -0.30 2 3
- In [10]: np.repeat(df['trial_num'].values, df[lst_col].str.len())
- Out[10]: array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3, 3], dtype=int64)
- In [11]: pd.DataFrame({
- ...: col:np.repeat(df[col].values, df[lst_col].str.len())
- ...: for col in df.columns.drop(lst_col)}
- ...: )
- Out[11]:
- trial_num subject
- 0 1 1
- 1 1 1
- 2 1 1
- 3 2 1
- 4 2 1
- 5 2 1
- 6 3 1
- .. ... ...
- 11 1 2
- 12 2 2
- 13 2 2
- 14 2 2
- 15 3 2
- 16 3 2
- 17 3 2
- [18 rows x 2 columns]
- In [12]: np.concatenate(df[lst_col].values)
- Out[12]: array([-1.04, -0.58, -1.32, 0.82, -0.59, -0.34, 0.25, 2.09, 0.12, 0.83, -0.88, 0.68, 0.55, -0.56, 0.65, -0.04, 0.36, -0.31])
- In [13]: pd.DataFrame({
- ...: col:np.repeat(df[col].values, df[lst_col].str.len())
- ...: for col in df.columns.drop(lst_col)}
- ...: ).assign(**{lst_col:np.concatenate(df[lst_col].values)})
- Out[13]:
- trial_num subject samples
- 0 1 1 -1.04
- 1 1 1 -0.58
- 2 1 1 -1.32
- 3 2 1 0.82
- 4 2 1 -0.59
- 5 2 1 -0.34
- 6 3 1 0.25
- .. ... ... ...
- 11 1 2 0.68
- 12 2 2 0.55
- 13 2 2 -0.56
- 14 2 2 0.65
- 15 3 2 -0.04
- 16 3 2 0.36
- 17 3 2 -0.31
- [18 rows x 3 columns]
- >>> df
- samples subject trial_num
- 0 [-0.07, -2.9, -2.44] 1 1
- 1 [-1.52, -0.35, 0.1] 1 2
- 2 [-0.17, 0.57, -0.65] 1 3
- 3 [-0.82, -1.06, 0.47] 2 1
- 4 [0.79, 1.35, -0.09] 2 2
- 5 [1.17, 1.14, -1.79] 2 3
- >>>
- >>> s = df.apply(lambda x: pd.Series(x['samples']),axis=1).stack().reset_index(level=1, drop=True)
- >>> s.name = 'sample'
- >>>
- >>> df.drop('samples', axis=1).join(s)
- subject trial_num sample
- 0 1 1 -0.07
- 0 1 1 -2.90
- 0 1 1 -2.44
- 1 1 2 -1.52
- 1 1 2 -0.35
- 1 1 2 0.10
- 2 1 3 -0.17
- 2 1 3 0.57
- 2 1 3 -0.65
- 3 2 1 -0.82
- 3 2 1 -1.06
- 3 2 1 0.47
- 4 2 2 0.79
- 4 2 2 1.35
- 4 2 2 -0.09
- 5 2 3 1.17
- 5 2 3 1.14
- 5 2 3 -1.79
- >>> res = df.set_index(['subject', 'trial_num'])['samples'].apply(pd.Series).stack()
- >>> res = res.reset_index()
- >>> res.columns = ['subject','trial_num','sample_num','sample']
- >>> res
- subject trial_num sample_num sample
- 0 1 1 0 1.89
- 1 1 1 1 -2.92
- 2 1 1 2 0.34
- 3 1 2 0 0.85
- 4 1 2 1 0.24
- 5 1 2 2 0.72
- 6 1 3 0 -0.96
- 7 1 3 1 -2.72
- 8 1 3 2 -0.11
- 9 2 1 0 -1.33
- 10 2 1 1 3.13
- 11 2 1 2 -0.65
- 12 2 2 0 0.10
- 13 2 2 1 0.65
- 14 2 2 2 0.15
- 15 2 3 0 0.64
- 16 2 3 1 -0.10
- 17 2 3 2 -0.76
- >>> objs = [df, pd.DataFrame(df['samples'].tolist())]
- >>> pd.concat(objs, axis=1).drop('samples', axis=1)
- subject trial_num 0 1 2
- 0 1 1 -0.49 -1.00 0.44
- 1 1 2 -0.28 1.48 2.01
- 2 1 3 -0.52 -1.84 0.02
- 3 2 1 1.23 -1.36 -1.06
- 4 2 2 0.54 0.18 0.51
- 5 2 3 -2.18 -0.13 -1.35
- >>> pd.melt(_, var_name='sample_num', value_name='sample',
- ... value_vars=[0, 1, 2], id_vars=['subject', 'trial_num'])
- subject trial_num sample_num sample
- 0 1 1 0 -0.49
- 1 1 2 0 -0.28
- 2 1 3 0 -0.52
- 3 2 1 0 1.23
- 4 2 2 0 0.54
- 5 2 3 0 -2.18
- 6 1 1 1 -1.00
- 7 1 2 1 1.48
- 8 1 3 1 -1.84
- 9 2 1 1 -1.36
- 10 2 2 1 0.18
- 11 2 3 1 -0.13
- 12 1 1 2 0.44
- 13 1 2 2 2.01
- 14 1 3 2 0.02
- 15 2 1 2 -1.06
- 16 2 2 2 0.51
- 17 2 3 2 -1.35
- items_as_cols = df.apply(lambda x: pd.Series(x['samples']), axis=1)
- # Keep original df index as a column so it's retained after melt
- items_as_cols['orig_index'] = items_as_cols.index
- melted_items = pd.melt(items_as_cols, id_vars='orig_index',
- var_name='sample_num', value_name='sample')
- melted_items.set_index('orig_index', inplace=True)
- df.merge(melted_items, left_index=True, right_index=True)
- samples subject trial_num sample_num sample
- 0 [1.84, 1.05, -0.66] 1 1 0 1.84
- 0 [1.84, 1.05, -0.66] 1 1 1 1.05
- 0 [1.84, 1.05, -0.66] 1 1 2 -0.66
- 1 [-0.24, -0.9, 0.65] 1 2 0 -0.24
- 1 [-0.24, -0.9, 0.65] 1 2 1 -0.90
- 1 [-0.24, -0.9, 0.65] 1 2 2 0.65
- 2 [1.15, -0.87, -1.1] 1 3 0 1.15
- 2 [1.15, -0.87, -1.1] 1 3 1 -0.87
- 2 [1.15, -0.87, -1.1] 1 3 2 -1.10
- 3 [-0.8, -0.62, -0.68] 2 1 0 -0.80
- 3 [-0.8, -0.62, -0.68] 2 1 1 -0.62
- 3 [-0.8, -0.62, -0.68] 2 1 2 -0.68
- 4 [0.91, -0.47, 1.43] 2 2 0 0.91
- 4 [0.91, -0.47, 1.43] 2 2 1 -0.47
- 4 [0.91, -0.47, 1.43] 2 2 2 1.43
- 5 [-1.14, -0.24, -0.91] 2 3 0 -1.14
- 5 [-1.14, -0.24, -0.91] 2 3 1 -0.24
- 5 [-1.14, -0.24, -0.91] 2 3 2 -0.91
- column_to_explode = 'samples'
- res = (df
- .set_index([x for x in df.columns if x != column_to_explode])[column_to_explode]
- .apply(pd.Series)
- .stack()
- .reset_index())
- res = res.rename(columns={
- res.columns[-2]:'exploded_{}_index'.format(column_to_explode),
- res.columns[-1]: '{}_exploded'.format(column_to_explode)})
- df.samples.apply(lambda x: pd.Series(x)).join(df).
- melt(['subject','trial_num'],[0,1,2],var_name='sample')
- subject trial_num sample value
- 0 1 1 0 -0.24
- 1 1 2 0 0.14
- 2 1 3 0 -0.67
- 3 2 1 0 -1.52
- 4 2 2 0 -0.00
- 5 2 3 0 -1.73
- 6 1 1 1 -0.70
- 7 1 2 1 -0.70
- 8 1 3 1 -0.29
- 9 2 1 1 -0.70
- 10 2 2 1 -0.72
- 11 2 3 1 1.30
- 12 1 1 2 -0.55
- 13 1 2 2 0.10
- 14 1 3 2 -0.44
- 15 2 1 2 0.13
- 16 2 2 2 -1.44
- 17 2 3 2 0.73
- df = df.reset_index(drop=True)
- lstcol = df.lstcol.values
- lstcollist = []
- indexlist = []
- countlist = []
- for ii in range(len(lstcol)):
- lstcollist.extend(lstcol[ii])
- indexlist.extend([ii]*len(lstcol[ii]))
- countlist.extend([jj for jj in range(len(lstcol[ii]))])
- df = pd.merge(df.drop("lstcol",axis=1),pd.DataFrame({"lstcol":lstcollist,"lstcol_num":countlist},
- index=indexlist),left_index=True,right_index=True).reset_index(drop=True)
- df = pd.DataFrame({
- 'var1': [['a', 'b', 'c'], ['d', 'e',], [], np.nan],
- 'var2': [1, 2, 3, 4]
- })
- df
- var1 var2
- 0 [a, b, c] 1
- 1 [d, e] 2
- 2 [] 3
- 3 NaN 4
- df.explode('var1')
- var1 var2
- 0 a 1
- 0 b 1
- 0 c 1
- 1 d 2
- 1 e 2
- 2 NaN 3 # empty list converted to NaN
- 3 NaN 4 # NaN entry preserved as-is
- # to reset the index to be monotonically increasing...
- df.explode('var1').reset_index(drop=True)
- var1 var2
- 0 a 1
- 1 b 1
- 2 c 1
- 3 d 2
- 4 e 2
- 5 NaN 3
- 6 NaN 4
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement