Advertisement
Guest User

Untitled

a guest
Jul 20th, 2019
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.57 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3.  
  4. df = pd.DataFrame(
  5. {'trial_num': [1, 2, 3, 1, 2, 3],
  6. 'subject': [1, 1, 1, 2, 2, 2],
  7. 'samples': [list(np.random.randn(3).round(2)) for i in range(6)]
  8. }
  9. )
  10.  
  11. df
  12. Out[10]:
  13. samples subject trial_num
  14. 0 [0.57, -0.83, 1.44] 1 1
  15. 1 [-0.01, 1.13, 0.36] 1 2
  16. 2 [1.18, -1.46, -0.94] 1 3
  17. 3 [-0.08, -4.22, -2.05] 2 1
  18. 4 [0.72, 0.79, 0.53] 2 2
  19. 5 [0.4, -0.32, -0.13] 2 3
  20.  
  21. subject trial_num sample sample_num
  22. 0 1 1 0.57 0
  23. 1 1 1 -0.83 1
  24. 2 1 1 1.44 2
  25. 3 1 2 -0.01 0
  26. 4 1 2 1.13 1
  27. 5 1 2 0.36 2
  28. 6 1 3 1.18 0
  29. # etc.
  30.  
  31. lst_col = 'samples'
  32.  
  33. r = pd.DataFrame({
  34. col:np.repeat(df[col].values, df[lst_col].str.len())
  35. for col in df.columns.drop(lst_col)}
  36. ).assign(**{lst_col:np.concatenate(df[lst_col].values)})[df.columns]
  37.  
  38. In [103]: r
  39. Out[103]:
  40. samples subject trial_num
  41. 0 0.10 1 1
  42. 1 -0.20 1 1
  43. 2 0.05 1 1
  44. 3 0.25 1 2
  45. 4 1.32 1 2
  46. 5 -0.17 1 2
  47. 6 0.64 1 3
  48. 7 -0.22 1 3
  49. 8 -0.71 1 3
  50. 9 -0.03 2 1
  51. 10 -0.65 2 1
  52. 11 0.76 2 1
  53. 12 1.77 2 2
  54. 13 0.89 2 2
  55. 14 0.65 2 2
  56. 15 -0.98 2 3
  57. 16 0.65 2 3
  58. 17 -0.30 2 3
  59.  
  60. In [10]: np.repeat(df['trial_num'].values, df[lst_col].str.len())
  61. Out[10]: array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3, 3], dtype=int64)
  62.  
  63. In [11]: pd.DataFrame({
  64. ...: col:np.repeat(df[col].values, df[lst_col].str.len())
  65. ...: for col in df.columns.drop(lst_col)}
  66. ...: )
  67. Out[11]:
  68. trial_num subject
  69. 0 1 1
  70. 1 1 1
  71. 2 1 1
  72. 3 2 1
  73. 4 2 1
  74. 5 2 1
  75. 6 3 1
  76. .. ... ...
  77. 11 1 2
  78. 12 2 2
  79. 13 2 2
  80. 14 2 2
  81. 15 3 2
  82. 16 3 2
  83. 17 3 2
  84.  
  85. [18 rows x 2 columns]
  86.  
  87. In [12]: np.concatenate(df[lst_col].values)
  88. Out[12]: array([-1.04, -0.58, -1.32, 0.82, -0.59, -0.34, 0.25, 2.09, 0.12, 0.83, -0.88, 0.68, 0.55, -0.56, 0.65, -0.04, 0.36, -0.31])
  89.  
  90. In [13]: pd.DataFrame({
  91. ...: col:np.repeat(df[col].values, df[lst_col].str.len())
  92. ...: for col in df.columns.drop(lst_col)}
  93. ...: ).assign(**{lst_col:np.concatenate(df[lst_col].values)})
  94. Out[13]:
  95. trial_num subject samples
  96. 0 1 1 -1.04
  97. 1 1 1 -0.58
  98. 2 1 1 -1.32
  99. 3 2 1 0.82
  100. 4 2 1 -0.59
  101. 5 2 1 -0.34
  102. 6 3 1 0.25
  103. .. ... ... ...
  104. 11 1 2 0.68
  105. 12 2 2 0.55
  106. 13 2 2 -0.56
  107. 14 2 2 0.65
  108. 15 3 2 -0.04
  109. 16 3 2 0.36
  110. 17 3 2 -0.31
  111.  
  112. [18 rows x 3 columns]
  113.  
  114. >>> df
  115. samples subject trial_num
  116. 0 [-0.07, -2.9, -2.44] 1 1
  117. 1 [-1.52, -0.35, 0.1] 1 2
  118. 2 [-0.17, 0.57, -0.65] 1 3
  119. 3 [-0.82, -1.06, 0.47] 2 1
  120. 4 [0.79, 1.35, -0.09] 2 2
  121. 5 [1.17, 1.14, -1.79] 2 3
  122. >>>
  123. >>> s = df.apply(lambda x: pd.Series(x['samples']),axis=1).stack().reset_index(level=1, drop=True)
  124. >>> s.name = 'sample'
  125. >>>
  126. >>> df.drop('samples', axis=1).join(s)
  127. subject trial_num sample
  128. 0 1 1 -0.07
  129. 0 1 1 -2.90
  130. 0 1 1 -2.44
  131. 1 1 2 -1.52
  132. 1 1 2 -0.35
  133. 1 1 2 0.10
  134. 2 1 3 -0.17
  135. 2 1 3 0.57
  136. 2 1 3 -0.65
  137. 3 2 1 -0.82
  138. 3 2 1 -1.06
  139. 3 2 1 0.47
  140. 4 2 2 0.79
  141. 4 2 2 1.35
  142. 4 2 2 -0.09
  143. 5 2 3 1.17
  144. 5 2 3 1.14
  145. 5 2 3 -1.79
  146.  
  147. >>> res = df.set_index(['subject', 'trial_num'])['samples'].apply(pd.Series).stack()
  148. >>> res = res.reset_index()
  149. >>> res.columns = ['subject','trial_num','sample_num','sample']
  150. >>> res
  151. subject trial_num sample_num sample
  152. 0 1 1 0 1.89
  153. 1 1 1 1 -2.92
  154. 2 1 1 2 0.34
  155. 3 1 2 0 0.85
  156. 4 1 2 1 0.24
  157. 5 1 2 2 0.72
  158. 6 1 3 0 -0.96
  159. 7 1 3 1 -2.72
  160. 8 1 3 2 -0.11
  161. 9 2 1 0 -1.33
  162. 10 2 1 1 3.13
  163. 11 2 1 2 -0.65
  164. 12 2 2 0 0.10
  165. 13 2 2 1 0.65
  166. 14 2 2 2 0.15
  167. 15 2 3 0 0.64
  168. 16 2 3 1 -0.10
  169. 17 2 3 2 -0.76
  170.  
  171. >>> objs = [df, pd.DataFrame(df['samples'].tolist())]
  172. >>> pd.concat(objs, axis=1).drop('samples', axis=1)
  173. subject trial_num 0 1 2
  174. 0 1 1 -0.49 -1.00 0.44
  175. 1 1 2 -0.28 1.48 2.01
  176. 2 1 3 -0.52 -1.84 0.02
  177. 3 2 1 1.23 -1.36 -1.06
  178. 4 2 2 0.54 0.18 0.51
  179. 5 2 3 -2.18 -0.13 -1.35
  180. >>> pd.melt(_, var_name='sample_num', value_name='sample',
  181. ... value_vars=[0, 1, 2], id_vars=['subject', 'trial_num'])
  182. subject trial_num sample_num sample
  183. 0 1 1 0 -0.49
  184. 1 1 2 0 -0.28
  185. 2 1 3 0 -0.52
  186. 3 2 1 0 1.23
  187. 4 2 2 0 0.54
  188. 5 2 3 0 -2.18
  189. 6 1 1 1 -1.00
  190. 7 1 2 1 1.48
  191. 8 1 3 1 -1.84
  192. 9 2 1 1 -1.36
  193. 10 2 2 1 0.18
  194. 11 2 3 1 -0.13
  195. 12 1 1 2 0.44
  196. 13 1 2 2 2.01
  197. 14 1 3 2 0.02
  198. 15 2 1 2 -1.06
  199. 16 2 2 2 0.51
  200. 17 2 3 2 -1.35
  201.  
  202. items_as_cols = df.apply(lambda x: pd.Series(x['samples']), axis=1)
  203. # Keep original df index as a column so it's retained after melt
  204. items_as_cols['orig_index'] = items_as_cols.index
  205.  
  206. melted_items = pd.melt(items_as_cols, id_vars='orig_index',
  207. var_name='sample_num', value_name='sample')
  208. melted_items.set_index('orig_index', inplace=True)
  209.  
  210. df.merge(melted_items, left_index=True, right_index=True)
  211.  
  212. samples subject trial_num sample_num sample
  213. 0 [1.84, 1.05, -0.66] 1 1 0 1.84
  214. 0 [1.84, 1.05, -0.66] 1 1 1 1.05
  215. 0 [1.84, 1.05, -0.66] 1 1 2 -0.66
  216. 1 [-0.24, -0.9, 0.65] 1 2 0 -0.24
  217. 1 [-0.24, -0.9, 0.65] 1 2 1 -0.90
  218. 1 [-0.24, -0.9, 0.65] 1 2 2 0.65
  219. 2 [1.15, -0.87, -1.1] 1 3 0 1.15
  220. 2 [1.15, -0.87, -1.1] 1 3 1 -0.87
  221. 2 [1.15, -0.87, -1.1] 1 3 2 -1.10
  222. 3 [-0.8, -0.62, -0.68] 2 1 0 -0.80
  223. 3 [-0.8, -0.62, -0.68] 2 1 1 -0.62
  224. 3 [-0.8, -0.62, -0.68] 2 1 2 -0.68
  225. 4 [0.91, -0.47, 1.43] 2 2 0 0.91
  226. 4 [0.91, -0.47, 1.43] 2 2 1 -0.47
  227. 4 [0.91, -0.47, 1.43] 2 2 2 1.43
  228. 5 [-1.14, -0.24, -0.91] 2 3 0 -1.14
  229. 5 [-1.14, -0.24, -0.91] 2 3 1 -0.24
  230. 5 [-1.14, -0.24, -0.91] 2 3 2 -0.91
  231.  
  232. column_to_explode = 'samples'
  233. res = (df
  234. .set_index([x for x in df.columns if x != column_to_explode])[column_to_explode]
  235. .apply(pd.Series)
  236. .stack()
  237. .reset_index())
  238. res = res.rename(columns={
  239. res.columns[-2]:'exploded_{}_index'.format(column_to_explode),
  240. res.columns[-1]: '{}_exploded'.format(column_to_explode)})
  241.  
  242. df.samples.apply(lambda x: pd.Series(x)).join(df).
  243. melt(['subject','trial_num'],[0,1,2],var_name='sample')
  244.  
  245. subject trial_num sample value
  246. 0 1 1 0 -0.24
  247. 1 1 2 0 0.14
  248. 2 1 3 0 -0.67
  249. 3 2 1 0 -1.52
  250. 4 2 2 0 -0.00
  251. 5 2 3 0 -1.73
  252. 6 1 1 1 -0.70
  253. 7 1 2 1 -0.70
  254. 8 1 3 1 -0.29
  255. 9 2 1 1 -0.70
  256. 10 2 2 1 -0.72
  257. 11 2 3 1 1.30
  258. 12 1 1 2 -0.55
  259. 13 1 2 2 0.10
  260. 14 1 3 2 -0.44
  261. 15 2 1 2 0.13
  262. 16 2 2 2 -1.44
  263. 17 2 3 2 0.73
  264.  
  265. df = df.reset_index(drop=True)
  266. lstcol = df.lstcol.values
  267. lstcollist = []
  268. indexlist = []
  269. countlist = []
  270. for ii in range(len(lstcol)):
  271. lstcollist.extend(lstcol[ii])
  272. indexlist.extend([ii]*len(lstcol[ii]))
  273. countlist.extend([jj for jj in range(len(lstcol[ii]))])
  274. df = pd.merge(df.drop("lstcol",axis=1),pd.DataFrame({"lstcol":lstcollist,"lstcol_num":countlist},
  275. index=indexlist),left_index=True,right_index=True).reset_index(drop=True)
  276.  
  277. df = pd.DataFrame({
  278. 'var1': [['a', 'b', 'c'], ['d', 'e',], [], np.nan],
  279. 'var2': [1, 2, 3, 4]
  280. })
  281. df
  282. var1 var2
  283. 0 [a, b, c] 1
  284. 1 [d, e] 2
  285. 2 [] 3
  286. 3 NaN 4
  287.  
  288. df.explode('var1')
  289.  
  290. var1 var2
  291. 0 a 1
  292. 0 b 1
  293. 0 c 1
  294. 1 d 2
  295. 1 e 2
  296. 2 NaN 3 # empty list converted to NaN
  297. 3 NaN 4 # NaN entry preserved as-is
  298.  
  299. # to reset the index to be monotonically increasing...
  300. df.explode('var1').reset_index(drop=True)
  301.  
  302. var1 var2
  303. 0 a 1
  304. 1 b 1
  305. 2 c 1
  306. 3 d 2
  307. 4 e 2
  308. 5 NaN 3
  309. 6 NaN 4
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement