Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- for i in range(N): #the larger is N, the better it is
- df_sh = df.apply(numpy.random.permutation, axis=1)
- #where df this is my large dataframe, with 10K rows and 1K columns
- corr = df_sh.corrwith(s, axis = 1)
- #where s is the provided series (shape of s =(1000,))
- def corr2_coeff_2d_1d(A, B):
- # Rowwise mean of input arrays & subtract from input arrays themeselves
- A_mA = A - A.mean(1,keepdims=1)
- B_mB = B - B.mean()
- # Sum of squares across rows
- ssA = np.einsum('ij,ij->i',A_mA,A_mA)
- ssB = B_mB.dot(B_mB)
- # Finally get corr coeff
- return A_mA.dot(B_mB)/np.sqrt(ssA*ssB)
- # Extract underlying arry data for faster NumPy processing in loop later on
- a = df.values
- s_ar = s.values
- # Setup array for row-indexing with NumPy's advanced indexing later on
- r = np.arange(a.shape[0])[:,None]
- for i in range(N):
- np.random.shuffle(a.T)
- # Compute correlation
- corr = corr2_coeff_2d_1d(a, s_ar)
- a = df.values
- s_ar = s.values
- r = np.arange(a.shape[0])[:,None]
- B = s_ar
- B_mB = B - B.mean()
- ssB = B_mB.dot(B_mB)
- A = a
- A_mean = A.mean(1,keepdims=1)
- for i in range(N):
- np.random.shuffle(a.T)
- # Compute correlation
- A = a
- A_mA = A - A_mean
- ssA = np.einsum('ij,ij->i',A_mA,A_mA)
- corr = A_mA.dot(B_mB)/np.sqrt(ssA*ssB)
- In [170]: df = pd.DataFrame(np.random.rand(10000,1000))
- In [171]: s = pd.Series(df.iloc[0])
- In [172]: %%timeit
- ...: df_sh = df.apply(np.random.permutation, axis=1)
- ...: corr = df_sh.corrwith(s, axis = 1)
- 1 loop, best of 3: 2.09 s per loop
- In [173]: a = df.values
- ...: s_ar = s.values
- ...: r = np.arange(a.shape[0])[:,None]
- ...:
- ...: B = s_ar
- ...: B_mB = B - B.mean()
- ...: ssB = B_mB.dot(B_mB)
- ...:
- ...: A = a
- ...: A_mean = A.mean(1,keepdims=1)
- In [174]: %%timeit
- ...: np.random.shuffle(a.T)
- ...:
- ...: # Compute correlation
- ...: A = a
- ...: A_mA = A - A_mean
- ...: ssA = np.einsum('ij,ij->i',A_mA,A_mA)
- ...: corr = A_mA.dot(B_mB)/np.sqrt(ssA*ssB)
- 1 loop, best of 3: 181 ms per loop
- In [175]: 2090.0/181
- Out[175]: 11.546961325966851
Add Comment
Please, Sign In to add comment