Advertisement
Guest User

Untitled

a guest
Aug 8th, 2016
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.36 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. from pandas.io.parsers import StringIO
  4.  
  5. def find_closest_date(timepoint, time_series, add_time_delta_column=True):
  6. # takes a pd.Timestamp() instance and a pd.Series with dates in it
  7. # calcs the delta between `timepoint` and each date in `time_series`
  8. # returns the closest date and optionally the number of days in its time delta
  9. deltas = np.abs(time_series - timepoint)
  10. idx_closest_date = np.argmin(deltas)
  11. res = {"closest_date": time_series.ix[idx_closest_date]}
  12. idx = ['closest_date']
  13. if add_time_delta_column:
  14. res["closest_delta"] = deltas[idx_closest_date]
  15. idx.append('closest_delta')
  16. return pd.Series(res, index=idx)
  17.  
  18.  
  19. a = """timestamp,email,subject
  20. 2016-07-01 10:17:00,a@gmail.com,subject3
  21. 2016-07-01 02:01:02,a@gmail.com,welcome
  22. 2016-07-01 14:45:04,a@gmail.com,subject3
  23. 2016-07-01 08:14:02,a@gmail.com,subject2
  24. 2016-07-01 16:26:35,a@gmail.com,subject4
  25. 2016-07-01 10:17:00,b@gmail.com,subject3
  26. 2016-07-01 02:01:02,b@gmail.com,welcome
  27. 2016-07-01 14:45:04,b@gmail.com,subject3
  28. 2016-07-01 08:14:02,b@gmail.com,subject2
  29. 2016-07-01 16:26:35,b@gmail.com,subject4
  30. """
  31.  
  32. b = """timestamp,email,subject,clicks,var1
  33. 2016-07-01 02:01:14,a@gmail.com,welcome,1,1
  34. 2016-07-01 08:15:48,a@gmail.com,subject2,2,2
  35. 2016-07-01 10:17:39,a@gmail.com,subject3,1,7
  36. 2016-07-01 14:46:01,a@gmail.com,subject3,1,2
  37. 2016-07-01 16:27:28,a@gmail.com,subject4,1,2
  38. 2016-07-01 10:17:05,b@gmail.com,subject3,0,0
  39. 2016-07-01 02:01:03,b@gmail.com,welcome,0,0
  40. 2016-07-01 14:45:05,b@gmail.com,subject3,0,0
  41. 2016-07-01 08:16:00,b@gmail.com,subject2,0,0
  42. 2016-07-01 17:00:00,b@gmail.com,subject4,0,0
  43. """
  44.  
  45. a = """timestamp,email,subject
  46. 2016-07-01 10:17:00,a@gmail.com,subject3
  47. 2016-07-01 10:17:00,b@gmail.com,subject3
  48. """
  49.  
  50. b = """timestamp,email,subject,clicks,var1
  51. 2016-07-01 10:17:39,a@gmail.com,subject3,1,7
  52. 2016-07-01 10:17:05,b@gmail.com,subject3,0,0
  53. """
  54. df1 = pd.read_csv(StringIO(a), parse_dates=['timestamp'])
  55. df2 = pd.read_csv(StringIO(b), parse_dates=['timestamp'])
  56.  
  57. df1[['closest', 'time_bt_x_and_y']] = df1.timestamp.apply(find_closest_date, args=[df2.timestamp])
  58. df1
  59.  
  60. df3 = pd.merge(df1, df2, left_on=['email','subject','closest'], right_on=['email','subject','timestamp'],how='left')
  61.  
  62. df3
  63. timestamp_x email subject closest time_bt_x_and_y timestamp_y clicks var1
  64. 2016-07-01 10:17:00 a@gmail.com subject3 2016-07-01 10:17:05 00:00:05 NaT NaN NaN
  65. 2016-07-01 02:01:02 a@gmail.com welcome 2016-07-01 02:01:03 00:00:01 NaT NaN NaN
  66. 2016-07-01 14:45:04 a@gmail.com subject3 2016-07-01 14:45:05 00:00:01 NaT NaN NaN
  67. 2016-07-01 08:14:02 a@gmail.com subject2 2016-07-01 08:15:48 00:01:46 2016-07-01 08:15:48 2.0 2.0
  68. 2016-07-01 16:26:35 a@gmail.com subject4 2016-07-01 16:27:28 00:00:53 2016-07-01 16:27:28 1.0 2.0
  69. 2016-07-01 10:17:00 b@gmail.com subject3 2016-07-01 10:17:05 00:00:05 2016-07-01 10:17:05 0.0 0.0
  70. 2016-07-01 02:01:02 b@gmail.com welcome 2016-07-01 02:01:03 00:00:01 2016-07-01 02:01:03 0.0 0.0
  71. 2016-07-01 14:45:04 b@gmail.com subject3 2016-07-01 14:45:05 00:00:01 2016-07-01 14:45:05 0.0 0.0
  72. 2016-07-01 08:14:02 b@gmail.com subject2 2016-07-01 08:15:48 00:01:46 NaT NaN NaN
  73. 2016-07-01 16:26:35 b@gmail.com subject4 2016-07-01 16:27:28 00:00:53 NaT NaN NaN
  74.  
  75. df1.groupby(['email','subject'])['timestamp'].apply(find_closest_date, args=[df1.timestamp])
  76.  
  77. In [108]: result = pd.merge(df1, df2, how='left', on=['email','subject'], suffixes=['', '_y']); result
  78. Out[108]:
  79. timestamp email subject timestamp_y clicks var1
  80. 0 2016-07-01 10:17:00 a@gmail.com subject3 2016-07-01 10:17:39 1 7
  81. 1 2016-07-01 10:17:00 a@gmail.com subject3 2016-07-01 14:46:01 1 2
  82. 2 2016-07-01 02:01:02 a@gmail.com welcome 2016-07-01 02:01:14 1 1
  83. 3 2016-07-01 14:45:04 a@gmail.com subject3 2016-07-01 10:17:39 1 7
  84. 4 2016-07-01 14:45:04 a@gmail.com subject3 2016-07-01 14:46:01 1 2
  85. 5 2016-07-01 08:14:02 a@gmail.com subject2 2016-07-01 08:15:48 2 2
  86. 6 2016-07-01 16:26:35 a@gmail.com subject4 2016-07-01 16:27:28 1 2
  87. 7 2016-07-01 10:17:00 b@gmail.com subject3 2016-07-01 10:17:05 0 0
  88. 8 2016-07-01 10:17:00 b@gmail.com subject3 2016-07-01 14:45:05 0 0
  89. 9 2016-07-01 02:01:02 b@gmail.com welcome 2016-07-01 02:01:03 0 0
  90. 10 2016-07-01 14:45:04 b@gmail.com subject3 2016-07-01 10:17:05 0 0
  91. 11 2016-07-01 14:45:04 b@gmail.com subject3 2016-07-01 14:45:05 0 0
  92. 12 2016-07-01 08:14:02 b@gmail.com subject2 2016-07-01 08:16:00 0 0
  93. 13 2016-07-01 16:26:35 b@gmail.com subject4 2016-07-01 17:00:00 0 0
  94.  
  95. result['diff'] = (result['timestamp_y'] - result['timestamp']).abs()
  96.  
  97. idx = result.groupby(['timestamp','email','subject'])['diff'].idxmin()
  98. result = result.loc[idx]
  99.  
  100. import numpy as np
  101. import pandas as pd
  102. from pandas.io.parsers import StringIO
  103.  
  104. a = """timestamp,email,subject
  105. 2016-07-01 10:17:00,a@gmail.com,subject3
  106. 2016-07-01 02:01:02,a@gmail.com,welcome
  107. 2016-07-01 14:45:04,a@gmail.com,subject3
  108. 2016-07-01 08:14:02,a@gmail.com,subject2
  109. 2016-07-01 16:26:35,a@gmail.com,subject4
  110. 2016-07-01 10:17:00,b@gmail.com,subject3
  111. 2016-07-01 02:01:02,b@gmail.com,welcome
  112. 2016-07-01 14:45:04,b@gmail.com,subject3
  113. 2016-07-01 08:14:02,b@gmail.com,subject2
  114. 2016-07-01 16:26:35,b@gmail.com,subject4
  115. """
  116.  
  117. b = """timestamp,email,subject,clicks,var1
  118. 2016-07-01 02:01:14,a@gmail.com,welcome,1,1
  119. 2016-07-01 08:15:48,a@gmail.com,subject2,2,2
  120. 2016-07-01 10:17:39,a@gmail.com,subject3,1,7
  121. 2016-07-01 14:46:01,a@gmail.com,subject3,1,2
  122. 2016-07-01 16:27:28,a@gmail.com,subject4,1,2
  123. 2016-07-01 10:17:05,b@gmail.com,subject3,0,0
  124. 2016-07-01 02:01:03,b@gmail.com,welcome,0,0
  125. 2016-07-01 14:45:05,b@gmail.com,subject3,0,0
  126. 2016-07-01 08:16:00,b@gmail.com,subject2,0,0
  127. 2016-07-01 17:00:00,b@gmail.com,subject4,0,0
  128. """
  129.  
  130. df1 = pd.read_csv(StringIO(a), parse_dates=['timestamp'])
  131. df2 = pd.read_csv(StringIO(b), parse_dates=['timestamp'])
  132.  
  133. result = pd.merge(df1, df2, how='left', on=['email','subject'], suffixes=['', '_y'])
  134. result['diff'] = (result['timestamp_y'] - result['timestamp']).abs()
  135. idx = result.groupby(['timestamp','email','subject'])['diff'].idxmin()
  136. result = result.loc[idx].drop(['timestamp_y','diff'], axis=1)
  137. result = result.sort_index()
  138. print(result)
  139.  
  140. timestamp email subject clicks var1
  141. 0 2016-07-01 10:17:00 a@gmail.com subject3 1 7
  142. 2 2016-07-01 02:01:02 a@gmail.com welcome 1 1
  143. 4 2016-07-01 14:45:04 a@gmail.com subject3 1 2
  144. 5 2016-07-01 08:14:02 a@gmail.com subject2 2 2
  145. 6 2016-07-01 16:26:35 a@gmail.com subject4 1 2
  146. 7 2016-07-01 10:17:00 b@gmail.com subject3 0 0
  147. 9 2016-07-01 02:01:02 b@gmail.com welcome 0 0
  148. 11 2016-07-01 14:45:04 b@gmail.com subject3 0 0
  149. 12 2016-07-01 08:14:02 b@gmail.com subject2 0 0
  150. 13 2016-07-01 16:26:35 b@gmail.com subject4 0 0
  151.  
  152. a = """timestamp,email,subject
  153. 2016-07-01 10:17:00,a@gmail.com,subject3
  154. 2016-07-01 02:01:02,a@gmail.com,welcome
  155. 2016-07-01 14:45:04,a@gmail.com,subject3
  156. 2016-07-01 08:14:02,a@gmail.com,subject2
  157. 2016-07-01 16:26:35,a@gmail.com,subject4
  158. 2016-07-01 10:17:00,b@gmail.com,subject3
  159. 2016-07-01 02:01:02,b@gmail.com,welcome
  160. 2016-07-01 14:45:04,b@gmail.com,subject3
  161. 2016-07-01 08:14:02,b@gmail.com,subject2
  162. 2016-07-01 16:26:35,b@gmail.com,subject4
  163. """
  164.  
  165. b = """timestamp,email,subject,clicks,var1
  166. 2016-07-01 02:01:14,a@gmail.com,welcome,1,1
  167. 2016-07-01 08:15:48,a@gmail.com,subject2,2,2
  168. 2016-07-01 10:17:39,a@gmail.com,subject3,1,7
  169. 2016-07-01 14:46:01,a@gmail.com,subject3,1,2
  170. 2016-07-01 16:27:28,a@gmail.com,subject4,1,2
  171. 2016-07-01 10:17:05,b@gmail.com,subject3,0,0
  172. 2016-07-01 02:01:03,b@gmail.com,welcome,0,0
  173. 2016-07-01 14:45:05,b@gmail.com,subject3,0,0
  174. 2016-07-01 08:16:00,b@gmail.com,subject2,0,0
  175. 2016-07-01 17:00:00,b@gmail.com,subject4,0,0
  176. """
  177.  
  178. df1 = pd.read_csv(StringIO(a), parse_dates=['timestamp'])
  179. df2 = pd.read_csv(StringIO(b), parse_dates=['timestamp'])
  180. df2 = df2.set_index(['email', 'subject'])
  181.  
  182. def find_closest_date(timepoint, time_series, add_time_delta_column=True):
  183. # takes a pd.Timestamp() instance and a pd.Series with dates in it
  184. # calcs the delta between `timepoint` and each date in `time_series`
  185. # returns the closest date and optionally the number of days in its time delta
  186. time_series = time_series.values
  187. timepoint = np.datetime64(timepoint)
  188. deltas = np.abs(np.subtract(time_series, timepoint))
  189. idx_closest_date = np.argmin(deltas)
  190. res = {"closest_date": time_series[idx_closest_date]}
  191. idx = ['closest_date']
  192. if add_time_delta_column:
  193. res["closest_delta"] = deltas[idx_closest_date]
  194. idx.append('closest_delta')
  195. return pd.Series(res, index=idx)
  196.  
  197. # Then group df1 as needed
  198. grouped = df1.groupby(['email', 'subject'])
  199.  
  200. # Finally loop over the group items, finding the closest timestamps
  201. join_ts = pd.DataFrame()
  202. for name, group in grouped:
  203. try:
  204. join_ts = pd.concat([join_ts, group['timestamp']
  205. .apply(find_closest_date, time_series=df2.loc[name, 'timestamp'])],
  206. axis=0)
  207. except KeyError:
  208. pass
  209.  
  210. df3 = pd.merge(pd.concat([df1, join_ts], axis=1), df2, left_on=['closest_date'], right_on=['timestamp'])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement