Advertisement
Guest User

Untitled

a guest
Aug 6th, 2016
116
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.23 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. from pandas.io.parsers import StringIO
  4.  
  5. def find_closest_date(timepoint, time_series, add_time_delta_column=True):
  6. # takes a pd.Timestamp() instance and a pd.Series with dates in it
  7. # calcs the delta between `timepoint` and each date in `time_series`
  8. # returns the closest date and optionally the number of days in its time delta
  9. deltas = np.abs(time_series - timepoint)
  10. idx_closest_date = np.argmin(deltas)
  11. res = {"closest_date": time_series.ix[idx_closest_date]}
  12. idx = ['closest_date']
  13. if add_time_delta_column:
  14. res["closest_delta"] = deltas[idx_closest_date]
  15. idx.append('closest_delta')
  16. return pd.Series(res, index=idx)
  17.  
  18.  
  19. a = """timestamp,email,subject
  20. 2016-07-01 10:17:00,a@gmail.com,subject3
  21. 2016-07-01 02:01:02,a@gmail.com,welcome
  22. 2016-07-01 14:45:04,a@gmail.com,subject3
  23. 2016-07-01 08:14:02,a@gmail.com,subject2
  24. 2016-07-01 16:26:35,a@gmail.com,subject4
  25. 2016-07-01 10:17:00,b@gmail.com,subject3
  26. 2016-07-01 02:01:02,b@gmail.com,welcome
  27. 2016-07-01 14:45:04,b@gmail.com,subject3
  28. 2016-07-01 08:14:02,b@gmail.com,subject2
  29. 2016-07-01 16:26:35,b@gmail.com,subject4
  30. """
  31.  
  32. b = """timestamp,email,subject,clicks,var1
  33. 2016-07-01 02:01:14,a@gmail.com,welcome,1,1
  34. 2016-07-01 08:15:48,a@gmail.com,subject2,2,2
  35. 2016-07-01 10:17:39,a@gmail.com,subject3,1,7
  36. 2016-07-01 14:46:01,a@gmail.com,subject3,1,2
  37. 2016-07-01 16:27:28,a@gmail.com,subject4,1,2
  38. 2016-07-01 10:17:05,b@gmail.com,subject3,0,0
  39. 2016-07-01 02:01:03,b@gmail.com,welcome,0,0
  40. 2016-07-01 14:45:05,b@gmail.com,subject3,0,0
  41. 2016-07-01 08:16:00,b@gmail.com,subject2,0,0
  42. 2016-07-01 17:00:00,b@gmail.com,subject4,0,0
  43. """
  44.  
  45. df1 = pd.read_csv(StringIO(a), parse_dates=['timestamp'])
  46. df2 = pd.read_csv(StringIO(b), parse_dates=['timestamp'])
  47.  
  48. df1[['closest', 'time_bt_x_and_y']] = df1.timestamp.apply(find_closest_date, args=[df2.timestamp])
  49. df1
  50.  
  51. df3 = pd.merge(df1, df2, left_on=['closest'], right_on=['timestamp'])
  52.  
  53. df3
  54. timestamp_x email subject closest time_bt_x_and_y timestamp_y clicks var1
  55. 2016-07-01 10:17:00 a@gmail.com subject3 2016-07-01 10:17:05 00:00:05 NaT NaN NaN
  56. 2016-07-01 02:01:02 a@gmail.com welcome 2016-07-01 02:01:03 00:00:01 NaT NaN NaN
  57. 2016-07-01 14:45:04 a@gmail.com subject3 2016-07-01 14:45:05 00:00:01 NaT NaN NaN
  58. 2016-07-01 08:14:02 a@gmail.com subject2 2016-07-01 08:15:48 00:01:46 2016-07-01 08:15:48 2.0 2.0
  59. 2016-07-01 16:26:35 a@gmail.com subject4 2016-07-01 16:27:28 00:00:53 2016-07-01 16:27:28 1.0 2.0
  60. 2016-07-01 10:17:00 b@gmail.com subject3 2016-07-01 10:17:05 00:00:05 2016-07-01 10:17:05 0.0 0.0
  61. 2016-07-01 02:01:02 b@gmail.com welcome 2016-07-01 02:01:03 00:00:01 2016-07-01 02:01:03 0.0 0.0
  62. 2016-07-01 14:45:04 b@gmail.com subject3 2016-07-01 14:45:05 00:00:01 2016-07-01 14:45:05 0.0 0.0
  63. 2016-07-01 08:14:02 b@gmail.com subject2 2016-07-01 08:15:48 00:01:46 NaT NaN NaN
  64. 2016-07-01 16:26:35 b@gmail.com subject4 2016-07-01 16:27:28 00:00:53 NaT NaN NaN
  65.  
  66. df1.groupby(['email','subject'])['timestamp'].apply(find_closest_date, args=[df1.timestamp])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement