Advertisement
n8henrie

Pandas DataFrame Resample

Feb 10th, 2014
187
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.29 KB | None | 0 0
  1. #! /usr/bin/env python3
  2.  
  3. import pandas as pd
  4. import numpy as np
  5. import datetime
  6.  
  7. dr = pd.date_range('11/6/2012', periods = 12, freq = 'W-Tue')
  8. df = pd.DataFrame(np.random.randn(len(dr)), index = dr, columns = ['column1'])
  9. df = df.resample(rule = '28D', how = 'mean')
  10.  
  11. # The start dates for each period
  12. df
  13. #              column1
  14. # 2012-11-06 -0.134757
  15. # 2012-12-04 -0.382133
  16. # 2013-01-01 -0.468343
  17. # 2013-01-29  0.000000
  18.  
  19. # [4 rows x 1 columns]
  20.  
  21. # Vacation dates
  22. vaca_days = list(pd.date_range(start = '2012-12-1', periods = 12, freq = 'D'))
  23. vaca_days
  24. # Timestamp('2012-12-01 00:00:00', tz=None),
  25. #  Timestamp('2012-12-02 00:00:00', tz=None),
  26. #  Timestamp('2012-12-03 00:00:00', tz=None),
  27. #  Timestamp('2012-12-04 00:00:00', tz=None),
  28. #  Timestamp('2012-12-05 00:00:00', tz=None),
  29. #  Timestamp('2012-12-06 00:00:00', tz=None),
  30. #  Timestamp('2012-12-07 00:00:00', tz=None),
  31. #  Timestamp('2012-12-08 00:00:00', tz=None),
  32. #  Timestamp('2012-12-09 00:00:00', tz=None),
  33. #  Timestamp('2012-12-10 00:00:00', tz=None),
  34. #  Timestamp('2012-12-11 00:00:00', tz=None),
  35. #  Timestamp('2012-12-12 00:00:00', tz=None)]
  36.  
  37. # Clearly there should be 3 vacation dates in the 2012-11-06 period
  38. # and 9 vacation dates in the 2013-12-04 period
  39. # ---------------------------
  40. # This gives me wrong numbers, because it tries to start the period on vaca_days[0]
  41. pd.Series(1, vaca_days).resample('28D', how='sum')
  42. # 2012-12-01    12
  43. # 2012-12-29     0
  44. # Freq: 28D, dtype: int64
  45.  
  46. # No help here
  47. pd.Series(1, vaca_days).resample('28D', how='sum').shift(-1)
  48. # 2012-12-01     0
  49. # 2012-12-29   NaN
  50. # Freq: 28D, dtype: float64
  51.  
  52. # The only workaround I could figure out: Append on the first start day
  53. vaca_days.append(pd.Timestamp('2012-11-06'))
  54.  
  55. vaca_series = pd.Series([0] + [1 for each in vaca_days[:-1]], sorted(vaca_days))
  56. vaca_series = vaca_series.resample(rule = '28D', how = sum)
  57.  
  58. vaca_series
  59. # 2012-11-06    3
  60. # 2012-12-04    9
  61. # 2013-01-01    0
  62. # Freq: 28D, dtype: int64
  63.  
  64. df['Vacation Days'] = vaca_series
  65. df['Vacation Days'] = df['Vacation Days'].fillna(0)
  66. df
  67.  
  68. #                   column1  Vacation Days
  69. # 2012-11-06  -4.761727e-01              3
  70. # 2012-12-04   5.715697e-01              9
  71. # 2013-01-01   4.065451e-02              0
  72. # 2013-01-29  2.781364e-309              0
  73.  
  74. # [4 rows x 2 columns]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement