Guest User

Untitled

a guest
Jul 16th, 2018
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.63 KB | None | 0 0
  1. import pandas as pd
  2. from datetime import datetime
  3. from datetime import timedelta
  4. import time
  5.  
  6. ## Filenames
  7. #chicago = 'chicago.csv'
  8. #new_york_city = 'new_york_city.csv'
  9. #washington = 'washington.csv'
  10.  
  11.  
  12. def get_city():
  13. '''Asks the user for a city and returns the filename for that city's bike share data.
  14. Args:
  15. none.
  16. Returns:
  17. (str) Filename for a city's bikeshare data.
  18. '''
  19. city = ''
  20. while city.lower() not in ['chicago', 'new york', 'washington']:
  21. city = input('\nHello! Let\'s explore some US bikeshare data!\n'
  22. 'Would you like to see data for Chicago, New York, or'
  23. ' Washington?\n')
  24. if city.lower() == 'chicago':
  25. return 'chicago.csv'
  26. elif city.lower() == 'new york':
  27. return 'new_york_city.csv'
  28. elif city.lower() == 'washington':
  29. return 'washington.csv'
  30. else:
  31. print('Sorry, I do not understand your input. Please input either '
  32. 'Chicago, New York, or Washington.')
  33.  
  34. def get_time_period():
  35. '''Asks the user for a time period and returns the specified filter.
  36. Args:
  37. none.
  38. Returns:
  39. (str) Time filter for the bikeshare data.
  40. '''
  41. time_period = ''
  42. while time_period.lower() not in ['month', 'day', 'none']:
  43. time_period = input('\nWould you like to filter the data by month, day,'
  44. ' or not at all? Type "none" for no time filter.\n')
  45. if time_period.lower() not in ['month', 'day', 'none']:
  46. print('Sorry, I do not understand your input.')
  47. return time_period
  48.  
  49. def get_month():
  50. '''Asks the user for a month and returns the specified month.
  51. Args:
  52. none.
  53. Returns:
  54. (tuple) Lower limit, upper limit of month for the bikeshare data.
  55. '''
  56. month_input = ''
  57. months_dict = {'january': 1, 'february': 2, 'march': 3, 'april': 4,
  58. 'may': 5, 'june': 6}
  59. while month_input.lower() not in months_dict.keys():
  60. month_input = input('\nWhich month? January, February, March, April,'
  61. ' May, or June?\n')
  62. if month_input.lower() not in months_dict.keys():
  63. print('Sorry, I do not understand your input. Please type in a '
  64. 'month between January and June')
  65. month = months_dict[month_input.lower()]
  66. return ('2017-{}'.format(month), '2017-{}'.format(month + 1))
  67.  
  68. def get_day():
  69. '''Asks the user for a day and returns the specified day.
  70. Args:
  71. none.
  72. Returns:
  73. (tuple) Lower limit, upper limit of date for the bikeshare data.
  74. '''
  75. this_month = get_month()[0]
  76. month = int(this_month[5:])
  77. valid_date = False
  78. while valid_date == False:
  79. is_int = False
  80. day = input('\nWhich day? Please type your response as an integer.\n')
  81. while is_int == False:
  82. try:
  83. day = int(day)
  84. is_int = True
  85. except ValueError:
  86. print('Sorry, I do not understand your input. Please type your'
  87. ' response as an integer.')
  88. day = input('\nWhich day? Please type your response as an integer.\n')
  89. try:
  90. start_date = datetime(2017, month, day)
  91. valid_date = True
  92. except ValueError as e:
  93. print(str(e).capitalize())
  94. end_date = start_date + timedelta(days=1)
  95. return (str(start_date), str(end_date))
  96.  
  97. def popular_month(df):
  98. '''Finds and prints the most popular month for start time.
  99. Args:
  100. bikeshare dataframe
  101. Returns:
  102. none
  103. '''
  104. months = ['January', 'February', 'March', 'April', 'May', 'June']
  105. index = int(df['start_time'].dt.month.mode())
  106. most_pop_month = months[index - 1]
  107. print('The most popular month is {}.'.format(most_pop_month))
  108.  
  109. def popular_day(df):
  110. '''Finds and prints the most popular day of week (Monday, Tuesday, etc.) for start time.
  111. Args:
  112. bikeshare dataframe
  113. Returns:
  114. none
  115. '''
  116. days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
  117. 'Saturday', 'Sunday']
  118. index = int(df['start_time'].dt.dayofweek.mode())
  119. most_pop_day = days_of_week[index]
  120. print('The most popular day of week for start time is {}.'.format(most_pop_day))
  121.  
  122. def popular_hour(df):
  123. '''Finds and prints the most popular hour of day for start time.
  124. Args:
  125. bikeshare dataframe
  126. Returns:
  127. none
  128. '''
  129. most_pop_hour = int(df['start_time'].dt.hour.mode())
  130. if most_pop_hour == 0:
  131. am_pm = 'am'
  132. pop_hour_readable = 12
  133. elif 1 <= most_pop_hour < 13:
  134. am_pm = 'am'
  135. pop_hour_readable = most_pop_hour
  136. elif 13 <= most_pop_hour < 24:
  137. am_pm = 'pm'
  138. pop_hour_readable = most_pop_hour - 12
  139. print('The most popular hour of day for start time is {}{}.'.format(pop_hour_readable, am_pm))
  140.  
  141. def trip_duration(df):
  142. '''Finds and prints the total trip duration and average trip duration in
  143. hours, minutes, and seconds.
  144. Args:
  145. bikeshare dataframe
  146. Returns:
  147. none
  148. '''
  149. total_duration = df['trip_duration'].sum()
  150. minute, second = divmod(total_duration, 60)
  151. hour, minute = divmod(minute, 60)
  152. print('The total trip duration is {} hours, {} minutes and {}'
  153. ' seconds.'.format(hour, minute, second))
  154. average_duration = round(df['trip_duration'].mean())
  155. m, s = divmod(average_duration, 60)
  156. if m > 60:
  157. h, m = divmod(m, 60)
  158. print('The average trip duration is {} hours, {} minutes and {}'
  159. ' seconds.'.format(h, m, s))
  160. else:
  161. print('The average trip duration is {} minutes and {} seconds.'.format(m, s))
  162.  
  163. def popular_stations(df):
  164. '''Finds and prints the most popular start station and most popular end station.
  165. Args:
  166. bikeshare dataframe
  167. Returns:
  168. none
  169. '''
  170. pop_start = df['start_station'].mode().to_string(index = False)
  171. pop_end = df['end_station'].mode().to_string(index = False)
  172. print('The most popular start station is {}.'.format(pop_start))
  173. print('The most popular end station is {}.'.format(pop_end))
  174.  
  175. def popular_trip(df):
  176. '''Finds and prints the most popular trip.
  177. Args:
  178. bikeshare dataframe
  179. Returns:
  180. none
  181. '''
  182. most_pop_trip = df['journey'].mode().to_string(index = False)
  183. # The 'journey' column is created in the statistics() function.
  184. print('The most popular trip is {}.'.format(most_pop_trip))
  185.  
  186. def users(df):
  187. '''Finds and prints the counts of each user type.
  188. Args:
  189. bikeshare dataframe
  190. Returns:
  191. none
  192. '''
  193. subs = df.query('user_type == "Subscriber"').user_type.count()
  194. cust = df.query('user_type == "Customer"').user_type.count()
  195. print('There are {} Subscribers and {} Customers.'.format(subs, cust))
  196.  
  197. def gender(df):
  198. '''Finds and prints the counts of gender.
  199. Args:
  200. bikeshare dataframe
  201. Returns:
  202. none
  203. '''
  204. male_count = df.query('gender == "Male"').gender.count()
  205. female_count = df.query('gender == "Male"').gender.count()
  206. print('There are {} male users and {} female users.'.format(male_count, female_count))
  207.  
  208. def birth_years(df):
  209. ''' Finds and prints the earliest (i.e. oldest user), most recent (i.e.
  210. youngest user), and most popular birth years.
  211. Args:
  212. bikeshare dataframe
  213. Returns:
  214. none
  215. '''
  216. earliest = int(df['birth_year'].min())
  217. latest = int(df['birth_year'].max())
  218. mode = int(df['birth_year'].mode())
  219. print('The oldest users are born in {}.\nThe youngest users are born in {}.'
  220. '\nThe most popular birth year is {}.'.format(earliest, latest, mode))
  221.  
  222. def display_data(df):
  223. '''Displays five lines of data if the user specifies that they would like to.
  224. After displaying five lines, ask the user if they would like to see five more,
  225. continuing asking until they say stop.
  226. Args:
  227. data frame
  228. Returns:
  229. none
  230. '''
  231. def is_valid(display):
  232. if display.lower() in ['yes', 'no']:
  233. return True
  234. else:
  235. return False
  236. head = 0
  237. tail = 5
  238. valid_input = False
  239. while valid_input == False:
  240. display = input('\nWould you like to view individual trip data? '
  241. 'Type \'yes\' or \'no\'.\n')
  242. valid_input = is_valid(display)
  243. if valid_input == True:
  244. break
  245. else:
  246. print("Sorry, I do not understand your input. Please type 'yes' or"
  247. " 'no'.")
  248. if display.lower() == 'yes':
  249. # prints every column except the 'journey' column created in statistics()
  250. print(df[df.columns[0:-1]].iloc[head:tail])
  251. display_more = ''
  252. while display_more.lower() != 'no':
  253. valid_input_2 = False
  254. while valid_input_2 == False:
  255. display_more = input('\nWould you like to view more individual'
  256. ' trip data? Type \'yes\' or \'no\'.\n')
  257. valid_input_2 = is_valid(display_more)
  258. if valid_input_2 == True:
  259. break
  260. else:
  261. print("Sorry, I do not understand your input. Please type "
  262. "'yes' or 'no'.")
  263. if display_more.lower() == 'yes':
  264. head += 5
  265. tail += 5
  266. print(df[df.columns[0:-1]].iloc[head:tail])
  267. elif display_more.lower() == 'no':
  268. break
  269.  
  270.  
  271. def statistics():
  272. '''Calculates and prints out the descriptive statistics about a city and
  273. time period specified by the user via raw input.
  274. Args:
  275. none.
  276. Returns:
  277. none.
  278. '''
  279. # Filter by city (Chicago, New York, Washington)
  280. city = get_city()
  281. print('Loading data...')
  282. df = pd.read_csv(city, parse_dates = ['Start Time', 'End Time'])
  283.  
  284. # change all column names to lowercase letters and replace spaces with underscores
  285. new_labels = []
  286. for col in df.columns:
  287. new_labels.append(col.replace(' ', '_').lower())
  288. df.columns = new_labels
  289.  
  290. # increases the column width so that the long strings in the 'journey'
  291. # column can be displayed fully
  292. pd.set_option('max_colwidth', 100)
  293.  
  294. # creates a 'journey' column that concatenates 'start_station' with
  295. # 'end_station' for the use popular_trip() function
  296. df['journey'] = df['start_station'].str.cat(df['end_station'], sep=' to ')
  297.  
  298. # Filter by time period (month, day, none)
  299. time_period = get_time_period()
  300. if time_period == 'none':
  301. df_filtered = df
  302. elif time_period == 'month' or time_period == 'day':
  303. if time_period == 'month':
  304. filter_lower, filter_upper = get_month()
  305. elif time_period == 'day':
  306. filter_lower, filter_upper = get_day()
  307. print('Filtering data...')
  308. df_filtered = df[(df['start_time'] >= filter_lower) & (df['start_time'] < filter_upper)]
  309. print('\nCalculating the first statistic...')
  310.  
  311. if time_period == 'none':
  312. start_time = time.time()
  313.  
  314. # What is the most popular month for start time?
  315. popular_month(df_filtered)
  316. print("That took %s seconds." % (time.time() - start_time))
  317. print("\nCalculating the next statistic...")
  318.  
  319. if time_period == 'none' or time_period == 'month':
  320. start_time = time.time()
  321.  
  322. # What is the most popular day of week (Monday, Tuesday, etc.) for start time?
  323. popular_day(df_filtered)
  324. print("That took %s seconds." % (time.time() - start_time))
  325. print("\nCalculating the next statistic...")
  326. start_time = time.time()
  327.  
  328. # What is the most popular hour of day for start time?
  329. popular_hour(df_filtered)
  330. print("That took %s seconds." % (time.time() - start_time))
  331. print("\nCalculating the next statistic...")
  332. start_time = time.time()
  333.  
  334. # What is the total trip duration and average trip duration?
  335. trip_duration(df_filtered)
  336. print("That took %s seconds." % (time.time() - start_time))
  337. print("\nCalculating the next statistic...")
  338. start_time = time.time()
  339.  
  340. # What is the most popular start station and most popular end station?
  341. popular_stations(df_filtered)
  342. print("That took %s seconds." % (time.time() - start_time))
  343. print("\nCalculating the next statistic...")
  344. start_time = time.time()
  345.  
  346. # What is the most popular trip?
  347. popular_trip(df_filtered)
  348. print("That took %s seconds." % (time.time() - start_time))
  349. print("\nCalculating the next statistic...")
  350. start_time = time.time()
  351.  
  352. # What are the counts of each user type?
  353. users(df_filtered)
  354. print("That took %s seconds." % (time.time() - start_time))
  355.  
  356. if city == 'chicago.csv' or city == 'new_york_city.csv':
  357. print("\nCalculating the next statistic...")
  358. start_time = time.time()
  359.  
  360. # What are the counts of gender?
  361. gender(df_filtered)
  362. print("That took %s seconds." % (time.time() - start_time))
  363. print("\nCalculating the next statistic...")
  364. start_time = time.time()
  365.  
  366. # What are the earliest (i.e. oldest user), most recent (i.e. youngest
  367. # user), and most popular birth years?
  368. birth_years(df_filtered)
  369. print("That took %s seconds." % (time.time() - start_time))
  370.  
  371. # Display five lines of data at a time if user specifies that they would like to
  372. display_data(df_filtered)
  373.  
  374. # Restart?
  375. restart = input('\nWould you like to restart? Type \'yes\' or \'no\'.\n')
  376. while restart.lower() not in ['yes', 'no']:
  377. print("Invalid input. Please type 'yes' or 'no'.")
  378. restart = input('\nWould you like to restart? Type \'yes\' or \'no\'.\n')
  379. if restart.lower() == 'yes':
  380. statistics()
  381.  
  382.  
  383. if __name__ == "__main__":
  384. statistics()
Add Comment
Please, Sign In to add comment