Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- import pandas as pd
- #-----------------------------------
- # Read the CSV file into a DataFrame
- filename = 'BreadBasket_DMS.csv'
- df = pd.read_csv(filename)
- # get a list of the column names (headers)
- col_list = df.columns.values.tolist()
- print(col_list)
- print(df.count())
- # create a list of unique dates from the DataFrame......
- datelist = df['Date'].unique().tolist()
- print(len(datelist), min(datelist), max(datelist))
- # Next, create a list of the unique items...
- itemlist = df['Item'].unique().tolist()
- print(itemlist)
- print(len(itemlist))
- # Sort by 'Item' column
- df2 = df.sort_values('Item')
- print(df2)
- # Groupby
- df3 = df.groupby(['Item'])
- tacos = df3.get_group('Tacos/Fajita')
- print(tacos)
- # Groupby and count
- df4 = df.groupby('Item').count()
- print(df4)
- # Question #1 ...
- # By Date, show how many of each item were sold...
- # produces a Series Data object
- byDate = df.groupby(['Date', 'Item'])['Date'].count()
- # Produces a DataFrame object
- # byDate = df.groupby(['Date','Item'])[['Item']].count()
- print(byDate)
- # By Item, show total sold for entire period...
- itemcount = df.groupby('Item')[['Transaction']].count()
- # Same as above, but sorted by Transaction ascending
- print('====================================')
- print('SortedItemCount #1 - Question #2')
- sorteditemcount = df.groupby('Item')[['Transaction']].count().sort_values('Transaction')
- print(sorteditemcount)
- print(sorteditemcount.head(10))
- print(sorteditemcount.tail(10))
- print('====================================')
- print('SortedItemCount #2 - Question #2')
- sorteditemcount2 = df.groupby('Item')[['Transaction']].count().sort_values('Transaction', ascending=False)
- print(sorteditemcount2)
- print(sorteditemcount2.head(10))
- print(sorteditemcount2.tail(10))
- # Question #3...
- # By Date|Time Trans per hour
- print('====================================')
- print('Question #3')
- print('By Hour')
- byhour = df.groupby(['Date', 'Time'])[['Transaction']].count()
- print(byhour)
- # =============================================================================
- #
- # =============================================================================
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement