Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # ======================================================
- # Script for FCM 142 - February 2019
- # Written by G.D. Walters
- # Version #2
- # ======================================================
- import pandas as pd
- # Change the path to where your data actually is, or leave the path off all together
- # and keep the data in the same folder as the code.
- csvfile = '/home/greg/Downloads/WhenDoChildrenLearnWords/main_data.csv'
- rawdata = pd.read_csv(csvfile)
- # Create our beginning basic DataFrame
- df = pd.DataFrame(rawdata)
- print('========================================================')
- print(' Number of cells with bad data by column.... ')
- print('========================================================')
- # Now, find out how many fields in each column has 'bad' data.
- print(df.isnull().sum(axis=0))
- # and hold so the data can be seen...
- print('Make sure your terminal is pretty wide so you can see all the data that comes next')
- print('(about 160 characters wide if possible), and then')
- keyin = input('Press <Enter> to continue -> ')
- # now, create a list of the columns that we are actually interested in right now.
- cols_to_use = ['Word_NW','Translation','AoA','VSoA','Lex_cat','Freq','CDS_freq']
- # Create another DataFrame with just the columns we want...
- # NOTE: DataFrame df is unchanged
- print('========================================================')
- print(' Our new DataFrame with just the columns of interest ')
- print('========================================================')
- df2 = df[cols_to_use]
- print(df2)
- # Set the DataFrame to 'is_copy' so we don't get any warning messages...
- # df2.is_copy = False
- # However, this is a depreciated command and might not work in
- # the future, near or otherwise
- pd.set_option('mode.chained_assignment', None)
- print('========================================================')
- print(' Cleaning the data.... ')
- print('========================================================')
- # Next we cleans the data by filling numeric values with a mean of thos columns
- # and 'unknown' in the text field
- df2['AoA'].fillna(df2['AoA'].mean(),inplace=True)
- df2['AoA'].fillna(df2['AoA'].mean(),inplace=True)
- df2['VSoA'].fillna(df2['VSoA'].mean(),inplace=True)
- df2['Freq'].fillna(df2['Freq'].mean(),inplace=True)
- df2['CDS_freq'].fillna(df2['CDS_freq'].mean(),inplace=True)
- df2['Lex_cat'].fillna('unknown',inplace=True)
- # And check to make sure that everything is filled...
- print('========================================================')
- print(' Number of cells STILL with bad data by column.... ')
- print('========================================================')
- print(df2.isnull().sum(axis=0))
- keyin = input('Press <Enter> to continue -> ')
- # Now, select just the rows where AoA is less than 18 months of age
- # will return 21 rows
- print('========================================================')
- print(' Data of column AoA less than 18 (months) .... ')
- print('========================================================')
- print(df2[df2.AoA < 18])
- keyin = input('Press <Enter> to continue -> ')
- # Now, display same data, but sorted by 'A0A' ascending
- print('========================================================')
- print(' Same data sorted by AoA ascending.... ')
- print('========================================================')
- print(df2[df2.AoA < 18].sort_values('AoA',ascending=True))
- keyin = input('Press <Enter> to continue -> ')
- # ~ Word_NW Translation AoA VSoA Lex_cat Freq CDS_freq
- # ~ 370 'en mamma' 'mommy' 12.0 20.0 people 36751.0 171.0
- # ~ 409 'takk' 'thank you' 13.0 40.0 games & routines 106589.0 106.0
- # ~ 2 'brrr (bil-lyd)' 'vroom' 13.0 20.0 sound effects NaN 20.0
- # ~ 8 'nam - nam' 'yum yum' 13.0 40.0 sound effects 35.0 19.0
- # ~ 394 'hei' 'hi' 13.0 40.0 games & routines 64086.0 74.0
- # ~ 379 'en pappa' 'daddy' 13.0 20.0 people 24689.0 118.0
- # ~ 391 'hadet' 'bye' 14.0 40.0 games & routines 467.0 6.0
- # ~ 1 'bææ' 'baa baa' 15.0 40.0 sound effects 18.0 5.0
- # ~ 401 'Borte! (lek hvor ansikt gjemmes og plutselig ... 'peek-a-boo' 15.0 40.0 games & routines 5.0 27.0
- # ~ 397 'ja' 'yes' 15.0 40.0 games & routines 158496.0 3291.0
- # ~ 7 'møø' 'moo' 15.0 40.0 sound effects 8.0 7.0
- # ~ 11 'voff voff' 'woof woof' 15.0 40.0 sound effects 10.0 52.0
- # ~ 404 'nei' 'no' 15.0 40.0 games & routines 136554.0 470.0
- # ~ 0 'au' 'ouch' 16.0 40.0 sound effects 4366.0 7.0
- # ~ 70 'en ball' 'ball' 16.0 40.0 common nouns 96368.0 74.0
- # ~ 387 'å bade' 'bathe' 17.0 60.0 games & routines 8123.0 16.0
- # ~ 91 'en banan' 'banana' 17.0 60.0 common nouns 4673.0 17.0
- # ~ 6 'mjau' 'meow' 17.0 60.0 sound effects 214.0 7.0
- # ~ 3 'gakk gakk' 'quack quack' 17.0 40.0 sound effects 16.0 3.0
- # ~ 350 'en baby' 'baby' 17.0 60.0 people 16739.0 33.0
- # ~ 110 'is' 'ice cream' 17.0 60.0 common nouns 180159.0 11.0
- # Now sort first by 'AoA' then by 'VSoA'
- # When they first learn the word and the "average" number of words they know at this point
- print('========================================================')
- print(' Finally same data sorted by AoA then VSoA Ascending....')
- print('========================================================')
- print(df2[df2.AoA < 18].sort_values(['AoA','VSoA'],ascending=True))
- keyin = input('Press <Enter> to continue -> ')
- # ~ Word_NW Translation AoA VSoA Lex_cat Freq CDS_freq
- # ~ 370 'en mamma' 'mommy' 12.0 20.0 people 36751.0 171.0
- # ~ 2 'brrr (bil-lyd)' 'vroom' 13.0 20.0 sound effects NaN 20.0
- # ~ 379 'en pappa' 'daddy' 13.0 20.0 people 24689.0 118.0
- # ~ 8 'nam - nam' 'yum yum' 13.0 40.0 sound effects 35.0 19.0
- # ~ 394 'hei' 'hi' 13.0 40.0 games & routines 64086.0 74.0
- # ~ 409 'takk' 'thank you' 13.0 40.0 games & routines 106589.0 106.0
- # ~ 391 'hadet' 'bye' 14.0 40.0 games & routines 467.0 6.0
- # ~ 1 'bææ' 'baa baa' 15.0 40.0 sound effects 18.0 5.0
- # ~ 7 'møø' 'moo' 15.0 40.0 sound effects 8.0 7.0
- # ~ 11 'voff voff' 'woof woof' 15.0 40.0 sound effects 10.0 52.0
- # ~ 397 'ja' 'yes' 15.0 40.0 games & routines 158496.0 3291.0
- # ~ 401 'Borte! (lek hvor ansikt gjemmes og plutselig ... 'peek-a-boo' 15.0 40.0 games & routines 5.0 27.0
- # ~ 404 'nei' 'no' 15.0 40.0 games & routines 136554.0 470.0
- # ~ 0 'au' 'ouch' 16.0 40.0 sound effects 4366.0 7.0
- # ~ 70 'en ball' 'ball' 16.0 40.0 common nouns 96368.0 74.0
- # ~ 3 'gakk gakk' 'quack quack' 17.0 40.0 sound effects 16.0 3.0
- # ~ 6 'mjau' 'meow' 17.0 60.0 sound effects 214.0 7.0
- # ~ 91 'en banan' 'banana' 17.0 60.0 common nouns 4673.0 17.0
- # ~ 110 'is' 'ice cream' 17.0 60.0 common nouns 180159.0 11.0
- # ~ 350 'en baby' 'baby' 17.0 60.0 people 16739.0 33.0
- # ~ 387 'å bade' 'bathe' 17.0 60.0 games & routines 8123.0 16.0
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement