FCM142 - Kidswords

# ======================================================
# Script for FCM 142 - February 2019
# Written by G.D. Walters
# Version #2
# ======================================================

import pandas as pd
# Change the path to where your data actually is, or leave the path off all together
# and keep the data in the same folder as the code.
csvfile = '/home/greg/Downloads/WhenDoChildrenLearnWords/main_data.csv'
rawdata = pd.read_csv(csvfile)
# Create our beginning basic DataFrame
df = pd.DataFrame(rawdata)
print('========================================================')
print(' Number of cells with bad data by column....            ')
print('========================================================')
# Now, find out how many fields in each column has 'bad' data.
print(df.isnull().sum(axis=0))
# and hold so the data can be seen...
print('Make sure your terminal is pretty wide so you can see all the data that comes next')
print('(about 160 characters wide if possible), and then')
keyin = input('Press <Enter> to continue -> ')

# now, create a list of the columns that we are actually interested in right now.
cols_to_use = ['Word_NW','Translation','AoA','VSoA','Lex_cat','Freq','CDS_freq']
# Create another DataFrame with just the columns we want...
# NOTE: DataFrame df is unchanged
print('========================================================')
print(' Our new DataFrame with just the columns of interest    ')
print('========================================================')

df2 = df[cols_to_use]
print(df2)
# Set the DataFrame to 'is_copy' so we don't get any warning messages...
# df2.is_copy = False
# However, this is a depreciated command and might not work in
# the future, near or otherwise
pd.set_option('mode.chained_assignment', None)
print('========================================================')
print(' Cleaning the data....                                  ')
print('========================================================')

# Next we cleans the data by filling numeric values with a mean of thos columns
# and 'unknown' in the text field
df2['AoA'].fillna(df2['AoA'].mean(),inplace=True)
df2['AoA'].fillna(df2['AoA'].mean(),inplace=True)
df2['VSoA'].fillna(df2['VSoA'].mean(),inplace=True)
df2['Freq'].fillna(df2['Freq'].mean(),inplace=True)
df2['CDS_freq'].fillna(df2['CDS_freq'].mean(),inplace=True)
df2['Lex_cat'].fillna('unknown',inplace=True)

# And check to make sure that everything is filled...
print('========================================================')
print(' Number of cells STILL with bad data by column....      ')
print('========================================================')

print(df2.isnull().sum(axis=0))
keyin = input('Press <Enter> to continue -> ')

# Now, select just the rows where AoA is less than 18 months of age
# will return 21 rows
print('========================================================')
print(' Data of column AoA less than 18 (months) ....          ')
print('========================================================')

print(df2[df2.AoA < 18])
keyin = input('Press <Enter> to continue -> ')
# Now, display same data, but sorted by 'A0A' ascending
print('========================================================')
print(' Same data sorted by AoA ascending....                  ')
print('========================================================')

print(df2[df2.AoA < 18].sort_values('AoA',ascending=True))
keyin = input('Press <Enter> to continue -> ')
                                               # ~ Word_NW    Translation   AoA  VSoA           Lex_cat      Freq  CDS_freq
# ~ 370                                         'en mamma'        'mommy'  12.0  20.0            people   36751.0     171.0
# ~ 409                                             'takk'    'thank you'  13.0  40.0  games & routines  106589.0     106.0
# ~ 2                                     'brrr (bil-lyd)'        'vroom'  13.0  20.0     sound effects       NaN      20.0
# ~ 8                                          'nam - nam'      'yum yum'  13.0  40.0     sound effects      35.0      19.0
# ~ 394                                              'hei'           'hi'  13.0  40.0  games & routines   64086.0      74.0
# ~ 379                                         'en pappa'        'daddy'  13.0  20.0            people   24689.0     118.0
# ~ 391                                            'hadet'          'bye'  14.0  40.0  games & routines     467.0       6.0
# ~ 1                                                'bææ'      'baa baa'  15.0  40.0     sound effects      18.0       5.0
# ~ 401  'Borte! (lek hvor ansikt gjemmes og plutselig ...   'peek-a-boo'  15.0  40.0  games & routines       5.0      27.0
# ~ 397                                               'ja'          'yes'  15.0  40.0  games & routines  158496.0    3291.0
# ~ 7                                                'møø'          'moo'  15.0  40.0     sound effects       8.0       7.0
# ~ 11                                         'voff voff'    'woof woof'  15.0  40.0     sound effects      10.0      52.0
# ~ 404                                              'nei'           'no'  15.0  40.0  games & routines  136554.0     470.0
# ~ 0                                                 'au'         'ouch'  16.0  40.0     sound effects    4366.0       7.0
# ~ 70                                           'en ball'         'ball'  16.0  40.0      common nouns   96368.0      74.0
# ~ 387                                           'å bade'        'bathe'  17.0  60.0  games & routines    8123.0      16.0
# ~ 91                                          'en banan'       'banana'  17.0  60.0      common nouns    4673.0      17.0
# ~ 6                                               'mjau'         'meow'  17.0  60.0     sound effects     214.0       7.0
# ~ 3                                          'gakk gakk'  'quack quack'  17.0  40.0     sound effects      16.0       3.0
# ~ 350                                          'en baby'         'baby'  17.0  60.0            people   16739.0      33.0
# ~ 110                                               'is'    'ice cream'  17.0  60.0      common nouns  180159.0      11.0
# Now sort first by 'AoA' then by 'VSoA'
# When they first learn the word and the "average" number of words they know at this point
print('========================================================')
print(' Finally same data sorted by AoA then VSoA Ascending....')
print('========================================================')

print(df2[df2.AoA < 18].sort_values(['AoA','VSoA'],ascending=True))
keyin = input('Press <Enter> to continue -> ')
                                               # ~ Word_NW    Translation   AoA  VSoA           Lex_cat      Freq  CDS_freq
# ~ 370                                         'en mamma'        'mommy'  12.0  20.0            people   36751.0     171.0
# ~ 2                                     'brrr (bil-lyd)'        'vroom'  13.0  20.0     sound effects       NaN      20.0
# ~ 379                                         'en pappa'        'daddy'  13.0  20.0            people   24689.0     118.0
# ~ 8                                          'nam - nam'      'yum yum'  13.0  40.0     sound effects      35.0      19.0
# ~ 394                                              'hei'           'hi'  13.0  40.0  games & routines   64086.0      74.0
# ~ 409                                             'takk'    'thank you'  13.0  40.0  games & routines  106589.0     106.0
# ~ 391                                            'hadet'          'bye'  14.0  40.0  games & routines     467.0       6.0
# ~ 1                                                'bææ'      'baa baa'  15.0  40.0     sound effects      18.0       5.0
# ~ 7                                                'møø'          'moo'  15.0  40.0     sound effects       8.0       7.0
# ~ 11                                         'voff voff'    'woof woof'  15.0  40.0     sound effects      10.0      52.0
# ~ 397                                               'ja'          'yes'  15.0  40.0  games & routines  158496.0    3291.0
# ~ 401  'Borte! (lek hvor ansikt gjemmes og plutselig ...   'peek-a-boo'  15.0  40.0  games & routines       5.0      27.0
# ~ 404                                              'nei'           'no'  15.0  40.0  games & routines  136554.0     470.0
# ~ 0                                                 'au'         'ouch'  16.0  40.0     sound effects    4366.0       7.0
# ~ 70                                           'en ball'         'ball'  16.0  40.0      common nouns   96368.0      74.0
# ~ 3                                          'gakk gakk'  'quack quack'  17.0  40.0     sound effects      16.0       3.0
# ~ 6                                               'mjau'         'meow'  17.0  60.0     sound effects     214.0       7.0
# ~ 91                                          'en banan'       'banana'  17.0  60.0      common nouns    4673.0      17.0
# ~ 110                                               'is'    'ice cream'  17.0  60.0      common nouns  180159.0      11.0
# ~ 350                                          'en baby'         'baby'  17.0  60.0            people   16739.0      33.0
# ~ 387                                           'å bade'        'bathe'  17.0  60.0  games & routines    8123.0      16.0