Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import seaborn as sns
- titanic_data = sns.load_dataset("titanic")
- print(titanic_data)
- print()
- #Часть 1
- print("PART1\n")
- zero_cnt = np.sum(titanic_data.isnull(), axis = 0)
- print("zero_cnt:\n", zero_cnt)
- print()
- #Часть 2
- print("\nPART2\n")
- #Удалите все столбцы, количество пропусков в которых превышает половину количества строк в таблице.
- half_rows = titanic_data.shape[0] // 2
- columns_to_drop = titanic_data.columns[zero_cnt > half_rows]
- titanic_data = titanic_data.drop(columns=columns_to_drop)
- print('columns_to_drop:', columns_to_drop, "- удаляем эти столбцы")
- print()
- #NB!Теперь работаем с titanic_cleaned
- #После того, как вы удалите все столбцы, нарушающие описанное условие, удалите все строки, количество пропусков в которых превышает половину количества столбцов.
- row_zero_cnt = np.sum(titanic_data.isnull(), axis = 1)
- half_rows = titanic_data.shape[1] // 2
- """
- print("shape:", titanic_cleaned.shape)
- print(type(row_zero_cnt))
- print(row_zero_cnt)"""
- print("half_rows:",half_rows)
- print("Максимальное пропусков в строке:", row_zero_cnt.max())
- to_be_left = row_zero_cnt <= half_rows
- #print(type(to_be_left))
- #(to_be_left)
- titanic_data = titanic_data[to_be_left]
- #print(titanic.shape)
- print("Вывод: мы не удалили ни одной строки")
- #NB!Теперь работаем с titanic
- #Часть 3
- print("\nPART3\n")
- man_mask = (titanic_data['who'] == 'man')
- #print("man_mask\n",male_mask)
- man_age_median = np.median(titanic_data.loc[man_mask, 'age'].dropna()).round()
- #print("man_age_median:", man_age_median)
- #print()
- titanic_data.loc[(titanic_data['who'] == 'man') & (titanic_data['age'].isna()), 'age'] = man_age_median
- woman_mask = (titanic_data['who'] == 'woman')
- woman_age_median = np.median(titanic_data.loc[woman_mask, 'age'].dropna()).round()
- #print("woman_age_median:", woman_age_median)
- #print()
- titanic_data.loc[(titanic_data['who'] == 'woman') & (titanic_data['age'].isna()), 'age'] = woman_age_median
- child_mask = (titanic_data['who'] == 'child')
- child_age_median = np.median(titanic_data.loc[child_mask, 'age'].dropna()).round()
- #print("child_age_median:", child_age_median)
- #print()
- titanic_data.loc[(titanic_data['who'] == 'child') & (titanic_data['age'].isna()), 'age'] = child_age_median
- #Часть4
- print("\nPART4\n")
- null_counts = np.sum(titanic_data.isna(), axis=1)
- #print("Распределение пропусков по строкам:")
- #print(null_counts.value_counts().sort_index())
- #print(null_counts)
- mask = null_counts <= 1
- #print("mask:", type(mask))
- titanic_data = titanic_data[mask]
- #Часть5
- print("\nPART5\n")
- city_cnt = titanic_data["embark_town"].value_counts()
- city = city_cnt.idxmax()
- print(city)
- #Часть 6
- print("\nPART6\n")
- #percent_of_surv1 = (titanic_data["survived"].value_counts(normalize=True))[1]
- #print(round(percent_of_surv1, 2))
- surv_percent = np.round(titanic_data["survived"].value_counts(normalize=True)[1] * 100, 2)
- print(surv_percent)
- #Часть7
- print("\nPART7\n")
- surv_cnt = titanic_data[titanic_data["survived"]==1].groupby("embarked")["survived"].count()
- print(surv_cnt)
- #Часть 8
- print("\nPART8\n")
- surv_class_cnt = titanic_data[titanic_data["survived"]==1].groupby("class", observed=True)["survived"].count()
- print(surv_class_cnt)
- #Часть 9
- print("\nPART9\n")
- surv_rich_percent = 100 * titanic_data.loc[titanic_data["fare"] >= 100, "survived"].value_counts(normalize=True)[1]
- print(np.round(surv_rich_percent, 2))
- #Часть 10
- print("\nPART10\n")
- child_alone = titanic_data.loc[(titanic_data["who"] == "child") & (titanic_data["alone"] == True)]
- print(child_alone.shape)
- print(child_alone)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement