 # violin and scater plot with stratification, frequency table and FacetGrid

May 12th, 2022 (edited)
576
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. import numpy as np
2. import pandas as pd
3. import matplotlib.pyplot as plt
4. import seaborn as sns
5. from scipy import stats
6.
8.     "C:/Users/eli/Desktop/YtPruboBEemdqA7UJJ_tgg_63e179e3722f4ef783f58ff6e395feb7_nhanes_2015_2016.csv")
9.
10. '''
11. Question 1
12. Make a scatterplot showing the relationship between the first and second measurements of diastolic blood pressure (BPXDI1 and BPXDI2).
13. Also obtain the 4x4 matrix of correlation coefficients among the first two systolic and the first two diastolic blood pressure measures.
14. '''
15. sns.scatterplot(data=da, x="BPXDI1", y="BPXDI2",  alpha=0.3)
16. # Most of the data is concentrated between 40 and 100 BPXDI1 and between 40 and 100 BPXDI2
17.
18. df = da.loc[:1, ["BPXDI1", "BPXDI2"]]
19. df.corr()
20. '''      BPXDI1  BPXDI2
21. BPXDI1     1.0     1.0
22. BPXDI2     1.0     1.0 '''
23.
24.
25. '''
26. Question 2
27. Construct a grid of scatterplots between the first systolic and the first diastolic blood pressure measurement.
28. Stratify the plots by gender (rows) and by race/ethnicity groups (columns).
29. '''
30. da["RIAGENDRx"] = da.RIAGENDR.replace({1: "Male", 2: "Female"})
31. sns.FacetGrid(da, row="RIAGENDR",  col="RIDRETH1").map(
33.
34.
35. '''
36. Question 3
37.
38. Use "violin plots" to compare the distributions of ages within groups defined by gender and educational attainment.
39. '''
40. sns.FacetGrid(da, row="RIAGENDR", col="DMDEDUC2").map(
42.
43.
44. '''
45. Question 4
46.
47. Use violin plots to compare the distributions of BMI within a series of 10-year age bands. Also stratify these plots by gender.
48. '''
49. da["agegroup"] = pd.cut(da.RIDAGEYR, [10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
50.
51. sns.FacetGrid(da, row="RIAGENDR", col="agegroup").map(
53.
54.
55. '''
56. Question 5
57.
58. Construct a frequency table for the joint distribution of ethnicity groups (RIDRETH1) and health-insurance status (HIQ210).
59. Normalize the results so that the values within each ethnic group are proportions that sum to 1.
60. '''
61. x = pd.crosstab(da.RIDRETH1, da.HIQ210)
62. x.apply(lambda z: z/z.sum(), axis=1)
63.