Untitled

import numpy as np
import matplotlib.pyplot as plt
from sklearn import  linear_model
import statsmodels.api as sm
import seaborn as sns
import pandas as pd

### then create a dataframe with three columns: "state", "urbanization" and "income".
### Call this dataframe df

model = sm.OLS(urbanization, income).fit()
pred = model.predict(income)
res = abs(urbanization - pred)
df['res'] = res
plt.figure(figsize=(15,8))
plt.scatter(df["income"], df["urbanization"],
            c = res, cmap="inferno", s = 60)
plt.tick_params(axis='both', which='major', labelsize=15)
ax = plt.gca()
for line in range(0,len(df)):
    if df.res[line]>15:
        plt.text(df.income[line], df.urbanization[line]+1, df.state[line], horizontalalignment='left', size=15, color='blue', weight='semibold')
plt.plot(income, pred, 'r')
cbar = plt.colorbar()
cbar.set_label("Linear regression residuals", size = 20)
plt.xlabel("annual income in 2015 (dollars)", size = 20)
plt.ylabel("urbanization rate (percentage)", size = 20)
plt.title("Urbanization and Annual Income by US States", size = 30)
plt.show()