SHARE
TWEET

Untitled

a guest May 23rd, 2019 66 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pandas as pd
  2. import numpy as np                     # For mathematical calculations
  3. import seaborn as sns                  # For data visualization
  4. import matplotlib.pyplot as plt        # For plotting graphs
  5. import warnings                        # To ignore any warnings
  6. warnings.filterwarnings("ignore")
  7.  
  8. train=pd.read_csv("training.csv")
  9. test=pd.read_csv("test.csv")
  10. train_original=pd.read_csv("training.csv")
  11. test_original=pd.read_csv("test.csv")
  12.  
  13.  
  14. print(train.columns)
  15. print(train.dtypes)
  16. print(train.shape,test.shape)
  17.  
  18. # analyse data
  19. print(train['Loan_Status'].value_counts())
  20. print(train['Loan_Status'].value_counts(normalize=True))
  21. train['Loan_Status'].value_counts().plot.bar()
  22. #plt.show()
  23.  
  24.  
  25. plt.subplot(221)
  26. train['Gender'].value_counts(normalize=True).plot.bar(figsize=(20,10), title= 'Gender')
  27. plt.subplot(222)
  28. train['Married'].value_counts(normalize=True).plot.bar(title= 'Married')
  29. plt.subplot(223)
  30. train['Self_Employed'].value_counts(normalize=True).plot.bar(title= 'S-E')
  31. plt.subplot(224)
  32. train['Credit_History'].value_counts(normalize=True).plot.bar(title= 'C-H')
  33. #plt.show()
  34.  
  35.  
  36. plt.subplot(131)
  37. train['Dependents'].value_counts(normalize=True).plot.bar(figsize=(24,6), title= 'Dependents')
  38. plt.subplot(132)
  39. train['Education'].value_counts(normalize=True).plot.bar(title= 'Education')
  40. plt.subplot(133)
  41. train['Property_Area'].value_counts(normalize=True).plot.bar(title= 'Property_Area')
  42. #plt.show()
  43.  
  44.  
  45.  
  46.  
  47. plt.subplot(121)
  48. sns.distplot(train['ApplicantIncome']);
  49. plt.subplot(122)
  50. train['ApplicantIncome'].plot.box(figsize=(16,5))
  51. #plt.show()
  52.  
  53. train.boxplot(column='ApplicantIncome', by = 'Education')
  54. plt.suptitle("boxplot")
  55. #plt.show()
  56.  
  57.  
  58.  
  59. plt.subplot(121)
  60. sns.distplot(train['CoapplicantIncome']);
  61. plt.subplot(122)
  62. train['CoapplicantIncome'].plot.box(figsize=(16,5))
  63. #plt.show()
  64.  
  65.  
  66. plt.subplot(121)
  67. df=train.dropna()
  68. sns.distplot(df['LoanAmount']);
  69. plt.subplot(122)
  70. train['LoanAmount'].plot.box(figsize=(16,5))
  71. #plt.show()
  72.  
  73. Gender=pd.crosstab(train['Gender'],train['Loan_Status'])
  74.  
  75.  
  76.  
  77. Gender=pd.crosstab(train['Gender'],train['Loan_Status'])
  78. Married=pd.crosstab(train['Married'],train['Loan_Status'])
  79. Dependents=pd.crosstab(train['Dependents'],train['Loan_Status'])
  80. Education=pd.crosstab(train['Education'],train['Loan_Status'])
  81. Self_Employed=pd.crosstab(train['Self_Employed'],train['Loan_Status'])
  82. Credit_History=pd.crosstab(train['Credit_History'],train['Loan_Status'])
  83. Property_Area=pd.crosstab(train['Property_Area'],train['Loan_Status'])
  84.  
  85. Gender.div(Gender.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
  86. #plt.show()
  87.  
  88. Married.div(Married.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
  89. #plt.show()
  90.  
  91. Dependents.div(Dependents.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
  92. #plt.show()
  93.  
  94. Education.div(Education.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
  95. #plt.show()
  96.  
  97. Self_Employed.div(Self_Employed.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
  98. #plt.show()
  99.  
  100. Credit_History.div(Credit_History.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
  101. #plt.show()
  102.  
  103. Property_Area.div(Property_Area.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
  104. #plt.show()
  105.  
  106. train.groupby('Loan_Status')['ApplicantIncome'].mean().plot.bar()
  107. #plt.show()
  108. df=train.dropna()
  109.  
  110. bins=[0,2500,4000,6000,81000]
  111. group=['Low','Average','High', 'Very high']
  112. train['Income_bin']=pd.cut(df['ApplicantIncome'],bins,labels=group)
  113.  
  114. Income_bin=pd.crosstab(train['Income_bin'],train['Loan_Status'])
  115. Income_bin.div(Income_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True,figsize=(10,4))
  116. #plt.show()
  117.  
  118.  
  119. bins=[0,1000,3000,42000]
  120. group=['Low','Average','High']
  121. train['Coapplicant_Income_bin']=pd.cut(df['CoapplicantIncome'],bins,labels=group)
  122.  
  123. Coapplicant_Income_bin=pd.crosstab(train['Coapplicant_Income_bin'],train['Loan_Status'])
  124. Coapplicant_Income_bin.div(Coapplicant_Income_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
  125. #plt.show()
  126.  
  127. train['Total_Income']=train['ApplicantIncome']+train['CoapplicantIncome']
  128. bins=[0,2500,4000,6000,81000]
  129. group=['Low','Average','High', 'Very high']
  130. train['Total_Income_bin']=pd.cut(train['Total_Income'],bins,labels=group)
  131.  
  132. Total_Income_bin=pd.crosstab(train['Total_Income_bin'],train['Loan_Status'])
  133. Total_Income_bin.div(Total_Income_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
  134. plt.xlabel('Total_Income')
  135. plt.ylabel('Percentage')
  136. #plt.show()
  137.  
  138.  
  139. bins=[0,100,200,700]
  140. group=['Low','Average','High']
  141. train['LoanAmount_bin']=pd.cut(df['LoanAmount'],bins,labels=group)
  142. LoanAmount_bin=pd.crosstab(train['LoanAmount_bin'],train['Loan_Status'])
  143. LoanAmount_bin.div(LoanAmount_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
  144. plt.xlabel('LoanAmount')
  145. P = plt.ylabel('Percentage')
  146.  
  147.  
  148. train=train.drop(['Income_bin', 'Coapplicant_Income_bin', 'LoanAmount_bin', 'Total_Income_bin', 'Total_Income'], axis=1)
  149.  
  150.  
  151. train['Dependents'].replace('3+', 3,inplace=True)
  152. test['Dependents'].replace('3+', 3,inplace=True)
  153. train['Loan_Status'].replace('N', 0,inplace=True)
  154. train['Loan_Status'].replace('Y', 1,inplace=True)
  155.  
  156.  
  157. matrix = train.corr()
  158. f, ax = plt.subplots(figsize=(9, 6))
  159. sns.heatmap(matrix, vmax=.8, square=True, cmap="BuPu");
  160. #plt.show()
  161.  
  162. print(train.isnull().sum())
  163.  
  164. train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
  165. train['Married'].fillna(train['Married'].mode()[0], inplace=True)
  166. train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
  167. train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
  168. train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
  169.  
  170. train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
  171. train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)
  172.  
  173. print('-------')
  174. print(train.isnull().sum())
  175.  
  176. test['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
  177. test['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
  178. test['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
  179. test['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
  180. test['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
  181. test['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)
  182.  
  183.  
  184.  
  185. train['LoanAmount_log'] = np.log(train['LoanAmount'])
  186. train['LoanAmount_log'].hist(bins=20)
  187. test['LoanAmount_log'] = np.log(test['LoanAmount'])
  188.  
  189. #plt.show()
  190.  
  191. train=train.drop('Loan_ID',axis=1)
  192. test=test.drop('Loan_ID',axis=1)
  193.  
  194. X = train.drop('Loan_Status',1)
  195. y = train.Loan_Status
  196.  
  197. #dummies
  198. X=pd.get_dummies(X)
  199. train=pd.get_dummies(train)
  200. test=pd.get_dummies(test)
  201.  
  202.  
  203. from sklearn.model_selection import train_test_split
  204. x_train, x_cv, y_train, y_cv = train_test_split(X,y, test_size =0.3)
  205.  
  206. from sklearn.linear_model import LogisticRegression
  207. from sklearn.metrics import accuracy_score
  208.  
  209. model = LogisticRegression()
  210. model.fit(x_train, y_train)
  211.  
  212.  
  213.  
  214.  
  215. LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
  216.           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
  217.           penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
  218.           verbose=0, warm_start=False)
  219.  
  220.  
  221. pred_cv = model.predict(x_cv)
  222.  
  223. accuracy_score(y_cv,pred_cv)
  224.  
  225.  
  226.  
  227. pred_test = model.predict(test)
  228. submission=pd.read_csv("Sample_Submission_ZAuTl8O_FK3zQHh.csv")
  229. submission['Loan_Status']=pred_test
  230. submission['Loan_ID']=test_original['Loan_ID']
  231.  
  232.  
  233.  
  234.  
  235. submission['Loan_Status'].replace(0, 'N',inplace=True)
  236. submission['Loan_Status'].replace(1, 'Y',inplace=True)
  237.  
  238. pd.DataFrame(submission, columns=['Loan_ID','Loan_Status']).to_csv('logistic.csv')
  239.  
  240.  
  241. from sklearn.model_selection import StratifiedKFold
  242.  
  243. i = 1
  244. kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
  245. for train_index, test_index in kf.split(X, y):
  246.     print('\n{} of kfold {}'.format(i, kf.n_splits))
  247.     xtr, xvl = X.loc[train_index], X.loc[test_index]
  248.     ytr, yvl = y[train_index], y[test_index]
  249.  
  250.     model = LogisticRegression(random_state=1)
  251.     model.fit(xtr, ytr)
  252.     pred_test = model.predict(xvl)
  253.     score = accuracy_score(yvl, pred_test)
  254.     print('accuracy_score', score)
  255.     i += 1
  256. pred_test = model.predict(test)
  257. pred = model.predict_proba(xvl)[:, 1]
  258.  
  259.  
  260.  
  261. submission['Loan_Status']=pred_test
  262. submission['Loan_ID']=test_original['Loan_ID']
  263. submission['Loan_Status'].replace(0, 'N',inplace=True)
  264. submission['Loan_Status'].replace(1, 'Y',inplace=True)
  265. pd.DataFrame(submission, columns=['Loan_ID','Loan_Status']).to_csv('Logistic2.csv')
  266.  
  267.  
  268.  
  269.  
  270.  
  271. train['Total_Income']=train['ApplicantIncome']+train['CoapplicantIncome']
  272. test['Total_Income']=test['ApplicantIncome']+test['CoapplicantIncome']
  273.  
  274. train['Total_Income_log'] = np.log(train['Total_Income'])
  275. sns.distplot(train['Total_Income_log']);
  276. test['Total_Income_log'] = np.log(test['Total_Income'])
  277.  
  278. train['EMI']=train['LoanAmount']/train['Loan_Amount_Term']
  279. test['EMI']=test['LoanAmount']/test['Loan_Amount_Term']
  280.  
  281.  
  282. train['Balance Income']=train['Total_Income']-(train['EMI']*1000)
  283.  
  284. test['Balance Income']=test['Total_Income']-(test['EMI']*1000)
  285.  
  286. train=train.drop(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'], axis=1)
  287. test=test.drop(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'], axis=1)
  288.  
  289.  
  290.  
  291. # optimisation
  292. # logistic regression
  293.  
  294. X = train.drop('Loan_Status',1)
  295. y = train.Loan_Status
  296.  
  297.  
  298.  
  299.  
  300. i = 1
  301. kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
  302. for train_index, test_index in kf.split(X, y):
  303.     print('\n{} of kfold {}'.format(i, kf.n_splits))
  304.     xtr, xvl = X.loc[train_index], X.loc[test_index]
  305.     ytr, yvl = y[train_index], y[test_index]
  306.  
  307.     model = LogisticRegression(random_state=1)
  308.     model.fit(xtr, ytr)
  309.     pred_test = model.predict(xvl)
  310.     score = accuracy_score(yvl, pred_test)
  311.     #print('accuracy_score', score)
  312.     i += 1
  313. pred_test = model.predict(test)
  314. pred = model.predict_proba(xvl)[:, 1]
  315.  
  316.  
  317. submission['Loan_Status']=pred_test            # filling Loan_Status with predictions
  318. submission['Loan_ID']=test_original['Loan_ID'] # filling Loan_ID with test Loan_ID
  319.  
  320. # replacing 0 and 1 with N and Y
  321. submission['Loan_Status'].replace(0, 'N',inplace=True)
  322. submission['Loan_Status'].replace(1, 'Y',inplace=True)
  323.  
  324. # Converting submission file to .csv format
  325. pd.DataFrame(submission, columns=['Loan_ID','Loan_Status']).to_csv('Log-new1.csv')
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top