• API
• FAQ
• Tools
• Archive
SHARE
TWEET

# Untitled

a guest May 23rd, 2019 66 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. import pandas as pd
2. import numpy as np                     # For mathematical calculations
3. import seaborn as sns                  # For data visualization
4. import matplotlib.pyplot as plt        # For plotting graphs
5. import warnings                        # To ignore any warnings
6. warnings.filterwarnings("ignore")
7.
12.
13.
14. print(train.columns)
15. print(train.dtypes)
16. print(train.shape,test.shape)
17.
18. # analyse data
19. print(train['Loan_Status'].value_counts())
20. print(train['Loan_Status'].value_counts(normalize=True))
21. train['Loan_Status'].value_counts().plot.bar()
22. #plt.show()
23.
24.
25. plt.subplot(221)
26. train['Gender'].value_counts(normalize=True).plot.bar(figsize=(20,10), title= 'Gender')
27. plt.subplot(222)
28. train['Married'].value_counts(normalize=True).plot.bar(title= 'Married')
29. plt.subplot(223)
30. train['Self_Employed'].value_counts(normalize=True).plot.bar(title= 'S-E')
31. plt.subplot(224)
32. train['Credit_History'].value_counts(normalize=True).plot.bar(title= 'C-H')
33. #plt.show()
34.
35.
36. plt.subplot(131)
37. train['Dependents'].value_counts(normalize=True).plot.bar(figsize=(24,6), title= 'Dependents')
38. plt.subplot(132)
39. train['Education'].value_counts(normalize=True).plot.bar(title= 'Education')
40. plt.subplot(133)
41. train['Property_Area'].value_counts(normalize=True).plot.bar(title= 'Property_Area')
42. #plt.show()
43.
44.
45.
46.
47. plt.subplot(121)
48. sns.distplot(train['ApplicantIncome']);
49. plt.subplot(122)
50. train['ApplicantIncome'].plot.box(figsize=(16,5))
51. #plt.show()
52.
53. train.boxplot(column='ApplicantIncome', by = 'Education')
54. plt.suptitle("boxplot")
55. #plt.show()
56.
57.
58.
59. plt.subplot(121)
60. sns.distplot(train['CoapplicantIncome']);
61. plt.subplot(122)
62. train['CoapplicantIncome'].plot.box(figsize=(16,5))
63. #plt.show()
64.
65.
66. plt.subplot(121)
67. df=train.dropna()
68. sns.distplot(df['LoanAmount']);
69. plt.subplot(122)
70. train['LoanAmount'].plot.box(figsize=(16,5))
71. #plt.show()
72.
73. Gender=pd.crosstab(train['Gender'],train['Loan_Status'])
74.
75.
76.
77. Gender=pd.crosstab(train['Gender'],train['Loan_Status'])
78. Married=pd.crosstab(train['Married'],train['Loan_Status'])
79. Dependents=pd.crosstab(train['Dependents'],train['Loan_Status'])
80. Education=pd.crosstab(train['Education'],train['Loan_Status'])
81. Self_Employed=pd.crosstab(train['Self_Employed'],train['Loan_Status'])
82. Credit_History=pd.crosstab(train['Credit_History'],train['Loan_Status'])
83. Property_Area=pd.crosstab(train['Property_Area'],train['Loan_Status'])
84.
85. Gender.div(Gender.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
86. #plt.show()
87.
88. Married.div(Married.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
89. #plt.show()
90.
91. Dependents.div(Dependents.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
92. #plt.show()
93.
94. Education.div(Education.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
95. #plt.show()
96.
97. Self_Employed.div(Self_Employed.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
98. #plt.show()
99.
100. Credit_History.div(Credit_History.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(4,4))
101. #plt.show()
102.
103. Property_Area.div(Property_Area.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
104. #plt.show()
105.
106. train.groupby('Loan_Status')['ApplicantIncome'].mean().plot.bar()
107. #plt.show()
108. df=train.dropna()
109.
110. bins=[0,2500,4000,6000,81000]
111. group=['Low','Average','High', 'Very high']
112. train['Income_bin']=pd.cut(df['ApplicantIncome'],bins,labels=group)
113.
114. Income_bin=pd.crosstab(train['Income_bin'],train['Loan_Status'])
115. Income_bin.div(Income_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True,figsize=(10,4))
116. #plt.show()
117.
118.
119. bins=[0,1000,3000,42000]
120. group=['Low','Average','High']
121. train['Coapplicant_Income_bin']=pd.cut(df['CoapplicantIncome'],bins,labels=group)
122.
123. Coapplicant_Income_bin=pd.crosstab(train['Coapplicant_Income_bin'],train['Loan_Status'])
124. Coapplicant_Income_bin.div(Coapplicant_Income_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
125. #plt.show()
126.
127. train['Total_Income']=train['ApplicantIncome']+train['CoapplicantIncome']
128. bins=[0,2500,4000,6000,81000]
129. group=['Low','Average','High', 'Very high']
130. train['Total_Income_bin']=pd.cut(train['Total_Income'],bins,labels=group)
131.
132. Total_Income_bin=pd.crosstab(train['Total_Income_bin'],train['Loan_Status'])
133. Total_Income_bin.div(Total_Income_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
134. plt.xlabel('Total_Income')
135. plt.ylabel('Percentage')
136. #plt.show()
137.
138.
139. bins=[0,100,200,700]
140. group=['Low','Average','High']
141. train['LoanAmount_bin']=pd.cut(df['LoanAmount'],bins,labels=group)
142. LoanAmount_bin=pd.crosstab(train['LoanAmount_bin'],train['Loan_Status'])
143. LoanAmount_bin.div(LoanAmount_bin.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
144. plt.xlabel('LoanAmount')
145. P = plt.ylabel('Percentage')
146.
147.
148. train=train.drop(['Income_bin', 'Coapplicant_Income_bin', 'LoanAmount_bin', 'Total_Income_bin', 'Total_Income'], axis=1)
149.
150.
151. train['Dependents'].replace('3+', 3,inplace=True)
152. test['Dependents'].replace('3+', 3,inplace=True)
153. train['Loan_Status'].replace('N', 0,inplace=True)
154. train['Loan_Status'].replace('Y', 1,inplace=True)
155.
156.
157. matrix = train.corr()
158. f, ax = plt.subplots(figsize=(9, 6))
159. sns.heatmap(matrix, vmax=.8, square=True, cmap="BuPu");
160. #plt.show()
161.
162. print(train.isnull().sum())
163.
164. train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
165. train['Married'].fillna(train['Married'].mode()[0], inplace=True)
166. train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
167. train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
168. train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
169.
170. train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
171. train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)
172.
173. print('-------')
174. print(train.isnull().sum())
175.
176. test['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
177. test['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
178. test['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
179. test['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
180. test['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
181. test['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)
182.
183.
184.
185. train['LoanAmount_log'] = np.log(train['LoanAmount'])
186. train['LoanAmount_log'].hist(bins=20)
187. test['LoanAmount_log'] = np.log(test['LoanAmount'])
188.
189. #plt.show()
190.
191. train=train.drop('Loan_ID',axis=1)
192. test=test.drop('Loan_ID',axis=1)
193.
194. X = train.drop('Loan_Status',1)
195. y = train.Loan_Status
196.
197. #dummies
198. X=pd.get_dummies(X)
199. train=pd.get_dummies(train)
200. test=pd.get_dummies(test)
201.
202.
203. from sklearn.model_selection import train_test_split
204. x_train, x_cv, y_train, y_cv = train_test_split(X,y, test_size =0.3)
205.
206. from sklearn.linear_model import LogisticRegression
207. from sklearn.metrics import accuracy_score
208.
209. model = LogisticRegression()
210. model.fit(x_train, y_train)
211.
212.
213.
214.
215. LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
216.           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
217.           penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
218.           verbose=0, warm_start=False)
219.
220.
221. pred_cv = model.predict(x_cv)
222.
223. accuracy_score(y_cv,pred_cv)
224.
225.
226.
227. pred_test = model.predict(test)
229. submission['Loan_Status']=pred_test
230. submission['Loan_ID']=test_original['Loan_ID']
231.
232.
233.
234.
235. submission['Loan_Status'].replace(0, 'N',inplace=True)
236. submission['Loan_Status'].replace(1, 'Y',inplace=True)
237.
238. pd.DataFrame(submission, columns=['Loan_ID','Loan_Status']).to_csv('logistic.csv')
239.
240.
241. from sklearn.model_selection import StratifiedKFold
242.
243. i = 1
244. kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
245. for train_index, test_index in kf.split(X, y):
246.     print('\n{} of kfold {}'.format(i, kf.n_splits))
247.     xtr, xvl = X.loc[train_index], X.loc[test_index]
248.     ytr, yvl = y[train_index], y[test_index]
249.
250.     model = LogisticRegression(random_state=1)
251.     model.fit(xtr, ytr)
252.     pred_test = model.predict(xvl)
253.     score = accuracy_score(yvl, pred_test)
254.     print('accuracy_score', score)
255.     i += 1
256. pred_test = model.predict(test)
257. pred = model.predict_proba(xvl)[:, 1]
258.
259.
260.
261. submission['Loan_Status']=pred_test
262. submission['Loan_ID']=test_original['Loan_ID']
263. submission['Loan_Status'].replace(0, 'N',inplace=True)
264. submission['Loan_Status'].replace(1, 'Y',inplace=True)
265. pd.DataFrame(submission, columns=['Loan_ID','Loan_Status']).to_csv('Logistic2.csv')
266.
267.
268.
269.
270.
271. train['Total_Income']=train['ApplicantIncome']+train['CoapplicantIncome']
272. test['Total_Income']=test['ApplicantIncome']+test['CoapplicantIncome']
273.
274. train['Total_Income_log'] = np.log(train['Total_Income'])
275. sns.distplot(train['Total_Income_log']);
276. test['Total_Income_log'] = np.log(test['Total_Income'])
277.
278. train['EMI']=train['LoanAmount']/train['Loan_Amount_Term']
279. test['EMI']=test['LoanAmount']/test['Loan_Amount_Term']
280.
281.
282. train['Balance Income']=train['Total_Income']-(train['EMI']*1000)
283.
284. test['Balance Income']=test['Total_Income']-(test['EMI']*1000)
285.
286. train=train.drop(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'], axis=1)
287. test=test.drop(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'], axis=1)
288.
289.
290.
291. # optimisation
292. # logistic regression
293.
294. X = train.drop('Loan_Status',1)
295. y = train.Loan_Status
296.
297.
298.
299.
300. i = 1
301. kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
302. for train_index, test_index in kf.split(X, y):
303.     print('\n{} of kfold {}'.format(i, kf.n_splits))
304.     xtr, xvl = X.loc[train_index], X.loc[test_index]
305.     ytr, yvl = y[train_index], y[test_index]
306.
307.     model = LogisticRegression(random_state=1)
308.     model.fit(xtr, ytr)
309.     pred_test = model.predict(xvl)
310.     score = accuracy_score(yvl, pred_test)
311.     #print('accuracy_score', score)
312.     i += 1
313. pred_test = model.predict(test)
314. pred = model.predict_proba(xvl)[:, 1]
315.
316.
317. submission['Loan_Status']=pred_test            # filling Loan_Status with predictions
318. submission['Loan_ID']=test_original['Loan_ID'] # filling Loan_ID with test Loan_ID
319.
320. # replacing 0 and 1 with N and Y
321. submission['Loan_Status'].replace(0, 'N',inplace=True)
322. submission['Loan_Status'].replace(1, 'Y',inplace=True)
323.
324. # Converting submission file to .csv format
325. pd.DataFrame(submission, columns=['Loan_ID','Loan_Status']).to_csv('Log-new1.csv')
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy.

Top