Advertisement
Guest User

Untitled

a guest
Sep 19th, 2019
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.63 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. import seaborn as sns
  5. from scipy import stats
  6. from scipy.stats import norm
  7. from pandas.plotting import scatter_matrix
  8. from sklearn import preprocessing
  9.  
  10.  
  11.  
  12. data2 = pd.read_excel('Schengen_Visa_Stats/data/raw_data/2018-consulates-schengen-visa-stats.xlsx', sheet_name='Data for consulates')
  13.  
  14. print(data2.shape)
  15. print(data2.head(20))
  16. print(data2.tail(20))
  17. print(data2.describe())
  18. print(data2.isnull().sum().sort_values())
  19.  
  20. # bs almost features about ATVs have lots of missing value and they are seperated with remaining >> drop
  21. data2.drop([
  22. 'Airport transit visas (ATVs) applied for ',
  23. ' ATVs issued (including multiple)',
  24. 'Multiple ATVs issued', 'ATVs not issued ',
  25. 'Not issued rate for ATVs',
  26. 'Total ATVs and uniform visas applied for',
  27. 'Total ATVs and uniform visas issued (including multiple ATVs, MEVs and LTVs) ',
  28. 'Total ATVs and uniform visas not issued',
  29. 'Not issued rate for ATVs and uniform visas '
  30. ], axis=1, inplace=True)
  31. data2.rename(columns={
  32. 'Schengen State': 'sch_state',
  33. 'Country where consulate is located': 'consulate_country',
  34. 'Consulate': 'consulate_city',
  35. 'Uniform visas applied for': 'applications',
  36. 'Total uniform visas issued (including MEV) \n': 'uniform_visas_issued',
  37. 'Multiple entry uniform visas (MEVs) issued': 'mevs_issued',
  38. 'Share of MEVs on total number of uniform visas issued': 'mevs_share',
  39. 'Total LTVs issued': 'ltvs_issued',
  40. 'Uniform visas not issued': 'rejected',
  41. 'Not issued rate for uniform visas': 'rejection_rate'}, inplace=True)
  42. data2 = data2.dropna(how='all').reset_index(drop=True)
  43.  
  44.  
  45. # print(data2[data2.sch_state.isnull()]) # 3 last row is total
  46. # drop 3 last row
  47. data2 = data2.dropna(axis=0, subset=['sch_state']).reset_index(drop=True)
  48. # fill nan by 0
  49. data2.fillna(0, inplace=True)
  50.  
  51. # check if applications=uniform_visas_issued+ltvs_issued+rejected
  52. data2['decisions'] = data2.uniform_visas_issued + data2.ltvs_issued + data2.rejected
  53. print('More applications than descisions entries =', len(data2[data2.decisions < data2.applications]))
  54. print('More descisions than applications entries =', len(data2[data2.decisions > data2.applications]))
  55.  
  56. # bs of the differences in applications vs decisions > use decisions
  57. # drop ltvs_issued
  58. data2['decisions'] = data2.decisions - data2.ltvs_issued
  59. data2['applications'] = data2.applications - data2.ltvs_issued
  60. data2.drop('ltvs_issued', axis=1, inplace=True)
  61. data2['rejection_rate'] = data2.rejected/data2.decisions
  62. data2['rejection_rate'].fillna(0, inplace=True)
  63.  
  64. #there are some mevs_share > 1
  65. data2.loc[data2['mevs_share'] > 1, 'mevs_share'] = 0
  66.  
  67. # data2['score'] = (1 - data2['rejection_rate']) * data2.mevs_share
  68. # min_max_scaler = preprocessing.MinMaxScaler()
  69. # data2['score'] = min_max_scaler.fit_transform(data2[['score']].values)
  70. #==================================================================================
  71. #VISUALIZE
  72. # #plot states recieving most visa applying
  73. from sklearn.preprocessing import StandardScaler
  74. x = data2.groupby('sch_state').sum().decisions.sort_values()
  75. x.plot.bar()
  76. plt.title('states recieving most visa applying')
  77. plt.xticks(rotation=45)
  78. plt.show()
  79. data2['state_recieve_score'] = data2['sch_state'].map(x.apply(lambda value: np.log(value)))
  80.  
  81. # #plot countries apply most visa
  82. x = data2.groupby('consulate_country').sum().decisions.sort_values()
  83. x.plot.bar()
  84. x.plot.bar()
  85. plt.xticks(rotation=45, fontsize=6)
  86. plt.show()
  87. data2['country_apply_score'] = data2['consulate_country'].map(x.apply(lambda value: np.log(value)))
  88.  
  89. # #plot states reject mostly
  90. x = data2.groupby('sch_state').sum().rejected.sort_values()
  91. x.plot.bar()
  92. plt.xticks(rotation=45)
  93. plt.show()
  94. data2['state_reject_score'] = data2['sch_state'].map(x.apply(lambda value: np.log(value)))
  95.  
  96. # #plot countries are rejected mostly
  97. x = data2.groupby('consulate_country').sum().rejected.sort_values()
  98. x.plot.bar()
  99. plt.xticks(rotation=45, fontsize=6)
  100. plt.show()
  101. data2['country_be_rejected_score'] = data2['consulate_country'].map(x.apply(lambda value: np.log(value)))
  102.  
  103. #process city
  104. x = data2.groupby('consulate_city').sum().decisions
  105. x1 = data2.groupby('consulate_city').sum().rejected
  106. data2['city_apply_score'] = data2['consulate_city'].map(x.apply(lambda value: np.log(value)))
  107. data2['city_be_rejected_score'] = data2['consulate_city'].map(x1.apply(lambda value: np.log(value)))
  108.  
  109. #project rejected vs decision score
  110. data2['rejected'] = np.log(data2['rejected'])
  111. data2['rejected'] = data2['rejected'].apply(lambda value: value/data2['rejected'].nlargest(1))
  112. data2['decisions'] = np.log(data2['decisions'])
  113. data2['decisions'] = data2['decisions'].apply(lambda value: value/data2['decisions'].nlargest(1))
  114.  
  115. #replace infinite value
  116. data2.replace([np.inf, -np.inf], 0, inplace=True)
  117.  
  118. #process total score
  119. data2['state_score'] = data2['state_recieve_score']*data2['state_reject_score']
  120. data2['state_score'] = data2['state_score'].apply(lambda value: value/data2['state_score'].nlargest(1))
  121. data2['country_score'] = data2['country_apply_score']*data2['country_be_rejected_score']
  122. data2['country_score'] = data2['country_score'].apply(lambda value: value/data2['country_score'].nlargest(1))
  123.  
  124. data2['score_recieve'] = data2['state_recieve_score']*data2['country_apply_score']*data2['city_apply_score']
  125. data2['score_reject'] = data2['state_reject_score']*data2['country_be_rejected_score']*data2['city_be_rejected_score']
  126. data2['score_recieve'] = data2['score_recieve'].apply(lambda value: value/data2['score_recieve'].nlargest(1))
  127. data2['score_reject'] = data2['score_reject'].apply(lambda value: value/data2['score_reject'].nlargest(1))
  128.  
  129.  
  130. # #plot correlation bettween feature
  131. # corr = data2.corr()
  132. # sns.heatmap(corr)
  133. # plt.xticks(rotation=45)
  134. # plt.show()
  135.  
  136. # #plot correlation
  137. # sns.jointplot(x=data2['mevs_share'], y=data2['rejection_rate'])
  138. # plt.show()
  139.  
  140. scatter_matrix(data2[['score_recieve', 'score_reject','rejected', 'decisions']])
  141. plt.xticks(rotation=45)
  142. plt.show()
  143.  
  144. #=============================================================================
  145. # # encode sch_state, consulate_country, consulate_city
  146. # from sklearn.preprocessing import LabelEncoder
  147. # le = LabelEncoder()
  148. # for col in ['sch_state', 'consulate_country', 'consulate_city']:
  149. # le.fit(data2[col])
  150. # data2[col] = le.transform(data2[col])
  151.  
  152. #===========================================================================
  153. # # # solve with linear regerssion
  154. from sklearn import linear_model
  155. from sklearn.model_selection import train_test_split
  156. from sklearn import metrics
  157.  
  158.  
  159. X = data2[['score_recieve', 'score_reject']]
  160. Y1 = data2['decisions']
  161. Y2 = data2['rejected']
  162. rate = data2['rejection_rate']
  163.  
  164. validation_size = 0.20
  165. seed = 7
  166. X1_train, X1_test, Y1_train, Y1_test = train_test_split(X, Y1, test_size=validation_size, random_state=seed)
  167. X2_train, X2_test, Y2_train, Y2_test = train_test_split(X, Y2, test_size=validation_size, random_state=seed)
  168. rate1, rate2 = train_test_split(rate, test_size=validation_size, random_state=seed)
  169.  
  170. model1 = linear_model.LinearRegression()
  171. model2 = linear_model.LinearRegression()
  172. model1.fit(X1_train, Y1_train)
  173. model2.fit(X2_train, Y2_train)
  174. # print('Intercept: \n', model.intercept_)
  175. # print('Coefficients: \n', model.coef_)
  176. Y1_pred = model1.predict(X1_test)
  177. Y2_pred = model2.predict(X2_test)
  178. # print('y1: ',metrics.mean_squared_error(Y1_test, Y1_pred))
  179. # print('y2: ',metrics.mean_squared_error(Y2_test, Y2_pred))
  180. pred_rate = Y2_pred / Y1_pred
  181. pred_rate[np.isnan(pred_rate)] = 0
  182. print('rate',metrics.mean_squared_error(rate2, pred_rate))
  183.  
  184. # NN
  185. from keras.models import Sequential
  186. from keras.layers import Dense
  187. from keras.wrappers.scikit_learn import KerasRegressor
  188. from sklearn.preprocessing import StandardScaler
  189. # from keras import optimizers
  190.  
  191. def base_model():
  192. model = Sequential()
  193. model.add(Dense(20, input_shape = (2,) , init='normal', activation='relu'))
  194. model.add(Dense(10, init='normal', activation='relu'))
  195. model.add(Dense(1, init='normal'))
  196. model.compile(loss='mean_squared_error', optimizer = 'adam')
  197. return model
  198. clf1 = KerasRegressor(build_fn=base_model, nb_epoch=100, batch_size=2,verbose=0)
  199. clf2 = KerasRegressor(build_fn=base_model, nb_epoch=100, batch_size=2,verbose=0)
  200. clf1.fit(X1_train,Y1_train)
  201. clf2.fit(X2_train,Y2_train)
  202. Y1_pred = clf1.predict(X1_test)
  203. Y2_pred = clf2.predict(X2_test)
  204. pred_rate = Y2_pred / Y1_pred
  205. # print('y1: ',metrics.mean_squared_error(Y1_test, Y1_pred))
  206. # print('y2: ',metrics.mean_squared_error(Y2_test, Y2_pred))
  207. print('rate',metrics.mean_squared_error(rate2, pred_rate))
  208.  
  209. #============================================================
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement