Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import seaborn as sns
- from scipy import stats
- from scipy.stats import norm
- from pandas.plotting import scatter_matrix
- from sklearn import preprocessing
- data2 = pd.read_excel('Schengen_Visa_Stats/data/raw_data/2018-consulates-schengen-visa-stats.xlsx', sheet_name='Data for consulates')
- print(data2.shape)
- print(data2.head(20))
- print(data2.tail(20))
- print(data2.describe())
- print(data2.isnull().sum().sort_values())
- # bs almost features about ATVs have lots of missing value and they are seperated with remaining >> drop
- data2.drop([
- 'Airport transit visas (ATVs) applied for ',
- ' ATVs issued (including multiple)',
- 'Multiple ATVs issued', 'ATVs not issued ',
- 'Not issued rate for ATVs',
- 'Total ATVs and uniform visas applied for',
- 'Total ATVs and uniform visas issued (including multiple ATVs, MEVs and LTVs) ',
- 'Total ATVs and uniform visas not issued',
- 'Not issued rate for ATVs and uniform visas '
- ], axis=1, inplace=True)
- data2.rename(columns={
- 'Schengen State': 'sch_state',
- 'Country where consulate is located': 'consulate_country',
- 'Consulate': 'consulate_city',
- 'Uniform visas applied for': 'applications',
- 'Total uniform visas issued (including MEV) \n': 'uniform_visas_issued',
- 'Multiple entry uniform visas (MEVs) issued': 'mevs_issued',
- 'Share of MEVs on total number of uniform visas issued': 'mevs_share',
- 'Total LTVs issued': 'ltvs_issued',
- 'Uniform visas not issued': 'rejected',
- 'Not issued rate for uniform visas': 'rejection_rate'}, inplace=True)
- data2 = data2.dropna(how='all').reset_index(drop=True)
- # print(data2[data2.sch_state.isnull()]) # 3 last row is total
- # drop 3 last row
- data2 = data2.dropna(axis=0, subset=['sch_state']).reset_index(drop=True)
- # fill nan by 0
- data2.fillna(0, inplace=True)
- # check if applications=uniform_visas_issued+ltvs_issued+rejected
- data2['decisions'] = data2.uniform_visas_issued + data2.ltvs_issued + data2.rejected
- print('More applications than descisions entries =', len(data2[data2.decisions < data2.applications]))
- print('More descisions than applications entries =', len(data2[data2.decisions > data2.applications]))
- # bs of the differences in applications vs decisions > use decisions
- # drop ltvs_issued
- data2['decisions'] = data2.decisions - data2.ltvs_issued
- data2['applications'] = data2.applications - data2.ltvs_issued
- data2.drop('ltvs_issued', axis=1, inplace=True)
- data2['rejection_rate'] = data2.rejected/data2.decisions
- data2['rejection_rate'].fillna(0, inplace=True)
- #there are some mevs_share > 1
- data2.loc[data2['mevs_share'] > 1, 'mevs_share'] = 0
- # data2['score'] = (1 - data2['rejection_rate']) * data2.mevs_share
- # min_max_scaler = preprocessing.MinMaxScaler()
- # data2['score'] = min_max_scaler.fit_transform(data2[['score']].values)
- #==================================================================================
- #VISUALIZE
- # #plot states recieving most visa applying
- from sklearn.preprocessing import StandardScaler
- x = data2.groupby('sch_state').sum().decisions.sort_values()
- x.plot.bar()
- plt.title('states recieving most visa applying')
- plt.xticks(rotation=45)
- plt.show()
- data2['state_recieve_score'] = data2['sch_state'].map(x.apply(lambda value: np.log(value)))
- # #plot countries apply most visa
- x = data2.groupby('consulate_country').sum().decisions.sort_values()
- x.plot.bar()
- x.plot.bar()
- plt.xticks(rotation=45, fontsize=6)
- plt.show()
- data2['country_apply_score'] = data2['consulate_country'].map(x.apply(lambda value: np.log(value)))
- # #plot states reject mostly
- x = data2.groupby('sch_state').sum().rejected.sort_values()
- x.plot.bar()
- plt.xticks(rotation=45)
- plt.show()
- data2['state_reject_score'] = data2['sch_state'].map(x.apply(lambda value: np.log(value)))
- # #plot countries are rejected mostly
- x = data2.groupby('consulate_country').sum().rejected.sort_values()
- x.plot.bar()
- plt.xticks(rotation=45, fontsize=6)
- plt.show()
- data2['country_be_rejected_score'] = data2['consulate_country'].map(x.apply(lambda value: np.log(value)))
- #process city
- x = data2.groupby('consulate_city').sum().decisions
- x1 = data2.groupby('consulate_city').sum().rejected
- data2['city_apply_score'] = data2['consulate_city'].map(x.apply(lambda value: np.log(value)))
- data2['city_be_rejected_score'] = data2['consulate_city'].map(x1.apply(lambda value: np.log(value)))
- #project rejected vs decision score
- data2['rejected'] = np.log(data2['rejected'])
- data2['rejected'] = data2['rejected'].apply(lambda value: value/data2['rejected'].nlargest(1))
- data2['decisions'] = np.log(data2['decisions'])
- data2['decisions'] = data2['decisions'].apply(lambda value: value/data2['decisions'].nlargest(1))
- #replace infinite value
- data2.replace([np.inf, -np.inf], 0, inplace=True)
- #process total score
- data2['state_score'] = data2['state_recieve_score']*data2['state_reject_score']
- data2['state_score'] = data2['state_score'].apply(lambda value: value/data2['state_score'].nlargest(1))
- data2['country_score'] = data2['country_apply_score']*data2['country_be_rejected_score']
- data2['country_score'] = data2['country_score'].apply(lambda value: value/data2['country_score'].nlargest(1))
- data2['score_recieve'] = data2['state_recieve_score']*data2['country_apply_score']*data2['city_apply_score']
- data2['score_reject'] = data2['state_reject_score']*data2['country_be_rejected_score']*data2['city_be_rejected_score']
- data2['score_recieve'] = data2['score_recieve'].apply(lambda value: value/data2['score_recieve'].nlargest(1))
- data2['score_reject'] = data2['score_reject'].apply(lambda value: value/data2['score_reject'].nlargest(1))
- # #plot correlation bettween feature
- # corr = data2.corr()
- # sns.heatmap(corr)
- # plt.xticks(rotation=45)
- # plt.show()
- # #plot correlation
- # sns.jointplot(x=data2['mevs_share'], y=data2['rejection_rate'])
- # plt.show()
- scatter_matrix(data2[['score_recieve', 'score_reject','rejected', 'decisions']])
- plt.xticks(rotation=45)
- plt.show()
- #=============================================================================
- # # encode sch_state, consulate_country, consulate_city
- # from sklearn.preprocessing import LabelEncoder
- # le = LabelEncoder()
- # for col in ['sch_state', 'consulate_country', 'consulate_city']:
- # le.fit(data2[col])
- # data2[col] = le.transform(data2[col])
- #===========================================================================
- # # # solve with linear regerssion
- from sklearn import linear_model
- from sklearn.model_selection import train_test_split
- from sklearn import metrics
- X = data2[['score_recieve', 'score_reject']]
- Y1 = data2['decisions']
- Y2 = data2['rejected']
- rate = data2['rejection_rate']
- validation_size = 0.20
- seed = 7
- X1_train, X1_test, Y1_train, Y1_test = train_test_split(X, Y1, test_size=validation_size, random_state=seed)
- X2_train, X2_test, Y2_train, Y2_test = train_test_split(X, Y2, test_size=validation_size, random_state=seed)
- rate1, rate2 = train_test_split(rate, test_size=validation_size, random_state=seed)
- model1 = linear_model.LinearRegression()
- model2 = linear_model.LinearRegression()
- model1.fit(X1_train, Y1_train)
- model2.fit(X2_train, Y2_train)
- # print('Intercept: \n', model.intercept_)
- # print('Coefficients: \n', model.coef_)
- Y1_pred = model1.predict(X1_test)
- Y2_pred = model2.predict(X2_test)
- # print('y1: ',metrics.mean_squared_error(Y1_test, Y1_pred))
- # print('y2: ',metrics.mean_squared_error(Y2_test, Y2_pred))
- pred_rate = Y2_pred / Y1_pred
- pred_rate[np.isnan(pred_rate)] = 0
- print('rate',metrics.mean_squared_error(rate2, pred_rate))
- # NN
- from keras.models import Sequential
- from keras.layers import Dense
- from keras.wrappers.scikit_learn import KerasRegressor
- from sklearn.preprocessing import StandardScaler
- # from keras import optimizers
- def base_model():
- model = Sequential()
- model.add(Dense(20, input_shape = (2,) , init='normal', activation='relu'))
- model.add(Dense(10, init='normal', activation='relu'))
- model.add(Dense(1, init='normal'))
- model.compile(loss='mean_squared_error', optimizer = 'adam')
- return model
- clf1 = KerasRegressor(build_fn=base_model, nb_epoch=100, batch_size=2,verbose=0)
- clf2 = KerasRegressor(build_fn=base_model, nb_epoch=100, batch_size=2,verbose=0)
- clf1.fit(X1_train,Y1_train)
- clf2.fit(X2_train,Y2_train)
- Y1_pred = clf1.predict(X1_test)
- Y2_pred = clf2.predict(X2_test)
- pred_rate = Y2_pred / Y1_pred
- # print('y1: ',metrics.mean_squared_error(Y1_test, Y1_pred))
- # print('y2: ',metrics.mean_squared_error(Y2_test, Y2_pred))
- print('rate',metrics.mean_squared_error(rate2, pred_rate))
- #============================================================
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement