Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding: utf-8
- # In[2]:
- import numpy as np
- # In[3]:
- import pandas as pd
- # In[3]:
- import matplotlib.pyplot as plt
- # In[4]:
- import seaborn as sns
- # In[5]:
- get_ipython().run_line_magic('matplotlib', 'inline')
- # # PreProcessing
- # In[7]:
- UNSW = pd.read_csv('/Radhe/Projeto/The UNSW-NB15 data set description/UNSW_NB15_training-set.csv')
- # In[7]:
- UNSW.head()
- # In[8]:
- UNSW.head(10)
- # In[9]:
- UNSW.describe()
- # In[10]:
- UNSW.info()
- # In[11]:
- UNSW.loc[[0],['attack_cat']]
- # In[8]:
- UNSW.iloc[:,1:3]
- # In[13]:
- #sns.pairplot(UNSW.iloc[1:,2:2],palette='bwr',hue = 'class') #usar hue!
- # In[14]:
- UNSW.iloc[:,1:3]
- # In[15]:
- UNSW.dropna()
- # # Calcular R² para verificar o modelo
- # R-squared is a statistical measure of how close the data are to the fitted regression line. It is also known as the coefficient of determination, or the coefficient of multiple determination for multiple regression. ... 100% indicates that the model explains all the variability of the response data around its mean.
- # Regressão Logística
- # In[9]:
- train = pd.read_csv('/Radhe/Projeto/The UNSW-NB15 data set description/UNSW_NB15_training-set.csv')
- # In[17]:
- train.isnull()
- # # Variaveis:
- #
- # Sintpkt Float Source interpacket arrival time (mSec) : coluna 16
- # Dintpkt Float Destination interpacket arrival time (mSec) coluna 17
- # tcprtt Float TCP connection setup round-trip time, the sum of synack and ackdat. coluna 24
- # synack Float TCP connection setup time, the time between the SYN and the SYN_ACK packets. coluna 25
- # ackdat Float TCP connection setup time, the time between the SYN_ACK and the ACK packets. coluna 26
- #
- # Classe: Label binary 0 for normal and 1 for attack records
- # é a coluna 44: de 0 a 44
- #
- # In[10]:
- train.loc[[0],['label']]
- # In[19]:
- #train.iloc[0,46]
- # In[20]:
- #sns.countplot(x='ackdat',data=train,hue='label',palette='RdBu_r')
- # In[21]:
- train.describe()
- # In[11]:
- train.iloc[:,44] #classe
- # In[23]:
- train.iloc[:,44].describe()
- # In[24]:
- #train.iloc[:,16] #Sintpkt
- # In[25]:
- #train.iloc[:,17] #Dintpkt
- # In[26]:
- #train.iloc[:,24] #tcprtt
- # In[27]:
- #train.iloc[:,25] #synack
- # In[28]:
- #train.iloc[:,26] #ackdat
- # In[29]:
- #sns.countplot(x='ackdat',data=train,hue='label',palette='RdBu_r')
- # # Tratando variáveis categóricas
- # In[30]:
- #train.iloc[:,4] #state
- # In[12]:
- pd.get_dummies(train['state'])
- # # #Deixando o dataframe apenas com as colunas que interessam:
- # In[13]:
- dintpkt = train.iloc[:,17]
- # In[14]:
- sinpkt = train.iloc[:,16]
- # In[15]:
- tcprtt = train.iloc[:,24]
- # In[16]:
- synack = train.iloc[:,25]
- # In[17]:
- ackdat = train.iloc[:,26]
- # In[18]:
- classe=train.iloc[:,44] #classe
- # In[19]:
- train = pd.concat([dintpkt,sinpkt,tcprtt,synack,ackdat,classe],axis=1)
- # In[39]:
- train.describe()
- # # Repetindo o tratamento com os dados de teste!
- # In[20]:
- teste = pd.read_csv('/Radhe/Projeto/The UNSW-NB15 data set description/UNSW_NB15_testing-set.csv')
- # In[41]:
- teste.describe()
- # In[21]:
- sinpkt = teste.iloc[:,16]
- # In[22]:
- dintpkt = teste.iloc[:,17]
- # In[23]:
- tcprtt = teste.iloc[:,24]
- # In[24]:
- ackdat = teste.iloc[:,26]
- # In[25]:
- classe=teste.iloc[:,44] #classe
- # In[26]:
- teste = pd.concat([dintpkt,sinpkt,tcprtt,synack,ackdat,classe],axis=1)
- # In[48]:
- #teste
- # In[29]:
- from sklearn.linear_model import LogisticRegression
- from sklearn.model_selection import train_test_split
- # In[34]:
- X_train,y_train =train.drop('label',axis=1),train['label']
- # In[35]:
- X_teste,y_teste =teste.drop('label',axis=1),teste['label']
- # In[36]:
- logmodel = LogisticRegression()
- # In[38]:
- logmodel.fit(X_train,y_train)
- # In[39]:
- predictions = logmodel.predict(X_teste.dropna())
- # In[66]:
- #X_teste.dropna()
- # # Avaliando o modelo
- # In[40]:
- from sklearn.metrics import classification_report
- # In[53]:
- print(classification_report(y_teste[0:82332],predictions[0:82332])) # #ver o erro -> Found input variables with inconsistent numbers of samples: [175341, 82332]
- # In[44]:
- from sklearn.metrics import confusion_matrix
- # In[55]:
- print(confusion_matrix(y_teste[0:82332],predictions)) #ver o erro -> Found input variables with inconsistent numbers of samples: [175341, 82332]
- # In[91]:
- Acertos = confusion_matrix(y_teste[0:82332],predictions)[0][0] + confusion_matrix(y_teste[0:82332],predictions)[1][1]
- # In[92]:
- print(Acertos)
- # In[93]:
- Erros = confusion_matrix(y_teste[0:82332],predictions)[1][0] + confusion_matrix(y_teste[0:82332],predictions)[0][1]
- # In[94]:
- print(Erros)
- # In[95]:
- Total_registros = Erros + Acertos
- # In[64]:
- print(Total_registros)
- # In[96]:
- Percentual_acertos = Acertos*100/Total_registros
- # In[68]:
- print(Percentual_acertos)
- # In[97]:
- Normal = confusion_matrix(y_teste[0:82332],predictions)[0][0] + confusion_matrix(y_teste[0:82332],predictions)[0][1]
- # In[98]:
- Ataques = confusion_matrix(y_teste[0:82332],predictions)[1][0] +confusion_matrix(y_teste[0:82332],predictions)[1][1]
- # In[99]:
- Percentual_Acerto_Ataques = Ataques*100/(Ataques + Normal)
- # In[100]:
- print(Percentual_Acerto_Ataques)
- # In[101]:
- Percetual_Acerto_Normal = Normal*100/(Ataques + Normal)
- # In[102]:
- print(Percetual_Acerto_Normal)
- # # Usando Keras Redes Neurais
- # In[110]:
- test =pd.read_csv('/Radhe1/CEFET/MineraçãoDados_CEFET/Projeto/The UNSW-NB15 data set description/UNSW_NB15_testing-set.csv')
- # In[7]:
- train = pd.read_csv('/Radhe1/CEFET/MineraçãoDados_CEFET/Projeto/The UNSW-NB15 data set description/UNSW_NB15_training-set.csv')
- # In[111]:
- sload1 =test.loc[:,'sload'].values #Sload
- dload1 =test.loc[:,'dload'].values #dload
- spkts1 = test.loc[:,'spkts'].values #Spkts
- dpkts1 = test.loc[:,'dpkts'].values #Dpkts
- swin1 = test.loc[:,'swin'].values #swin
- dwin1 = test.loc[:,'dwin'].values #dwin
- smean1 = test.loc[:,'smean'].values #smeansz
- dmean1 = test.loc[:,'dmean'].values #dmeansz
- sjit1 = test.loc[:,'sjit'].values #Sjit
- djit1 = test.loc[:,'djit'].values #Djit
- sinpkt1 = test.loc[:,'sinpkt'].values #Sintpkt
- dinpkt1 = test.loc[:,'dinpkt'].values #Dintpkt
- tcprtt1 = test.loc[:,'tcprtt'].values #tcprtt
- synack1 = test.loc[:,'synack'].values #synack
- ackdat1 = test.loc[:,'ackdat'].values #ackdat
- ct_srv_src1 = test.loc[:,'ct_srv_src'].values #ct_srv_src
- ct_srv_dst1 = test.loc[:,'ct_srv_dst'].values #ct_srv_dst
- ct_dst_ltm1 = test.loc[:,'ct_dst_ltm'].values #ct_dst_ltm
- ct_src_ltm1 = test.loc[:,'ct_src_ltm'].values #ct_src_ltm
- ct_src_dport_ltm1 = test.loc[:,'ct_src_dport_ltm'].values #ct_src_dport_ltm
- ct_dst_sport_ltm1 = test.loc[:,'ct_dst_sport_ltm'].values #ct_dst_sport_ltm
- ct_dst_src_ltm1 = test.loc[:,'ct_dst_src_ltm'].values #ct_dst_src_ltm
- classe1_test = test.loc[:,'label'].values#A classe!
- # In[102]:
- #test = pd.concat([sload1,dload1,spkts1,dpkts1,swin1,dwin1,smean1,dmean1,sjit1,djit1,sinpkt1,dinpkt1,tcprtt1,synack1,ackdat1,ct_srv_src1,ct_srv_dst1,ct_dst_ltm1,ct_src_ltm1,ct_src_dport_ltm1,ct_dst_sport_ltm1,ct_dst_src_ltm1,classe1],axis=1)
- # In[112]:
- teste =[sload1,dload1,spkts1,dpkts1,swin1,dwin1,smean1,dmean1,sjit1,djit1,sinpkt1,dinpkt1,tcprtt1,synack1,ackdat1,ct_srv_src1,ct_srv_dst1,ct_dst_ltm1,ct_src_ltm1,ct_src_dport_ltm1,ct_dst_sport_ltm1,ct_dst_src_ltm1]
- # In[113]:
- #test
- # In[13]:
- sload =train.loc[:,'sload'] #Sload
- dload =train.loc[:,'dload'] #dload
- spkts = train.loc[:,'spkts'] #Spkts
- dpkts = train.loc[:,'dpkts'] #Dpkts
- swin= train.loc[:,'swin'] #swin
- dwin = train.loc[:,'dwin'] #dwin
- smean = train.loc[:,'smean'] #smeansz
- dmean = train.loc[:,'dmean'] #dmeansz
- sjit = train.loc[:,'sjit'] #Sjit
- djit = train.loc[:,'djit'] #Djit
- sinpkt = train.loc[:,'sinpkt'] #Sintpkt
- dinpkt = train.loc[:,'dinpkt'] #Dintpkt
- tcprtt = train.loc[:,'tcprtt'] #tcprtt
- synack = train.loc[:,'synack'] #synack
- ackdat = train.loc[:,'ackdat'] #ackdat
- ct_srv_src = train.loc[:,'ct_srv_src'] #ct_srv_src
- ct_srv_dst = train.loc[:,'ct_srv_dst'] #ct_srv_dst
- ct_dst_ltm = train.loc[:,'ct_dst_ltm'] #ct_dst_ltm
- ct_src_ltm = train.loc[:,'ct_src_ltm'] #ct_src_ltm
- ct_src_dport_ltm = train.loc[:,'ct_src_dport_ltm'] #ct_src_dport_ltm
- ct_dst_sport_ltm = train.loc[:,'ct_dst_sport_ltm'] #ct_dst_sport_ltm
- ct_dst_src_ltm = train.loc[:,'ct_dst_src_ltm'] #ct_dst_src_ltm
- classe = train.loc[:,'label']#A classe!
- # In[16]:
- train = pd.concat([sload,dload,spkts,dpkts,swin,dwin,smean,dmean,sjit,djit,sinpkt,dinpkt,tcprtt,synack,ackdat,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,classe],axis=1)
- # In[127]:
- X_train2.values
- # In[130]:
- X_train2,y_train2 =train.drop('label',axis=1),train['label']
- X_test2,y_test2 =test,classe1_test
- # In[18]:
- import keras
- # In[21]:
- from keras.models import Sequential
- # In[23]:
- from keras.layers import Dense
- # In[115]:
- classificador_rede_neural = Sequential()
- # In[69]:
- # # Camadas Ocultas e de Saída
- # camadas ocultas = (entradas + saídas)/2 #estimando o numero de neurônios em camada oculta
- #
- # temos:len(train.columns) - 1 atributos previsores
- #
- # 1 classe
- # In[34]:
- #len(train.columns)
- # In[43]:
- camadas_ocultas = round(len(train.columns)/2)
- # In[44]:
- print(camadas_ocultas)
- # In[51]:
- classificador_rede_neural.add(Dense(units=camadas_ocultas, activation='relu',input_dim =len(train.columns) ))#primeira camada
- # In[53]:
- classificador_rede_neural.add(Dense(units=camadas_ocultas, activation='relu' ))#segunda camada
- # In[54]:
- classificador_rede_neural.add(Dense(units=1, activation='sigmoid' ))#camada de saída. a saída é binária, logo units=1
- # In[55]:
- #sigmoid pq a saida é binaria: gera uma probabilidade
- # In[56]:
- classificador_rede_neural.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
- # In[122]:
- classificador_rede_neural.fit(X_test2,y_test2,batch_size=10,epochs =100)#é treino: trocar
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement