Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- # In[2]:
- import pandas as pd
- # In[3]:
- import matplotlib.pyplot as plt
- # In[4]:
- import seaborn as sns
- # # Obtendo os dados
- # Lendo o arquivo UNSW-NB15_1.csv e gravando-o em um DataFrame chamado UNSW1
- # In[81]:
- UNSW1 = pd.read_csv('/Radhe1/CEFET/MineraçãoDados_CEFET/Projeto/The UNSW-NB15 data set description/UNSW-NB15_1.csv',dtype={"srcip":object ,},header=None)
- # # Verificando o cabeçalho: head()
- # In[82]:
- UNSW1.head(20)
- # # Usando info() e describe() no dataframe:
- # In[83]:
- UNSW1.info()
- # In[84]:
- UNSW1.describe()
- # # Análise de dados exploratória e Pre-processamento
- #
- # In[85]:
- #sns.pairplot(UNSW1,palette='bwr') #usar hue!
- # In[86]:
- UNSW11 = pd.read_csv('/home/govinda/Desktop/UNSW-NB15_1_ed.csv')
- # In[87]:
- UNSW11.describe()
- # In[88]:
- UNSW11['class']#a classe!
- # In[89]:
- #sns.pairplot(UNSW11.iloc[:,1:2],palette='bwr',hue = 'class') #usar hue!
- # In[90]:
- UNSW11.info()
- # In[91]:
- UNSW11.head()
- # In[92]:
- UNSW11
- # In[93]:
- UNSW11.dropna()
- # In[94]:
- print("dados 'Nan'",700001 - 22215)
- # # Verificando os arquivos de treino e teste fornecidos junto ao dataset
- # In[6]:
- train = pd.read_csv('/home/sseg/Desktop/CEFET/UNSW_NB15_training-set.csv')
- # In[7]:
- train
- # In[8]:
- train.dropna()
- # Não há dados "Nan" no conjunto de treino!
- # In[10]:
- train.describe()
- # In[13]:
- train.info()
- # In[14]:
- test =pd.read_csv('/home/sseg/Desktop/CEFET/UNSW_NB15_testing-set.csv')
- # In[15]:
- test
- # In[19]:
- test.dropna()
- # Podemos observar que não há dados "Nan" no conjunto de teste!
- # In[20]:
- test.describe()
- # In[21]:
- test.info()
- # In[23]:
- test.head()
- # # Tratando variáveis categóricas
- #
- # In[24]:
- train.iloc[:,4] #state : é uma variável categórica
- # In[25]:
- pd.get_dummies(train['state'])
- train.loc[[0],['label']]
- sload =train.loc[:,'sload'] #Sload
- # In[28]:
- dload =train.loc[:,'dload'] #dload
- # In[29]:
- spkts = train.loc[:,'spkts'] #Spkts
- # In[30]:
- dpkts = train.loc[:,'dpkts'] #Dpkts
- # In[31]:
- swin = train.loc[:,'swin'] #swin
- # In[32]:
- dwin = train.loc[:,'dwin'] #dwin
- # In[33]:
- smean = train.loc[:,'smean'] #smeansz
- # In[34]:
- dmean = train.loc[:,'dmean'] #dmeansz
- # In[35]:
- sjit = train.loc[:,'sjit'] #Sjit
- # In[36]:
- djit = train.loc[:,'djit'] #Djit
- # In[37]:
- sinpkt = train.loc[:,'sinpkt'] #Sintpkt
- # In[38]:
- dinpkt = train.loc[:,'dinpkt'] #Dintpkt
- # In[39]:
- tcprtt = train.loc[:,'tcprtt'] #tcprtt
- # In[40]:
- synack = train.loc[:,'synack'] #synack
- # In[41]:
- ackdat = train.loc[:,'ackdat'] #ackdat
- # In[42]:
- ct_srv_src = train.loc[:,'ct_srv_src'] #ct_srv_src
- # In[43]:
- ct_srv_dst = train.loc[:,'ct_srv_dst'] #ct_srv_dst
- # In[44]:
- ct_dst_ltm = train.loc[:,'ct_dst_ltm'] #ct_dst_ltm
- # In[45]:
- ct_src_ltm = train.loc[:,'ct_src_ltm'] #ct_src_ltm
- # In[46]:
- ct_src_dport_ltm = train.loc[:,'ct_src_dport_ltm'] #ct_src_dport_ltm
- # In[47]:
- ct_dst_sport_ltm = train.loc[:,'ct_dst_sport_ltm'] #ct_dst_sport_ltm
- # In[48]:
- ct_dst_src_ltm = train.loc[:,'ct_dst_src_ltm'] #ct_dst_src_ltm
- # In[49]:
- classe = train.loc[:,'label']#A classe!
- # # Teste:
- # In[50]:
- sload1 =test.loc[:,'sload'] #Sload
- dload1 =test.loc[:,'dload'] #dload
- spkts1 = test.loc[:,'spkts'] #Spkts
- dpkts1 = test.loc[:,'dpkts'] #Dpkts
- swin1 = test.loc[:,'swin'] #swin
- dwin1 = test.loc[:,'dwin'] #dwin
- smean1 = test.loc[:,'smean'] #smeansz
- dmean1 = test.loc[:,'dmean'] #dmeansz
- sjit1 = test.loc[:,'sjit'] #Sjit
- djit1 = test.loc[:,'djit'] #Djit
- sinpkt1 = test.loc[:,'sinpkt'] #Sintpkt
- dinpkt1 = test.loc[:,'dinpkt'] #Dintpkt
- tcprtt1 = test.loc[:,'tcprtt'] #tcprtt
- synack1 = test.loc[:,'synack'] #synack
- ackdat1 = test.loc[:,'ackdat'] #ackdat
- ct_srv_src1 = test.loc[:,'ct_srv_src'] #ct_srv_src
- ct_srv_dst1 = test.loc[:,'ct_srv_dst'] #ct_srv_dst
- ct_dst_ltm1 = test.loc[:,'ct_dst_ltm'] #ct_dst_ltm
- ct_src_ltm1 = test.loc[:,'ct_src_ltm'] #ct_src_ltm
- ct_src_dport_ltm1 = test.loc[:,'ct_src_dport_ltm'] #ct_src_dport_ltm
- ct_dst_sport_ltm1 = test.loc[:,'ct_dst_sport_ltm'] #ct_dst_sport_ltm
- ct_dst_src_ltm1 = test.loc[:,'ct_dst_src_ltm'] #ct_dst_src_ltm
- classe1 = test.loc[:,'label']#A classe!
- # # #Deixando o dataframe apenas com as colunas que interessam (features):
- #
- # In[51]:
- train = pd.concat([sload,dload,spkts,dpkts,swin,dwin,smean,dmean,sjit,djit,sinpkt,dinpkt,tcprtt,synack,ackdat,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,classe],axis=1)
- # In[52]:
- test = pd.concat([sload1,dload1,spkts1,dpkts1,swin1,dwin1,smean1,dmean1,sjit1,djit1,sinpkt1,dinpkt1,tcprtt1,synack1,ackdat1,ct_srv_src1,ct_srv_dst1,ct_dst_ltm1,ct_src_ltm1,ct_src_dport_ltm1,ct_dst_sport_ltm1,ct_dst_src_ltm1,classe1],axis=1)
- # In[53]:
- train.describe()
- # In[54]:
- test.describe()
- # # Aplicando a regressão logística
- # In[57]:
- from sklearn.linear_model import LogisticRegression
- # In[58]:
- X_train,y_train =train.drop('label',axis=1),train['label']
- # In[59]:
- X_test,y_test =test.drop('label',axis=1),test['label']
- # In[144]:
- #logmodel = LogisticRegression()
- # In[60]:
- import pickle #salvar o modelo para não treinar de novo
- # In[70]:
- logmodel_persistente = open('logmodel_persistente.p', 'rb')
- # In[71]:
- pickle.load(logmodel_persistente)
- # In[72]:
- logmodel = logmodel_persistente
- # In[73]:
- #logmodel.fit(X_train,y_train) #ja salvei com o pickle
- # In[74]:
- predictions = logmodel.predict(X_test)
- # In[166]:
- import pickle #salvar o modelo para não treinar de novo
- # In[167]:
- logmodel_persistente = open('logmodel_persistente.p', 'wb')
- # In[168]:
- pickle.dump(logmodel, logmodel_persistente)
- # In[169]:
- logmodel_persistente.close()
- # # Avaliando o modelo
- # In[173]:
- from sklearn.metrics import classification_report
- # In[174]:
- print(classification_report(y_test,predictions))
- #
Advertisement
Add Comment
Please, Sign In to add comment