Advertisement
Guest User

Untitled

a guest
Jun 16th, 2018
1,026
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.41 KB | None | 0 0
  1.  
  2. # coding: utf-8
  3.  
  4. # In[2]:
  5.  
  6.  
  7. import numpy as np
  8.  
  9.  
  10. # In[3]:
  11.  
  12.  
  13. import pandas as pd
  14.  
  15.  
  16. # In[3]:
  17.  
  18.  
  19. import matplotlib.pyplot as plt
  20.  
  21.  
  22. # In[4]:
  23.  
  24.  
  25. import seaborn as sns
  26.  
  27.  
  28. # In[5]:
  29.  
  30.  
  31. get_ipython().run_line_magic('matplotlib', 'inline')
  32.  
  33.  
  34. # # PreProcessing
  35.  
  36. # In[7]:
  37.  
  38.  
  39. UNSW = pd.read_csv('/Radhe/Projeto/The UNSW-NB15 data set description/UNSW_NB15_training-set.csv')
  40.  
  41.  
  42. # In[7]:
  43.  
  44.  
  45. UNSW.head()
  46.  
  47.  
  48. # In[8]:
  49.  
  50.  
  51. UNSW.head(10)
  52.  
  53.  
  54. # In[9]:
  55.  
  56.  
  57. UNSW.describe()
  58.  
  59.  
  60. # In[10]:
  61.  
  62.  
  63. UNSW.info()
  64.  
  65.  
  66. # In[11]:
  67.  
  68.  
  69. UNSW.loc[[0],['attack_cat']]
  70.  
  71.  
  72. # In[8]:
  73.  
  74.  
  75. UNSW.iloc[:,1:3]
  76.  
  77.  
  78. # In[13]:
  79.  
  80.  
  81. #sns.pairplot(UNSW.iloc[1:,2:2],palette='bwr',hue = 'class') #usar hue!
  82.  
  83.  
  84. # In[14]:
  85.  
  86.  
  87. UNSW.iloc[:,1:3]
  88.  
  89.  
  90. # In[15]:
  91.  
  92.  
  93. UNSW.dropna()
  94.  
  95.  
  96. # # Calcular R² para verificar o modelo
  97.  
  98. # R-squared is a statistical measure of how close the data are to the fitted regression line. It is also known as the coefficient of determination, or the coefficient of multiple determination for multiple regression. ... 100% indicates that the model explains all the variability of the response data around its mean.
  99.  
  100. # Regressão Logística
  101.  
  102. # In[9]:
  103.  
  104.  
  105. train = pd.read_csv('/Radhe/Projeto/The UNSW-NB15 data set description/UNSW_NB15_training-set.csv')
  106.  
  107.  
  108. # In[17]:
  109.  
  110.  
  111. train.isnull()
  112.  
  113.  
  114. # # Variaveis:
  115. #
  116. # Sintpkt   Float   Source interpacket arrival time (mSec) : coluna 16
  117. # Dintpkt   Float   Destination interpacket arrival time (mSec) coluna 17
  118. # tcprtt    Float   TCP connection setup round-trip time, the sum of synack and ackdat. coluna 24
  119. # synack    Float   TCP connection setup time, the time between the SYN and the SYN_ACK packets. coluna 25
  120. # ackdat    Float   TCP connection setup time, the time between the SYN_ACK and the ACK packets. coluna 26
  121. #
  122.  
  123. # Classe: Label binary  0 for normal and 1 for attack records
  124. # é a coluna 44: de 0 a 44
  125. #
  126.  
  127. # In[10]:
  128.  
  129.  
  130. train.loc[[0],['label']]
  131.  
  132.  
  133. # In[19]:
  134.  
  135.  
  136. #train.iloc[0,46]
  137.  
  138.  
  139. # In[20]:
  140.  
  141.  
  142. #sns.countplot(x='ackdat',data=train,hue='label',palette='RdBu_r')
  143.  
  144.  
  145. # In[21]:
  146.  
  147.  
  148. train.describe()
  149.  
  150.  
  151. # In[11]:
  152.  
  153.  
  154. train.iloc[:,44] #classe
  155.  
  156.  
  157. # In[23]:
  158.  
  159.  
  160. train.iloc[:,44].describe()
  161.  
  162.  
  163. # In[24]:
  164.  
  165.  
  166. #train.iloc[:,16] #Sintpkt
  167.  
  168.  
  169. # In[25]:
  170.  
  171.  
  172. #train.iloc[:,17] #Dintpkt
  173.  
  174.  
  175. # In[26]:
  176.  
  177.  
  178. #train.iloc[:,24] #tcprtt
  179.  
  180.  
  181. # In[27]:
  182.  
  183.  
  184. #train.iloc[:,25] #synack
  185.  
  186.  
  187. # In[28]:
  188.  
  189.  
  190. #train.iloc[:,26] #ackdat
  191.  
  192.  
  193. # In[29]:
  194.  
  195.  
  196. #sns.countplot(x='ackdat',data=train,hue='label',palette='RdBu_r')
  197.  
  198.  
  199. # # Tratando variáveis categóricas
  200.  
  201. # In[30]:
  202.  
  203.  
  204. #train.iloc[:,4] #state
  205.  
  206.  
  207. # In[12]:
  208.  
  209.  
  210. pd.get_dummies(train['state'])
  211.  
  212.  
  213. # # #Deixando o dataframe apenas com as colunas que interessam:
  214.  
  215. # In[13]:
  216.  
  217.  
  218. dintpkt = train.iloc[:,17]
  219.  
  220.  
  221. # In[14]:
  222.  
  223.  
  224. sinpkt = train.iloc[:,16]
  225.  
  226.  
  227. # In[15]:
  228.  
  229.  
  230. tcprtt = train.iloc[:,24]
  231.  
  232.  
  233. # In[16]:
  234.  
  235.  
  236. synack = train.iloc[:,25]
  237.  
  238.  
  239. # In[17]:
  240.  
  241.  
  242. ackdat = train.iloc[:,26]
  243.  
  244.  
  245. # In[18]:
  246.  
  247.  
  248. classe=train.iloc[:,44] #classe
  249.  
  250.  
  251. # In[19]:
  252.  
  253.  
  254. train = pd.concat([dintpkt,sinpkt,tcprtt,synack,ackdat,classe],axis=1)
  255.  
  256.  
  257. # In[39]:
  258.  
  259.  
  260. train.describe()
  261.  
  262.  
  263. # # Repetindo o tratamento com os dados de teste!
  264.  
  265. # In[20]:
  266.  
  267.  
  268. teste = pd.read_csv('/Radhe/Projeto/The UNSW-NB15 data set description/UNSW_NB15_testing-set.csv')
  269.  
  270.  
  271. # In[41]:
  272.  
  273.  
  274. teste.describe()
  275.  
  276.  
  277. # In[21]:
  278.  
  279.  
  280. sinpkt = teste.iloc[:,16]
  281.  
  282.  
  283. # In[22]:
  284.  
  285.  
  286. dintpkt = teste.iloc[:,17]
  287.  
  288.  
  289. # In[23]:
  290.  
  291.  
  292. tcprtt = teste.iloc[:,24]
  293.  
  294.  
  295. # In[24]:
  296.  
  297.  
  298. ackdat = teste.iloc[:,26]
  299.  
  300.  
  301. # In[25]:
  302.  
  303.  
  304. classe=teste.iloc[:,44] #classe
  305.  
  306.  
  307. # In[26]:
  308.  
  309.  
  310. teste =  pd.concat([dintpkt,sinpkt,tcprtt,synack,ackdat,classe],axis=1)
  311.  
  312.  
  313. # In[48]:
  314.  
  315.  
  316. #teste
  317.  
  318.  
  319. # In[29]:
  320.  
  321.  
  322. from sklearn.linear_model import LogisticRegression
  323.  
  324. from sklearn.model_selection import train_test_split
  325. # In[34]:
  326.  
  327.  
  328. X_train,y_train =train.drop('label',axis=1),train['label']
  329.  
  330.  
  331. # In[35]:
  332.  
  333.  
  334. X_teste,y_teste =teste.drop('label',axis=1),teste['label']
  335.  
  336.  
  337. # In[36]:
  338.  
  339.  
  340. logmodel = LogisticRegression()
  341.  
  342.  
  343. # In[38]:
  344.  
  345.  
  346. logmodel.fit(X_train,y_train)
  347.  
  348.  
  349. # In[39]:
  350.  
  351.  
  352. predictions = logmodel.predict(X_teste.dropna())
  353.  
  354.  
  355. # In[66]:
  356.  
  357.  
  358. #X_teste.dropna()
  359.  
  360.  
  361. # # Avaliando o modelo
  362.  
  363. # In[40]:
  364.  
  365.  
  366. from sklearn.metrics import classification_report
  367.  
  368.  
  369. # In[53]:
  370.  
  371.  
  372. print(classification_report(y_teste[0:82332],predictions[0:82332])) # #ver o erro -> Found input variables with inconsistent numbers of samples: [175341, 82332]
  373.  
  374.  
  375. # In[44]:
  376.  
  377.  
  378. from sklearn.metrics import confusion_matrix
  379.  
  380.  
  381. # In[55]:
  382.  
  383.  
  384. print(confusion_matrix(y_teste[0:82332],predictions)) #ver o erro -> Found input variables with inconsistent numbers of samples: [175341, 82332]
  385.  
  386.  
  387. # In[91]:
  388.  
  389.  
  390. Acertos = confusion_matrix(y_teste[0:82332],predictions)[0][0] + confusion_matrix(y_teste[0:82332],predictions)[1][1]
  391.  
  392.  
  393. # In[92]:
  394.  
  395.  
  396. print(Acertos)
  397.  
  398.  
  399. # In[93]:
  400.  
  401.  
  402. Erros = confusion_matrix(y_teste[0:82332],predictions)[1][0] + confusion_matrix(y_teste[0:82332],predictions)[0][1]
  403.  
  404.  
  405. # In[94]:
  406.  
  407.  
  408. print(Erros)
  409.  
  410.  
  411. # In[95]:
  412.  
  413.  
  414. Total_registros = Erros + Acertos
  415.  
  416.  
  417. # In[64]:
  418.  
  419.  
  420. print(Total_registros)
  421.  
  422.  
  423. # In[96]:
  424.  
  425.  
  426. Percentual_acertos = Acertos*100/Total_registros
  427.  
  428.  
  429. # In[68]:
  430.  
  431.  
  432. print(Percentual_acertos)
  433.  
  434.  
  435. # In[97]:
  436.  
  437.  
  438. Normal = confusion_matrix(y_teste[0:82332],predictions)[0][0] + confusion_matrix(y_teste[0:82332],predictions)[0][1]
  439.  
  440.  
  441. # In[98]:
  442.  
  443.  
  444. Ataques = confusion_matrix(y_teste[0:82332],predictions)[1][0] +confusion_matrix(y_teste[0:82332],predictions)[1][1]
  445.  
  446.  
  447. # In[99]:
  448.  
  449.  
  450. Percentual_Acerto_Ataques = Ataques*100/(Ataques + Normal)
  451.  
  452.  
  453. # In[100]:
  454.  
  455.  
  456. print(Percentual_Acerto_Ataques)
  457.  
  458.  
  459. # In[101]:
  460.  
  461.  
  462. Percetual_Acerto_Normal = Normal*100/(Ataques + Normal)
  463.  
  464.  
  465. # In[102]:
  466.  
  467.  
  468. print(Percetual_Acerto_Normal)
  469.  
  470.  
  471. # # Usando Keras Redes Neurais
  472.  
  473. # In[110]:
  474.  
  475.  
  476. test =pd.read_csv('/Radhe1/CEFET/MineraçãoDados_CEFET/Projeto/The UNSW-NB15 data set description/UNSW_NB15_testing-set.csv')
  477.  
  478.  
  479. # In[7]:
  480.  
  481.  
  482. train = pd.read_csv('/Radhe1/CEFET/MineraçãoDados_CEFET/Projeto/The UNSW-NB15 data set description/UNSW_NB15_training-set.csv')
  483.  
  484.  
  485. # In[111]:
  486.  
  487.  
  488. sload1 =test.loc[:,'sload'].values #Sload
  489.  
  490. dload1 =test.loc[:,'dload'].values #dload
  491.  
  492. spkts1 = test.loc[:,'spkts'].values #Spkts
  493.  
  494. dpkts1 = test.loc[:,'dpkts'].values #Dpkts
  495.  
  496. swin1 = test.loc[:,'swin'].values #swin
  497.  
  498. dwin1 = test.loc[:,'dwin'].values #dwin
  499.  
  500. smean1 = test.loc[:,'smean'].values #smeansz
  501.  
  502. dmean1 = test.loc[:,'dmean'].values #dmeansz
  503.  
  504. sjit1 = test.loc[:,'sjit'].values #Sjit
  505.  
  506. djit1 = test.loc[:,'djit'].values #Djit
  507.  
  508. sinpkt1 = test.loc[:,'sinpkt'].values #Sintpkt
  509.  
  510. dinpkt1 = test.loc[:,'dinpkt'].values #Dintpkt
  511.  
  512.  
  513. tcprtt1 = test.loc[:,'tcprtt'].values #tcprtt
  514.  
  515. synack1 = test.loc[:,'synack'].values #synack
  516.  
  517. ackdat1 = test.loc[:,'ackdat'].values #ackdat
  518.  
  519. ct_srv_src1 = test.loc[:,'ct_srv_src'].values #ct_srv_src
  520.  
  521.  
  522. ct_srv_dst1 = test.loc[:,'ct_srv_dst'].values #ct_srv_dst
  523.  
  524. ct_dst_ltm1 = test.loc[:,'ct_dst_ltm'].values #ct_dst_ltm
  525.  
  526. ct_src_ltm1 = test.loc[:,'ct_src_ltm'].values #ct_src_ltm
  527.  
  528. ct_src_dport_ltm1 = test.loc[:,'ct_src_dport_ltm'].values #ct_src_dport_ltm
  529.  
  530.  
  531. ct_dst_sport_ltm1 = test.loc[:,'ct_dst_sport_ltm'].values #ct_dst_sport_ltm
  532.  
  533.  
  534. ct_dst_src_ltm1 = test.loc[:,'ct_dst_src_ltm'].values #ct_dst_src_ltm
  535.  
  536.  
  537. classe1_test = test.loc[:,'label'].values#A classe!
  538.  
  539.  
  540. # In[102]:
  541.  
  542.  
  543. #test = pd.concat([sload1,dload1,spkts1,dpkts1,swin1,dwin1,smean1,dmean1,sjit1,djit1,sinpkt1,dinpkt1,tcprtt1,synack1,ackdat1,ct_srv_src1,ct_srv_dst1,ct_dst_ltm1,ct_src_ltm1,ct_src_dport_ltm1,ct_dst_sport_ltm1,ct_dst_src_ltm1,classe1],axis=1)
  544.  
  545.  
  546. # In[112]:
  547.  
  548.  
  549. teste =[sload1,dload1,spkts1,dpkts1,swin1,dwin1,smean1,dmean1,sjit1,djit1,sinpkt1,dinpkt1,tcprtt1,synack1,ackdat1,ct_srv_src1,ct_srv_dst1,ct_dst_ltm1,ct_src_ltm1,ct_src_dport_ltm1,ct_dst_sport_ltm1,ct_dst_src_ltm1]
  550.  
  551.  
  552. # In[113]:
  553.  
  554.  
  555. #test
  556.  
  557.  
  558. # In[13]:
  559.  
  560.  
  561. sload =train.loc[:,'sload'] #Sload
  562.  
  563. dload =train.loc[:,'dload'] #dload
  564.  
  565. spkts = train.loc[:,'spkts'] #Spkts
  566.  
  567. dpkts = train.loc[:,'dpkts'] #Dpkts
  568.  
  569. swin= train.loc[:,'swin'] #swin
  570.  
  571. dwin = train.loc[:,'dwin'] #dwin
  572.  
  573. smean = train.loc[:,'smean'] #smeansz
  574.  
  575. dmean = train.loc[:,'dmean'] #dmeansz
  576.  
  577. sjit = train.loc[:,'sjit'] #Sjit
  578.  
  579. djit = train.loc[:,'djit'] #Djit
  580.  
  581. sinpkt = train.loc[:,'sinpkt'] #Sintpkt
  582.  
  583. dinpkt = train.loc[:,'dinpkt'] #Dintpkt
  584.  
  585.  
  586. tcprtt = train.loc[:,'tcprtt'] #tcprtt
  587.  
  588. synack = train.loc[:,'synack'] #synack
  589.  
  590. ackdat = train.loc[:,'ackdat'] #ackdat
  591.  
  592. ct_srv_src = train.loc[:,'ct_srv_src'] #ct_srv_src
  593.  
  594.  
  595. ct_srv_dst = train.loc[:,'ct_srv_dst'] #ct_srv_dst
  596.  
  597. ct_dst_ltm = train.loc[:,'ct_dst_ltm'] #ct_dst_ltm
  598.  
  599. ct_src_ltm = train.loc[:,'ct_src_ltm'] #ct_src_ltm
  600.  
  601. ct_src_dport_ltm = train.loc[:,'ct_src_dport_ltm'] #ct_src_dport_ltm
  602.  
  603.  
  604. ct_dst_sport_ltm = train.loc[:,'ct_dst_sport_ltm'] #ct_dst_sport_ltm
  605.  
  606.  
  607. ct_dst_src_ltm = train.loc[:,'ct_dst_src_ltm'] #ct_dst_src_ltm
  608.  
  609.  
  610. classe = train.loc[:,'label']#A classe!
  611.  
  612.  
  613. # In[16]:
  614.  
  615.  
  616. train = pd.concat([sload,dload,spkts,dpkts,swin,dwin,smean,dmean,sjit,djit,sinpkt,dinpkt,tcprtt,synack,ackdat,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,classe],axis=1)
  617.  
  618.  
  619. # In[127]:
  620.  
  621.  
  622. X_train2.values
  623.  
  624.  
  625. # In[130]:
  626.  
  627.  
  628. X_train2,y_train2 =train.drop('label',axis=1),train['label']
  629.  
  630. X_test2,y_test2 =test,classe1_test
  631.  
  632.  
  633. # In[18]:
  634.  
  635.  
  636. import keras
  637.  
  638.  
  639. # In[21]:
  640.  
  641.  
  642. from keras.models import Sequential
  643.  
  644.  
  645. # In[23]:
  646.  
  647.  
  648. from keras.layers import Dense
  649.  
  650.  
  651. # In[115]:
  652.  
  653.  
  654. classificador_rede_neural = Sequential()
  655.  
  656.  
  657. # In[69]:
  658.  
  659.  
  660.  
  661.  
  662.  
  663. # # Camadas Ocultas e de Saída
  664.  
  665. # camadas ocultas = (entradas + saídas)/2 #estimando o numero de neurônios em camada oculta
  666. #
  667. # temos:len(train.columns) - 1   atributos previsores
  668. #
  669. # 1 classe
  670.  
  671. # In[34]:
  672.  
  673.  
  674. #len(train.columns)
  675.  
  676.  
  677. # In[43]:
  678.  
  679.  
  680. camadas_ocultas = round(len(train.columns)/2)
  681.  
  682.  
  683. # In[44]:
  684.  
  685.  
  686. print(camadas_ocultas)
  687.  
  688.  
  689. # In[51]:
  690.  
  691.  
  692. classificador_rede_neural.add(Dense(units=camadas_ocultas, activation='relu',input_dim =len(train.columns) ))#primeira camada
  693.  
  694.  
  695. # In[53]:
  696.  
  697.  
  698. classificador_rede_neural.add(Dense(units=camadas_ocultas, activation='relu' ))#segunda camada
  699.  
  700.  
  701. # In[54]:
  702.  
  703.  
  704. classificador_rede_neural.add(Dense(units=1, activation='sigmoid' ))#camada de saída. a saída é binária, logo units=1
  705.  
  706.  
  707. # In[55]:
  708.  
  709.  
  710. #sigmoid pq a saida é binaria: gera uma probabilidade
  711.  
  712.  
  713. # In[56]:
  714.  
  715.  
  716. classificador_rede_neural.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
  717.  
  718.  
  719. # In[122]:
  720.  
  721.  
  722. classificador_rede_neural.fit(X_test2,y_test2,batch_size=10,epochs =100)#é treino: trocar
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement