Guest User

Untitled

a guest
Jun 21st, 2018
315
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.29 KB | None | 0 0
  1.  
  2. import numpy as np
  3.  
  4.  
  5. # In[2]:
  6.  
  7.  
  8. import pandas as pd
  9.  
  10.  
  11. # In[3]:
  12.  
  13.  
  14. import matplotlib.pyplot as plt
  15.  
  16.  
  17. # In[4]:
  18.  
  19.  
  20. import seaborn as sns
  21.  
  22.  
  23. # # Obtendo os dados
  24.  
  25. # Lendo o arquivo UNSW-NB15_1.csv e gravando-o em um DataFrame chamado UNSW1
  26.  
  27. # In[81]:
  28.  
  29.  
  30. UNSW1 = pd.read_csv('/Radhe1/CEFET/MineraçãoDados_CEFET/Projeto/The UNSW-NB15 data set description/UNSW-NB15_1.csv',dtype={"srcip":object ,},header=None)
  31.  
  32.  
  33. # # Verificando o cabeçalho: head()
  34.  
  35. # In[82]:
  36.  
  37.  
  38. UNSW1.head(20)
  39.  
  40.  
  41. # # Usando info() e describe() no dataframe:
  42.  
  43. # In[83]:
  44.  
  45.  
  46. UNSW1.info()
  47.  
  48.  
  49. # In[84]:
  50.  
  51.  
  52. UNSW1.describe()
  53.  
  54.  
  55. # # Análise de dados exploratória e Pre-processamento
  56. #
  57.  
  58. # In[85]:
  59.  
  60.  
  61. #sns.pairplot(UNSW1,palette='bwr') #usar hue!
  62.  
  63.  
  64. # In[86]:
  65.  
  66.  
  67. UNSW11 = pd.read_csv('/home/govinda/Desktop/UNSW-NB15_1_ed.csv')
  68.  
  69.  
  70. # In[87]:
  71.  
  72.  
  73. UNSW11.describe()
  74.  
  75.  
  76. # In[88]:
  77.  
  78.  
  79. UNSW11['class']#a classe!
  80.  
  81.  
  82. # In[89]:
  83.  
  84.  
  85. #sns.pairplot(UNSW11.iloc[:,1:2],palette='bwr',hue = 'class') #usar hue!
  86.  
  87.  
  88. # In[90]:
  89.  
  90.  
  91. UNSW11.info()
  92.  
  93.  
  94. # In[91]:
  95.  
  96.  
  97. UNSW11.head()
  98.  
  99.  
  100. # In[92]:
  101.  
  102.  
  103. UNSW11
  104.  
  105.  
  106. # In[93]:
  107.  
  108.  
  109. UNSW11.dropna()
  110.  
  111.  
  112. # In[94]:
  113.  
  114.  
  115. print("dados 'Nan'",700001 - 22215)
  116.  
  117.  
  118. # # Verificando os arquivos de treino e teste fornecidos junto ao dataset
  119.  
  120. # In[6]:
  121.  
  122.  
  123. train = pd.read_csv('/home/sseg/Desktop/CEFET/UNSW_NB15_training-set.csv')
  124.  
  125.  
  126. # In[7]:
  127.  
  128.  
  129. train
  130.  
  131.  
  132. # In[8]:
  133.  
  134.  
  135. train.dropna()
  136.  
  137.  
  138. # Não há dados "Nan" no conjunto de treino!
  139.  
  140. # In[10]:
  141.  
  142.  
  143. train.describe()
  144.  
  145.  
  146. # In[13]:
  147.  
  148.  
  149. train.info()
  150.  
  151.  
  152. # In[14]:
  153.  
  154.  
  155. test =pd.read_csv('/home/sseg/Desktop/CEFET/UNSW_NB15_testing-set.csv')
  156.  
  157.  
  158. # In[15]:
  159.  
  160.  
  161. test
  162.  
  163.  
  164. # In[19]:
  165.  
  166.  
  167. test.dropna()
  168.  
  169.  
  170. # Podemos observar que não há dados "Nan" no conjunto de teste!
  171.  
  172. # In[20]:
  173.  
  174.  
  175. test.describe()
  176.  
  177.  
  178. # In[21]:
  179.  
  180.  
  181. test.info()
  182.  
  183.  
  184. # In[23]:
  185.  
  186.  
  187. test.head()
  188.  
  189.  
  190. # # Tratando variáveis categóricas
  191. #
  192.  
  193. # In[24]:
  194.  
  195.  
  196. train.iloc[:,4] #state : é uma variável categórica
  197.  
  198.  
  199. # In[25]:
  200.  
  201.  
  202. pd.get_dummies(train['state'])
  203.  
  204.  
  205. train.loc[[0],['label']]
  206.  
  207.  
  208.  
  209. sload =train.loc[:,'sload'] #Sload
  210.  
  211.  
  212. # In[28]:
  213.  
  214.  
  215. dload =train.loc[:,'dload'] #dload
  216.  
  217.  
  218. # In[29]:
  219.  
  220.  
  221. spkts = train.loc[:,'spkts'] #Spkts
  222.  
  223.  
  224. # In[30]:
  225.  
  226.  
  227. dpkts = train.loc[:,'dpkts'] #Dpkts
  228.  
  229.  
  230. # In[31]:
  231.  
  232.  
  233. swin = train.loc[:,'swin'] #swin
  234.  
  235.  
  236. # In[32]:
  237.  
  238.  
  239. dwin = train.loc[:,'dwin'] #dwin
  240.  
  241.  
  242. # In[33]:
  243.  
  244.  
  245. smean = train.loc[:,'smean'] #smeansz
  246.  
  247.  
  248. # In[34]:
  249.  
  250.  
  251. dmean = train.loc[:,'dmean'] #dmeansz
  252.  
  253.  
  254. # In[35]:
  255.  
  256.  
  257. sjit = train.loc[:,'sjit'] #Sjit
  258.  
  259.  
  260. # In[36]:
  261.  
  262.  
  263. djit = train.loc[:,'djit'] #Djit
  264.  
  265.  
  266. # In[37]:
  267.  
  268.  
  269. sinpkt = train.loc[:,'sinpkt'] #Sintpkt
  270.  
  271.  
  272. # In[38]:
  273.  
  274.  
  275. dinpkt = train.loc[:,'dinpkt'] #Dintpkt
  276.  
  277.  
  278. # In[39]:
  279.  
  280.  
  281. tcprtt = train.loc[:,'tcprtt'] #tcprtt
  282.  
  283.  
  284. # In[40]:
  285.  
  286.  
  287. synack = train.loc[:,'synack'] #synack
  288.  
  289.  
  290. # In[41]:
  291.  
  292.  
  293. ackdat = train.loc[:,'ackdat'] #ackdat
  294.  
  295.  
  296. # In[42]:
  297.  
  298.  
  299. ct_srv_src = train.loc[:,'ct_srv_src'] #ct_srv_src
  300.  
  301.  
  302. # In[43]:
  303.  
  304.  
  305. ct_srv_dst = train.loc[:,'ct_srv_dst'] #ct_srv_dst
  306.  
  307.  
  308. # In[44]:
  309.  
  310.  
  311. ct_dst_ltm = train.loc[:,'ct_dst_ltm'] #ct_dst_ltm
  312.  
  313.  
  314. # In[45]:
  315.  
  316.  
  317. ct_src_ltm = train.loc[:,'ct_src_ltm'] #ct_src_ltm
  318.  
  319.  
  320. # In[46]:
  321.  
  322.  
  323. ct_src_dport_ltm = train.loc[:,'ct_src_dport_ltm'] #ct_src_dport_ltm
  324.  
  325.  
  326. # In[47]:
  327.  
  328.  
  329. ct_dst_sport_ltm = train.loc[:,'ct_dst_sport_ltm'] #ct_dst_sport_ltm
  330.  
  331.  
  332. # In[48]:
  333.  
  334.  
  335. ct_dst_src_ltm = train.loc[:,'ct_dst_src_ltm'] #ct_dst_src_ltm
  336.  
  337.  
  338. # In[49]:
  339.  
  340.  
  341. classe = train.loc[:,'label']#A classe!
  342.  
  343.  
  344. # # Teste:
  345.  
  346. # In[50]:
  347.  
  348.  
  349. sload1 =test.loc[:,'sload'] #Sload
  350.  
  351. dload1 =test.loc[:,'dload'] #dload
  352.  
  353. spkts1 = test.loc[:,'spkts'] #Spkts
  354.  
  355. dpkts1 = test.loc[:,'dpkts'] #Dpkts
  356.  
  357. swin1 = test.loc[:,'swin'] #swin
  358.  
  359. dwin1 = test.loc[:,'dwin'] #dwin
  360.  
  361. smean1 = test.loc[:,'smean'] #smeansz
  362.  
  363. dmean1 = test.loc[:,'dmean'] #dmeansz
  364.  
  365. sjit1 = test.loc[:,'sjit'] #Sjit
  366.  
  367. djit1 = test.loc[:,'djit'] #Djit
  368.  
  369. sinpkt1 = test.loc[:,'sinpkt'] #Sintpkt
  370.  
  371. dinpkt1 = test.loc[:,'dinpkt'] #Dintpkt
  372.  
  373.  
  374. tcprtt1 = test.loc[:,'tcprtt'] #tcprtt
  375.  
  376. synack1 = test.loc[:,'synack'] #synack
  377.  
  378. ackdat1 = test.loc[:,'ackdat'] #ackdat
  379.  
  380. ct_srv_src1 = test.loc[:,'ct_srv_src'] #ct_srv_src
  381.  
  382.  
  383. ct_srv_dst1 = test.loc[:,'ct_srv_dst'] #ct_srv_dst
  384.  
  385. ct_dst_ltm1 = test.loc[:,'ct_dst_ltm'] #ct_dst_ltm
  386.  
  387. ct_src_ltm1 = test.loc[:,'ct_src_ltm'] #ct_src_ltm
  388.  
  389. ct_src_dport_ltm1 = test.loc[:,'ct_src_dport_ltm'] #ct_src_dport_ltm
  390.  
  391.  
  392. ct_dst_sport_ltm1 = test.loc[:,'ct_dst_sport_ltm'] #ct_dst_sport_ltm
  393.  
  394.  
  395. ct_dst_src_ltm1 = test.loc[:,'ct_dst_src_ltm'] #ct_dst_src_ltm
  396.  
  397.  
  398. classe1 = test.loc[:,'label']#A classe!
  399.  
  400.  
  401. # # #Deixando o dataframe apenas com as colunas que interessam (features):
  402. #
  403.  
  404. # In[51]:
  405.  
  406.  
  407. train = pd.concat([sload,dload,spkts,dpkts,swin,dwin,smean,dmean,sjit,djit,sinpkt,dinpkt,tcprtt,synack,ackdat,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,classe],axis=1)
  408.  
  409.  
  410. # In[52]:
  411.  
  412.  
  413. test = pd.concat([sload1,dload1,spkts1,dpkts1,swin1,dwin1,smean1,dmean1,sjit1,djit1,sinpkt1,dinpkt1,tcprtt1,synack1,ackdat1,ct_srv_src1,ct_srv_dst1,ct_dst_ltm1,ct_src_ltm1,ct_src_dport_ltm1,ct_dst_sport_ltm1,ct_dst_src_ltm1,classe1],axis=1)
  414.  
  415.  
  416. # In[53]:
  417.  
  418.  
  419. train.describe()
  420.  
  421.  
  422. # In[54]:
  423.  
  424.  
  425. test.describe()
  426.  
  427.  
  428. # # Aplicando a regressão logística
  429.  
  430. # In[57]:
  431.  
  432.  
  433. from sklearn.linear_model import LogisticRegression
  434.  
  435.  
  436. # In[58]:
  437.  
  438.  
  439. X_train,y_train =train.drop('label',axis=1),train['label']
  440.  
  441.  
  442. # In[59]:
  443.  
  444.  
  445. X_test,y_test =test.drop('label',axis=1),test['label']
  446.  
  447.  
  448. # In[144]:
  449.  
  450.  
  451. #logmodel = LogisticRegression()
  452.  
  453.  
  454. # In[60]:
  455.  
  456.  
  457. import pickle  #salvar o modelo para não treinar de novo
  458.  
  459.  
  460. # In[70]:
  461.  
  462.  
  463. logmodel_persistente = open('logmodel_persistente.p',  'rb')
  464.  
  465.  
  466. # In[71]:
  467.  
  468.  
  469. pickle.load(logmodel_persistente)
  470.  
  471.  
  472. # In[72]:
  473.  
  474.  
  475. logmodel = logmodel_persistente
  476.  
  477.  
  478. # In[73]:
  479.  
  480.  
  481. #logmodel.fit(X_train,y_train) #ja salvei com o pickle
  482.  
  483.  
  484. # In[74]:
  485.  
  486.  
  487. predictions = logmodel.predict(X_test)
  488.  
  489.  
  490. # In[166]:
  491.  
  492.  
  493. import pickle  #salvar o modelo para não treinar de novo
  494.  
  495.  
  496. # In[167]:
  497.  
  498.  
  499. logmodel_persistente = open('logmodel_persistente.p',  'wb')
  500.  
  501.  
  502. # In[168]:
  503.  
  504.  
  505. pickle.dump(logmodel, logmodel_persistente)
  506.  
  507.  
  508. # In[169]:
  509.  
  510.  
  511. logmodel_persistente.close()
  512.  
  513.  
  514. # # Avaliando o modelo
  515.  
  516. # In[173]:
  517.  
  518.  
  519. from sklearn.metrics import classification_report
  520.  
  521.  
  522. # In[174]:
  523.  
  524.  
  525. print(classification_report(y_test,predictions))
  526.  
  527.  
  528. #
Advertisement
Add Comment
Please, Sign In to add comment