Untitled

#%%

#%%

################################# CAMERA ANECOICA ############################################

import pandas as pd # Pandas dataframe
from scapy.layers.dot11 import Dot11ProbeReq, Dot11ProbeResp, Dot11Beacon, Dot11AssoResp, RadioTap, Dot11Elt
from scapy.packet import ls
from scapy.utils import rdpcap
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import numpy as np
from raw_data.oui import mac_lookup
import os
from datetime import datetime
from sklearn.cluster import DBSCAN

#%%

MAX_SN = 4096 # Max value for the 802.11 sequence number

def extractSN(sc):
    hexSC = '0' * (4 - len(hex(sc)[2:])) + hex(sc)[2:] # "normalize" to four digit hexadecimal number
    sn = int(hexSC[:-1], 16)
    return sn


'''
addr1   Destination MAC address
addr2   Source MAC address of sender
addr3   MAC address of Access Point
'''

#%%

def pcapng_to_dataframe(pcap):
    packet = []
    ap = []
    mac_in_chiaro = []
    access_point = []
    time_list = [p.time for p in pcap]
    time_list = np.array(time_list)
    time_threshold = time_list.min() + (15*60)
    for pkt in pcap:
        ##### rimozione primi 5 minuti di scansione ############
        if pkt.time > time_threshold:
            # if pkt.haslayer(Dot11Beacon or Dot11ProbeResp or Dot11AssoResp):
            if pkt.haslayer(Dot11Beacon):
                if hasattr(pkt, 'addr2'):
                    ap.append(pkt.addr2)
                    # print("source address :", pkt.addr2)
                if hasattr(pkt, 'addr1')and pkt.addr1 != 'ff:ff:ff:ff:ff:ff':
                    mac_in_chiaro.append(pkt.addr1)
                    # print("mac in chiaro: ", pkt.addr1)
                if hasattr(pkt, 'addr3'):
                    access_point.append(pkt.addr3)
                    # print("Access Point address :", pkt.addr3)

            if pkt.haslayer(Dot11ProbeReq):
                mac = pkt.addr2
                seq = extractSN(pkt.SC)
                power = pkt.dBm_AntSignal
                # print("pkt", mac)
                while pkt:
                    if ('ID' and 'len' and 'info') in pkt.fields.keys():
                        packet.append([mac, seq, pkt.fields['ID'], pkt.fields['len'], power])
                    pkt = pkt.payload

    pkts_df = pd.DataFrame(packet)
    pkts_df = pkts_df.drop_duplicates()
    pkts_df.sort_values(by=[0,1])
    pkts_df.columns = ['mac', 'seq', 'id', 'len', 'power']

    ap = set(ap)

    # print(len(pkts_df.mac.unique()))

    pkts_df = pkts_df[~pkts_df.mac.isin(ap)]

    # print(len(pkts_df.mac.unique()))

    # pkts_df = pkts_df[pkts_df.id != 221] # Wi-Fi Protected Access
    # pkts_df = pkts_df[pkts_df.id != 0]
    # pkts_df = pkts_df[pkts_df.id != 238]
    # prova = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 42, 48, 50]
    # pkts_df = pkts_df[pkts_df.id.isin(prova)]

    return ap, mac_in_chiaro, pkts_df


#%%

# SKIP! SKIP! SKIP! SKIP! SKIP! SKIP! SKIP! SKIP! SKIP! SKIP! SKIP! SKIP!
###################### DATASET DESCRIPTION ###########################

my_dir = "raw_data/01_prova.pcapng"

for filename in os.listdir(my_dir):
    print("File capture : ", filename)
    filename = my_dir+filename

    pcap = rdpcap(filename)

    ap, mac_in_chiaro, df = pcapng_to_dataframe(pcap)

    # DIMENSIONE DEL FILE
    file_dimension = os.path.getsize(filename)
    print('File dimension :' , round(file_dimension/1024, 2), "KB")

    # DURATA DELLA SCANSIONE # aggiunti 3600 per compensare l'ora di errore per Linux/Windows
    first = round(pcap[0].time + 3600, 2)
    last = round(pcap[len(pcap)-1].time + 3600, 2)
    first_dateTime = datetime.fromtimestamp(first)
    last_dateTime = datetime.fromtimestamp(last)
    print("Start capture =", first_dateTime)
    print("Stop capture =", last_dateTime)
    print("Capture Lenght :", round(last - first, 2), "sec -", round((last - first)/60, 2), "min")

    print("Number of packets : ", len(df))
    print("Number of AP mac : ", len(ap))
    print("Number of mac_in_chiaro : ", len(mac_in_chiaro))
    print("Number of mac : ", len(df.mac.unique()))

    ## Randomico o no
    df['random'] = df['mac'].apply(lambda x: (bin(int(x[:2], 16))[2:].zfill(8)[6:] in ['10', '11']))

    print("MAC non randomici: ", len(df[df.random == False].mac.unique()))
    print("MAC randomici: ", len(df[df.random == True].mac.unique()))
    print("\n")

#%%

filename = "raw_data/anecoica_20_01/01_prova.pcapng"

print("File capture : ", filename)

pcap = rdpcap(filename)

#%%

ap, mac_in_chiaro, df = pcapng_to_dataframe(pcap)

#%%

############################ TEST CON TIME AND SEQ ################################

def derandomization_with_seq(pcap):
    packet = []
    ap = []
    mac_in_chiaro = []
    access_point = []
    time_list = [p.time for p in pcap]
    time_list = np.array(time_list)
    time_threshold = time_list.min() + (15*60)
    for pkt in pcap:
        ##### rimozione primi 5 minuti di scansione ############
        if pkt.time > time_threshold:
            # if pkt.haslayer(Dot11Beacon or Dot11ProbeResp or Dot11AssoResp):
            if pkt.haslayer(Dot11Beacon):
                if hasattr(pkt, 'addr2'):
                    ap.append(pkt.addr2)
                    # print("source address :", pkt.addr2)
                if hasattr(pkt, 'addr1')and pkt.addr1 != 'ff:ff:ff:ff:ff:ff':
                    mac_in_chiaro.append(pkt.addr1)
                    # print("mac in chiaro: ", pkt.addr1)
                if hasattr(pkt, 'addr3'):
                    access_point.append(pkt.addr3)
                    # print("Access Point address :", pkt.addr3)

            if pkt.haslayer(Dot11ProbeReq):

                mac = pkt.addr2
                seq = extractSN(pkt.SC)
                power = pkt.dBm_AntSignal
                # print("pkt", mac)
                while pkt:
                    if ('ID' and 'len' and 'info') in pkt.fields.keys():
                        packet.append([mac, seq, pkt.fields['ID'], pkt.fields['len'], power, pkt.time])
                    pkt = pkt.payload


    pkts_df = pd.DataFrame(packet)
    pkts_df = pkts_df.drop_duplicates()
    pkts_df.sort_values(by=[0,1])
    pkts_df.columns = ['mac', 'seq', 'id', 'len', 'power', 'time']

    ap = set(ap)

    # print(len(pkts_df.mac.unique()))

    pkts_df = pkts_df[~pkts_df.mac.isin(ap)]

    # print(len(pkts_df.mac.unique()))

    # pkts_df = pkts_df[pkts_df.id != 221] # Wi-Fi Protected Access
    # pkts_df = pkts_df[pkts_df.id != 0]
    # pkts_df = pkts_df[pkts_df.id != 238]
    # prova = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 42, 48, 50]
    # pkts_df = pkts_df[pkts_df.id.isin(prova)]

    return ap, mac_in_chiaro, pkts_df

ap, mac_in_chiaro, df = derandomization_with_seq(pcap)

#%%

import matplotlib.pyplot as plt

df_prova = df.drop(columns=['id','len','power']).iloc[0:50, :].drop_duplicates() #.iloc[:50, :]

sns.catplot(x='time', y='seq', hue='mac', data=df_prova)
plt.show()

#%%

############################ ID ELEMENT SELECTION ################################

# pkts_df = pkts_df[pkts_df.id != 221] # Wi-Fi Protected Access
# pkts_df = pkts_df[pkts_df.id != 0]
# pkts_df = pkts_df[pkts_df.id != 238]
# prova = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 42, 48, 50]
# pkts_df = pkts_df[pkts_df.id.isin(prova)]


#%%


# DIMENSIONE DEL FILE
file_dimension = os.path.getsize(filename)
print('File dimension :' , round(file_dimension/1024, 2), "KB")

# DURATA DELLA SCANSIONE # aggiunti 3600 per compensare l'ora di errore per Linux/Windows
first = round(pcap[0].time + 3600, 2)
last = round(pcap[len(pcap)-1].time + 3600, 2)
first_dateTime = datetime.fromtimestamp(first)
last_dateTime = datetime.fromtimestamp(last)
print("Start capture =", first_dateTime)
print("Stop capture =", last_dateTime)
print("Capture Length :", round(last - first, 2), "sec -", round((last - first)/60, 2), "min")

print("Number of packets : ", len(df))
print("Number of AP mac : ", len(ap))
print("Number of mac_in_chiaro : ", len(mac_in_chiaro))
print("Number of mac : ", len(df.mac.unique()))

## Randomico o no
df['random'] = df['mac'].apply(lambda x: (bin(int(x[:2], 16))[2:].zfill(8)[6:] in ['10', '11']))

print("MAC non randomici: ", len(df[df.random == False].mac.unique()))
print("MAC randomici: ", len(df[df.random == True].mac.unique()))


#%%

df_norand = df[df.random == False]
df_rand = df[df.random == True]

print("Non randomici: \n", df_norand.sample(10))


#%%

######################### PANORAMICA DATASET NON RANDOMICI ##################################

df_norand_vendor = pd.DataFrame(df_norand)
df_norand_vendor['vendor'] = [str(mac_lookup(x)) for x in df_norand_vendor.mac]

print(df_norand_vendor.head())

sns.catplot(y='vendor', kind="count", data=df_norand_vendor)

#%%

######################### PANORAMICA DATASET RANDOMICI ##################################


df_rand_vendor = pd.DataFrame(df_rand)
df_rand_vendor['vendor'] = [str(mac_lookup(x)) for x in df_rand_vendor.mac]

print(df_rand_vendor.head())

sns.catplot(y='vendor', kind="count", data=df_rand_vendor)

#%%

print(df_rand.drop(columns=['seq', 'power', 'time','random']).iloc[0:50, :].drop_duplicates())


#%%

# print(df_test_seq[(df_test_seq.id == 50 & df_test_seq.len == 0) | (df_test_seq.id == 0 & df_test_seq.len == 0)].drop(columns=['seq', 'power', 'time']).iloc[0:50, :].drop_duplicates())

print(df_rand.loc[(df_rand['id'] == 50) & (df_rand['len'] == 4)].mac.unique())

mac_list = df_rand.loc[(df_rand['id'] == 50) & (df_rand['len'] == 4)].mac.unique()

#%%

df_mac_list = df_rand.loc[(df_rand['id'] == 50) & (df_rand['len'] == 4)]

#%%

sns.catplot(x='time', y='seq', hue='mac', data= df_mac_list)


#%%
df = df_rand

for mac in df.mac.unique():
    # print(mac, df.sort_values(by='id')[df.mac == mac][['id','len']])
    df_temp =  df.sort_values(by='id')[df.mac == mac][['seq', 'id', 'len','power']]
    df_temp = df_temp.drop_duplicates()
    # print(mac, "\n", df_temp.values)

# Computing how many entries there are for each mac_address
df_temp = df.groupby(df['id']).count()

# Discarding mac_addresses where there is only one value
df = df[~df['id'].isin(df_temp[df_temp.mac.isin([1])].index)]

df_temp = df[['mac','id','len','power']].sort_values(by=['mac','id','len','power']).drop_duplicates(keep='first')
df_temp.groupby('mac').count().sort_values(by='id')

print(df_temp)

#%%

total = []
for mac in df_temp.mac.unique():
    lista= []
    for id in df_temp.id.unique():
        # print(df_temp[(df_temp['id']==id) & (df_temp['mac']==mac)].len)
        if df_temp[(df_temp['id']==id) & (df_temp['mac']==mac)].len.empty:
            lista.append(0)
        else:
            lista.append(df_temp.loc[(df_temp['id']==id) & (df_temp['mac']==mac), 'len'].values)
    seq_mean = df_temp[df_temp['mac']==mac].mean()
    total.append(lista)

#%%

df_new = pd.DataFrame(total)
df_new.index = df_temp.mac.unique()
df_new.columns = ["id_"+str(x) for x in df_temp.id.unique()]

print(df_new.head())

#%%

# view the dataset
print(df_new.head(3))

df_new_expanded = df_new

for column in df_new.columns:
    # if len(column) >= 2:
    #     print("poba")
    # expand df_new_expanded.tags into its own dataframe
    tags = df_new[column].apply(pd.Series)
    if tags.shape[1] > 1:
        # rename each variable is tags
        tags = tags.rename(columns = lambda x : column + "_" + str(x)).fillna(0)
        # print("\n TAGS: \n", tags.head(3))
        # join the tags dataframe back to the original dataframe
        df_new_expanded = pd.concat([df_new_expanded.loc[:, df_new_expanded.columns != column], tags[:]], axis=1)
        # print("\n df: \n", df_new_expanded.head(3))
        # drop the original column
        # df_new_expanded = df_new_expanded.drop(columns=[column])

df_new_expanded = df_new_expanded.astype(int)

print(df_new_expanded.head(3))

#%%

################### DROPPING COLUMNS WHERE VALUES ARE ALL EQUAL ####################

df_ichis = df_new_expanded

print(df_ichis.head(3))
print("\n\nN_FEATURES = ", len(df_ichis.columns))

nunique = df_ichis.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
df_ichis = df_ichis.drop(cols_to_drop, axis=1)

print("\n\nN_FEATURES = ", len(df_ichis.columns))


print(df_ichis)


#%%

############### TAKING IN ACCOUNT ONLY COLUMNS WHERE THERE ARE MINIMUM 23 VALUES ##################

indici = np.argwhere(np.count_nonzero(df_ichis, axis=0)>23)
idxcolumn = np.hstack(indici)

print(idxcolumn)
print(df_ichis.iloc[:, idxcolumn])
df_tantesperanze = df_ichis.iloc[:, idxcolumn]

#%%

f = df_tantesperanze.iloc[:, 0]

#%%

sns.distplot(f[f!=0], bins=10)


#%%

for column in df_tantesperanze.columns:
    f = df_tantesperanze.loc[:, column]
    sns.distplot(f)
    f.plot.hist(bins=12, alpha=0.5)


#%%

################################## DBASCAN TEST PARAMETERS ####################################
df_ichis = df_tantesperanze

X = StandardScaler().fit_transform(df_ichis)

print("\n\nN_FEATURES = ", len(df_ichis.columns))
print("\n\nN_SAMPLES = ", len(df_ichis))

for min_samples in range(2,6,1):
    for eps in range(1,100,1):
        eps = eps/100
        # Compute DBSCAN
        db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_
        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        #print("\nPARAMETERS: eps = {:f} min_samples = {:f}".format(eps, min_samples))
        if n_clusters_ > 12:
            print("N_CLUSTERS = ", n_clusters_)
            print("Parametri: eps = {eps} min_samples = {min_samples}".format(eps=eps, min_samples=min_samples))


#%%

############################ OPTICS ###########################################

from sklearn.cluster import OPTICS, cluster_optics_dbscan
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt


df_ichis = df_tantesperanze

X = StandardScaler().fit_transform(df_ichis)


clust = OPTICS(min_samples=2, xi=0.1, min_cluster_size=0.001)

# Run the fit
clust.fit(X)
eps_1 = 2
labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
                                   core_distances=clust.core_distances_,
                                   ordering=clust.ordering_, eps=eps_1)
eps_2 = 0.1
labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
                                   core_distances=clust.core_distances_,
                                   ordering=clust.ordering_, eps=eps_2)

space = np.arange(len(X))
reachability = clust.reachability_[clust.ordering_]
labels = clust.labels_[clust.ordering_]

plt.figure(figsize=(20, 14))
G = gridspec.GridSpec(2, 3)
ax1 = plt.subplot(G[0, :])
ax2 = plt.subplot(G[1, 0])
ax3 = plt.subplot(G[1, 1])
ax4 = plt.subplot(G[1, 2])


import random

n_clusters = len(set(clust.labels_)) - (1 if -1 in clust.labels_ else 0)
# Reachability plot
for klass in range(0, n_clusters):
    r = random.randint(0,255)
    g = random.randint(0,255)
    b = random.randint(0,255)
    rgb = [r,g,b]
    Xk = space[labels == klass]
    Rk = reachability[labels == klass]
    ax1.plot(Xk, Rk, rgb, alpha=0.3)
ax1.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.3)
ax1.plot(space, np.full_like(space, float(eps_1), dtype=float), 'k-', alpha=0.5)
ax1.plot(space, np.full_like(space, float(eps_2), dtype=float), 'k-.', alpha=0.5)
ax1.set_ylabel('Reachability (epsilon distance)')
ax1.set_title('Reachability Plot')
ax1.set_ylim(-0.1, 2)


# OPTICS
colors = ['g.', 'r.', 'b.', 'y.', 'c.']
for klass, color in zip(range(0, 5), colors):
    Xk = X[clust.labels_ == klass]
    ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], 'k+', alpha=0.1)
n_clusters = len(set(clust.labels_)) - (1 if -1 in clust.labels_ else 0)
ax2.set_title('cluster {} Automatic Clustering\nOPTICS xi'.format(n_clusters))

# DBSCAN at 0.5
colors = ['g', 'greenyellow', 'olive', 'r', 'b', 'c']
for klass, color in zip(range(0, 6), colors):
    Xk = X[labels_050 == klass]
    ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3, marker='.')
ax3.plot(X[labels_050 == -1, 0], X[labels_050 == -1, 1], 'k+', alpha=0.1)
n_clusters = len(set(labels_050)) - (1 if -1 in labels_050 else 0)
ax3.set_title('cluster {} Clustering at {} epsilon cut\nDBSCAN'.format(n_clusters,eps_1))

# DBSCAN at 2.
colors = ['g.', 'm.', 'y.', 'c.']
for klass, color in zip(range(0, 4), colors):
    Xk = X[labels_200 == klass]
    ax4.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
ax4.plot(X[labels_200 == -1, 0], X[labels_200 == -1, 1], 'k+', alpha=0.1)
n_clusters = len(set(labels_200)) - (1 if -1 in labels_200 else 0)
ax4.set_title('cluster {} Clustering at {} epsilon cut\nDBSCAN'.format(n_clusters,eps_2))
plt.savefig('f2')
plt.tight_layout()
plt.show()

#%%

from sklearn.cluster import OPTICS, cluster_optics_dbscan

df_ichis = df_tantesperanze

X = StandardScaler().fit_transform(df_ichis)

np.seterr(divide='ignore')

for min_cluster_size in range(1,10,1):
    min_cluster_size = min_cluster_size/10
    for xi in range(1,100,1):
        xi = xi/100
        clust = OPTICS(min_samples=2, xi=xi, min_cluster_size=min_cluster_size)

        # Run the fit
        clust.fit(X)
        eps_1 = 2
        labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
                                           core_distances=clust.core_distances_,
                                           ordering=clust.ordering_, eps=eps_1)
        eps_2 = 0.1
        labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
                                           core_distances=clust.core_distances_,
                                           ordering=clust.ordering_, eps=eps_2)

        n_clusters = len(set(clust.labels_)) - (1 if -1 in clust.labels_ else 0)

        if n_clusters == 14:
            # print("N_CLUSTERS = ", n_clusters)
            print("Parametri: xi = {xi} min_cluster_size = {min_cluster_size}".format(xi=xi, min_cluster_size=min_cluster_size))

#%%


########################### PCA ######################################
df_ichis = df_tantesperanze

pca = PCA(n_components=2)
print(df_ichis.head(3))

X = StandardScaler().fit_transform(df_ichis)
principal_components = pca.fit_transform(X)
df_pca = pd.DataFrame(data = principal_components,columns=['component1', 'component2'])

print(df_pca.head(5))

etichette_optics = clust.labels_

#%%

df_pca.plot()

sns.scatterplot(df_pca)

#%%

plt.scatter(df_pca['component1'], df_pca['component2'], c= etichette_optics, s=50, alpha=0.5)
# plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
# plt.ylim(-5, 25)
# plt.xlim(-5, 40)

#%%

############################# K-MEANS ####################################################

from pandas import DataFrame
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Data = {'x': [25,34,22,27,33,33,31,22,35,34,67,54,57,43,50,57,59,52,65,47,49,48,35,33,44,45,38,43,51,46],
#         'y': [79,51,53,78,59,74,73,57,69,75,51,32,40,47,53,36,35,58,59,50,25,20,14,12,20,5,29,27,8,7]
#        }
#
#df_kmeans = DataFrame(Data,columns=['x','y'])


print(df_pca.head(5))

df_kmeans = df_pca

kmeans = KMeans(n_clusters=23).fit(df_ichis)
centroids = kmeans.cluster_centers_
print(centroids)
print(kmeans.labels_)


plt.scatter(df_kmeans['component1'], df_kmeans['component2'], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
plt.ylim(-5, 25)

plt.xlim(-5, 40)

#%%

time_list = [p.time for p in pcap]

time_list = np.array(time_list)

min_time = time_list.min()
time_threshold = time_list.min() + (5*60)


#%%

[str(mac_lookup(x)) for x in ap]