Untitled

# -*- coding: utf-8 -*-
import csv
import math

import numpy as np

VISITED = []
NOISE = []


def euclid_distance(p,q):
    return math.sqrt(np.square(np.subtract(p,q)).sum())

def are_neighbors(p,q,eps):
    return euclid_distance(p,q) < eps

def region_query(dataset, point, eps):
    neighbors = []
    for neighbor in dataset:
        if are_neighbors(neighbor,point,eps):
            neighbors.append(neighbor)
    return neighbors

def is_visited(point):
    for visitedPoint in VISITED:
        if np.array_equal(point, visitedPoint):
            return True
    return False

def cluster_contains(clusters, point):
    for value in clusters.values():
        for val in value:
            if np.array_equal(val, point):
                return True
    return False

def expand_cluster(dataset, point, clusters, neighborPoints, Cluster_ID, eps, minPts):
    clusters[Cluster_ID].append(point)
    for neighbor in neighborPoints:
        if not is_visited(neighbor):
            VISITED.append(neighbor)
            newNeighbors = region_query(dataset,neighbor,eps)
            if len(newNeighbors) >= minPts:
                neighborPoints.extend(newNeighbors) # Toto nie je vhodne v pythone
        if not cluster_contains(clusters, neighbor):
            clusters[Cluster_ID].append(neighbor)

def DBSCAN(dataset, eps, minPts):
    Cluster_ID = 0
    clusters = dict()
    for point in dataset:
        if is_visited(point):
            continue
        VISITED.append(point)
        NeighborPoints = region_query(dataset,point,eps)
        if len(NeighborPoints) < minPts:
            NOISE.append(point)
        else:
            clusters[Cluster_ID]=[]
            expand_cluster(dataset, point, clusters, NeighborPoints, Cluster_ID, eps, minPts)
            Cluster_ID = Cluster_ID + 1

    sum = 0
    i = 0
    for cluster in clusters.values():
        print('Cluster '+ repr(i) + ' contains: ' + repr(len(cluster)))
        i = i+1
        sum = sum + len(cluster)
    print('There has been ' + repr(len(NOISE)) + ' noise points')
    print('Sum of cluster\'s length is ' + repr(sum))
    print('Together with noise\'s ' + repr(sum+len(NOISE)))
    print('Total length of dataset is: ' + repr(len(dataset)))
    return clusters

def main():
    Irismatrix = []
    with  open('iris.csv', newline='') as csvFile:
        reader = csv.reader(csvFile, delimiter=',')
        next(csvFile)
        for row in reader:
            Irismatrix.append([float(i) for i in row[1:5]])

    Irislength = len(Irismatrix)
    IrisDataSet = np.array(Irismatrix[0:Irislength])

    Yeastmatrix = []
    with  open('yeast.csv', newline='') as csvFile:
        reader = csv.reader(csvFile, delimiter=',')
        next(csvFile)
        for row in reader:
            Yeastmatrix.append([float(i) for i in row[0:7]])

    Yeastlength = len(Yeastmatrix)
    YeastDataSet = np.array(Yeastmatrix[0:Yeastlength])

    eps = 0.5
    min_points = 5

    #print(YeastDataSet)

    print('Parameters: eps=' + repr(eps) + ', minPts=' + repr(min_points))
    DBSCAN(IrisDataSet, eps, min_points)
if __name__ == "__main__":
    main()