Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import csv
- import math
- import numpy as np
- VISITED = []
- NOISE = []
- def euclid_distance(p,q):
- return math.sqrt(np.square(np.subtract(p,q)).sum())
- def are_neighbors(p,q,eps):
- return euclid_distance(p,q) < eps
- def region_query(dataset, point, eps):
- neighbors = []
- for neighbor in dataset:
- if are_neighbors(neighbor,point,eps):
- neighbors.append(neighbor)
- return neighbors
- def is_visited(point):
- for visitedPoint in VISITED:
- if np.array_equal(point, visitedPoint):
- return True
- return False
- def cluster_contains(clusters, point):
- for value in clusters.values():
- for val in value:
- if np.array_equal(val, point):
- return True
- return False
- def expand_cluster(dataset, point, clusters, neighborPoints, Cluster_ID, eps, minPts):
- clusters[Cluster_ID].append(point)
- for neighbor in neighborPoints:
- if not is_visited(neighbor):
- VISITED.append(neighbor)
- newNeighbors = region_query(dataset,neighbor,eps)
- if len(newNeighbors) >= minPts:
- neighborPoints.extend(newNeighbors) # Toto nie je vhodne v pythone
- if not cluster_contains(clusters, neighbor):
- clusters[Cluster_ID].append(neighbor)
- def DBSCAN(dataset, eps, minPts):
- Cluster_ID = 0
- clusters = dict()
- for point in dataset:
- if is_visited(point):
- continue
- VISITED.append(point)
- NeighborPoints = region_query(dataset,point,eps)
- if len(NeighborPoints) < minPts:
- NOISE.append(point)
- else:
- clusters[Cluster_ID]=[]
- expand_cluster(dataset, point, clusters, NeighborPoints, Cluster_ID, eps, minPts)
- Cluster_ID = Cluster_ID + 1
- sum = 0
- i = 0
- for cluster in clusters.values():
- print('Cluster '+ repr(i) + ' contains: ' + repr(len(cluster)))
- i = i+1
- sum = sum + len(cluster)
- print('There has been ' + repr(len(NOISE)) + ' noise points')
- print('Sum of cluster\'s length is ' + repr(sum))
- print('Together with noise\'s ' + repr(sum+len(NOISE)))
- print('Total length of dataset is: ' + repr(len(dataset)))
- return clusters
- def main():
- Irismatrix = []
- with open('iris.csv', newline='') as csvFile:
- reader = csv.reader(csvFile, delimiter=',')
- next(csvFile)
- for row in reader:
- Irismatrix.append([float(i) for i in row[1:5]])
- Irislength = len(Irismatrix)
- IrisDataSet = np.array(Irismatrix[0:Irislength])
- Yeastmatrix = []
- with open('yeast.csv', newline='') as csvFile:
- reader = csv.reader(csvFile, delimiter=',')
- next(csvFile)
- for row in reader:
- Yeastmatrix.append([float(i) for i in row[0:7]])
- Yeastlength = len(Yeastmatrix)
- YeastDataSet = np.array(Yeastmatrix[0:Yeastlength])
- eps = 0.5
- min_points = 5
- #print(YeastDataSet)
- print('Parameters: eps=' + repr(eps) + ', minPts=' + repr(min_points))
- DBSCAN(IrisDataSet, eps, min_points)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement