Hello. DBSCAN The algorithm is [^ 1], a type of data clustering, and neighborhood search (example / c0fadee57b03f9f08d22)) is based on a combination of the [Disjoint-set Data Structure](https://ja.wikipedia.org/wiki/Disjoint-set Data Structure) method.
I myself read the scikit-learn source [^ 2] and wrote the DBSCAN algorithm (two types, depth-first type and breadth-first type) almost by copying.
Just in case, I checked whether it matches the result by scikit-learn.
$ ./dbscan.py
True : depth-first
True : breadth-first
dbscan.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import numpy as np
import sklearn.cluster
from scipy import spatial
def dbscandepth(neigh, minp):
ilabel = 0
labels = [-1 for _ in neigh] # initialized as noise
for i in range(len(neigh)):
if labels[i] >= 0 or len(neigh[i]) < minp: # visited or noise
continue
candid = [i]
while len(candid) > 0:
i = candid.pop()
if len(neigh[i]) >= minp: # core only
for j in neigh[i]:
if labels[j] < 0:
candid.append(j)
labels[j] = ilabel
ilabel += 1
return labels
def dbscanbreadth(neigh, minp):
def expandcandid(candid):
candid = filter(lambda j: len(neigh[j]) >= minp, candid) # core only
if candid != []:
candid = [neigh[j] for j in candid]
candid = list(set.union(*map(set, candid)))
candid = filter(lambda j: labels[j] < 0, candid)
return candid
ilabel = 0
labels = [-1 for _ in neigh] # initialized as noise
for i, candid in enumerate(neigh):
if labels[i] >= 0 or len(candid) < minp: # visited or noise
continue
while len(candid) > 0:
candid = expandcandid(candid)
for j in candid:
labels[j] = ilabel
ilabel += 1
return labels
def main():
minp = 4
radius = 0.1
points = np.random.uniform(size=(100,2))
labels_sklearn = list(sklearn.cluster.DBSCAN(radius, minp).fit(points).labels_ # scikit-learn
tree = spatial.cKDTree(points)
neigh = [tree.query_ball_point(p, radius) for p in points] #Neighborhood search
labels = dbscandepth(neigh, minp)
print(labels == labels_sklearn, ": depth first")
labels = dbscanbreadth(neigh, minp)
print(labels == labels_sklearn, ": breadth first")
return 0
if __name__ == '__main__':
main()
[^ 1]: There is also an article here: "DBSCAN Blues" [^ 2]: Sources are _dbscan_inner.pyx and [dbscan_.py](https:: //github.com/scikit-learn/scikit-learn/blob/21e63aaf01404f4eb3e7ae54ef9f71e8bb905f6b/sklearn/cluster/dbscan_.py)