Back
Relating clustering stability to properties of cluster boundaries
In this paper, we investigate stability-based methods for cluster model selection, in particular to select the number K of clusters. The scenario under consideration is that clustering is performed by minimizing a certain clustering quality function, and that a unique global minimizer exists. On the one hand we show that stability can be upper bounded by certain properties of the optimal clustering, namely by the mass in a small tube around the cluster boundaries. On the other hand, we provide counterexamples which show that a reverse statement is not true in general. Finally, we give some examples and arguments why, from a theoretic point of view, using clustering stability in a high sample setting can be problematic. It can be seen that distribution-free guarantees bounding the difference between the finite sample stability and the true stability cannot exist, unless one makes strong assumptions on the underlying distribution.
@inproceedings{5105, title = {Relating clustering stability to properties of cluster boundaries}, journal = {Proceedings of the 21st Annual Conference on Learning Theory (COLT 2008)}, booktitle = {COLT 2008}, abstract = {In this paper, we investigate stability-based methods for cluster model selection, in particular to select the number K of clusters. The scenario under consideration is that clustering is performed by minimizing a certain clustering quality function, and that a unique global minimizer exists. On the one hand we show that stability can be upper bounded by certain properties of the optimal clustering, namely by the mass in a small tube around the cluster boundaries. On the other hand, we provide counterexamples which show that a reverse statement is not true in general. Finally, we give some examples and arguments why, from a theoretic point of view, using clustering stability in a high sample setting can be problematic. It can be seen that distribution-free guarantees bounding the difference between the finite sample stability and the true stability cannot exist, unless one makes strong assumptions on the underlying distribution.}, pages = {379-390}, editors = {Servedio, R. A., T. Zhang}, publisher = {Omnipress}, organization = {Max-Planck-Gesellschaft}, school = {Biologische Kybernetik}, address = {Madison, WI, USA}, month = jul, year = {2008}, slug = {5105}, author = {Ben-David, S. and von Luxburg, U.}, month_numeric = {7} }