Perform clustering — fuzzyclara • fuzzyclara

Function to perform a cluster analysis in a hard or fuzzy way. The function can either be performed using a common dissimilarity metric or a self-defined distance function.

Usage

fuzzyclara(
  data,
  clusters = 5,
  metric = "euclidean",
  algorithm = "clara",
  samples = 10,
  sample_size = NULL,
  max_neighbors = 100,
  num_local = 10,
  type = "hard",
  cores = 1,
  seed = 1234,
  m = 1.5,
  verbose = 1,
  scale = TRUE,
  build = FALSE,
  ...
)

Arguments

data: data.frame or matrix to be clustered
clusters: Number of clusters. Defaults to 5.
metric: A character specifying a predefined dissimilarity metric (like "euclidean" or "manhattan") or a self-defined dissimilarity function. Defaults to "euclidean". Will be passed as argument method to dist, so check ?proxy::dist for full details.
algorithm: One of c("clara","clarans")
samples: Number of subsamples (only if algorithm = "clara")
sample_size: Number of observations belonging to a sample. If NULL (default), the minimum of nrow(data) and 40 + clusters * 2 is used as sample size. (only if algorithm = "clara")
max_neighbors: Maximum number of randomized medoid searches with each cluster (only if algorithm = "clarans")
num_local: Number of clustering iterations (only if algorithm = "clarans")
type: One of c("hard","fuzzy"), specifying the type of clustering to be performed.
cores: Numbers of cores for computation. cores > 1 implies a parallel call. Defaults to 1.
seed: Random number seed. Defaults to 1234.
m: Fuzziness exponent (only for type = "fuzzy"), which has to be a numeric of minimum 1. Defaults to 2.
verbose: Can be set to integers between 0 and 2 to control the level of detail of the printed diagnostic messages. Higher numbers lead to more detailed messages. Defaults to 1.
scale: Scale numeric variables before distance matrix calculation? Default TRUE
build: Additional build algorithm to choose initial medoids (only relevant for type = "fuzzy". Default FALSE.)
...: Additional arguments passed to the main clustering algorithm and to proxy::dist for the calculation of the distance matrix (pam or vegclust)

Value

Object of class "fuzzyclara"

Details

If the clustering is run on mulitple cores, the verbose messages are printed in a file clustering_progress.log (if verbose > 0).

References

Kaufman, L., and Rousseeuw, P. J. (1986). Clustering large data sets. Pattern Recognition in Practice, 425–437.

Ng, R. T., and Han, J. (2002). CLARANS: A method for clustering objects for spatial data mining. IEEE transactions on knowledge and data engineering, 14(5), 1003–1016. doi:10.1109/tkde.2002.1033770 .

Examples

 
# Hard clustering
cc_hard <- fuzzyclara(data        = USArrests,
                      clusters    = 3,
                      metric      = "euclidean",
                      samples     = 1,
                      sample_size = NULL,
                      type        = "hard",
                      seed        = 3526,
                      verbose     = 0)
cc_hard
#> Clustering results
#> 
#> Medoids
#> [1] "New Mexico"    "Oklahoma"      "New Hampshire"
#> 
#> Clustering
#>  [1] 2 2 2 3 2 2 3 3 2 2 3 1 2 3 1 3 3 2 1 2 3 2 1 2 3 3 3 2 1 3 2 2 2 1 3 3 3 3
#> [39] 3 2 1 2 2 3 1 3 3 1 1 3
#> 
#> Minimum average distance
#> [1] 1.180717

# Fuzzy clustering
cc_fuzzy <- fuzzyclara(data        = USArrests,
                       clusters    = 3,
                       metric      = "euclidean",
                       samples     = 1,
                       sample_size = NULL,
                       type        = "fuzzy",
                       m           = 2,
                       seed        = 3526,
                       verbose     = 0)
cc_fuzzy
#> Clustering results
#> 
#> Medoids
#> [1] "Oklahoma"  "Arizona"   "Tennessee"
#> 
#> Clustering
#>  [1] 3 3 1 2 1 1 2 2 1 3 2 2 1 2 2 2 2 3 2 1 2 1 2 3 2 2 2 1 2 2 1 1 3 2 2 2 2 2
#> [39] 2 3 2 3 3 2 2 2 2 2 2 2
#> 
#> Minimum average weighted distance
#> [1] 1.94242
#> 
#> Membership scores
#>                 Cluster1  Cluster2  Cluster3
#> Alabama        0.2040878 0.2391714 0.5567409
#> Alaska         0.3373655 0.2726496 0.3899849
#> Arizona        1.0000000 0.0000000 0.0000000
#> Arkansas       0.2075892 0.3966215 0.3957893
#> California     0.5401685 0.2248051 0.2350264
#> Colorado       0.4475538 0.2744007 0.2780455
#> Connecticut    0.2348136 0.5280016 0.2371848
#> Delaware       0.2906227 0.4701428 0.2392345
#> Florida        0.4443412 0.2316682 0.3239905
#> Georgia        0.2091524 0.2149396 0.5759081
#> Hawaii         0.2482766 0.4883161 0.2634073
#> Idaho          0.2209589 0.5129169 0.2661242
#> Illinois       0.4666698 0.2739684 0.2593617
#> Indiana        0.1344369 0.6694262 0.1961369
#> Iowa           0.2311216 0.4905457 0.2783327
#> Kansas         0.1310680 0.6999444 0.1689876
#> Kentucky       0.1917648 0.4401893 0.3680459
#> Louisiana      0.2560625 0.2412981 0.5026393
#> Maine          0.2396947 0.4695769 0.2907285
#> Maryland       0.4281216 0.2306369 0.3412416
#> Massachusetts  0.2682437 0.5043343 0.2274220
#> Michigan       0.4467571 0.2192029 0.3340400
#> Minnesota      0.2158369 0.5379562 0.2462069
#> Mississippi    0.2484292 0.2817416 0.4698292
#> Missouri       0.2669546 0.3898602 0.3431852
#> Montana        0.1922866 0.5233027 0.2844107
#> Nebraska       0.1814854 0.5935543 0.2249603
#> Nevada         0.4372644 0.2469118 0.3158237
#> New Hampshire  0.2351771 0.4821286 0.2826942
#> New Jersey     0.3025875 0.4474846 0.2499279
#> New Mexico     0.4736616 0.2098122 0.3165261
#> New York       0.4959333 0.2489337 0.2551329
#> North Carolina 0.2984813 0.2995500 0.4019686
#> North Dakota   0.2525175 0.4409055 0.3065770
#> Ohio           0.1722044 0.6264071 0.2013885
#> Oklahoma       0.0000000 1.0000000 0.0000000
#> Oregon         0.2597840 0.4842455 0.2559705
#> Pennsylvania   0.1733916 0.6187011 0.2079073
#> Rhode Island   0.2938264 0.4548769 0.2512968
#> South Carolina 0.2521289 0.2569116 0.4909595
#> South Dakota   0.2294820 0.4627901 0.3077278
#> Tennessee      0.0000000 0.0000000 1.0000000
#> Texas          0.3315450 0.2977964 0.3706587
#> Utah           0.2550652 0.5204090 0.2245258
#> Vermont        0.2537642 0.4173744 0.3288614
#> Virginia       0.1470128 0.6016305 0.2513568
#> Washington     0.2420740 0.5403595 0.2175666
#> West Virginia  0.2356115 0.4301945 0.3341939
#> Wisconsin      0.2298126 0.5057011 0.2644864
#> Wyoming        0.1652925 0.6041126 0.2305949

# Fuzzy clustering with self-defined distance function
dist_function <- function(x, y) {
sqrt(sum((x - y)^2))
}

cc_dist <- fuzzyclara(data        = USArrests,
                      clusters    = 3,
                      metric      = dist_function,
                      samples     = 1,
                      sample_size = NULL,
                      type        = "fuzzy",
                      m           = 2,
                      seed        = 3526,
                      verbose     = 0)
cc_dist
#> Clustering results
#> 
#> Medoids
#> [1] "Oklahoma"  "Arizona"   "Tennessee"
#> 
#> Clustering
#>  [1] 3 3 1 2 1 1 2 2 1 3 2 2 1 2 2 2 2 3 2 1 2 1 2 3 2 2 2 1 2 2 1 1 3 2 2 2 2 2
#> [39] 2 3 2 3 3 2 2 2 2 2 2 2
#> 
#> Minimum average weighted distance
#> [1] 1.94242
#> 
#> Membership scores
#>                 Cluster1  Cluster2  Cluster3
#> Alabama        0.2040878 0.2391714 0.5567409
#> Alaska         0.3373655 0.2726496 0.3899849
#> Arizona        1.0000000 0.0000000 0.0000000
#> Arkansas       0.2075892 0.3966215 0.3957893
#> California     0.5401685 0.2248051 0.2350264
#> Colorado       0.4475538 0.2744007 0.2780455
#> Connecticut    0.2348136 0.5280016 0.2371848
#> Delaware       0.2906227 0.4701428 0.2392345
#> Florida        0.4443412 0.2316682 0.3239905
#> Georgia        0.2091524 0.2149396 0.5759081
#> Hawaii         0.2482766 0.4883161 0.2634073
#> Idaho          0.2209589 0.5129169 0.2661242
#> Illinois       0.4666698 0.2739684 0.2593617
#> Indiana        0.1344369 0.6694262 0.1961369
#> Iowa           0.2311216 0.4905457 0.2783327
#> Kansas         0.1310680 0.6999444 0.1689876
#> Kentucky       0.1917648 0.4401893 0.3680459
#> Louisiana      0.2560625 0.2412981 0.5026393
#> Maine          0.2396947 0.4695769 0.2907285
#> Maryland       0.4281216 0.2306369 0.3412416
#> Massachusetts  0.2682437 0.5043343 0.2274220
#> Michigan       0.4467571 0.2192029 0.3340400
#> Minnesota      0.2158369 0.5379562 0.2462069
#> Mississippi    0.2484292 0.2817416 0.4698292
#> Missouri       0.2669546 0.3898602 0.3431852
#> Montana        0.1922866 0.5233027 0.2844107
#> Nebraska       0.1814854 0.5935543 0.2249603
#> Nevada         0.4372644 0.2469118 0.3158237
#> New Hampshire  0.2351771 0.4821286 0.2826942
#> New Jersey     0.3025875 0.4474846 0.2499279
#> New Mexico     0.4736616 0.2098122 0.3165261
#> New York       0.4959333 0.2489337 0.2551329
#> North Carolina 0.2984813 0.2995500 0.4019686
#> North Dakota   0.2525175 0.4409055 0.3065770
#> Ohio           0.1722044 0.6264071 0.2013885
#> Oklahoma       0.0000000 1.0000000 0.0000000
#> Oregon         0.2597840 0.4842455 0.2559705
#> Pennsylvania   0.1733916 0.6187011 0.2079073
#> Rhode Island   0.2938264 0.4548769 0.2512968
#> South Carolina 0.2521289 0.2569116 0.4909595
#> South Dakota   0.2294820 0.4627901 0.3077278
#> Tennessee      0.0000000 0.0000000 1.0000000
#> Texas          0.3315450 0.2977964 0.3706587
#> Utah           0.2550652 0.5204090 0.2245258
#> Vermont        0.2537642 0.4173744 0.3288614
#> Virginia       0.1470128 0.6016305 0.2513568
#> Washington     0.2420740 0.5403595 0.2175666
#> West Virginia  0.2356115 0.4301945 0.3341939
#> Wisconsin      0.2298126 0.5057011 0.2644864
#> Wyoming        0.1652925 0.6041126 0.2305949

# Hard clustering with other distance function
cc_manh <- fuzzyclara(data        = USArrests,
                      clusters    = 3,
                      metric      = "manhattan",
                      samples     = 1,
                      sample_size = NULL,
                      type        = "hard",
                      seed        = 3526,
                      verbose     = 0)

cc_manh
#> Clustering results
#> 
#> Medoids
#> [1] "New Mexico"    "Oklahoma"      "New Hampshire"
#> 
#> Clustering
#>  [1] 2 2 2 3 2 2 1 3 2 2 3 1 2 3 1 3 3 2 1 2 3 2 1 2 3 3 3 2 1 3 2 2 2 1 3 3 3 3
#> [39] 3 2 1 2 2 3 1 3 3 1 1 3
#> 
#> Minimum average distance
#> [1] 2.011671

# Hard clustering with Minkowski distance
# In order to specify arguments of the distance metric (e. g. p for
# Minkowski distance), 
# you can use a self-defined distance function.

dist_mink <- function(x, y) {
proxy::dist(list(x, y), method = "minkowski", p = 1)
}
cc_mink <- fuzzyclara(data        = USArrests,
                      clusters    = 3,
                      metric      = dist_mink,
                      samples     = 1,
                      sample_size = NULL,
                      type        = "hard",
                      seed        = 3526,
                      verbose     = 0)
cc_mink
#> Clustering results
#> 
#> Medoids
#> [1] "New Mexico"    "Oklahoma"      "New Hampshire"
#> 
#> Clustering
#>  [1] 2 2 2 3 2 2 1 3 2 2 3 1 2 3 1 3 3 2 1 2 3 2 1 2 3 3 3 2 1 3 2 2 2 1 3 3 3 3
#> [39] 3 2 1 2 2 3 1 3 3 1 1 3
#> 
#> Minimum average distance
#> [1] 2.011671