Skip to contents

This document gives an overview of the functionality provided by the R package fuzzyclara.

Clustering

Hard clustering

cc_hard <- fuzzyclara(data        = USArrests,
                       clusters    = 3,
                       metric      = "euclidean",
                       samples     = 1,
                       sample_size = NULL,
                       type        = "hard",
                       seed        = 3526,
                       verbose     = 0)
cc_hard
## Clustering results
## 
## Medoids
## [1] "New Mexico"    "Oklahoma"      "New Hampshire"
## 
## Clustering
##  [1] 2 2 2 3 2 2 3 3 2 2 3 1 2 3 1 3 3 2 1 2 3 2 1 2 3 3 3 2 1 3 2 2 2 1 3 3 3 3
## [39] 3 2 1 2 2 3 1 3 3 1 1 3
## 
## Minimum average distance
## [1] 1.180717

Fuzzy clustering

cc_fuzzy <- fuzzyclara(data        = USArrests,
                       clusters    = 3,
                       metric      = "euclidean",
                       samples     = 1,
                       sample_size = NULL,
                       type        = "fuzzy",
                       m           = 2,
                       seed        = 3526,
                       verbose     = 0)
cc_fuzzy
## Clustering results
## 
## Medoids
## [1] "Oklahoma"  "Arizona"   "Tennessee"
## 
## Clustering
##  [1] 3 3 1 2 1 1 2 2 1 3 2 2 1 2 2 2 2 3 2 1 2 1 2 3 2 2 2 1 2 2 1 1 3 2 2 2 2 2
## [39] 2 3 2 3 3 2 2 2 2 2 2 2
## 
## Minimum average weighted distance
## [1] 1.94242
## 
## Membership scores
##                 Cluster1  Cluster2  Cluster3
## Alabama        0.2040878 0.2391714 0.5567409
## Alaska         0.3373655 0.2726496 0.3899849
## Arizona        1.0000000 0.0000000 0.0000000
## Arkansas       0.2075892 0.3966215 0.3957893
## California     0.5401685 0.2248051 0.2350264
## Colorado       0.4475538 0.2744007 0.2780455
## Connecticut    0.2348136 0.5280016 0.2371848
## Delaware       0.2906227 0.4701428 0.2392345
## Florida        0.4443412 0.2316682 0.3239905
## Georgia        0.2091524 0.2149396 0.5759081
## Hawaii         0.2482766 0.4883161 0.2634073
## Idaho          0.2209589 0.5129169 0.2661242
## Illinois       0.4666698 0.2739684 0.2593617
## Indiana        0.1344369 0.6694262 0.1961369
## Iowa           0.2311216 0.4905457 0.2783327
## Kansas         0.1310680 0.6999444 0.1689876
## Kentucky       0.1917648 0.4401893 0.3680459
## Louisiana      0.2560625 0.2412981 0.5026393
## Maine          0.2396947 0.4695769 0.2907285
## Maryland       0.4281216 0.2306369 0.3412416
## Massachusetts  0.2682437 0.5043343 0.2274220
## Michigan       0.4467571 0.2192029 0.3340400
## Minnesota      0.2158369 0.5379562 0.2462069
## Mississippi    0.2484292 0.2817416 0.4698292
## Missouri       0.2669546 0.3898602 0.3431852
## Montana        0.1922866 0.5233027 0.2844107
## Nebraska       0.1814854 0.5935543 0.2249603
## Nevada         0.4372644 0.2469118 0.3158237
## New Hampshire  0.2351771 0.4821286 0.2826942
## New Jersey     0.3025875 0.4474846 0.2499279
## New Mexico     0.4736616 0.2098122 0.3165261
## New York       0.4959333 0.2489337 0.2551329
## North Carolina 0.2984813 0.2995500 0.4019686
## North Dakota   0.2525175 0.4409055 0.3065770
## Ohio           0.1722044 0.6264071 0.2013885
## Oklahoma       0.0000000 1.0000000 0.0000000
## Oregon         0.2597840 0.4842455 0.2559705
## Pennsylvania   0.1733916 0.6187011 0.2079073
## Rhode Island   0.2938264 0.4548769 0.2512968
## South Carolina 0.2521289 0.2569116 0.4909595
## South Dakota   0.2294820 0.4627901 0.3077278
## Tennessee      0.0000000 0.0000000 1.0000000
## Texas          0.3315450 0.2977964 0.3706587
## Utah           0.2550652 0.5204090 0.2245258
## Vermont        0.2537642 0.4173744 0.3288614
## Virginia       0.1470128 0.6016305 0.2513568
## Washington     0.2420740 0.5403595 0.2175666
## West Virginia  0.2356115 0.4301945 0.3341939
## Wisconsin      0.2298126 0.5057011 0.2644864
## Wyoming        0.1652925 0.6041126 0.2305949

Clustering with self-defined distance function and other distance functions

dist_function <- function(x, y) {
  sqrt(sum((x - y)^2))
}

cc_dist <- fuzzyclara(data        = USArrests,
                      clusters    = 3,
                      metric      = dist_function,
                      samples     = 1,
                      sample_size = NULL,
                      type        = "fuzzy",
                      m           = 2,
                      seed        = 3526,
                      verbose     = 0)
cc_dist
## Clustering results
## 
## Medoids
## [1] "Oklahoma"  "Arizona"   "Tennessee"
## 
## Clustering
##  [1] 3 3 1 2 1 1 2 2 1 3 2 2 1 2 2 2 2 3 2 1 2 1 2 3 2 2 2 1 2 2 1 1 3 2 2 2 2 2
## [39] 2 3 2 3 3 2 2 2 2 2 2 2
## 
## Minimum average weighted distance
## [1] 1.94242
## 
## Membership scores
##                 Cluster1  Cluster2  Cluster3
## Alabama        0.2040878 0.2391714 0.5567409
## Alaska         0.3373655 0.2726496 0.3899849
## Arizona        1.0000000 0.0000000 0.0000000
## Arkansas       0.2075892 0.3966215 0.3957893
## California     0.5401685 0.2248051 0.2350264
## Colorado       0.4475538 0.2744007 0.2780455
## Connecticut    0.2348136 0.5280016 0.2371848
## Delaware       0.2906227 0.4701428 0.2392345
## Florida        0.4443412 0.2316682 0.3239905
## Georgia        0.2091524 0.2149396 0.5759081
## Hawaii         0.2482766 0.4883161 0.2634073
## Idaho          0.2209589 0.5129169 0.2661242
## Illinois       0.4666698 0.2739684 0.2593617
## Indiana        0.1344369 0.6694262 0.1961369
## Iowa           0.2311216 0.4905457 0.2783327
## Kansas         0.1310680 0.6999444 0.1689876
## Kentucky       0.1917648 0.4401893 0.3680459
## Louisiana      0.2560625 0.2412981 0.5026393
## Maine          0.2396947 0.4695769 0.2907285
## Maryland       0.4281216 0.2306369 0.3412416
## Massachusetts  0.2682437 0.5043343 0.2274220
## Michigan       0.4467571 0.2192029 0.3340400
## Minnesota      0.2158369 0.5379562 0.2462069
## Mississippi    0.2484292 0.2817416 0.4698292
## Missouri       0.2669546 0.3898602 0.3431852
## Montana        0.1922866 0.5233027 0.2844107
## Nebraska       0.1814854 0.5935543 0.2249603
## Nevada         0.4372644 0.2469118 0.3158237
## New Hampshire  0.2351771 0.4821286 0.2826942
## New Jersey     0.3025875 0.4474846 0.2499279
## New Mexico     0.4736616 0.2098122 0.3165261
## New York       0.4959333 0.2489337 0.2551329
## North Carolina 0.2984813 0.2995500 0.4019686
## North Dakota   0.2525175 0.4409055 0.3065770
## Ohio           0.1722044 0.6264071 0.2013885
## Oklahoma       0.0000000 1.0000000 0.0000000
## Oregon         0.2597840 0.4842455 0.2559705
## Pennsylvania   0.1733916 0.6187011 0.2079073
## Rhode Island   0.2938264 0.4548769 0.2512968
## South Carolina 0.2521289 0.2569116 0.4909595
## South Dakota   0.2294820 0.4627901 0.3077278
## Tennessee      0.0000000 0.0000000 1.0000000
## Texas          0.3315450 0.2977964 0.3706587
## Utah           0.2550652 0.5204090 0.2245258
## Vermont        0.2537642 0.4173744 0.3288614
## Virginia       0.1470128 0.6016305 0.2513568
## Washington     0.2420740 0.5403595 0.2175666
## West Virginia  0.2356115 0.4301945 0.3341939
## Wisconsin      0.2298126 0.5057011 0.2644864
## Wyoming        0.1652925 0.6041126 0.2305949

You can also use other distance functions from the proxy package like Gower, Manhattan etc. In order to specify arguments of the distance metric (e. g. p for Minkowski distance), you can use a self-defined distance function.

cc_manh <- fuzzyclara(data        = USArrests,
                       clusters    = 3,
                       metric      = "manhattan",
                       samples     = 1,
                       sample_size = NULL,
                       type        = "hard",
                       seed        = 3526,
                       verbose     = 0)
cc_manh
## Clustering results
## 
## Medoids
## [1] "New Mexico"    "Oklahoma"      "New Hampshire"
## 
## Clustering
##  [1] 2 2 2 3 2 2 1 3 2 2 3 1 2 3 1 3 3 2 1 2 3 2 1 2 3 3 3 2 1 3 2 2 2 1 3 3 3 3
## [39] 3 2 1 2 2 3 1 3 3 1 1 3
## 
## Minimum average distance
## [1] 2.011671
dist_mink <- function(x, y) {
  proxy::dist(list(x, y), method = "minkowski", p = 1)
}
cc_mink <- fuzzyclara(data        = USArrests,
                       clusters    = 3,
                       metric      = dist_mink,
                       samples     = 1,
                       sample_size = NULL,
                       type        = "hard",
                       seed        = 3526,
                       verbose     = 0)
cc_mink
## Clustering results
## 
## Medoids
## [1] "New Mexico"    "Oklahoma"      "New Hampshire"
## 
## Clustering
##  [1] 2 2 2 3 2 2 1 3 2 2 3 1 2 3 1 3 3 2 1 2 3 2 1 2 3 3 3 2 1 3 2 2 2 1 3 3 3 3
## [39] 3 2 1 2 2 3 1 3 3 1 1 3
## 
## Minimum average distance
## [1] 2.011671

Select optimal number of clusters

cc_number <- evaluate_cluster_numbers(
                       data            = USArrests,
                       clusters_range  = 2:6,
                       metric          = "euclidean",
                       samples         = 1,
                       sample_size     = NULL,
                       type            = "hard",
                       seed            = 3526,
                       verbose         = 0)
cc_number

Plot of clustering results

# Enrich the USArrest dataset by area and state

USArrests_enriched <- USArrests %>% 
  mutate(State = as.factor(rownames(USArrests)),
         Area  = as.factor(case_when(State %in% c("Washington", "Oregon",
                 "California", "Nevada", "Arizona", "Idaho", "Montana",
                 "Wyoming", "Colorado", "New Mexico", "Utah", "Hawaii",
                 "Alaska") ~ "West",
                 State %in% c("Texas", "Oklahoma", "Arkansas", "Louisiana",
                 "Mississippi", "Alabama", "Tennessee", "Kentucky", "Georgia",
                 "Florida", "South Carolina", "North Carolina", "Virginia",
                 "West Virginia") ~ "South",
                 State %in% c("Kansas", "Nebraska", "South Dakota",
                 "North Dakota", "Minnesota", "Missouri", "Iowa", "Illinois",
                 "Indiana", "Michigan", "Wisconsin", "Ohio") ~ "Midwest",
                 State %in% c("Maine", "New Hampshire", "New York",
                 "Massachusetts", "Rhode Island", "Vermont", "Pennsylvania",
                 "New Jersey", "Connecticut", "Delaware", "Maryland") ~
                   "Northeast")))

Boxplot

plot(x = cc_fuzzy, data = USArrests_enriched, variable = "Assault") 

Barplot

plot(x = cc_fuzzy, data = USArrests_enriched, variable = "Area")

Wordcloud

plot(x = cc_fuzzy, data = USArrests_enriched, variable = "State",
     type = "wordclouds")

Scatterplot

plot(x = cc_fuzzy, data = USArrests_enriched, type = "scatterplot",
     x_var = "Murder", y_var = "Assault")
## `geom_smooth()` using formula = 'y ~ x'

plot(x = cc_fuzzy, data = USArrests_enriched, type = "scatterplot",
     x_var = "Murder", y_var = "Assault", 
     focus = TRUE)

PCA

plot(x = cc_fuzzy, data = USArrests_enriched, type = "pca",
     group_by = "Area")

plot(x = cc_fuzzy, data = USArrests_enriched, type = "pca",
     focus = TRUE)

Silhouette

plot(x = cc_fuzzy, data = USArrests, type = "silhouette")
## $plot

## 
## $silhouette_table
##   Cluster Size Silhouette width
## 1       1   10        0.3802710
## 2       2   31        0.3786707
## 3       3    9        0.2383694
## 
## $average_silhouette_width
## [1] 0.3537365