Visualization of clustering solution by variables — plot.fuzzyclara • fuzzyclara

Function to provide graphical visualization of distribution

Usage

# S3 method for class 'fuzzyclara'
plot(
  x,
  data,
  type = NULL,
  variable = NULL,
  na.omit = FALSE,
  membership_threshold = 0,
  sample_percentage = 1,
  plot_membership_scores = FALSE,
  seed = 42,
  ...
)

Arguments

x: An object of class "fuzzyclara"
data: data.frame or matrix used for clustering
type, variable: Type of plot. One of c("barplot","boxplot","wordclouds", "silhouette","pca","scatterplot", "parallel"). Defaults to NULL, which either plots a barplot or a boxplot, depending on the class of variable.
na.omit: Should missing values be excluded for plotting? Defaults to FALSE.
membership_threshold: Threshold for fuzzy clustering observations to be plotted. Must be a number between 0 and 1. Defaults to 0.
sample_percentage: Percentage value that indicates which percentage of observations should randomly selected for representation the plot. Must be a number between 0 and 1. Defaults to 1.
plot_membership_scores: Boolean value indicating whether the cluster membership scores for the observations should be indicated through line the transparency (TRUE) or not (FALSE). Defaults to FALSE.
seed: random number seed (needed for clara_wordcloud and clara_parallel)
...: Further arguments for internal plot functions. For each type there is an internal plot function. See for example ?clara_pca.

Value

Clustering plot

Examples


# Prepare data for example (enrich the USArrest dataset by area and state)
library(dplyr)
#> 
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#> 
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#> 
#>     intersect, setdiff, setequal, union
USArrests_enriched <- USArrests %>% 
  mutate(State = as.factor(rownames(USArrests)),
         Area  = as.factor(case_when(State %in% c("Washington", "Oregon",
                                               "California", "Nevada",
                                               "Arizona", "Idaho", "Montana",
                                               "Wyoming", "Colorado",
                                               "New Mexico", "Utah", "Hawaii",
                                               "Alaska") ~ "West",
                                  State %in% c("Texas", "Oklahoma", "Arkansas",
                                               "Louisiana", "Mississippi",
                                               "Alabama", "Tennessee",
                                               "Kentucky", "Georgia",
                                               "Florida", "South Carolina",
                                               "North Carolina", "Virginia",
                                               "West Virginia") ~ "South",
                                  State %in% c("Kansas", "Nebraska", "South Dakota",
                                               "North Dakota", "Minnesota",
                                               "Missouri", "Iowa", "Illinois",
                                               "Indiana", "Michigan", "Wisconsin",
                                               "Ohio") ~ "Midwest",
                                  State %in% c("Maine", "New Hampshire", "New York",
                                               "Massachusetts", "Rhode Island",
                                               "Vermont", "Pennsylvania",
                                               "New Jersey", "Connecticut",
                                               "Delaware", "Maryland") ~
                                               "Northeast")))
# Determine clusters that will be plotted                                 
cc_hard <- fuzzyclara(data        = USArrests,
                      clusters    = 3,
                      metric      = "euclidean",
                      samples     = 1,
                      sample_size = NULL,
                      type        = "hard",
                      seed        = 3526,
                      verbose     = 0)
cc_hard
#> Clustering results
#> 
#> Medoids
#> [1] "New Mexico"    "Oklahoma"      "New Hampshire"
#> 
#> Clustering
#>  [1] 2 2 2 3 2 2 3 3 2 2 3 1 2 3 1 3 3 2 1 2 3 2 1 2 3 3 3 2 1 3 2 2 2 1 3 3 3 3
#> [39] 3 2 1 2 2 3 1 3 3 1 1 3
#> 
#> Minimum average distance
#> [1] 1.180717

cc_fuzzy <- fuzzyclara(data        = USArrests,
                      clusters    = 3,
                      metric      = "euclidean",
                      samples     = 1,
                      sample_size = NULL,
                      type        = "fuzzy",
                      m           = 2,
                      seed        = 3526,
                      verbose     = 0)
cc_fuzzy
#> Clustering results
#> 
#> Medoids
#> [1] "Oklahoma"  "Arizona"   "Tennessee"
#> 
#> Clustering
#>  [1] 3 3 1 2 1 1 2 2 1 3 2 2 1 2 2 2 2 3 2 1 2 1 2 3 2 2 2 1 2 2 1 1 3 2 2 2 2 2
#> [39] 2 3 2 3 3 2 2 2 2 2 2 2
#> 
#> Minimum average weighted distance
#> [1] 1.94242
#> 
#> Membership scores
#>                 Cluster1  Cluster2  Cluster3
#> Alabama        0.2040878 0.2391714 0.5567409
#> Alaska         0.3373655 0.2726496 0.3899849
#> Arizona        1.0000000 0.0000000 0.0000000
#> Arkansas       0.2075892 0.3966215 0.3957893
#> California     0.5401685 0.2248051 0.2350264
#> Colorado       0.4475538 0.2744007 0.2780455
#> Connecticut    0.2348136 0.5280016 0.2371848
#> Delaware       0.2906227 0.4701428 0.2392345
#> Florida        0.4443412 0.2316682 0.3239905
#> Georgia        0.2091524 0.2149396 0.5759081
#> Hawaii         0.2482766 0.4883161 0.2634073
#> Idaho          0.2209589 0.5129169 0.2661242
#> Illinois       0.4666698 0.2739684 0.2593617
#> Indiana        0.1344369 0.6694262 0.1961369
#> Iowa           0.2311216 0.4905457 0.2783327
#> Kansas         0.1310680 0.6999444 0.1689876
#> Kentucky       0.1917648 0.4401893 0.3680459
#> Louisiana      0.2560625 0.2412981 0.5026393
#> Maine          0.2396947 0.4695769 0.2907285
#> Maryland       0.4281216 0.2306369 0.3412416
#> Massachusetts  0.2682437 0.5043343 0.2274220
#> Michigan       0.4467571 0.2192029 0.3340400
#> Minnesota      0.2158369 0.5379562 0.2462069
#> Mississippi    0.2484292 0.2817416 0.4698292
#> Missouri       0.2669546 0.3898602 0.3431852
#> Montana        0.1922866 0.5233027 0.2844107
#> Nebraska       0.1814854 0.5935543 0.2249603
#> Nevada         0.4372644 0.2469118 0.3158237
#> New Hampshire  0.2351771 0.4821286 0.2826942
#> New Jersey     0.3025875 0.4474846 0.2499279
#> New Mexico     0.4736616 0.2098122 0.3165261
#> New York       0.4959333 0.2489337 0.2551329
#> North Carolina 0.2984813 0.2995500 0.4019686
#> North Dakota   0.2525175 0.4409055 0.3065770
#> Ohio           0.1722044 0.6264071 0.2013885
#> Oklahoma       0.0000000 1.0000000 0.0000000
#> Oregon         0.2597840 0.4842455 0.2559705
#> Pennsylvania   0.1733916 0.6187011 0.2079073
#> Rhode Island   0.2938264 0.4548769 0.2512968
#> South Carolina 0.2521289 0.2569116 0.4909595
#> South Dakota   0.2294820 0.4627901 0.3077278
#> Tennessee      0.0000000 0.0000000 1.0000000
#> Texas          0.3315450 0.2977964 0.3706587
#> Utah           0.2550652 0.5204090 0.2245258
#> Vermont        0.2537642 0.4173744 0.3288614
#> Virginia       0.1470128 0.6016305 0.2513568
#> Washington     0.2420740 0.5403595 0.2175666
#> West Virginia  0.2356115 0.4301945 0.3341939
#> Wisconsin      0.2298126 0.5057011 0.2644864
#> Wyoming        0.1652925 0.6041126 0.2305949
                                    
# Boxplot
plot(x = cc_hard, data = USArrests_enriched, variable = "Assault") 


# Barplot
plot(x = cc_hard, data = USArrests_enriched, variable = "Area")


# Wordcloud
plot(x = cc_hard, data = USArrests_enriched, variable = "State", 
     type = "wordclouds", seed = 123)                                    

                                    
# Scatterplot
plot(x = cc_hard, data = USArrests_enriched, type = "scatterplot",
     x_var = "Murder", y_var = "Assault")   
#> `geom_smooth()` using formula = 'y ~ x'


# Plot membership probability for fuzzy clustering
plot(x = cc_fuzzy, data = USArrests_enriched, type = "scatterplot",
     x_var = "Murder", y_var = "Assault", 
     focus = TRUE)


# Plot membership probability for fuzzy clustering (one cluster only)
plot(x = cc_fuzzy, data = USArrests_enriched, type = "scatterplot",
     x_var = "Murder", y_var = "Assault", 
     focus = TRUE, focus_clusters = c(1))  

     
# PCA
plot(x = cc_hard, data = USArrests_enriched, type = "pca",
     group_by = "Area")     

 
# Plot membership probability for one or more clusters following a PCA    
plot(x = cc_fuzzy, data = USArrests_enriched, type = "pca",
     focus = TRUE)     

       
plot(x = cc_fuzzy, data = USArrests_enriched, type = "pca",
     focus = TRUE, focus_clusters = c(1))      

  
# Silhouette plot
plot(x = cc_hard, data = USArrests, type = "silhouette")  
#> $plot

#> 
#> $silhouette_table
#>   Cluster Size Silhouette width
#> 1       1   10        0.4604416
#> 2       2   19        0.2757843
#> 3       3   21        0.2797126
#> 
#> $average_silhouette_width
#> [1] 0.3143656
#> 
 
# Plot clusters for fuzzy clustering (using threshold for membership scores)
plot(x = cc_fuzzy, data = USArrests_enriched, type = "pca",
     variable = "Assault", membership_threshold = 0) 

       
plot(x = cc_fuzzy, data = USArrests_enriched, type = "pca",
     variable = "Assault", membership_threshold = 0.5)       

       
plot(x = cc_fuzzy, data = USArrests_enriched, type = "scatterplot",
     x_var = "Murder", y_var = "Assault", membership_threshold = 0)  
#> `geom_smooth()` using formula = 'y ~ x'

     
plot(x = cc_fuzzy, data = USArrests_enriched, type = "scatterplot",
     x_var = "Murder", y_var = "Assault", membership_threshold = 0.5)     
#> `geom_smooth()` using formula = 'y ~ x'
#> Warning: NaNs produced
#> Warning: no non-missing arguments to max; returning -Inf

  
plot(x = cc_fuzzy, data = USArrests_enriched, type = "scatterplot",
     x_var = "Murder", y_var = "Assault", membership_threshold = 0.5,
     plot_all_fuzzy = TRUE)  
#> `geom_smooth()` using formula = 'y ~ x'
#> Warning: NaNs produced
#> Warning: no non-missing arguments to max; returning -Inf

  
plot(x = cc_fuzzy, data = USArrests_enriched, type = "pca",
     group_by = "Area", membership_threshold = 0)      

     
plot(x = cc_fuzzy, data = USArrests_enriched, type = "pca",
     group_by = "Area", membership_threshold = 0.5)     


plot(x = cc_fuzzy, data = USArrests_enriched, type = "pca",
     group_by = "Area", membership_threshold = 0.5, plot_all_fuzzy = TRUE)     


# Parallel Plot
plot(x = cc_fuzzy, data = USArrests_enriched, 
     type = "parallel", sample_percentage = 1, plot_membership_scores = TRUE)