Skip to content

Commit 46c7694

Browse files
committed
New cluster objective based on minimal cluster size
1 parent b546ef6 commit 46c7694

3 files changed

Lines changed: 41 additions & 15 deletions

File tree

R/cluster_analysis.R

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
#'
55
#' @param dpp The output of [dynamic_per_pixel()]
66
#' @param kMax Tha maximum number of clusters to be tested
7-
#' @param explanation_objective The a threshold between 0 and 1 for the ratio
8-
#' between Within-Cluster sum of squares and Between-Cluster sum of squares.
7+
#' @param minimum_clusterSize The minimum size of a cluster allowed. By default
8+
#' 1% of all moving average pixels, need to be in a cluster.
99
#' @param plot_result Logical
1010
#' @param correlate_first If TRUE, Pearson correlation between pixels will be
1111
#' used for clustering instead of Index values. This results in a clustering
@@ -28,28 +28,27 @@
2828
#' @export
2929
#'
3030
best_nk <- function(
31-
dpp, kMax = 10, explanation_objective = 0.98, plot_result = TRUE, correlate_first = FALSE
31+
dpp, kMax = 10, minimum_clusterSize = 0.01, plot_result = TRUE, correlate_first = FALSE
3232
){
3333
y <- prepare_for_clustering(
3434
moving_averages_matrix = dpp$moving_averages[,-1],
3535
correlate_first = correlate_first,
3636
maxPixels = 10000
3737
)
3838

39-
wcss <- c()
40-
withinBetween <- 0
4139
i <- 1
42-
while(withinBetween < explanation_objective){
40+
kmeansOutput <- kmeans(t(y), centers = i, iter.max = 20, nstart = 2)
41+
wcss <-withinBetween <- kmeansOutput$betweenss / kmeansOutput$totss
42+
while(all(kmeansOutput$size > minimum_clusterSize * ncol(y))){
4343
if(i > kMax){
4444
break
4545
}
46+
i <- i + 1
4647
cat(paste0("Calculation of ", i, ifelse(i == 1, " cluster", " clusters"), " ... \n"))
47-
4848
kmeansOutput <- kmeans(t(y), centers = i, iter.max = 20, nstart = 2)
49-
49+
5050
withinBetween <- kmeansOutput$betweenss / kmeansOutput$totss
5151
wcss <- c(wcss, withinBetween)
52-
i <- i + 1
5352
}
5453

5554
par(mar = c(4.1, 4.1, 4.1, 2.1))
@@ -65,7 +64,6 @@ best_nk <- function(
6564
best_nCluster <- length(wcss)
6665

6766
abline(v = best_nCluster)
68-
abline(h = explanation_objective * 100, lty = "dotted")
6967

7068
tp <- ifelse(
7169
abs(best_nCluster - par("xaxp")[2]) < abs(best_nCluster - par("xaxp")[1]),
@@ -107,7 +105,7 @@ pixel_clusters <- function(
107105
keep <- 1:365
108106
if(!whole_dynamic){
109107
proportionToRemove <- 100000 / ncol(dpp$moving_averages)
110-
i_d <- round(1 / proportionToRemove)
108+
i_d <- ceiling(1 / proportionToRemove)
111109
if(i_d > 10){
112110
i_d <- 10
113111
}
@@ -149,10 +147,17 @@ pixel_clusters <- function(
149147
#' rows of the moving averages of one index as created by [dynamic_per_pixel()]
150148
#' @param correlate_first If TRUE, Pearson correlation between pixels will be
151149
#' used for clustering instead of Index values. This results in a clustering
152-
#' based on different shapes of the dynamic.
150+
#' based on different shapes of the dynamic.
153151
#' @param maxPixels The maximum numbers of pixel used for cluster analysis.
154152
#' Leave NULL to use all pixels.
155153
#'
154+
#' @details
155+
#' If correlate first, pixels are identified as outliers and removed if there
156+
#' not at least 5% other pixels that correlate at least 0.95. For example, given
157+
#' 1000 correlated pixels, the evaluated pixels is not an outlier if it
158+
#' correlates to >= 50 pixels >= 0.95.
159+
#'
160+
#'
156161
#' @return A matrix either with data per pixels or correlation between pixels
157162
#' used for following cluster analysis
158163
#'
@@ -189,6 +194,21 @@ prepare_for_clustering <- function(
189194
}
190195
cat(paste0("correlation of ", ncol(ts_table), " pixels ... \n"))
191196
y <- t(cor(x = ts_table, y = ts_reference, use = "pairwise.complete.obs"))
197+
198+
199+
# remove pixels that have a low correlation to almost "all" others
200+
# highCor <- apply(y, 2, quantile, p = 0.95)
201+
# )
202+
# cat(paste0(
203+
# sum(highCor < 0.95), " pixels of ", ncol(y), " pixels identified as ",
204+
# "outlier and removed ... \n")
205+
# )
206+
#
207+
# # maxCor <- apply(y, 2, max)
208+
# # maxCor >= 0.99
209+
#
210+
# sum(which(maxCor < 0.99) %in% which(highCor < 0.95))
211+
# y <- round(y[,highCor >= 0.95], 5)
192212
y <- round(y, 5)
193213
} else {
194214
y <- ts_table

man/best_nk.Rd

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/prepare_for_clustering.Rd

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)