We performed additional validation of our analytical approach and findings. First, we performed an analysis of interobserver reliability in call classification at the individual scale across multiple observers. Second, we compared visual inspection, spectrographic cross-correlation (SPCC) and random forests as similarity methods. Finally, we asked whether or not a difference in social context between the individual and higher social scales could have skewed results for monk parakeets.
rm(list = ls())
X <- c("tidyverse", "vegan", "pbapply", "dplyr", "data.table", "mclust", "effsize", "knitr")
invisible(lapply(X, library, character.only = TRUE))
theme_AcousticSpace <- function(){
theme(panel.background = element_rect(fill = "white"), plot.background = element_rect(fill = "white"),
panel.grid.major = element_line(size = 1, colour = "white"),
panel.grid.minor = element_line(size = 0.75, colour = "white"),
axis.line = element_line(size = 0.45, colour = "black"),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12))
}
Set directory paths for analysis.
in_path_st <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/visual-inspection/www"
in_path_classes <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/visual-inspection/classifications"
out_path <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/Site_Recordings"
Set general parameters.
social_scales <- c("Individual", "Pair", "Flock", "Site")
perms <- 9999
cores <- parallel::detectCores() - 2
seed <- 401
Read in .csv files of human classifications, obtained from Shiny app.
files <- list.files(pattern = ".csv", path = in_path_classes)
# files
csvs <- lapply(1:length(files), function(x){ read.csv(file.path(in_path_classes, files[x])) })
length(csvs)
## [1] 12
Here we asked how consistently human observers classified calls at the individual scale. This was the only social scale at which it was possible to calculate classification accuracy by human visual inspection.
# write a loop to extract individual scale information, assign individual IDs to classes by a majority rule and calculate classification accuracy per observer
observs <- 1:length(csvs)
classif_accur <- invisible(unlist(pblapply(observs, function(i){
tmp <- t(csvs[[i]][grep("Individual", csvs[[i]]$social_scale), grep("class_", names(csvs[[i]]))])
tmp <- data.frame(class = row.names(tmp), spectros = tmp[, 1])
# iterate over classes to assign an individual ID to each class
class_IDs <- unlist(lapply(1:nrow(tmp), function(x){
# obtain calls for the given class
calls <- as.character(tmp$spectros[x])
calls <- strsplit(calls, split = ";")[[1]]
# obtain individual IDs from the spectrogram image names
indivs <- sapply(1:length(calls), function(z){
strsplit(strsplit(calls[z], split = "INDIV-")[[1]][2], split = "_")[[1]][1]
})
if(all(!is.na(indivs))){
# find the individual with the most spectrograms assiged to this class
# this individual ID will become assigned to the given class to facilitate calculating classification accuracy
itbl <- table(indivs)
m <- max(itbl)
class_ID <- names(itbl)[which(itbl == m)]
} else {
class_ID <- NA
}
return(class_ID)
}))
# if each class could be assigned a unique individual ID
if(all(!is.na(class_IDs)) & !any(duplicated(class_IDs))){
tmp_class_IDs <- class_IDs
# if some classes were left empty
} else if(any(is.na(class_IDs))){
tmp_class_IDs <- class_IDs[!is.na(class_IDs)]
# if some classes were evenly assigned to different individuals
} else if(any(duplicated(class_IDs)) & all(!is.na(class_IDs))){
tmp_class_IDs <- class_IDs[!duplicated(class_IDs)]
}
wrong <- unlist(lapply(1:nrow(tmp), function(x){
# obtain calls for the given class
calls <- as.character(tmp$spectros[x])
calls <- strsplit(calls, split = ";")[[1]]
# obtain individual IDs from the spectrogram image names
indivs <- sapply(1:length(calls), function(z){
strsplit(strsplit(calls[z], split = "INDIV-")[[1]][2], split = "_")[[1]][1]
})
itbl <- table(indivs)
wrong <- as.vector(itbl[which(names(itbl) != tmp_class_IDs[x])])
return(wrong)
}))
wrong_class <- sum(wrong)
# extract the total number of calls classified
N <- sum(unlist(lapply(1:nrow(tmp), function(z){
calls <- as.character(tmp$spectros[z])
length(strsplit(calls, split = ";")[[1]])
})))
# calculate classification accuracy over all calls
return(((N - wrong_class)/N)*100)
})))
classif_accur
## [1] 75.00000 87.50000 75.00000 87.50000 55.55556 87.50000 50.00000
## [8] 81.25000 75.00000 75.00000 37.50000 75.00000
mean(classif_accur)
## [1] 71.81713
range(classif_accur)
## [1] 37.5 87.5
sd(classif_accur)
## [1] 15.94189
Read in extended selection tables per social scale used in Shiny app. These will determine how calls will be ordered in matrices across social scales.
# read in the extended selection tables after moving them into the Shiny app working directory
indiv_shiny_tbl <- readRDS(file.path(path = in_path_st, "indiv_scale_shiny_extended_seltable.RDS"))
pair_shiny_tbl <- readRDS(file.path(path = in_path_st, "pair_scale_shiny_extended_seltable.RDS"))
flock_shiny_tbl <- readRDS(file.path(path = in_path_st, "flock_scale_shiny_extended_seltable.RDS"))
site_shiny_tbl <- readRDS(file.path(path = in_path_st, "site_scale_shiny_extended_seltable.RDS"))
sel_tables <- list(indiv_shiny_tbl, pair_shiny_tbl, flock_shiny_tbl, site_shiny_tbl)
Write a function to convert human classifications into a list of binary matrices (one per social scale).
df2binmat <- function(X, class_prefix){
# iterate over social scales to make a list of binary matrices
mat_list <- invisible(pblapply(1:length(social_scales), function(i){
# get all calls for the given social scale
sel_table <- sel_tables[[i]]
calls_df <- data.frame(call_ID = paste(sel_table$sound.files, "-1.jpeg", sep = ""))
# now get the class labels for these calls
# first extract calls and classes for the given social scale
tmp <- droplevels(X[grep(paste("^", social_scales[i], "$", sep = ""), X$social_scale), ])
class_cols <- tmp[, grep(paste(paste("^", class_prefix, sep = ""), collapse = "|"), names(X))]
class_cols <- t(class_cols)
# then iterate over rows and turn this into a data frame with one row per call
class_df <- data.table::rbindlist(lapply(1:nrow(class_cols), function(y){
calls_tmp <- unlist(strsplit(class_cols[y, ], split = ";"))
df <- data.frame(class_label = row.names(class_cols)[y], call_ID = calls_tmp)
return(df)
}))
# iterate over rows of the calls data frame for the given social scale, pulling out class labels from the class data frame
final_df <- data.table::rbindlist(lapply(1:nrow(calls_df), function(y){
call_tmp <- calls_df[y, ]
label <- as.character(class_df$class_label[grep(paste("^", call_tmp, "$", sep = ""), class_df$call_ID)])
return(data.frame(call_ID = call_tmp, class_label = label, ID = as.character(sel_table$ID[y])))
}))
# iterate over each row in this data frame and compare the label of the given call to labels of all other calls
# this comparison is the same as asking whether or not calls were classified in the same group
# this will return a long binary vector that can be used to build the final matrix
class_comps <- unlist(lapply(1:nrow(calls_df), function(y){
as.numeric(final_df$class_label[y] == final_df$class_label)
}))
# build the final matrix of human classifications, which contains binary encoding for whether or not calls were classified in the same group per observer and social scale
human_class_mat <- matrix(class_comps, nrow = nrow(final_df), ncol = nrow(final_df), byrow = TRUE)
dimnames(human_class_mat) <- list(final_df$ID, final_df$ID)
return(human_class_mat)
}))
names(mat_list) <- social_scales
return(mat_list)
}
human_class_mats_by_observer <- pblapply(1:length(csvs), function(x){
df2binmat(X = csvs[[x]], class_prefix = "class")
})
names(human_class_mats_by_observer) <- paste("Observer", seq(1, length(human_class_mats_by_observer), 1), sep = "-")
# str(human_class_mats_by_observer)
These matrices look good. We moved on to subsetting SPCC and random forests acoustic similarity matrices for the given calls per social scale.
# read in repeatedly sampled individuals SPCC and random forests matrices
# also the SPCC and random forests matrices for higher social scales
i_path <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/Focal_Individuals"
h_path <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/Site_Recordings"
# repeatedly sampled individuals
# needs to be filtered by calls used in the human classification, which were the same calls used for random forests validation
xc_mat_i <- readRDS(file.path(i_path, "xc_mat_allfocal.RDS")) # SPCC
# str(xc_mat_i)
# this already contains only calls in the random forests validation data set (e.g. the individuals used in human classification)
# has no dimension names
rf_mat_i <- readRDS(file.path(h_path, "proxm_ms.RDS")) # random forests
# str(rf_mat_i)
# higher social scales
# these need to be filtered for pair, flock and site calls used in human classification
xc_mat_h <- readRDS(file.path(h_path, "xc_mat_sites.RDS")) # SPCC
# str(xc_mat_h)
# add dimension names to the rf_mat to facilitate filtering, these are the same as the xc_mat for higher social scales
rf_mat_h <- readRDS(file.path(h_path, "proxm_site.RDS")) # random forests
dimnames(rf_mat_h) <- dimnames(xc_mat_h)
# str(rf_mat_h)
Subset the SPCC and random forests matrices for repeatedly sampled individuals used in human classification.
sel_table <- sel_tables[[1]]
# unique(sel_table$scale) # checking
# dimnames(xc_mat_i)
# make the selection table call IDs match the SPCC matrix dimension names
call_IDs <- sel_table$sound.files
call_IDs <- gsub("WAV_", "WAV-", call_IDs)
# filter the SPCC matrix by the call_IDs in the selection table used for human classification
xc_mat_i_sub <- xc_mat_i[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_i)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_i)[[2]])]
# dimnames(xc_mat_i_sub) # checking
# replace dimnames to be friendlier for plotting
dimnames(xc_mat_i_sub) <- list(sel_table$ID, sel_table$ID)
dim(xc_mat_i_sub)
# filter the random forests matrix by the call_IDs in the selection table used for human classification
# first give the random forests matrix dimension names
rf_dim_nms <- dimnames(xc_mat_i)[[1]][grep("RAW|ZW8|-BIRD3|-BIRD5", dimnames(xc_mat_i)[[1]])]
dimnames(rf_mat_i) <- list(rf_dim_nms, rf_dim_nms)
rf_mat_i_sub <- rf_mat_i[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_i)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_i)[[2]])]
# dimnames(rf_mat_i_sub) # checking
# replace dimnames to be friendlier for plotting
dimnames(rf_mat_i_sub) <- list(sel_table$ID, sel_table$ID)
# make the binary matrix of group identity at this social scale
group_IDs <- sapply(1:nrow(sel_table), function(x){
as.numeric(sel_table$ID[x] == sel_table$ID)
})
binmat_indiv_ID <- matrix(group_IDs, byrow = TRUE, nrow = nrow(sel_table), ncol = nrow(sel_table))
dimnames(binmat_indiv_ID) <- list(sel_table$ID, sel_table$ID)
# binmat_indiv_ID # checking
Read in the selection table and metadata for calls at higher social scales.
ccs <- read.csv(file.path(h_path, "site-level_calls_preprocessed_final.csv"), header = TRUE)
Subset the SPCC and random forests matrices for pair calls used in human classification.
sel_table <- sel_tables[[2]]
# unique(sel_table$scale) # checking
# check out call IDs between the human classification selection table and the SPCC matrix
# sel_table$sound.files
# dimnames(xc_mat_h)[[1]]
# make the selection table call IDs match the SPCC matrix dimension names
call_IDs <- sel_table$sound.files
call_IDs <- gsub("WAV_", "WAV-", call_IDs)
# filter the SPCC matrix by the call_IDs in the selection table used for human classification
xc_mat_h_pairs <- xc_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[2]])]
# dimnames(xc_mat_h_pairs) # checking
# replace dimnames to be friendlier for plotting
dimnames(xc_mat_h_pairs) <- list(sel_table$ID, sel_table$ID)
# repeat filtering for the random forests matrix
rf_mat_h_pairs <- rf_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[2]])]
# dimnames(rf_mat_h_pairs) # checking
# replace dimnames to be friendlier for plotting
dimnames(rf_mat_h_pairs) <- list(sel_table$ID, sel_table$ID)
# make the binary matrix of group identity at this social scale
group_IDs <- sapply(1:nrow(sel_table), function(x){
as.numeric(sel_table$ID[x] == sel_table$ID)
})
binmat_pair_ID <- matrix(group_IDs, byrow = TRUE, nrow = nrow(sel_table), ncol = nrow(sel_table))
dimnames(binmat_pair_ID) <- list(sel_table$ID, sel_table$ID)
# binmat_pair_ID # checking
Subset the SPCC and random forests matrices for flock calls used in human classification.
sel_table <- sel_tables[[3]]
# unique(sel_table$scale) # checking
# make the selection table call IDs match the SPCC matrix dimension names
call_IDs <- sel_table$sound.files
call_IDs <- gsub("WAV_", "WAV-", call_IDs)
# filter the SPCC matrix by the call_IDs in the selection table used for human classification
xc_mat_h_flocks <- xc_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[2]])]
# dimnames(xc_mat_h_flocks) # checking
# replace dimnames to be friendlier for plotting
dimnames(xc_mat_h_flocks) <- list(sel_table$ID, sel_table$ID)
# repeat filtering for the random forests matrix
rf_mat_h_flocks <- rf_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[2]])]
# dimnames(rf_mat_h_flocks) # checking
# replace dimnames to be friendlier for plotting
dimnames(rf_mat_h_flocks) <- list(sel_table$ID, sel_table$ID)
# make the binary matrix of group identity at this social scale
group_IDs <- sapply(1:nrow(sel_table), function(x){
as.numeric(sel_table$ID[x] == sel_table$ID)
})
binmat_flock_ID <- matrix(group_IDs, byrow = TRUE, nrow = nrow(sel_table), ncol = nrow(sel_table))
dimnames(binmat_flock_ID) <- list(sel_table$ID, sel_table$ID)
# binmat_flock_ID # checking
Subset the SPCC and random forests matrices for site calls used in human classification. Also, make a binary matrix of group identity for these site-scale calls.
sel_table <- sel_tables[[4]]
# unique(sel_table$scale) # checking
# make the selection table call IDs match the SPCC matrix dimension names
call_IDs <- sel_table$sound.files
call_IDs <- gsub("WAV_", "WAV-", call_IDs)
# filter the SPCC matrix by the call_IDs in the selection table used for human classification
xc_mat_h_sites <- xc_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[2]])]
# dimnames(xc_mat_h_sites) # checking
# replace dimnames to be friendlier for plotting
dimnames(xc_mat_h_sites) <- list(sel_table$ID, sel_table$ID)
# repeat filtering for the random forests matrix
rf_mat_h_sites <- rf_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[2]])]
# dimnames(rf_mat_h_sites) # checking
# replace dimnames to be friendlier for plotting
dimnames(rf_mat_h_sites) <- list(sel_table$ID, sel_table$ID)
# make the binary matrix of group identity at this social scale
group_IDs <- sapply(1:nrow(sel_table), function(x){
as.numeric(sel_table$ID[x] == sel_table$ID)
})
binmat_site_ID <- matrix(group_IDs, byrow = TRUE, nrow = nrow(sel_table), ncol = nrow(sel_table))
dimnames(binmat_site_ID) <- list(sel_table$ID, sel_table$ID)
# binmat_site_ID # checking
The subsetted SPCC and random forests matrices look good, as do the binary matrices of group identity across social scales. We used a model-based clustering approach to ask how well clustering algorithms classified calls based on similarity matrices per method.
# x <- 2
human_mat_list <- invisible(pblapply(1:length(social_scales), function(x){
ss <- social_scales[x]
# extract human classification matrices across observers for this social scale
human_class_mats <- lapply(1:length(human_class_mats_by_observer), function(x){
human_class_mats_by_observer[[x]][[ss]]
})
# add matrices for this scale together across observers
# yields a matrix in which each cell indicates how often calls were classified together across observers
human_class_mat <- Reduce('+', human_class_mats)
# scale the matrix values between 0 and 1 to be as similar as possible to the SPCC and random forests matrices
human_class_mat <- apply(human_class_mat, MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))
return(human_class_mat)
}))
names(human_mat_list) <- social_scales
SPCC_mat_list <- list(xc_mat_i_sub, xc_mat_h_pairs, xc_mat_h_flocks, xc_mat_h_sites)
names(SPCC_mat_list) <- social_scales
rf_mat_list <- list(rf_mat_i_sub, rf_mat_h_pairs, rf_mat_h_flocks, rf_mat_h_sites)
names(rf_mat_list) <- social_scales
asim_method <- c("Visual Inspection", "SPCC", "Random Forests")
asim_mat_list <- list(human_mat_list, SPCC_mat_list, rf_mat_list)
names(asim_mat_list) <- asim_method
Iterate over similarity methods and social scales to: 1) perform t-SNE per similarity method, 2) perform model-based clustering per simiarity method, 3) make a data frame with the new embeddings as well as metadata for plotting, including the model-based clustering classes.
df_list <- invisible(pblapply(1:length(asim_method), function(i){
asim_mats <- asim_mat_list[[asim_method[i]]]
# iterate over social scales
df_list <- lapply(1:length(social_scales), function(x){
ss <- social_scales[x]
# convert the similarity matrix to a distance matrix
# will be used for t-SNE and model-based clustering
dist_mat <- stats::as.dist(1 - asim_mats[[x]], diag = TRUE, upper = TRUE)
# perform t-SNE
# the optimized perplexity for focal individual feature extraction was too large for this t-SNE visualization, so we used the maximum perplexity possible given the dimensions of the distance matrix
plx <- (dim(asim_mats[[x]])[1]/3) - 1
# visualize with t-SNE in 2 dimensions
tmp <- list()
tmp <- lapply(1:10, function(n){
Rtsne::Rtsne(dist_mat, dims = 2, pca = FALSE, max_iter = 5000, perplexity = plx, is_distance = TRUE, theta = 0.0)
})
# str(tmp)
KL <- sapply(1:length(tmp), function(x){
tmp[[x]]$itercosts[100]
}, simplify = TRUE)
# overwrite tmp with the solution that minimized KL
tsne <- tmp[[which(KL == min(KL))]]
# perform model-based clustering while limiting the possible number of clusters to the true number of groups in each call data set
set.seed(seed)
mBIC_res <- mclust::Mclust(data = dist_mat, G = 1:4)
# make a data frame with output
df <- data.frame(asim_method = asim_method[i], social_scale = ss, X = tsne$Y[, 1], Y = tsne$Y[, 2], true_group = as.numeric(factor(dimnames(asim_mats[[x]])[[1]])), cluster = mBIC_res$classification)
return(df)
})
return(rbindlist(df_list))
}))
clustering_df <- rbindlist(df_list)
clustering_df$true_group <- factor(clustering_df$true_group)
clustering_df$cluster <- factor(clustering_df$cluster)
str(clustering_df)
saveRDS(clustering_df, file.path(out_path, "clustering_df.RDS"))
Here, find the number of calls misclassified per true group across social scales and similarity methods. The resulting data frame will be used for labels in the faceted plot below.
clustering_df <- readRDS(file.path(out_path, "clustering_df.RDS"))
# make a new column of numeric group ID
clustering_df$true_group_num <- as.numeric(clustering_df$true_group)
clustering_df$asim_method <- as.character(clustering_df$asim_method)
clustering_df$asim_method <- factor(clustering_df$asim_method, levels = c("Visual Inspection", "SPCC", "Random Forests"))
asim_method <- c("Visual Inspection", "SPCC", "Random Forests")
clusters <- 4 # each social scale has 4 clusters
# number of calls per group per social scale
num_spectros <- c(4, 2, 3, 4)
# iterate over similarity methods
df_list <- invisible(pblapply(1:length(asim_method), function(i){
# iterate over social scales
df_list <- lapply(1:length(social_scales), function(x){
am <- asim_method[i]
ss <- social_scales[x]
# subset data and count observations per cluster identified by model-based clustering
clust_df <- clustering_df %>%
filter(as.character(asim_method) == am) %>%
filter(social_scale == ss) %>%
droplevels()
# setting parameters for placing text labels in faceted plot below
if(am == "Visual Inspection" & ss == "Individual"){
X <- -Inf
Y <- -Inf
} else if(am == "SPCC" & ss == "Pair"){
X <- Inf
Y <- -Inf
} else if(am == "Random Forests" & ss == "Pair"){
X <- Inf
Y <- -Inf
} else if(am == "Visual Inspection" & ss == "Flock"){
X <- -Inf
Y <- -Inf
} else {
X <- -Inf
Y <- -Inf
}
# iterate over clusters to find the number of incorrectly assigned calls per cluster
incorr <- unlist(lapply(1:clusters, function(y){
tmp <- clust_df %>%
filter(cluster == y) %>%
droplevels()
if(nrow(tmp) < num_spectros[x] & length(unique(tmp$true_group)) == 1){
incorr <- num_spectros[x] - nrow(tmp)
} else if(nrow(tmp) > num_spectros[x] & length(unique(tmp$true_group)) > 1){
incorr <- nrow(tmp) - num_spectros[x]
} else if(nrow(tmp) <= num_spectros[x] & length(unique(tmp$true_group)) > 1){
incorr <- abs(length(unique(tmp$true_group)) - length(unique(tmp$cluster)))
# correctly assigned
} else if(nrow(tmp) == num_spectros[x] & length(unique(tmp$true_group)) == 1){
incorr <- 0
# if no calls were assigned to this cluster, these incorrect assignments will be picked up when evaluating other clusters
} else if(nrow(tmp) == 0){
incorr <- 0
}
return(incorr)
}))
incorr <- (sum(unlist(incorr))/nrow(clust_df)*100)
percent_corr <- paste(round(100 - incorr, digits = 2), "%", sep = "")
df <- data.frame(asim_method = am, social_scale = ss, percent_corr = percent_corr, X = X, Y = Y)
return(df)
})
return(rbindlist(df_list))
}))
corr_df <- rbindlist(df_list)
corr_df
## asim_method social_scale percent_corr X Y
## 1: Visual Inspection Individual 87.5% -Inf -Inf
## 2: Visual Inspection Pair 75% -Inf -Inf
## 3: Visual Inspection Flock 58.33% -Inf -Inf
## 4: Visual Inspection Site 87.5% -Inf -Inf
## 5: SPCC Individual 87.5% -Inf -Inf
## 6: SPCC Pair 75% Inf -Inf
## 7: SPCC Flock 66.67% -Inf -Inf
## 8: SPCC Site 43.75% -Inf -Inf
## 9: Random Forests Individual 87.5% -Inf -Inf
## 10: Random Forests Pair 75% Inf -Inf
## 11: Random Forests Flock 50% -Inf -Inf
## 12: Random Forests Site 68.75% -Inf -Inf
Make a faceted plot by smilarity method and social scales of the true group identity versus model-based clustering results. This is Supplementary Figure 6 in supplementary materials.
# initialize aesthetics
n <- 12
cols <- c(topo.colors(n, alpha = 1)[2], terrain.colors(n, alpha = 1)[2], heat.colors(n, alpha = 1)[1], heat.colors(n, alpha = 1)[5])
shps <- c(0, 21, 24, 6)
# find the min and max per level of grouping variables
x_maxes <- tapply(clustering_df$X, list(clustering_df$social_scale, clustering_df$asim_method), max, simplify = TRUE)
x_mins <- tapply(clustering_df$X, list(clustering_df$social_scale, clustering_df$asim_method), min)
y_maxes <- tapply(clustering_df$Y, list(clustering_df$social_scale, clustering_df$asim_method), max, simplify = TRUE)
y_mins <- tapply(clustering_df$Y, list(clustering_df$social_scale, clustering_df$asim_method), min)
# find a good x and y buffer per combo of factor levels
x_buf <- tapply(clustering_df$X, list(clustering_df$social_scale, clustering_df$asim_method), function(x){ return(max(x)/3) }, simplify = TRUE)
x_buf
## Visual Inspection SPCC Random Forests
## Individual 52.69264 67.08727 25.08714
## Pair 75.17353 52.03722 79.10235
## Flock 68.02524 55.50820 196.42636
## Site 42.31252 23.08736 47.10373
y_buf <- tapply(clustering_df$Y, list(clustering_df$social_scale, clustering_df$asim_method), function(x){ return(max(x)/3) }, simplify = TRUE)
y_buf
## Visual Inspection SPCC Random Forests
## Individual 41.18420 36.37380 22.63860
## Pair 131.72327 43.23936 269.69292
## Flock 63.33374 56.66033 393.91938
## Site 51.56966 44.52101 45.91808
blank_data <- data.frame(asim_method = rep(corr_df$asim_method, 2), social_scale = rep(corr_df$social_scale, 2), x = c(as.vector(x_mins) - as.vector(x_buf), as.vector(x_maxes) + as.vector(x_buf)), y = c(as.vector(y_mins) - as.vector(y_buf), as.vector(y_maxes) + as.vector(y_buf)))
# make graphic
gg <- ggplot(data = clustering_df, aes(x = X, y = Y)) +
# yields nice strip text on top and right but cannot yield truly free axes
# facet_grid(rows = vars(social_scale), cols = vars(asim_method), scales = "free", shrink = TRUE, space = "fixed") +
facet_wrap(vars(social_scale, asim_method), ncol = length(unique(clustering_df$asim_method)), scales = "free") +
# labeller = label_values
geom_point(aes(shape = true_group), fill = gray.colors(n, alpha = 1)[2], size = 3.5) +
# overlay circular points for clusters
geom_point(aes(color = cluster), shape = 19, size = 2) +
scale_shape_manual(values = shps) +
guides(shape = guide_legend(title = "True Group"), color = guide_legend(title = "Cluster")) +
xlab("t-SNE Dimension 1") + ylab("t-SNE Dimension 2") +
geom_blank(data = blank_data, aes(x = x, y = y)) +
expand_limits(y = 0) + scale_y_continuous(expand = c(0, 0)) +
geom_text(data = corr_df, aes(label = percent_corr, vjust = -0.45), hjust = 0.1*c(-1, -1, -1, -1, -1, 11, -1, -1, -1, 11, -1, -1), size = 4, fontface = "bold") +
# scale_color_manual(colors = cols) +
theme(panel.background = element_rect(fill = "white", color = "black"), plot.background = element_rect(fill = "white"), panel.grid.major = element_line(size = 1, colour = "white"), panel.grid.minor = element_line(size = 0.75, colour = "white"), axis.line = element_line(size = 0.5, colour = "black"), axis.title = element_text(size = 14), axis.text = element_text(size = 10), strip.text = element_text(size = 11, face = "bold"), legend.position = "top", legend.title = element_text(size = 12), legend.text = element_text(size = 12)) +
ggtitle("")
gg
# ggsave(file.path(out_path, "SupplementaryFigure6_CompareAcousticMethods.jpeg"), units = "in", height = 8.5, width = 6.9, dpi = 300)
# extract cluster colors from this plot for future reference
# p <- ggplot_build(gg)
# unique(p$data[[2]]["colour"])