We performed additional validation of our analytical approach and findings. First, we performed an analysis of interobserver reliability in call classification at the individual scale across multiple observers. Second, we compared visual inspection, spectrographic cross-correlation (SPCC) and random forests as similarity methods. Finally, we asked whether or not a difference in social context between the individual and higher social scales could have skewed results for monk parakeets.

rm(list = ls())

X <- c("tidyverse", "vegan", "pbapply", "dplyr", "data.table", "mclust", "effsize", "knitr")

invisible(lapply(X, library, character.only = TRUE))

theme_AcousticSpace <- function(){
  theme(panel.background = element_rect(fill = "white"), plot.background = element_rect(fill = "white"),
        panel.grid.major = element_line(size = 1, colour = "white"),
        panel.grid.minor = element_line(size = 0.75, colour = "white"),
        axis.line = element_line(size = 0.45, colour = "black"),
        axis.title = element_text(size = 14),
        axis.text = element_text(size = 12))
}

Set directory paths for analysis.

in_path_st <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/visual-inspection/www"

in_path_classes <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/visual-inspection/classifications"

out_path <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/Site_Recordings"

Set general parameters.

social_scales <- c("Individual", "Pair", "Flock", "Site")
perms <- 9999
cores <- parallel::detectCores() - 2
seed <- 401

Read in .csv files of human classifications, obtained from Shiny app.

files <- list.files(pattern = ".csv", path = in_path_classes)
# files

csvs <- lapply(1:length(files), function(x){ read.csv(file.path(in_path_classes, files[x])) })
length(csvs)

## [1] 12

Inter-observer Reliability

Here we asked how consistently human observers classified calls at the individual scale. This was the only social scale at which it was possible to calculate classification accuracy by human visual inspection.

# write a loop to extract individual scale information, assign individual IDs to classes by a majority rule and calculate classification accuracy per observer
observs <- 1:length(csvs)

classif_accur <- invisible(unlist(pblapply(observs, function(i){

  tmp <- t(csvs[[i]][grep("Individual", csvs[[i]]$social_scale), grep("class_", names(csvs[[i]]))])
  tmp <- data.frame(class = row.names(tmp), spectros = tmp[, 1])

  # iterate over classes to assign an individual ID to each class
  class_IDs <- unlist(lapply(1:nrow(tmp), function(x){

    # obtain calls for the given class
    calls <- as.character(tmp$spectros[x])
    calls <- strsplit(calls, split = ";")[[1]]

    # obtain individual IDs from the spectrogram image names
    indivs <- sapply(1:length(calls), function(z){
      strsplit(strsplit(calls[z], split = "INDIV-")[[1]][2], split = "_")[[1]][1]
    })

    if(all(!is.na(indivs))){
      # find the individual with the most spectrograms assiged to this class
    # this individual ID will become assigned to the given class to facilitate calculating classification accuracy
      itbl <- table(indivs)
      m <- max(itbl)
      class_ID <- names(itbl)[which(itbl == m)]
    } else {
      class_ID <- NA
    }

    return(class_ID)

  }))


  # if each class could be assigned a unique individual ID
  if(all(!is.na(class_IDs)) & !any(duplicated(class_IDs))){

    tmp_class_IDs <- class_IDs

  # if some classes were left empty
  } else if(any(is.na(class_IDs))){

    tmp_class_IDs <- class_IDs[!is.na(class_IDs)]

  # if some classes were evenly assigned to different individuals
  } else if(any(duplicated(class_IDs)) & all(!is.na(class_IDs))){

    tmp_class_IDs <- class_IDs[!duplicated(class_IDs)]

  }

   wrong <- unlist(lapply(1:nrow(tmp), function(x){

      # obtain calls for the given class
      calls <- as.character(tmp$spectros[x])
      calls <- strsplit(calls, split = ";")[[1]]

      # obtain individual IDs from the spectrogram image names
      indivs <- sapply(1:length(calls), function(z){
       strsplit(strsplit(calls[z], split = "INDIV-")[[1]][2], split = "_")[[1]][1]
      })

      itbl <- table(indivs)

      wrong <- as.vector(itbl[which(names(itbl) != tmp_class_IDs[x])])
      return(wrong)

    }))
  wrong_class <- sum(wrong)

  # extract the total number of calls classified
  N <- sum(unlist(lapply(1:nrow(tmp), function(z){
    calls <- as.character(tmp$spectros[z])
    length(strsplit(calls, split = ";")[[1]])
  })))

  # calculate classification accuracy over all calls
  return(((N - wrong_class)/N)*100)

})))

Classification Accuracy

classif_accur

##  [1] 75.00000 87.50000 75.00000 87.50000 55.55556 87.50000 50.00000
##  [8] 81.25000 75.00000 75.00000 37.50000 75.00000

mean(classif_accur)

## [1] 71.81713

range(classif_accur)

## [1] 37.5 87.5

sd(classif_accur)

## [1] 15.94189

Comparing Similarity Methods

Pre-process Shiny Visual Inspection Results

Read in extended selection tables per social scale used in Shiny app. These will determine how calls will be ordered in matrices across social scales.

# read in the extended selection tables after moving them into the Shiny app working directory
indiv_shiny_tbl <- readRDS(file.path(path = in_path_st, "indiv_scale_shiny_extended_seltable.RDS"))

pair_shiny_tbl <- readRDS(file.path(path = in_path_st, "pair_scale_shiny_extended_seltable.RDS"))

flock_shiny_tbl <- readRDS(file.path(path = in_path_st, "flock_scale_shiny_extended_seltable.RDS"))

site_shiny_tbl <- readRDS(file.path(path = in_path_st, "site_scale_shiny_extended_seltable.RDS"))

sel_tables <- list(indiv_shiny_tbl, pair_shiny_tbl, flock_shiny_tbl, site_shiny_tbl)

Write a function to convert human classifications into a list of binary matrices (one per social scale).

df2binmat <- function(X, class_prefix){

  # iterate over social scales to make a list of binary matrices
  mat_list <- invisible(pblapply(1:length(social_scales), function(i){

    # get all calls for the given social scale
    sel_table <- sel_tables[[i]]
    calls_df <- data.frame(call_ID = paste(sel_table$sound.files, "-1.jpeg", sep = ""))

    # now get the class labels for these calls
    # first extract calls and classes for the given social scale
     tmp <- droplevels(X[grep(paste("^", social_scales[i], "$", sep = ""), X$social_scale), ])
    class_cols <- tmp[, grep(paste(paste("^", class_prefix, sep = ""), collapse = "|"), names(X))]
    class_cols <- t(class_cols)

    # then iterate over rows and turn this into a data frame with one row per call
    class_df <- data.table::rbindlist(lapply(1:nrow(class_cols), function(y){
      calls_tmp <- unlist(strsplit(class_cols[y, ], split = ";"))
      df <- data.frame(class_label = row.names(class_cols)[y], call_ID = calls_tmp)
      return(df)
    }))

    # iterate over rows of the calls data frame for the given social scale, pulling out class labels from the class data frame
    final_df <- data.table::rbindlist(lapply(1:nrow(calls_df), function(y){

      call_tmp <- calls_df[y, ]
      label <- as.character(class_df$class_label[grep(paste("^", call_tmp, "$", sep = ""), class_df$call_ID)])

      return(data.frame(call_ID = call_tmp, class_label = label, ID = as.character(sel_table$ID[y])))

    }))

    # iterate over each row in this data frame and compare the label of the given call to labels of all other calls
    # this comparison is the same as asking whether or not calls were classified in the same group
    # this will return a long binary vector that can be used to build the final matrix
    class_comps <- unlist(lapply(1:nrow(calls_df), function(y){
      as.numeric(final_df$class_label[y] == final_df$class_label)
    }))

    # build the final matrix of human classifications, which contains binary encoding for whether or not calls were classified in the same group per observer and social scale
    human_class_mat <- matrix(class_comps, nrow = nrow(final_df), ncol = nrow(final_df), byrow = TRUE)
    dimnames(human_class_mat) <- list(final_df$ID, final_df$ID)

    return(human_class_mat)

  }))

  names(mat_list) <- social_scales
  return(mat_list)

}

human_class_mats_by_observer <- pblapply(1:length(csvs), function(x){
  df2binmat(X = csvs[[x]], class_prefix = "class")
})

names(human_class_mats_by_observer) <- paste("Observer", seq(1, length(human_class_mats_by_observer), 1), sep = "-")
# str(human_class_mats_by_observer)

These matrices look good. We moved on to subsetting SPCC and random forests acoustic similarity matrices for the given calls per social scale.

# read in repeatedly sampled individuals SPCC and random forests matrices
# also the SPCC and random forests matrices for higher social scales
i_path <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/Focal_Individuals"
h_path <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/Site_Recordings"

# repeatedly sampled individuals
# needs to be filtered by calls used in the human classification, which were the same calls used for random forests validation
xc_mat_i <- readRDS(file.path(i_path, "xc_mat_allfocal.RDS")) # SPCC
# str(xc_mat_i)

# this already contains only calls in the random forests validation data set (e.g. the individuals used in human classification)
 # has no dimension names
rf_mat_i <- readRDS(file.path(h_path, "proxm_ms.RDS")) # random forests
# str(rf_mat_i)

# higher social scales
# these need to be filtered for pair, flock and site calls used in human classification
xc_mat_h <- readRDS(file.path(h_path, "xc_mat_sites.RDS")) # SPCC
# str(xc_mat_h)

# add dimension names to the rf_mat to facilitate filtering, these are the same as the xc_mat for higher social scales
rf_mat_h <- readRDS(file.path(h_path, "proxm_site.RDS")) # random forests
dimnames(rf_mat_h) <- dimnames(xc_mat_h)
# str(rf_mat_h)

Individual Scale

Subset the SPCC and random forests matrices for repeatedly sampled individuals used in human classification.

sel_table <- sel_tables[[1]]
# unique(sel_table$scale) # checking
# dimnames(xc_mat_i)

# make the selection table call IDs match the SPCC matrix dimension names
call_IDs <- sel_table$sound.files
call_IDs <- gsub("WAV_", "WAV-", call_IDs)

# filter the SPCC matrix by the call_IDs in the selection table used for human classification
xc_mat_i_sub <- xc_mat_i[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_i)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_i)[[2]])]
# dimnames(xc_mat_i_sub) # checking

# replace dimnames to be friendlier for plotting
dimnames(xc_mat_i_sub) <- list(sel_table$ID, sel_table$ID)
dim(xc_mat_i_sub)

# filter the random forests matrix by the call_IDs in the selection table used for human classification
# first give the random forests matrix dimension names
rf_dim_nms <- dimnames(xc_mat_i)[[1]][grep("RAW|ZW8|-BIRD3|-BIRD5", dimnames(xc_mat_i)[[1]])]
dimnames(rf_mat_i) <- list(rf_dim_nms, rf_dim_nms)

rf_mat_i_sub <- rf_mat_i[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_i)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_i)[[2]])]
# dimnames(rf_mat_i_sub) # checking

# replace dimnames to be friendlier for plotting
dimnames(rf_mat_i_sub) <- list(sel_table$ID, sel_table$ID)

# make the binary matrix of group identity at this social scale
group_IDs <- sapply(1:nrow(sel_table), function(x){
  as.numeric(sel_table$ID[x] == sel_table$ID)
})

binmat_indiv_ID <- matrix(group_IDs, byrow = TRUE, nrow = nrow(sel_table), ncol = nrow(sel_table))
dimnames(binmat_indiv_ID) <- list(sel_table$ID, sel_table$ID)
# binmat_indiv_ID # checking

Read in the selection table and metadata for calls at higher social scales.

ccs <- read.csv(file.path(h_path, "site-level_calls_preprocessed_final.csv"), header = TRUE)

Pair Scale

Subset the SPCC and random forests matrices for pair calls used in human classification.

sel_table <- sel_tables[[2]]
# unique(sel_table$scale) # checking

# check out call IDs between the human classification selection table and the SPCC matrix
# sel_table$sound.files
# dimnames(xc_mat_h)[[1]]

# make the selection table call IDs match the SPCC matrix dimension names
call_IDs <- sel_table$sound.files
call_IDs <- gsub("WAV_", "WAV-", call_IDs)

# filter the SPCC matrix by the call_IDs in the selection table used for human classification
xc_mat_h_pairs <- xc_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[2]])]
# dimnames(xc_mat_h_pairs) # checking

# replace dimnames to be friendlier for plotting
dimnames(xc_mat_h_pairs) <- list(sel_table$ID, sel_table$ID)

# repeat filtering for the random forests matrix
rf_mat_h_pairs <- rf_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[2]])]
# dimnames(rf_mat_h_pairs) # checking

# replace dimnames to be friendlier for plotting
dimnames(rf_mat_h_pairs) <- list(sel_table$ID, sel_table$ID)

# make the binary matrix of group identity at this social scale
group_IDs <- sapply(1:nrow(sel_table), function(x){
  as.numeric(sel_table$ID[x] == sel_table$ID)
})

binmat_pair_ID <- matrix(group_IDs, byrow = TRUE, nrow = nrow(sel_table), ncol = nrow(sel_table))
dimnames(binmat_pair_ID) <- list(sel_table$ID, sel_table$ID)
# binmat_pair_ID # checking

Flock Scale

Subset the SPCC and random forests matrices for flock calls used in human classification.

sel_table <- sel_tables[[3]]
# unique(sel_table$scale) # checking

# make the selection table call IDs match the SPCC matrix dimension names
call_IDs <- sel_table$sound.files
call_IDs <- gsub("WAV_", "WAV-", call_IDs)

# filter the SPCC matrix by the call_IDs in the selection table used for human classification
xc_mat_h_flocks <- xc_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[2]])]
# dimnames(xc_mat_h_flocks) # checking

# replace dimnames to be friendlier for plotting
dimnames(xc_mat_h_flocks) <- list(sel_table$ID, sel_table$ID)

# repeat filtering for the random forests matrix
rf_mat_h_flocks <- rf_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[2]])]
# dimnames(rf_mat_h_flocks) # checking

# replace dimnames to be friendlier for plotting
dimnames(rf_mat_h_flocks) <- list(sel_table$ID, sel_table$ID)

# make the binary matrix of group identity at this social scale
group_IDs <- sapply(1:nrow(sel_table), function(x){
  as.numeric(sel_table$ID[x] == sel_table$ID)
})

binmat_flock_ID <- matrix(group_IDs, byrow = TRUE, nrow = nrow(sel_table), ncol = nrow(sel_table))
dimnames(binmat_flock_ID) <- list(sel_table$ID, sel_table$ID)
# binmat_flock_ID # checking

Site Scale

Subset the SPCC and random forests matrices for site calls used in human classification. Also, make a binary matrix of group identity for these site-scale calls.

sel_table <- sel_tables[[4]]
# unique(sel_table$scale) # checking

# make the selection table call IDs match the SPCC matrix dimension names
call_IDs <- sel_table$sound.files
call_IDs <- gsub("WAV_", "WAV-", call_IDs)

# filter the SPCC matrix by the call_IDs in the selection table used for human classification
xc_mat_h_sites <- xc_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[2]])]
# dimnames(xc_mat_h_sites) # checking

# replace dimnames to be friendlier for plotting
dimnames(xc_mat_h_sites) <- list(sel_table$ID, sel_table$ID)

# repeat filtering for the random forests matrix
rf_mat_h_sites <- rf_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[2]])]
# dimnames(rf_mat_h_sites) # checking

# replace dimnames to be friendlier for plotting
dimnames(rf_mat_h_sites) <- list(sel_table$ID, sel_table$ID)

# make the binary matrix of group identity at this social scale
group_IDs <- sapply(1:nrow(sel_table), function(x){
  as.numeric(sel_table$ID[x] == sel_table$ID)
})

binmat_site_ID <- matrix(group_IDs, byrow = TRUE, nrow = nrow(sel_table), ncol = nrow(sel_table))
dimnames(binmat_site_ID) <- list(sel_table$ID, sel_table$ID)
# binmat_site_ID # checking

The subsetted SPCC and random forests matrices look good, as do the binary matrices of group identity across social scales. We used a model-based clustering approach to ask how well clustering algorithms classified calls based on similarity matrices per method.

# x <- 2
human_mat_list <- invisible(pblapply(1:length(social_scales), function(x){

  ss <- social_scales[x]

  # extract human classification matrices across observers for this social scale
  human_class_mats <- lapply(1:length(human_class_mats_by_observer), function(x){
    human_class_mats_by_observer[[x]][[ss]]
  })

  # add matrices for this scale together across observers
  # yields a matrix in which each cell indicates how often calls were classified together across observers
  human_class_mat <- Reduce('+', human_class_mats)

  # scale the matrix values between 0 and 1 to be as similar as possible to the SPCC and random forests matrices
  human_class_mat <- apply(human_class_mat, MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))

  return(human_class_mat)

}))
names(human_mat_list) <- social_scales

SPCC_mat_list <- list(xc_mat_i_sub, xc_mat_h_pairs, xc_mat_h_flocks, xc_mat_h_sites)
names(SPCC_mat_list) <- social_scales

rf_mat_list <- list(rf_mat_i_sub, rf_mat_h_pairs, rf_mat_h_flocks, rf_mat_h_sites)
names(rf_mat_list) <- social_scales

asim_method <- c("Visual Inspection", "SPCC", "Random Forests")
asim_mat_list <- list(human_mat_list, SPCC_mat_list, rf_mat_list)
names(asim_mat_list) <- asim_method

t-SNE and Model-based Clustering

Iterate over similarity methods and social scales to: 1) perform t-SNE per similarity method, 2) perform model-based clustering per simiarity method, 3) make a data frame with the new embeddings as well as metadata for plotting, including the model-based clustering classes.

df_list <- invisible(pblapply(1:length(asim_method), function(i){

  asim_mats <- asim_mat_list[[asim_method[i]]]

  # iterate over social scales
  df_list <- lapply(1:length(social_scales), function(x){

    ss <- social_scales[x]

    # convert the similarity matrix to a distance matrix
    # will be used for t-SNE and model-based clustering
    dist_mat <- stats::as.dist(1 - asim_mats[[x]], diag = TRUE, upper = TRUE)

    # perform t-SNE

    # the optimized perplexity for focal individual feature extraction was too large for this t-SNE visualization, so we used the maximum perplexity possible given the dimensions of the distance matrix
    plx <- (dim(asim_mats[[x]])[1]/3) - 1

    # visualize with t-SNE in 2 dimensions
    tmp <- list()
    tmp <- lapply(1:10, function(n){
      Rtsne::Rtsne(dist_mat, dims = 2, pca = FALSE, max_iter = 5000, perplexity = plx, is_distance = TRUE, theta = 0.0)
    })
    # str(tmp)

    KL <- sapply(1:length(tmp), function(x){
      tmp[[x]]$itercosts[100]
    }, simplify = TRUE)

    # overwrite tmp with the solution that minimized KL
    tsne <- tmp[[which(KL == min(KL))]]

    # perform model-based clustering while limiting the possible number of clusters to the true number of groups in each call data set
    set.seed(seed)
    mBIC_res <- mclust::Mclust(data = dist_mat, G = 1:4)

    # make a data frame with output
    df <- data.frame(asim_method = asim_method[i], social_scale = ss, X = tsne$Y[, 1], Y = tsne$Y[, 2], true_group = as.numeric(factor(dimnames(asim_mats[[x]])[[1]])), cluster = mBIC_res$classification)

    return(df)

  })

  return(rbindlist(df_list))

}))

clustering_df <- rbindlist(df_list)
clustering_df$true_group <- factor(clustering_df$true_group)
clustering_df$cluster <- factor(clustering_df$cluster)
str(clustering_df)

saveRDS(clustering_df, file.path(out_path, "clustering_df.RDS"))

Here, find the number of calls misclassified per true group across social scales and similarity methods. The resulting data frame will be used for labels in the faceted plot below.

clustering_df <- readRDS(file.path(out_path, "clustering_df.RDS"))

# make a new column of numeric group ID
clustering_df$true_group_num <- as.numeric(clustering_df$true_group)

clustering_df$asim_method <- as.character(clustering_df$asim_method)

clustering_df$asim_method <- factor(clustering_df$asim_method, levels = c("Visual Inspection", "SPCC", "Random Forests"))

asim_method <- c("Visual Inspection", "SPCC", "Random Forests")
clusters <- 4 # each social scale has 4 clusters

# number of calls per group per social scale
num_spectros <- c(4, 2, 3, 4)

# iterate over similarity methods
df_list <- invisible(pblapply(1:length(asim_method), function(i){

  # iterate over social scales
  df_list <- lapply(1:length(social_scales), function(x){

    am <- asim_method[i]
    ss <- social_scales[x]

    # subset data and count observations per cluster identified by model-based clustering
    clust_df <- clustering_df %>%
      filter(as.character(asim_method) == am) %>%
      filter(social_scale == ss) %>%
      droplevels()

    # setting parameters for placing text labels in faceted plot below
    if(am == "Visual Inspection" & ss == "Individual"){
       X <- -Inf
       Y <- -Inf
    } else if(am == "SPCC" & ss == "Pair"){
       X <- Inf
       Y <- -Inf
    } else if(am == "Random Forests" & ss == "Pair"){
       X <- Inf
       Y <- -Inf
    } else if(am == "Visual Inspection" & ss == "Flock"){
       X <- -Inf
       Y <- -Inf
    } else {
      X <- -Inf
      Y <- -Inf
    }

    # iterate over clusters to find the number of incorrectly assigned calls per cluster
    incorr <- unlist(lapply(1:clusters, function(y){

      tmp <- clust_df %>%
        filter(cluster == y) %>%
        droplevels()

      if(nrow(tmp) < num_spectros[x] & length(unique(tmp$true_group)) == 1){

        incorr <- num_spectros[x] - nrow(tmp)

      } else if(nrow(tmp) > num_spectros[x] & length(unique(tmp$true_group)) > 1){

        incorr <- nrow(tmp) - num_spectros[x]

      } else if(nrow(tmp) <= num_spectros[x] & length(unique(tmp$true_group)) > 1){

        incorr <- abs(length(unique(tmp$true_group)) - length(unique(tmp$cluster)))

        # correctly assigned
      } else if(nrow(tmp) == num_spectros[x] & length(unique(tmp$true_group)) == 1){

        incorr <- 0

        # if no calls were assigned to this cluster, these incorrect assignments will be picked up when evaluating other clusters
      } else if(nrow(tmp) == 0){

        incorr <- 0

      }

      return(incorr)

    }))

    incorr <- (sum(unlist(incorr))/nrow(clust_df)*100)
    percent_corr <- paste(round(100 - incorr, digits = 2), "%", sep = "")

    df <- data.frame(asim_method = am, social_scale = ss, percent_corr = percent_corr, X = X, Y = Y)
    return(df)

  })

  return(rbindlist(df_list))

}))

corr_df <- rbindlist(df_list)
corr_df

##           asim_method social_scale percent_corr    X    Y
##  1: Visual Inspection   Individual        87.5% -Inf -Inf
##  2: Visual Inspection         Pair          75% -Inf -Inf
##  3: Visual Inspection        Flock       58.33% -Inf -Inf
##  4: Visual Inspection         Site        87.5% -Inf -Inf
##  5:              SPCC   Individual        87.5% -Inf -Inf
##  6:              SPCC         Pair          75%  Inf -Inf
##  7:              SPCC        Flock       66.67% -Inf -Inf
##  8:              SPCC         Site       43.75% -Inf -Inf
##  9:    Random Forests   Individual        87.5% -Inf -Inf
## 10:    Random Forests         Pair          75%  Inf -Inf
## 11:    Random Forests        Flock          50% -Inf -Inf
## 12:    Random Forests         Site       68.75% -Inf -Inf

Supplementary Figure 6

Make a faceted plot by smilarity method and social scales of the true group identity versus model-based clustering results. This is Supplementary Figure 6 in supplementary materials.

# initialize aesthetics
n <- 12
cols <- c(topo.colors(n, alpha = 1)[2], terrain.colors(n, alpha = 1)[2], heat.colors(n, alpha = 1)[1], heat.colors(n, alpha = 1)[5])
shps <- c(0, 21, 24, 6)

# find the min and max per level of grouping variables
x_maxes <- tapply(clustering_df$X, list(clustering_df$social_scale, clustering_df$asim_method), max, simplify = TRUE)

x_mins <- tapply(clustering_df$X, list(clustering_df$social_scale, clustering_df$asim_method), min)

y_maxes <- tapply(clustering_df$Y, list(clustering_df$social_scale, clustering_df$asim_method), max, simplify = TRUE)

y_mins <- tapply(clustering_df$Y, list(clustering_df$social_scale, clustering_df$asim_method), min)

# find a good x and y buffer per combo of factor levels
x_buf <- tapply(clustering_df$X, list(clustering_df$social_scale, clustering_df$asim_method), function(x){ return(max(x)/3) }, simplify = TRUE)
x_buf

##            Visual Inspection     SPCC Random Forests
## Individual          52.69264 67.08727       25.08714
## Pair                75.17353 52.03722       79.10235
## Flock               68.02524 55.50820      196.42636
## Site                42.31252 23.08736       47.10373

y_buf <- tapply(clustering_df$Y, list(clustering_df$social_scale, clustering_df$asim_method), function(x){ return(max(x)/3) }, simplify = TRUE)
y_buf

##            Visual Inspection     SPCC Random Forests
## Individual          41.18420 36.37380       22.63860
## Pair               131.72327 43.23936      269.69292
## Flock               63.33374 56.66033      393.91938
## Site                51.56966 44.52101       45.91808

blank_data <- data.frame(asim_method = rep(corr_df$asim_method, 2), social_scale = rep(corr_df$social_scale, 2), x = c(as.vector(x_mins) - as.vector(x_buf), as.vector(x_maxes) + as.vector(x_buf)), y = c(as.vector(y_mins) - as.vector(y_buf), as.vector(y_maxes) + as.vector(y_buf)))

# make graphic
gg <- ggplot(data = clustering_df, aes(x = X, y = Y)) +

  # yields nice strip text on top and right but cannot yield truly free axes
  # facet_grid(rows = vars(social_scale), cols = vars(asim_method), scales = "free", shrink = TRUE, space = "fixed") +

  facet_wrap(vars(social_scale, asim_method), ncol = length(unique(clustering_df$asim_method)), scales = "free") +

  # labeller = label_values

  geom_point(aes(shape = true_group), fill = gray.colors(n, alpha = 1)[2], size = 3.5) +

  # overlay circular points for clusters
  geom_point(aes(color = cluster), shape = 19, size = 2) +

  scale_shape_manual(values = shps) +

  guides(shape = guide_legend(title = "True Group"), color = guide_legend(title = "Cluster")) +

  xlab("t-SNE Dimension 1") + ylab("t-SNE Dimension 2") +

  geom_blank(data = blank_data, aes(x = x, y = y)) +

  expand_limits(y = 0) + scale_y_continuous(expand = c(0, 0)) +

  geom_text(data = corr_df, aes(label = percent_corr, vjust = -0.45), hjust = 0.1*c(-1, -1, -1, -1, -1, 11, -1, -1, -1, 11, -1, -1), size = 4, fontface = "bold") +

  # scale_color_manual(colors = cols) +

  theme(panel.background = element_rect(fill = "white", color = "black"), plot.background = element_rect(fill = "white"), panel.grid.major = element_line(size = 1, colour = "white"), panel.grid.minor = element_line(size = 0.75, colour = "white"), axis.line = element_line(size = 0.5, colour = "black"), axis.title = element_text(size = 14), axis.text = element_text(size = 10), strip.text = element_text(size = 11, face = "bold"), legend.position = "top", legend.title = element_text(size = 12), legend.text = element_text(size = 12)) +

    ggtitle("")

gg

# ggsave(file.path(out_path, "SupplementaryFigure6_CompareAcousticMethods.jpeg"), units = "in", height = 8.5, width = 6.9, dpi = 300)

# extract cluster colors from this plot for future reference
# p <- ggplot_build(gg)
# unique(p$data[[2]]["colour"])

Adressing Differences in Social Context

Here, we asked whether there was a difference in acoustic convergence when individuals sampled for higher social scales were recorded while flying alone or in a social context. We compared the SPCC distances between these calls in a permutation test. We performed permutations for sites at which we recorded lone individuals.

Permutation Tests and Effect Sizes

Prepare for the permutation test by identifying individuals in lone and social contexts at higher social scales.

# read in the higher social scale SPCC matrix
xc_mat_h <- readRDS(file.path(h_path, "xc_mat_sites.RDS"))
# str(xc_mat_h)

# convert to a distance matrix
xc_mat_h <- 1 - xc_mat_h

# find calls of individuals that were sampled flying alone for the site social scale
table(ccs$Group_size)

site_indivs <- droplevels(ccs[ccs$Group_size == 1 & !is.na(ccs$Group_size), ])
# str(site_indivs)

# at which sites did we record these birds?
sites <- names(table(site_indivs$General_site))
sites

# add a call_ID column
site_indivs$call_ID <- paste(as.character(site_indivs$sound.files), site_indivs$selec, sep = "-")

# find calls of individuals flying in pairs or flocks of 3-4 birds
# also filter by the sites in which the lone individuals were recorded
site_social <- droplevels(ccs[ccs$Group_size > 1 & ccs$Group_size < 5 & !is.na(ccs$Group_size) & grepl(paste(paste("^", sites, "$", sep = ""), collapse = "|"), ccs$General_site), ])
# str(site_social)

table(site_social$General_site)

# add a call_ID column
site_social$call_ID <- paste(as.character(site_social$sound.files), site_social$selec, sep = "-")

# retain only sites that were sampled for both the lone individuals and the social individuals
sites <- sites[which(sites %in% site_social$General_site)]

# mean(table(droplevels(ccs$General_site[grep(paste(sites, collapse = "|"), ccs$General_site)])))

Iterate over the sites above and extract the distances to all other calls at that site for the lone calls and the social calls, then perform the permuatation test per site.

iteratns <- 1000

perm_test_list <- invisible(pblapply(1:length(sites), function(i){

  # first filter the SPCC matrix by the calls for the given site
  xc_tmp <- xc_mat_h[grep(paste(paste("_", sites[i], "_", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[1]]), grep(paste(paste("_", sites[i], "_", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[2]])]

  # extract distances to all other calls for the lone individuals
  xc_tmp_lone <- xc_tmp[, grep(paste(paste("^", site_indivs$call_ID[grep(sites[i], site_indivs$call_ID)], "$", sep = ""), collapse = "|"), dimnames(xc_tmp)[[2]])]

  # make sure to exclude calls compared to themselves
  xc_tmp_lone <- xc_tmp_lone[xc_tmp_lone != 0]

  # extract distances to all other calls for the social individuals
  xc_tmp_social <- xc_tmp[, grep(paste(paste("^", site_social$call_ID[grep(sites[i], site_social$call_ID)], "$", sep = ""), collapse = "|"), dimnames(xc_tmp)[[2]])]

  # make sure to exclude calls compared to themselves
  xc_tmp_social <- xc_tmp_social[xc_tmp_social != 0]

  # initialize the observed differences between the distance for lone or social calls
  obs_diff <- abs(mean(xc_tmp_lone) - mean(xc_tmp_social))

  # initialize the length of the lone call distances
  # this will serve as the sample size for the permuted lone and social distance per iteration
  nl <- length(xc_tmp_lone)

  # combine the SPCC values for lone and social individuals
  all_xc_vals <- c(xc_tmp_lone, xc_tmp_social)

  df_list <- lapply(1:iteratns, function(x){

    # randomly sample distances without replacement to represent the lone distances
  rsamp_l <- sample(all_xc_vals, size = nl, replace = FALSE)

  # randomly sample distances without replacement to represent the social distances
  rsamp_s <- sample(all_xc_vals, size = nl, replace = FALSE)

  # save the mean of each permuted group
  df <- data.frame(site = sites[i], iteratn = x, mean_bw = abs(mean(rsamp_l) - mean(rsamp_s)))

  return(df)

  })

  df <- rbindlist(df_list)

  # iterate over sites to calculate p-values
  p_higher <- length(which(df$mean_bw > obs_diff))/iteratns
  p_lower <- length(which(df$mean_bw < obs_diff))/iteratns

  # calculate the effect size of the observed difference
  effect <- cohen.d(d = xc_tmp_lone, f = xc_tmp_social, pooled = TRUE, hedges.correction = TRUE, conf.level = 0.95)
  # str(effect)

  final_df <- data.frame(site = sites[i], perm_sample_size = nl, num_higher = length(which(df$mean_bw > obs_diff)), num_lower = length(which(df$mean_bw < obs_diff)), p_higher = p_higher, p_lower = p_lower, effect_size = round(effect$estimate, digits = 2), effect_size_95CI_lower = round(effect$conf.int[["lower"]], digits = 2), effect_size_95CI_upper = round(effect$conf.int[["upper"]], digits = 2))

  return(final_df)



}))

perm_test_df <- rbindlist(perm_test_list)
str(perm_test_df)

saveRDS(perm_test_df, file.path(out_path, "perm_test_df_1.RDS"))

I’m interested in whether or not the repeatedly sampled individuals at site 1145 show similar observed and expected differences between lone and social contexts. Group size was not recorded for all 1145 calls, but there 5 calls recorded from pairs flying together.

This permutation test will be somewhat different, because here, the number of repeatedly sampled calls at the individual scale outweighs those of the site scale recorded in a confirmed social context.

Prepare for the permutation test with calls from site 1145, including repeatedly sampled individuals.

# read in the matrix of SPCC similarity for repeatedly sampled individuals
xc_mat_i <- readRDS(file.path(i_path, "xc_mat_allfocal.RDS"))
# str(xc_mat_i)

# convert to a distance matrix
xc_mat_i <- 1 - xc_mat_i

# filter the individual scale by site 1145
xc_mat_i_1145 <- xc_mat_i[grep("_1145-", dimnames(xc_mat_i)[[1]]), grep("_1145", dimnames(xc_mat_i)[[2]])]
# str(xc_mat_i_1145)

# replace dimnames of each matrix with individual IDs
rsindiv_IDs <- sapply(1:length(dimnames(xc_mat_i_1145)[[1]]), function(x){

  if(grepl("UNM", dimnames(xc_mat_i_1145)[[1]][x])){
    strsplit(strsplit(dimnames(xc_mat_i_1145)[[1]][x], split = "_")[[1]][5], split = "-")[[1]][3]
  } else {
    strsplit(strsplit(dimnames(xc_mat_i_1145)[[1]][x], split = "_")[[1]][5], split = "-")[[1]][2]
  }

})

# rsindiv_IDs

dimnames(xc_mat_i_1145) <- list(rsindiv_IDs, rsindiv_IDs)
# str(xc_mat_i_1145)

# for every call for the repeatedly sampled individuals, find the distance to all other calls for all other individuals except that same individual
indivs <- unique(rsindiv_IDs)
# indivs

df_list <- invisible(pblapply(1:length(indivs), function(x){

   # exclude calls from the same individual
  tmp <- xc_mat_i_1145[grep(indivs[x], dimnames(xc_mat_i_1145)[[1]]), -grep(indivs[x], dimnames(xc_mat_i_1145)[[2]])]

  # iterate over calls and return distances to all other calls of all other individuals while excluding calls from the same individual
  df_list <- lapply(1:length(dimnames(tmp)[[1]]), function(i){
    vals <- tmp[i, ]
    df <- data.frame(ID = indivs[x], dists = vals)
    return(df)
  })

  return(rbindlist(df_list))

}))

indivs_df <- rbindlist(df_list)
# str(indivs_df)

# read in the SPCC matrix for higher social scales
xc_mat_h <- readRDS(file.path(h_path, "xc_mat_sites.RDS"))
# str(xc_mat_h)

# convert to a distance matrix
xc_mat_h <- 1 - xc_mat_h

# for the higher social scale, filter by the calls at 1145 recorded in a confirmed social context
social_1145 <- droplevels(ccs[grepl("1145", ccs$General_site) & ccs$Group_size > 1 & !is.na(ccs$Group_size), ])

social_1145$call_ID <- paste(social_1145$sound.files, social_1145$selec, sep = "-")

# obtain the SPCC distances from the social context calls at 1145 to all other calls at 1145
xc_mat_h_1145 <- xc_mat_h[grep(paste("_", "1145", "-", sep = ""), dimnames(xc_mat_h)[[1]]), grep(paste(paste("^", social_1145$call_ID, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[2]])]
# str(xc_mat_h_1145)

# make sure to exclude any calls compared to themselves
xc_social_1145 <- xc_mat_h_1145[xc_mat_h_1145 != 0]
# xc_social_1145

Perform the permutation test by permuting the SPCC distances among calls in the repeatedly sampled individual and social (1145 birds recorded flying in pairs) contexts.

# initialize the observed differences between the distance for lone or social calls
obs_diff <- abs(mean(indivs_df$dists) - mean(xc_social_1145))
obs_diff

# initialize the length of the social call distances
# this will serve aas the sample size for the permuted individual and social distance per iteration
ns <- length(xc_social_1145)

# combine the SPCC values for lone and social individuals
all_xc_vals <- c(indivs_df$dists, xc_social_1145)

iteratns <- 1000

perm_test_list2 <- invisible(pblapply(1:iteratns, function(x){

  # randomly sample distances without replacement to represent the lone distances
  rsamp_l <- sample(all_xc_vals, size = ns, replace = FALSE)

  # randomly sample distances without replacement to represent the social distances
  rsamp_s <- sample(all_xc_vals, size = ns, replace = FALSE)

  # save the mean of each permuted group
  df <- data.frame(site = "1145", iteratn = x, mean_bw = abs(mean(rsamp_l) - mean(rsamp_s)))

  return(df)

}))

perm_test_df2 <- rbindlist(perm_test_list2)

# calculate p-values for whether or not the permuted difference was significantly greater or less than the observed difference
p_higher <- length(which(perm_test_df2$mean_bw > obs_diff))/iteratns
p_lower <- length(which(perm_test_df2$mean_bw < obs_diff))/iteratns

# calculate the effect size of the observed difference
effect <- cohen.d(d = indivs_df$dists, f = xc_social_1145, pooled = TRUE, hedges.correction = TRUE, conf.level = 0.95)
# str(effect)

final_df <- data.frame(site = "1145", perm_sample_size = ns, num_higher = length(which(perm_test_df2$mean_bw > obs_diff)), num_lower = length(which(perm_test_df2$mean_bw < obs_diff)), p_higher = p_higher, p_lower = p_lower, effect_size = round(effect$estimate, digits = 2), effect_size_95CI_lower = round(effect$conf.int[["lower"]], digits = 2), effect_size_95CI_upper = round(effect$conf.int[["upper"]], digits = 2))

saveRDS(final_df, file.path(out_path, "perm_test_df_2.RDS"))

Permutation Test Results

Combine results from permutation tests and calculate p-values to determine whether or not the permuted difference was significantly greater or less than the observed difference between social contexts per site.

perm1 <- readRDS(file.path(out_path, "perm_test_df_1.RDS"))
str(perm1)

perm2 <- readRDS(file.path(out_path, "perm_test_df_2.RDS"))
str(perm2)

all_perms <- rbind(perm1, perm2)
kable(all_perms)

saveRDS(all_perms, file.path(out_path, "overall_perm_test_df.RDS"))

We used a Bonferroni correction for multiple comparisons. We evaluated significance of the permutation test p-values using the adjusted alpha reported below.

all_perms <- readRDS(file.path(out_path, "overall_perm_test_df.RDS"))

# the total number of comparisons for this permutation test, which includes the total number of sites used by the p-value comparisons (higher and lower than expected):
nrow(all_perms)*2

## [1] 42

# the adjusted alpha by a Bonferroni correction:
alpha <- 0.05

adj_alpha <- round(alpha/(nrow(all_perms)*2), digits = 4)
adj_alpha

## [1] 0.0012

Supplementary Table 4

This is Supplementary Table 4 in supplementary materials.

tbl <- all_perms

names(tbl) <- c("Site", "Sample_Size", "Higher", "Lower", "p-higher", "p-lower", "Effect_Size", "95_CI_L", "95_CI_U")

kable(tbl, digits = 3, align = "c")

Site	Sample_Size	Higher	Lower	p-higher	p-lower	Effect_Size	95_CI_L	95_CI_U
ARAZ	28	183	817	0.183	0.817	0.33	-0.08	0.74
CHAC	11	328	672	0.328	0.672	-0.37	-1.04	0.29
CISN	135	69	931	0.069	0.931	0.20	0.01	0.39
ELTE	66	594	406	0.594	0.406	0.09	-0.18	0.35
FAGR	18	256	744	0.256	0.744	-0.29	-0.92	0.35
GOLF	105	0	1000	0.000	1.000	0.60	0.30	0.89
INES-03	14	295	705	0.295	0.705	0.39	-0.16	0.94
INES-04	16	445	555	0.445	0.555	0.23	-0.34	0.81
INES-07	8	786	214	0.786	0.214	0.10	-0.80	1.00
INES-08	26	342	658	0.342	0.658	0.25	-0.14	0.65
KIYU	14	290	710	0.290	0.710	-0.36	-1.00	0.28
LENA	72	0	1000	0.000	1.000	-0.58	-0.87	-0.30
OJOS	44	155	845	0.155	0.845	-0.28	-0.61	0.05
PAVO	96	0	1000	0.000	1.000	-0.39	-0.65	-0.13
PEIX	18	49	951	0.049	0.951	0.64	0.15	1.13
PFER-01	33	368	632	0.368	0.632	0.21	-0.16	0.57
PIED	60	2	998	0.002	0.998	0.49	0.20	0.78
QUEB	45	308	692	0.308	0.692	-0.19	-0.54	0.16
ROSA	14	285	715	0.285	0.715	0.39	-0.16	0.94
VALI	22	867	133	0.867	0.133	-0.05	-0.51	0.42
1145	60	29	971	0.029	0.971	0.40	0.15	0.66

Effect Sizes for Acoustic Convergence at the Individual Scale

We also calculated the effect sizes for acoustic convergence within versus among repeatedly sampled individuals (e.g. the effect size of acoustic convergence at the individual scale, or individual signatures). This served as a baseline for assessing the strength of the effect of social context (lone versus social) on acoustic convergence (see above).

indiv_es <- invisible(pblapply(1:length(indivs), function(x){

  # SPCC distances within the given individual and all others
  tmp_in <- xc_mat_i_1145[grep(indivs[x], dimnames(xc_mat_i_1145)[[1]]), grep(indivs[x], dimnames(xc_mat_i_1145)[[2]])]
  tmp_in <- tmp_in[lower.tri(tmp_in)]

   # SPCC distances between the given individual and all others
  tmp_bw <- as.vector(xc_mat_i_1145[grep(indivs[x], dimnames(xc_mat_i_1145)[[1]]), -grep(indivs[x], dimnames(xc_mat_i_1145)[[2]])])

  # calculate the effect size of the observed difference
  effect <- cohen.d(d = tmp_bw, f = tmp_in, pooled = TRUE, hedges.correction = TRUE, conf.level = 0.95)
  effect

  return(data.frame(indiv = indivs[x], effect_size = effect$estimate, es_95CI_lower = effect$conf.int[["lower"]], es_95CI_upper = effect$conf.int[["upper"]]))

}))

indiv_es <- rbindlist(indiv_es)

Supplementary Table 5

This is Supplementary Table 5 in supplementary materials.

tbl <- indiv_es

names(tbl) <- c("Repeatedly Sampled Individual", "Effect Size", "95_CI_L", "95_CI_U")

kable(tbl, digits = 2, align = "c")

Repeatedly Sampled Individual	Effect Size	95_CI_L	95_CI_U
BIRD2	0.98	0.84	1.12
BIRD3	4.29	3.59	4.99
BIRD4	0.75	0.51	0.98
BIRD1	2.06	1.92	2.20
AAT	2.51	2.23	2.79

What was the mean and standard deviation of the effect sizes across repeatedly sampled individuals?

mean(indiv_es$effect_size)

## [1] 2.11747

sd(indiv_es$effect_size)

## [1] 1.417682

Mean and standard deviation of the effect sizes in observed differences between social context over sites? There were no significant p-values for lower permuted acoustic convergence between the lone and social contexts compared to the observed difference.

mean(abs(all_perms$effect_size[all_perms$p_higher < adj_alpha]))

## [1] 0.5233333

sd(abs(all_perms$effect_size[all_perms$p_higher < adj_alpha]))

## [1] 0.1159023

Supplementary Code 5:

Additional Validation of Our Analytical Approach and Findings

XXX^1, XXX², XXX¹ ¹XXX, ²XXX
^XXX@gmail.com

13 July, 2019

Inter-observer Reliability

Classification Accuracy

Comparing Similarity Methods

Pre-process Shiny Visual Inspection Results

Individual Scale

Pair Scale

Flock Scale

Site Scale

t-SNE and Model-based Clustering

Supplementary Figure 6

Supplementary Code 5:

Additional Validation of Our Analytical Approach and Findings

XXX1*, XXX2, XXX1 1XXX, 2XXX *XXX@gmail.com

13 July, 2019

Inter-observer Reliability

Classification Accuracy

Comparing Similarity Methods

Pre-process Shiny Visual Inspection Results

Individual Scale

Pair Scale

Flock Scale

Site Scale

t-SNE and Model-based Clustering

Supplementary Figure 6

Adressing Differences in Social Context

Permutation Tests and Effect Sizes

Permutation Test Results

Supplementary Table 4

Effect Sizes for Acoustic Convergence at the Individual Scale

Supplementary Table 5

XXX^1, XXX², XXX¹ ¹XXX, ²XXX
^XXX@gmail.com