Chapter 6 Contradictory objectives diagnostic
<- "2023-12-28-phylo-sampling-diag"
experiment_slug
<- paste0(
working_directory "experiments/",
experiment_slug,"/analysis/"
)
if (exists("bookdown_wd_prefix")) {
<- paste0(
working_directory
bookdown_wd_prefix,
working_directory
) }
6.1 Dependencies
library(tidyverse)
library(cowplot)
library(RColorBrewer)
library(khroma)
library(rstatix)
library(knitr)
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
print(version)
## _
## platform aarch64-apple-darwin20
## arch aarch64
## os darwin20
## system aarch64, darwin20
## status
## major 4
## minor 2.1
## year 2022
## month 06
## day 23
## svn rev 82513
## language R
## version.string R version 4.2.1 (2022-06-23)
## nickname Funny-Looking Kid
6.2 Setup
# Configure our default graphing theme
theme_set(theme_cowplot())
# Create a directory to store plots
<- paste0(working_directory, "plots/")
plot_directory dir.create(plot_directory, showWarnings=FALSE)
# Constants
<- "contradictory-objectives" focal_diagnostic
6.2.1 Load experiment summary data
<- paste0(working_directory, "data/aggregate.csv")
summary_data_loc <- read_csv(summary_data_loc) summary_data
## Rows: 1080 Columns: 58
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): DIAGNOSTIC, EVAL_FIT_EST_MODE, EVAL_MODE, SELECTION, STOP_MODE
## dbl (53): ACCURACY, CREDIT, DIAGNOSTIC_DIMENSIONALITY, EVAL_MAX_PHYLO_SEARCH...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- summary_data %>%
summary_data mutate(
evals_per_gen = case_when(
== "cohort-full-compete" ~ 1.0 / NUM_COHORTS,
EVAL_MODE == "cohort" ~ 1.0 / NUM_COHORTS,
EVAL_MODE == "down-sample" ~ TEST_DOWNSAMPLE_RATE,
EVAL_MODE == "full" ~ 1.0,
EVAL_MODE == "indiv-rand-sample" ~ TEST_DOWNSAMPLE_RATE,
EVAL_MODE == "phylo-informed-sample" ~ TEST_DOWNSAMPLE_RATE
EVAL_MODE
),EVAL_FIT_EST_MODE = case_when(
== "ancestor-opt" ~ "ancestor",
EVAL_FIT_EST_MODE == "relative-opt" ~ "relative",
EVAL_FIT_EST_MODE .default = EVAL_FIT_EST_MODE
),.keep = "all"
%>%
) mutate(
eval_label = case_when(
# Clean up down-sample label
== "down-sample" & EVAL_FIT_EST_MODE != "none" ~ paste("down-sample", EVAL_FIT_EST_MODE, sep="-"),
EVAL_MODE .default = EVAL_MODE
),%>%
) mutate(
evals_per_gen = as.factor(evals_per_gen),
DIAGNOSTIC = as.factor(DIAGNOSTIC),
SELECTION = as.factor(SELECTION),
EVAL_MODE = as.factor(EVAL_MODE),
NUM_COHORTS = as.factor(NUM_COHORTS),
TEST_DOWNSAMPLE_RATE = as.factor(TEST_DOWNSAMPLE_RATE),
EVAL_FIT_EST_MODE = factor(
EVAL_FIT_EST_MODE,levels = c(
"none",
"ancestor",
"relative"
),labels = c(
"None",
"Ancestor",
"Relative"
)
)
)
# Grab just the contradictory objectives data
<- filter(
con_obj_summary_data
summary_data,== "contradictory-objectives"
DIAGNOSTIC )
6.2.2 Load experiment time series data
<- paste0(working_directory, "data/time_series.csv")
ts_data_loc <- read_csv(ts_data_loc) ts_data
## Rows: 108000 Columns: 28
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): DIAGNOSTIC, EVAL_FIT_EST_MODE, EVAL_MODE, SELECTION
## dbl (24): NUM_COHORTS, SEED, TEST_DOWNSAMPLE_RATE, ave_depth, deleterious_st...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- ts_data %>%
ts_data mutate(
evals_per_gen = case_when(
== "cohort-full-compete" ~ 1.0 / NUM_COHORTS,
EVAL_MODE == "cohort" ~ 1.0 / NUM_COHORTS,
EVAL_MODE == "down-sample" ~ TEST_DOWNSAMPLE_RATE,
EVAL_MODE == "full" ~ 1.0,
EVAL_MODE == "indiv-rand-sample" ~ TEST_DOWNSAMPLE_RATE,
EVAL_MODE == "phylo-informed-sample" ~ TEST_DOWNSAMPLE_RATE
EVAL_MODE
),EVAL_FIT_EST_MODE = case_when(
== "ancestor-opt" ~ "ancestor",
EVAL_FIT_EST_MODE == "relative-opt" ~ "relative",
EVAL_FIT_EST_MODE .default = EVAL_FIT_EST_MODE
),.keep = "all"
%>%
) mutate(
eval_label = case_when(
== "down-sample" & EVAL_FIT_EST_MODE != "none" ~ paste("down-sample", EVAL_FIT_EST_MODE, sep="-"),
EVAL_MODE .default = EVAL_MODE
)%>%
) mutate(
evals_per_gen = as.factor(evals_per_gen),
DIAGNOSTIC = as.factor(DIAGNOSTIC),
SELECTION = as.factor(SELECTION),
EVAL_MODE = as.factor(EVAL_MODE),
NUM_COHORTS = as.factor(NUM_COHORTS),
TEST_DOWNSAMPLE_RATE = as.factor(TEST_DOWNSAMPLE_RATE),
EVAL_FIT_EST_MODE = factor(
EVAL_FIT_EST_MODE,levels = c(
"none",
"ancestor",
"relative"
),labels = c(
"None",
"Ancestor",
"Relative"
)
)
)
# Grab just the contradictory objectives data
<- ts_data %>%
con_obj_ts_data filter(DIAGNOSTIC == "contradictory-objectives")
Summarize time series data:
<- ts_data %>%
ts_summary_data group_by(SEED, DIAGNOSTIC, SELECTION, evals_per_gen, eval_label) %>%
summarize(
n = n(),
avg_num_unique_selected = mean(num_unique_selected),
total_optimal_trait_coverage_loss = sum(optimal_trait_coverage_loss)
)
## `summarise()` has grouped output by 'SEED', 'DIAGNOSTIC', 'SELECTION',
## 'evals_per_gen'. You can override using the `.groups` argument.
6.2.3 Plotting helper functions
The following function assist with exploratory plotting of different measurements from summary and time series data. Note that for these plots, standard lexicase reference is rendered at equivalent number of generations (instead of evaluations).
<- function(data, diagnostic, selection, response) {
build_plot_summary_data <- data %>% filter(DIAGNOSTIC == diagnostic)
diag_data
<- median(
full_median filter(
diag_data,== "full" & SELECTION == selection
eval_label
)[[response]]
)
<- diag_data %>%
plot filter(
!= "full" & SELECTION == selection
eval_label %>%
) ggplot(
aes_string(
x = "eval_label",
y = response,
fill = "eval_label"
)+
) geom_hline(
yintercept = full_median,
size = 1.0,
alpha = 0.7,
color = "black",
linetype="dashed"
+
) geom_flat_violin(
position = position_nudge(x = .2, y = 0),
alpha = .8,
adjust = 1.5
+
) geom_point(
mapping = aes(color = eval_label),
position = position_jitter(width = .15),
size = .5,
alpha = 0.8
+
) geom_boxplot(
width = .1,
outlier.shape = NA,
alpha = 0.5
+
) scale_y_continuous(
# limits = c(-0.5, 100)
+
) scale_fill_bright() +
scale_color_bright() +
facet_grid(
~ evals_per_gen,
SELECTION # nrow=2,
labeller = label_both
+
) theme(
legend.position = "none",
axis.text.x = element_text(
angle = 30,
hjust = 1
),panel.border = element_rect(color = "gray", size = 2)
)
return(plot)
}
<- function(
build_plot_time_series_single_sampling
data,
diagnostic,
selection,
sampling_level,
response
) {
<- data %>% filter(
diag_data == diagnostic &
DIAGNOSTIC == selection &
SELECTION == sampling_level
evals_per_gen %>%
) mutate(
sampling_level_label = sampling_level
)
<- data %>% filter(
full_diag_data == diagnostic & SELECTION == selection & eval_label == "full"
DIAGNOSTIC %>%
) mutate(
# Ensure that median line will sit in same facet
sampling_level_label = sampling_level
)
<- diag_data %>%
plot filter(
!= "full"
eval_label %>%
) ggplot(
aes_string(
x = "ts_step",
# x = "evaluations",
y = {{ response }}
)+
) stat_summary(
geom = "line",
fun = mean,
aes(
color = eval_label
)+
) stat_summary(
geom = "ribbon",
fun.data = "mean_cl_boot",
fun.args = list(conf.int = 0.95),
alpha = 0.2,
linetype = 0,
aes(
color = eval_label,
fill = eval_label
)+
) scale_fill_bright() +
scale_color_bright() +
# facet_wrap(
# ~ sampling_level_label,
# ncol = 1,
# labeller = label_both
# ) +
theme(
legend.position = "right"
+
) stat_summary(
data = full_diag_data,
geom = "line",
fun = median,
linetype = "dashed",
color = "black"
)
return(plot)
}
<- function(
build_plot_time_series
data,
diagnostic,
selection,
response
) {# Build 1% sampling plot and 10% sampling plot
<- data %>% build_plot_time_series_single_sampling(
p_01
diagnostic,
selection,"0.01",
response
)<- data %>% build_plot_time_series_single_sampling(
p_10
diagnostic,
selection,"0.1",
response
)
<- ggdraw() +
title draw_label(
paste0(diagnostic, " - ", selection),
fontface = 'bold',
x = 0,
hjust = 0
+
) theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 7)
)
<- plot_grid(
plot
title,+ labs(title = "1% subsampling") + theme(legend.position = "none"),
p_01 + labs(title = "10% subsampling") + theme(legend.position = "bottom"),
p_10 nrow = 3,
ncol = 1,
rel_heights = c(0.075, 1, 1)
)
return(plot)
}
6.3 Population-wide satisfactory trait coverage
6.3.1 Final - Lexicase selection
<- summary_data %>% build_plot_summary_data(
p
focal_diagnostic,"lexicase",
"pop_optimal_trait_coverage"
)ggsave(
filename = paste0(plot_directory, "con-obj-sat-cov-final-lex.pdf"),
plot = p + labs(title = "Contradictory objectives - Lexicase selection"),
width = 15,
height = 10
)
6.3.1.1 Statistics
First, we’ll create a table of median / mean values for easy reference.
%>%
con_obj_summary_data group_by(DIAGNOSTIC, SELECTION, evals_per_gen, eval_label) %>%
summarize(
cov_median = median(pop_optimal_trait_coverage),
cov_mean = mean(pop_optimal_trait_coverage),
replicates = n()
%>%
) kable()
## `summarise()` has grouped output by 'DIAGNOSTIC', 'SELECTION', 'evals_per_gen'.
## You can override using the `.groups` argument.
DIAGNOSTIC | SELECTION | evals_per_gen | eval_label | cov_median | cov_mean | replicates |
---|---|---|---|---|---|---|
contradictory-objectives | lexicase | 0.01 | down-sample | 1.0 | 1.00 | 20 |
contradictory-objectives | lexicase | 0.01 | down-sample-ancestor | 2.5 | 3.25 | 20 |
contradictory-objectives | lexicase | 0.01 | indiv-rand-sample | 8.0 | 8.20 | 20 |
contradictory-objectives | lexicase | 0.01 | phylo-informed-sample | 8.5 | 8.90 | 20 |
contradictory-objectives | lexicase | 0.1 | down-sample | 1.0 | 1.00 | 20 |
contradictory-objectives | lexicase | 0.1 | down-sample-ancestor | 17.5 | 18.15 | 20 |
contradictory-objectives | lexicase | 0.1 | indiv-rand-sample | 24.5 | 24.10 | 20 |
contradictory-objectives | lexicase | 0.1 | phylo-informed-sample | 24.0 | 24.25 | 20 |
contradictory-objectives | lexicase | 1 | full | 38.0 | 37.85 | 20 |
contradictory-objectives | tournament | 0.01 | down-sample | 1.0 | 1.00 | 20 |
contradictory-objectives | tournament | 0.01 | down-sample-ancestor | 1.0 | 1.00 | 20 |
contradictory-objectives | tournament | 0.01 | indiv-rand-sample | 1.0 | 1.00 | 20 |
contradictory-objectives | tournament | 0.01 | phylo-informed-sample | 1.0 | 1.00 | 20 |
contradictory-objectives | tournament | 0.1 | down-sample | 1.0 | 1.00 | 20 |
contradictory-objectives | tournament | 0.1 | down-sample-ancestor | 1.0 | 1.00 | 20 |
contradictory-objectives | tournament | 0.1 | indiv-rand-sample | 1.0 | 1.00 | 20 |
contradictory-objectives | tournament | 0.1 | phylo-informed-sample | 1.0 | 1.00 | 20 |
contradictory-objectives | tournament | 1 | full | 1.0 | 1.00 | 20 |
Next, we run a Kruskal-Wallis test to check for differences.
For these tests, we only compare within a single subsampling level (evals_per_gen
) and within the same selection scheme.
<- con_obj_summary_data %>%
kw_test filter(eval_label != "full") %>%
group_by(SELECTION, evals_per_gen) %>%
kruskal_test(pop_optimal_trait_coverage ~ eval_label) %>%
mutate(sig = (p < 0.05)) %>%
unite(
"comparison_group",
SELECTION,
evals_per_gen,sep = "_",
remove = FALSE
)kable(kw_test)
comparison_group | SELECTION | evals_per_gen | .y. | n | statistic | df | p | method | sig |
---|---|---|---|---|---|---|---|---|---|
lexicase_0.01 | lexicase | 0.01 | pop_optimal_trait_coverage | 80 | 58.24682 | 3 | 0 | Kruskal-Wallis | TRUE |
lexicase_0.1 | lexicase | 0.1 | pop_optimal_trait_coverage | 80 | 62.11510 | 3 | 0 | Kruskal-Wallis | TRUE |
tournament_0.01 | tournament | 0.01 | pop_optimal_trait_coverage | 80 | NaN | 3 | NaN | Kruskal-Wallis | NA |
tournament_0.1 | tournament | 0.1 | pop_optimal_trait_coverage | 80 | NaN | 3 | NaN | Kruskal-Wallis | NA |
# Grab group names of significant comparisons
<- filter(kw_test, p < 0.05)$comparison_group
sig_kw_groups
<- con_obj_summary_data %>%
wrs_test unite(
"comparison_group",
SELECTION,
evals_per_gen,sep = "_",
remove = FALSE
%>%
) filter(
!= "full" & comparison_group %in% sig_kw_groups
eval_label %>%
) group_by(SELECTION, evals_per_gen) %>%
pairwise_wilcox_test(pop_optimal_trait_coverage ~ eval_label) %>%
adjust_pvalue(method = "holm") %>%
add_significance("p.adj")
kable(wrs_test)
SELECTION | evals_per_gen | .y. | group1 | group2 | n1 | n2 | statistic | p | p.adj | p.adj.signif |
---|---|---|---|---|---|---|---|---|---|---|
lexicase | 0.01 | pop_optimal_trait_coverage | down-sample | down-sample-ancestor | 20 | 20 | 50 | 3.20e-06 | 1.91e-05 | **** |
lexicase | 0.01 | pop_optimal_trait_coverage | down-sample | indiv-rand-sample | 20 | 20 | 0 | 0.00e+00 | 1.00e-07 | **** |
lexicase | 0.01 | pop_optimal_trait_coverage | down-sample | phylo-informed-sample | 20 | 20 | 0 | 0.00e+00 | 1.00e-07 | **** |
lexicase | 0.01 | pop_optimal_trait_coverage | down-sample-ancestor | indiv-rand-sample | 20 | 20 | 37 | 9.80e-06 | 2.93e-05 | **** |
lexicase | 0.01 | pop_optimal_trait_coverage | down-sample-ancestor | phylo-informed-sample | 20 | 20 | 28 | 3.20e-06 | 1.91e-05 | **** |
lexicase | 0.01 | pop_optimal_trait_coverage | indiv-rand-sample | phylo-informed-sample | 20 | 20 | 181 | 6.14e-01 | 1.00e+00 | ns |
lexicase | 0.1 | pop_optimal_trait_coverage | down-sample | down-sample-ancestor | 20 | 20 | 0 | 0.00e+00 | 1.00e-07 | **** |
lexicase | 0.1 | pop_optimal_trait_coverage | down-sample | indiv-rand-sample | 20 | 20 | 0 | 0.00e+00 | 1.00e-07 | **** |
lexicase | 0.1 | pop_optimal_trait_coverage | down-sample | phylo-informed-sample | 20 | 20 | 0 | 0.00e+00 | 1.00e-07 | **** |
lexicase | 0.1 | pop_optimal_trait_coverage | down-sample-ancestor | indiv-rand-sample | 20 | 20 | 31 | 4.90e-06 | 1.95e-05 | **** |
lexicase | 0.1 | pop_optimal_trait_coverage | down-sample-ancestor | phylo-informed-sample | 20 | 20 | 24 | 1.90e-06 | 1.32e-05 | **** |
lexicase | 0.1 | pop_optimal_trait_coverage | indiv-rand-sample | phylo-informed-sample | 20 | 20 | 203 | 9.46e-01 | 1.00e+00 | ns |
6.3.2 Over time - lexicase selection
<- ts_data %>% build_plot_time_series(
p
focal_diagnostic,"lexicase",
"pop_optimal_trait_coverage"
)ggsave(
filename = paste0(plot_directory, "con-obj-sat-cov-ts-lex.pdf"),
plot = p,
width = 15,
height = 10
)
6.3.3 Final - Tournament selection
<- summary_data %>% build_plot_summary_data(
p
focal_diagnostic,"tournament",
"pop_optimal_trait_coverage"
)ggsave(
filename = paste0(plot_directory, "con-obj-sat-cov-final-tourn.pdf"),
plot = p + labs(title = "Contradictory objectives - Tournament selection"),
width = 15,
height = 10
)
6.4 MRCA changes
<- summary_data %>% build_plot_summary_data(
p
focal_diagnostic,"lexicase",
"phylo_mrca_changes"
)ggsave(
filename = paste0(plot_directory, "con-obj-mrca-chgs-final-lex.pdf"),
plot = p + labs(title = "Contradictory objectives - Lexicase selection"),
width = 15,
height = 10
)
6.5 Mean genotype deleterious steps
<- summary_data %>% build_plot_summary_data(
p
focal_diagnostic,"lexicase",
"phylo_mean_genoetype_deleterious_steps"
)ggsave(
filename = paste0(plot_directory, "con-obj-del-steps-final-lex.pdf"),
plot = p + labs(title = "Contradictory objectives - Lexicase selection"),
width = 15,
height = 10
)
6.6 Mean genotype pairwise distance
<- summary_data %>% build_plot_summary_data(
p
focal_diagnostic,"lexicase",
"phylo_mean_genotype_pairwise_distance"
)ggsave(
filename = paste0(plot_directory, "con-obj-pw-dist-final-lex.pdf"),
plot = p + labs(title = "Contradictory objectives - Lexicase selection"),
width = 15,
height = 10
)
6.7 Number unique individual selected
build_plot_summary_data(
ts_summary_data,
focal_diagnostic,"lexicase",
"avg_num_unique_selected"
)
6.8 Manuscript figures
Time series graphs don’t add a ton here, so just final graphs.
<- function(
build_final_score_manuscript_plot
selection,
subsample_rate
) {
# Extract median values for max aggregate score at same evaluation level as sampling regimes
<- max(
max_eval filter(con_obj_summary_data, evals_per_gen == subsample_rate)$evaluations
)<- as.numeric(
full_eval_steps levels(
as.factor(
filter(con_obj_summary_data, eval_label == "full" & evaluations >= max_eval)$evaluations # nolint: line_length_linter.
)
)
)<- full_eval_steps[which.min( full_eval_steps - max_eval )]
full_eval <- median(
full_median_score_evals filter(
con_obj_summary_data,== selection & eval_label == "full" & evaluations == full_eval
SELECTION $pop_optimal_trait_coverage
)
)
<- con_obj_summary_data %>%
plot filter(
!= "full" &
eval_label == selection &
SELECTION == subsample_rate
evals_per_gen %>%
) ggplot(
aes(
x = eval_label,
y = pop_optimal_trait_coverage,
fill = eval_label
)+
) geom_hline(
yintercept = full_median_score_evals,
size = 1.0,
alpha = 0.7,
color = "black",
linetype="dashed"
+
) geom_flat_violin(
position = position_nudge(x = .2, y = 0),
alpha = .8,
adjust = 1.5
+
) geom_point(
mapping = aes(color = eval_label),
position = position_jitter(width = .15),
size = .5,
alpha = 0.8
+
) geom_boxplot(
width = .1,
outlier.shape = NA,
alpha = 0.5
+
) scale_y_continuous(
name = "Satisfactory trait coverage",
limits = c(0, 100)
+
) scale_x_discrete(
name = "Subsampling regime",
breaks = c("down-sample", "down-sample-ancestor", "indiv-rand-sample", "phylo-informed-sample"),
labels = c("DS\n(no est.)", "DS+EST", "IRS", "ABS")
+
) scale_fill_bright() +
scale_color_bright() +
theme(
legend.position = "none",
# axis.text.x = element_text(
# angle = 30,
# hjust = 1
# ),
)return(plot)
}
Build end-of-run plots (fixed number of evaluations)
<- build_final_score_manuscript_plot(
plot_final_lex_01 "lexicase",
"0.01"
)<- build_final_score_manuscript_plot(
plot_final_lex_10 "lexicase",
"0.1"
)
Combine into single figure
<- plot_grid(
lex_fig +
plot_final_lex_01 # labs(
# title = "1% subsampling"
# ) +
theme(
plot.margin = margin(1, 0, 0, 0, "cm")
),+
plot_final_lex_10 # labs(
# title = "10% subsampling"
# ) +
theme(
axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
plot.margin = margin(1, 0, 0, 1, "cm")
),nrow = 1,
ncol = 2,
align = "h",
labels = c("a) 1% subsampling", "b) 10% subsampling"),
rel_widths = c(1, 1)
) lex_fig
save_plot(
filename = paste0(plot_directory, "2023-12-28-con-obj-lex-fig.pdf"),
plot = lex_fig,
base_width = 7,
base_height = 4,
dpi = 600
)