Significance tests

To be sure that MAP-Elites is helping us explore a meaningfully large portion of the range of possible programs we could evolve with a given representation, we want to show that the distribution of our metrics among solutions to the problem is different among different selection schemes.

First let’s load in the data

library(readr)
library(dplyr)
library(ggplot2)
library(dgof)
data <- read_csv("../data/scopegp_trait_50000_data.csv")
data$problem <- as.factor(data$problem)
data$selection_method <- as.factor(data$selection_method)

# We're not including the symbolic regression problem (Dow) because nothing solved it
data <- data %>% filter(problem!="SYMREG")
data <- data %>% filter(problem!="COLLATZ")

Make sure we have all the data:

de_dup <- data %>% distinct(run_id, .keep_all = TRUE)
summary(de_dup)
##      run_id       agent_id        selection_method     problem   
##  Min.   :  1   Min.   :0.000000   LEX  :118        COLLATZ :  0  
##  1st Qu.:118   1st Qu.:0.000000   MAPE :109        LOGIC   :120  
##  Median :366   Median :0.000000   RAND :120        SMALLEST:119  
##  Mean   :331   Mean   :0.006452   TOURN:118        SQUARES :120  
##  3rd Qu.:542   3rd Qu.:0.000000                    SUM     :106  
##  Max.   :660   Max.   :1.000000                    SYMREG  :  0  
##      update      instruction_entropy  scope_count        fitness      
##  Min.   :50000   Min.   :0.0000      Min.   : 1.000   Min.   :     0  
##  1st Qu.:50000   1st Qu.:0.9186      1st Qu.: 1.000   1st Qu.:     0  
##  Median :50000   Median :2.9223      Median : 2.000   Median :  1208  
##  Mean   :50000   Mean   :2.6366      Mean   : 2.796   Mean   : 35828  
##  3rd Qu.:50000   3rd Qu.:4.4177      3rd Qu.: 4.000   3rd Qu.: 10000  
##  Max.   :50000   Max.   :4.6840      Max.   :12.000   Max.   :200000  
##  scope_count_bin   inst_ent_bin   
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 3.000  
##  Median : 1.000   Median :10.000  
##  Mean   : 1.796   Mean   : 8.871  
##  3rd Qu.: 3.000   3rd Qu.:15.000  
##  Max.   :11.000   Max.   :16.000

Now let’s see if the distributions are different using a Kolmogrov-Smirnov test

single_problem <- function(df, prob, cutoff) {
  lex <- data %>% filter(problem==prob, selection_method=="LEX", fitness>=cutoff)
  mape <- data %>% filter(problem==prob, selection_method=="MAPE", fitness>=cutoff)
  rand <- data %>% filter(problem==prob, selection_method=="RAND", fitness>=cutoff)
  tourn <- data %>% filter(problem==prob, selection_method=="TOURN")
  
  if (length(mape$instruction_entropy) == 0) {
    print("MAPE DID NOT SOLVE PROBLEM")
    return()
  }
  
  if (length(lex$instruction_entropy) > 0) {
    print("MAPE vs. Lex instruction entropy")
    print(ks.test(mape$instruction_entropy, lex$instruction_entropy))
    print("MAPE vs. Lex scope count")
    print(ks.test(mape$scope_count, lex$scope_count))
  } else {
    print("Lexicase did not solve problem")
  }
  
  if (length(rand$instruction_entropy) > 0) {
    print("MAPE vs. Rand instruction entropy")
    print(ks.test(mape$instruction_entropy, rand$instruction_entropy))
    print("MAPE vs. Rand scope count")
    print(ks.test(mape$scope_count, rand$scope_count))
  } else {
    print("Random did not solve problem")
  }
  
  if (length(tourn$instruction_entropy) > 0) {
    print("MAPE vs. Tourn instruction entropy")
    print(ks.test(mape$instruction_entropy, tourn$instruction_entropy))
    print("MAPE vs. Tourn scope count")
    print(ks.test(mape$scope_count, tourn$scope_count))
  } else {
    print("Tournament did not solve problem")
  }
  
}

probs = c("LOGIC", "SQUARES", "SMALLEST", "SUM")
min_fs = c(10, 10000, 200000, 200000)

for (i in 1:length(probs)) {
  print(probs[i])
  single_problem(data, probs[i], min_fs[i])
}
## [1] "LOGIC"
## [1] "MAPE vs. Lex instruction entropy"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$instruction_entropy and lex$instruction_entropy
## D = 0.22445, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "MAPE vs. Lex scope count"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$scope_count and lex$scope_count
## D = 0.60791, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "Random did not solve problem"
## [1] "MAPE vs. Tourn instruction entropy"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$instruction_entropy and tourn$instruction_entropy
## D = 0.14578, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "MAPE vs. Tourn scope count"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$scope_count and tourn$scope_count
## D = 0.68851, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "SQUARES"
## [1] "MAPE vs. Lex instruction entropy"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$instruction_entropy and lex$instruction_entropy
## D = 0.26963, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "MAPE vs. Lex scope count"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$scope_count and lex$scope_count
## D = 0.78875, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "Random did not solve problem"
## [1] "MAPE vs. Tourn instruction entropy"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$instruction_entropy and tourn$instruction_entropy
## D = 0.20966, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "MAPE vs. Tourn scope count"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$scope_count and tourn$scope_count
## D = 0.70846, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "SMALLEST"
## [1] "MAPE vs. Lex instruction entropy"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$instruction_entropy and lex$instruction_entropy
## D = 0.39574, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "MAPE vs. Lex scope count"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$scope_count and lex$scope_count
## D = 0.70962, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "Random did not solve problem"
## [1] "MAPE vs. Tourn instruction entropy"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$instruction_entropy and tourn$instruction_entropy
## D = 0.22254, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "MAPE vs. Tourn scope count"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$scope_count and tourn$scope_count
## D = 0.73713, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "SUM"
## [1] "MAPE vs. Lex instruction entropy"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$instruction_entropy and lex$instruction_entropy
## D = 0.54654, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "MAPE vs. Lex scope count"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$scope_count and lex$scope_count
## D = 0.51001, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "Random did not solve problem"
## [1] "MAPE vs. Tourn instruction entropy"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$instruction_entropy and tourn$instruction_entropy
## D = 0.47622, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "MAPE vs. Tourn scope count"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  mape$scope_count and tourn$scope_count
## D = 0.5927, p-value < 2.2e-16
## alternative hypothesis: two-sided

That was a lot of comparisons so we should do a bonferroni correction for multiple comparisons. Unfortunately, these p-values are so low that R is just reporting “0” if we try to probe deeper into that p < 2.2e-16. To perform our bonferroni correction, we will round our p-values up (which is safe to do because it only makes our test more conservative) to 2.2e-16 (0.00000000000000022). Because we performed 48 hypothesis tests, a Bonferroni correction would lower our original alpha value of .05 to .05/48 = .001. Thus, p-values must be less than this value for us to consider them significant. Since 0.00000000000000022 is well below .001, we can safely say that all non-map-elites distributions are significantly different from all map-elites distributions.

Visually comparing univariate distributions

Let’s plot the distributions we were just comparing, as a sanity check:

filtered_data = data.frame()
for (i in 1:length(probs)) {
  filtered_data <- rbind(filtered_data, data %>% filter(problem==probs[i], selection_method!="RAND", fitness>=min_fs[i]))
  filtered_data <- rbind(filtered_data, data %>% filter(problem==probs[i], selection_method=="RAND"))
}
filtered_data$selection_method <- factor(filtered_data$selection_method, c("MAPE", "TOURN", "LEX", "RAND"))
levels(filtered_data$selection_method) <- c("MAP-Elites", "Tournament", "Lexicase", "Random")
levels(filtered_data$problem) <- c("Collatz", "Logic", "Smallest", "Squares", "Sum", "Symreg")

ggplot(filtered_data, aes(instruction_entropy, colour = selection_method)) +
  geom_freqpoly(stat = "density", bw=.2) + theme_classic() + facet_wrap(~problem) +
  scale_x_continuous("Instruction entropy") + scale_y_continuous("Density") + scale_color_discrete("Selection method") +
  theme(legend.position = "bottom", axis.title = element_text(size=18), panel.grid.major = element_blank(),
        axis.text = element_text(size=14), strip.text = element_text(size=18), legend.text = element_text(size=14),
         legend.title = element_text(size=14), panel.grid.minor = element_blank())

ggsave("inst_ent_density.pdf", width = 8, height=8)

ggplot(filtered_data, aes(scope_count, colour = selection_method)) +
  geom_freqpoly(stat="density", bw=.5)  + theme_classic() + facet_wrap(~problem) +
  scale_x_continuous("Scope count") + scale_y_continuous("Density") + scale_color_discrete("Selection method")+
  theme(legend.position = "bottom", axis.title = element_text(size=18), panel.grid.major = element_blank(),
        axis.text = element_text(size=14), strip.text = element_text(size=18), legend.text = element_text(size=14), 
        legend.title = element_text(size=14), panel.grid.minor = element_blank())

ggsave("scope_count_density.pdf", width = 8, height=8)

MAP-Elites is clearly exploring a wider range of program structures than the other selection schemes.

Heat maps of relationships between scope count and instruction entropy

Let’s look at how these two metrics relate to each other.

ggplot(data=filtered_data, aes(x=scope_count_bin, y=inst_ent_bin, z=fitness)) + stat_summary_2d(binwidth = 1, fun = function(x) length(x)) + theme_classic() + facet_grid(problem~selection_method) + scale_fill_gradient("Solution count", trans="log10") +
  scale_y_continuous("Instruction entropy", breaks=c(0,15), labels=c(0,4.7)) + scale_x_continuous("Scope count") +
  theme(legend.position = "bottom", axis.title = element_text(size=18),
        axis.text = element_text(size=14), strip.text = element_text(size=16), legend.text = element_text(size=10),
         legend.title = element_text(size=14), panel.grid.major = element_blank(),
         panel.grid.minor = element_blank())

ggsave("heatmaps.pdf", width = 8, height=8)