2016-02-01 22:16:55 +01:00
|
|
|
#!/usr/bin/env Rscript
|
|
|
|
library(ggplot2);
|
|
|
|
library(plyr);
|
|
|
|
|
|
|
|
# get __dirname and load ./_cli.R
|
|
|
|
args = commandArgs(trailingOnly = F);
|
|
|
|
dirname = dirname(sub("--file=", "", args[grep("--file", args)]));
|
|
|
|
source(paste0(dirname, '/_cli.R'), chdir=T);
|
|
|
|
|
|
|
|
if (!is.null(args.options$help) ||
|
|
|
|
(!is.null(args.options$plot) && args.options$plot == TRUE)) {
|
|
|
|
stop("usage: cat file.csv | Rscript compare.R
|
|
|
|
--help show this message
|
|
|
|
--plot filename save plot to filename");
|
|
|
|
}
|
|
|
|
|
|
|
|
plot.filename = args.options$plot;
|
|
|
|
|
2016-10-12 20:23:40 +02:00
|
|
|
dat = read.csv(
|
|
|
|
file('stdin'),
|
|
|
|
colClasses=c('character', 'character', 'character', 'numeric', 'numeric')
|
|
|
|
);
|
2016-02-01 22:16:55 +01:00
|
|
|
dat = data.frame(dat);
|
2016-10-12 20:23:40 +02:00
|
|
|
|
2016-02-01 22:16:55 +01:00
|
|
|
dat$nameTwoLines = paste0(dat$filename, '\n', dat$configuration);
|
|
|
|
dat$name = paste0(dat$filename, dat$configuration);
|
|
|
|
|
|
|
|
# Create a box plot
|
|
|
|
if (!is.null(plot.filename)) {
|
|
|
|
p = ggplot(data=dat);
|
|
|
|
p = p + geom_boxplot(aes(x=nameTwoLines, y=rate, fill=binary));
|
|
|
|
p = p + ylab("rate of operations (higher is better)");
|
|
|
|
p = p + xlab("benchmark");
|
|
|
|
p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5));
|
|
|
|
ggsave(plot.filename, p);
|
|
|
|
}
|
|
|
|
|
2018-01-25 15:33:57 +01:00
|
|
|
# computes the shared standard error, as used in the welch t-test
|
|
|
|
welch.sd = function (old.rate, new.rate) {
|
|
|
|
old.se.squared = var(old.rate) / length(old.rate)
|
|
|
|
new.se.squared = var(new.rate) / length(new.rate)
|
|
|
|
return(sqrt(old.se.squared + new.se.squared))
|
|
|
|
}
|
|
|
|
|
|
|
|
# calculate the improvement confidence interval. The improvement is calculated
|
|
|
|
# by dividing by old.mu and not new.mu, because old.mu is what the mean
|
|
|
|
# improvement is calculated relative to.
|
|
|
|
confidence.interval = function (shared.se, old.mu, w, risk) {
|
|
|
|
interval = qt(1 - (risk / 2), w$parameter) * shared.se;
|
|
|
|
return(sprintf("±%.2f%%", (interval / old.mu) * 100))
|
|
|
|
}
|
|
|
|
|
2016-02-01 22:16:55 +01:00
|
|
|
# Print a table with results
|
|
|
|
statistics = ddply(dat, "name", function(subdat) {
|
2016-08-27 13:27:02 +02:00
|
|
|
old.rate = subset(subdat, binary == "old")$rate;
|
|
|
|
new.rate = subset(subdat, binary == "new")$rate;
|
2016-02-01 22:16:55 +01:00
|
|
|
|
|
|
|
# Calculate improvement for the "new" binary compared with the "old" binary
|
2016-08-27 13:27:02 +02:00
|
|
|
old.mu = mean(old.rate);
|
|
|
|
new.mu = mean(new.rate);
|
|
|
|
improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100));
|
2016-02-01 22:16:55 +01:00
|
|
|
|
2018-01-25 15:33:57 +01:00
|
|
|
r = list(
|
|
|
|
confidence = "NA",
|
|
|
|
improvement = improvement,
|
|
|
|
"accuracy (*)" = "NA",
|
|
|
|
"(**)" = "NA",
|
|
|
|
"(***)" = "NA"
|
|
|
|
);
|
|
|
|
|
2017-02-06 02:18:46 +01:00
|
|
|
# Check if there is enough data to calculate the calculate the p-value
|
2016-08-27 13:27:02 +02:00
|
|
|
if (length(old.rate) > 1 && length(new.rate) > 1) {
|
|
|
|
# Perform a statistics test to see of there actually is a difference in
|
|
|
|
# performance.
|
|
|
|
w = t.test(rate ~ binary, data=subdat);
|
2018-01-25 15:33:57 +01:00
|
|
|
shared.se = welch.sd(old.rate, new.rate)
|
2016-08-27 13:27:02 +02:00
|
|
|
|
|
|
|
# Add user friendly stars to the table. There should be at least one star
|
|
|
|
# before you can say that there is an improvement.
|
2017-01-11 13:16:25 +01:00
|
|
|
confidence = '';
|
2018-01-25 15:33:57 +01:00
|
|
|
if (w$p.value < 0.001) {
|
2017-01-11 13:16:25 +01:00
|
|
|
confidence = '***';
|
2018-01-25 15:33:57 +01:00
|
|
|
} else if (w$p.value < 0.01) {
|
2017-01-11 13:16:25 +01:00
|
|
|
confidence = '**';
|
2018-01-25 15:33:57 +01:00
|
|
|
} else if (w$p.value < 0.05) {
|
2017-01-11 13:16:25 +01:00
|
|
|
confidence = '*';
|
2016-08-27 13:27:02 +02:00
|
|
|
}
|
2018-01-25 15:33:57 +01:00
|
|
|
|
|
|
|
r = list(
|
|
|
|
confidence = confidence,
|
|
|
|
improvement = improvement,
|
|
|
|
"accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05),
|
|
|
|
"(**)" = confidence.interval(shared.se, old.mu, w, 0.01),
|
|
|
|
"(***)" = confidence.interval(shared.se, old.mu, w, 0.001)
|
|
|
|
);
|
2016-02-01 22:16:55 +01:00
|
|
|
}
|
|
|
|
|
2018-01-25 15:33:57 +01:00
|
|
|
return(data.frame(r, check.names=FALSE));
|
2016-02-01 22:16:55 +01:00
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
# Set the benchmark names as the row.names to left align them in the print
|
|
|
|
row.names(statistics) = statistics$name;
|
|
|
|
statistics$name = NULL;
|
|
|
|
|
|
|
|
options(width = 200);
|
|
|
|
print(statistics);
|
2018-01-25 15:33:57 +01:00
|
|
|
cat("\n")
|
|
|
|
cat(sprintf(
|
2018-06-01 17:30:17 +02:00
|
|
|
"Be aware that when doing many comparisons the risk of a false-positive
|
|
|
|
result increases. In this case there are %d comparisons, you can thus
|
2018-01-25 15:33:57 +01:00
|
|
|
expect the following amount of false-positive results:
|
|
|
|
%.2f false positives, when considering a 5%% risk acceptance (*, **, ***),
|
|
|
|
%.2f false positives, when considering a 1%% risk acceptance (**, ***),
|
|
|
|
%.2f false positives, when considering a 0.1%% risk acceptance (***)
|
|
|
|
",
|
|
|
|
nrow(statistics),
|
|
|
|
nrow(statistics) * 0.05,
|
|
|
|
nrow(statistics) * 0.01,
|
|
|
|
nrow(statistics) * 0.001))
|