nodejs/benchmark/compare.R

#!/usr/bin/env Rscript
library(ggplot2);
library(plyr);

# get __dirname and load ./_cli.R
args = commandArgs(trailingOnly = F);
dirname = dirname(sub("--file=", "", args[grep("--file", args)]));
source(paste0(dirname, '/_cli.R'), chdir=T);

if (!is.null(args.options$help) ||
   (!is.null(args.options$plot) && args.options$plot == TRUE)) {
  stop("usage: cat file.csv | Rscript compare.R
  --help           show this message
  --plot filename  save plot to filename");
}

plot.filename = args.options$plot;

dat = read.csv(
  file('stdin'),
  colClasses=c('character', 'character', 'character', 'numeric', 'numeric')
);
dat = data.frame(dat);

dat$nameTwoLines = paste0(dat$filename, '\n', dat$configuration);
dat$name = paste0(dat$filename, dat$configuration);

# Create a box plot
if (!is.null(plot.filename)) {
  p = ggplot(data=dat);
  p = p + geom_boxplot(aes(x=nameTwoLines, y=rate, fill=binary));
  p = p + ylab("rate of operations (higher is better)");
  p = p + xlab("benchmark");
  p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5));
  ggsave(plot.filename, p);
}

# computes the shared standard error, as used in the welch t-test
welch.sd = function (old.rate, new.rate) {
  old.se.squared = var(old.rate) / length(old.rate)
  new.se.squared = var(new.rate) / length(new.rate)
  return(sqrt(old.se.squared + new.se.squared))
}

# calculate the improvement confidence interval. The improvement is calculated
# by dividing by old.mu and not new.mu, because old.mu is what the mean
# improvement is calculated relative to.
confidence.interval = function (shared.se, old.mu, w, risk) {
  interval = qt(1 - (risk / 2), w$parameter) * shared.se;
  return(sprintf("±%.2f%%", (interval / old.mu) * 100))
}

# Print a table with results
statistics = ddply(dat, "name", function(subdat) {
  old.rate = subset(subdat, binary == "old")$rate;
  new.rate = subset(subdat, binary == "new")$rate;

  # Calculate improvement for the "new" binary compared with the "old" binary
  old.mu = mean(old.rate);
  new.mu = mean(new.rate);
  improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100));

  r = list(
    confidence = "NA",
    improvement = improvement,
    "accuracy (*)" = "NA",
    "(**)" = "NA",
    "(***)" = "NA"
  );

  # Check if there is enough data to calculate the calculate the p-value
  if (length(old.rate) > 1 && length(new.rate) > 1) {
    # Perform a statistics test to see of there actually is a difference in
    # performance.
    w = t.test(rate ~ binary, data=subdat);
    shared.se = welch.sd(old.rate, new.rate)

    # Add user friendly stars to the table. There should be at least one star
    # before you can say that there is an improvement.
    confidence = '';
    if (w$p.value < 0.001) {
      confidence = '***';
    } else if (w$p.value < 0.01) {
      confidence = '**';
    } else if (w$p.value < 0.05) {
      confidence = '*';
    }

    r = list(
      confidence = confidence,
      improvement = improvement,
      "accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05),
      "(**)" = confidence.interval(shared.se, old.mu, w, 0.01),
      "(***)" = confidence.interval(shared.se, old.mu, w, 0.001)
    );
  }

  return(data.frame(r, check.names=FALSE));
});


# Set the benchmark names as the row.names to left align them in the print
row.names(statistics) = statistics$name;
statistics$name = NULL;

options(width = 200);
print(statistics);
cat("\n")
cat(sprintf(
"Be aware that when doing many comparisons the risk of a false-positive
result increases. In this case there are %d comparisons, you can thus
expect the following amount of false-positive results:
  %.2f false positives, when considering a   5%% risk acceptance (*, **, ***),
  %.2f false positives, when considering a   1%% risk acceptance (**, ***),
  %.2f false positives, when considering a 0.1%% risk acceptance (***)
",
nrow(statistics),
nrow(statistics) * 0.05,
nrow(statistics) * 0.01,
nrow(statistics) * 0.001))
benchmark: use t-test for comparing node versions The data sampling is done in node and the data processing is done in R. Only plyr was added as an R dependency and it is fairly standard. PR-URL: https://github.com/nodejs/node/pull/7094 Reviewed-By: Trevor Norris <trev.norris@gmail.com> Reviewed-By: Jeremiah Senkpiel <fishrock123@rocketmail.com> Reviewed-By: Brian White <mscdex@mscdex.net> Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-02-01 22:16:55 +01:00			`#!/usr/bin/env Rscript`
			`library(ggplot2);`
			`library(plyr);`

			`# get __dirname and load ./_cli.R`
			`args = commandArgs(trailingOnly = F);`
			`dirname = dirname(sub("--file=", "", args[grep("--file", args)]));`
			`source(paste0(dirname, '/_cli.R'), chdir=T);`

			`if (!is.null(args.options$help) \|\|`
			`(!is.null(args.options$plot) && args.options$plot == TRUE)) {`
			`stop("usage: cat file.csv \| Rscript compare.R`
			`--help show this message`
			`--plot filename save plot to filename");`
			`}`

			`plot.filename = args.options$plot;`

benchmark: fixes csv parsing given no parameters When a benchmark did not contain any parameters the csv configuration filed would be "". In R this is by default parsed as NA, causing NA in the printout too. Fixes: https://github.com/nodejs/node/issues/9061 PR-URL: https://github.com/nodejs/node/pull/9064 Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Brian White <mscdex@mscdex.net> 2016-10-12 20:23:40 +02:00			`dat = read.csv(`
			`file('stdin'),`
			`colClasses=c('character', 'character', 'character', 'numeric', 'numeric')`
			`);`
benchmark: use t-test for comparing node versions The data sampling is done in node and the data processing is done in R. Only plyr was added as an R dependency and it is fairly standard. PR-URL: https://github.com/nodejs/node/pull/7094 Reviewed-By: Trevor Norris <trev.norris@gmail.com> Reviewed-By: Jeremiah Senkpiel <fishrock123@rocketmail.com> Reviewed-By: Brian White <mscdex@mscdex.net> Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-02-01 22:16:55 +01:00			`dat = data.frame(dat);`
benchmark: fixes csv parsing given no parameters When a benchmark did not contain any parameters the csv configuration filed would be "". In R this is by default parsed as NA, causing NA in the printout too. Fixes: https://github.com/nodejs/node/issues/9061 PR-URL: https://github.com/nodejs/node/pull/9064 Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Brian White <mscdex@mscdex.net> 2016-10-12 20:23:40 +02:00
benchmark: use t-test for comparing node versions The data sampling is done in node and the data processing is done in R. Only plyr was added as an R dependency and it is fairly standard. PR-URL: https://github.com/nodejs/node/pull/7094 Reviewed-By: Trevor Norris <trev.norris@gmail.com> Reviewed-By: Jeremiah Senkpiel <fishrock123@rocketmail.com> Reviewed-By: Brian White <mscdex@mscdex.net> Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-02-01 22:16:55 +01:00			`dat$nameTwoLines = paste0(dat$filename, '\n', dat$configuration);`
			`dat$name = paste0(dat$filename, dat$configuration);`

			`# Create a box plot`
			`if (!is.null(plot.filename)) {`
			`p = ggplot(data=dat);`
			`p = p + geom_boxplot(aes(x=nameTwoLines, y=rate, fill=binary));`
			`p = p + ylab("rate of operations (higher is better)");`
			`p = p + xlab("benchmark");`
			`p = p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5));`
			`ggsave(plot.filename, p);`
			`}`

benchmark: make compare.R easier to understand PR-URL: https://github.com/nodejs/node/pull/18373 Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> 2018-01-25 15:33:57 +01:00			`# computes the shared standard error, as used in the welch t-test`
			`welch.sd = function (old.rate, new.rate) {`
			`old.se.squared = var(old.rate) / length(old.rate)`
			`new.se.squared = var(new.rate) / length(new.rate)`
			`return(sqrt(old.se.squared + new.se.squared))`
			`}`

			`# calculate the improvement confidence interval. The improvement is calculated`
			`# by dividing by old.mu and not new.mu, because old.mu is what the mean`
			`# improvement is calculated relative to.`
			`confidence.interval = function (shared.se, old.mu, w, risk) {`
			`interval = qt(1 - (risk / 2), w$parameter) * shared.se;`
			`return(sprintf("±%.2f%%", (interval / old.mu) * 100))`
			`}`

benchmark: use t-test for comparing node versions The data sampling is done in node and the data processing is done in R. Only plyr was added as an R dependency and it is fairly standard. PR-URL: https://github.com/nodejs/node/pull/7094 Reviewed-By: Trevor Norris <trev.norris@gmail.com> Reviewed-By: Jeremiah Senkpiel <fishrock123@rocketmail.com> Reviewed-By: Brian White <mscdex@mscdex.net> Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-02-01 22:16:55 +01:00			`# Print a table with results`
			`statistics = ddply(dat, "name", function(subdat) {`
benchmark: ignore significance when using --runs 1 Because the standard deviation can't be calculated when there is only one observation the R scripts raises an error. However it may still be useful to run them for non-statistical purposes. This changes the behaviour such when there is only one observation, the values that depends on the standard deviation becomes Not Applicable (NA). Fixes: https://github.com/nodejs/node/issues/8288 PR-URL: https://github.com/nodejs/node/pull/8299 Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-08-27 13:27:02 +02:00			`old.rate = subset(subdat, binary == "old")$rate;`
			`new.rate = subset(subdat, binary == "new")$rate;`
benchmark: use t-test for comparing node versions The data sampling is done in node and the data processing is done in R. Only plyr was added as an R dependency and it is fairly standard. PR-URL: https://github.com/nodejs/node/pull/7094 Reviewed-By: Trevor Norris <trev.norris@gmail.com> Reviewed-By: Jeremiah Senkpiel <fishrock123@rocketmail.com> Reviewed-By: Brian White <mscdex@mscdex.net> Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-02-01 22:16:55 +01:00
			`# Calculate improvement for the "new" binary compared with the "old" binary`
benchmark: ignore significance when using --runs 1 Because the standard deviation can't be calculated when there is only one observation the R scripts raises an error. However it may still be useful to run them for non-statistical purposes. This changes the behaviour such when there is only one observation, the values that depends on the standard deviation becomes Not Applicable (NA). Fixes: https://github.com/nodejs/node/issues/8288 PR-URL: https://github.com/nodejs/node/pull/8299 Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-08-27 13:27:02 +02:00			`old.mu = mean(old.rate);`
			`new.mu = mean(new.rate);`
			`improvement = sprintf("%.2f %%", ((new.mu - old.mu) / old.mu * 100));`
benchmark: use t-test for comparing node versions The data sampling is done in node and the data processing is done in R. Only plyr was added as an R dependency and it is fairly standard. PR-URL: https://github.com/nodejs/node/pull/7094 Reviewed-By: Trevor Norris <trev.norris@gmail.com> Reviewed-By: Jeremiah Senkpiel <fishrock123@rocketmail.com> Reviewed-By: Brian White <mscdex@mscdex.net> Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-02-01 22:16:55 +01:00
benchmark: make compare.R easier to understand PR-URL: https://github.com/nodejs/node/pull/18373 Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> 2018-01-25 15:33:57 +01:00			`r = list(`
			`confidence = "NA",`
			`improvement = improvement,`
			`"accuracy (*)" = "NA",`
			`"(**)" = "NA",`
			`"(***)" = "NA"`
			`);`

benchmark,build,doc,lib,src,test: correct typos PR-URL: https://github.com/nodejs/node/pull/11189 Reviewed-By: Brian White <mscdex@mscdex.net> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Michael Dawson <michael_dawson@ca.ibm.com> Reviewed-By: Sakthipriyan Vairamani <thechargingvolcano@gmail.com> 2017-02-06 02:18:46 +01:00			`# Check if there is enough data to calculate the calculate the p-value`
benchmark: ignore significance when using --runs 1 Because the standard deviation can't be calculated when there is only one observation the R scripts raises an error. However it may still be useful to run them for non-statistical purposes. This changes the behaviour such when there is only one observation, the values that depends on the standard deviation becomes Not Applicable (NA). Fixes: https://github.com/nodejs/node/issues/8288 PR-URL: https://github.com/nodejs/node/pull/8299 Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-08-27 13:27:02 +02:00			`if (length(old.rate) > 1 && length(new.rate) > 1) {`
			`# Perform a statistics test to see of there actually is a difference in`
			`# performance.`
			`w = t.test(rate ~ binary, data=subdat);`
benchmark: make compare.R easier to understand PR-URL: https://github.com/nodejs/node/pull/18373 Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> 2018-01-25 15:33:57 +01:00			`shared.se = welch.sd(old.rate, new.rate)`
benchmark: ignore significance when using --runs 1 Because the standard deviation can't be calculated when there is only one observation the R scripts raises an error. However it may still be useful to run them for non-statistical purposes. This changes the behaviour such when there is only one observation, the values that depends on the standard deviation becomes Not Applicable (NA). Fixes: https://github.com/nodejs/node/issues/8288 PR-URL: https://github.com/nodejs/node/pull/8299 Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-08-27 13:27:02 +02:00
			`# Add user friendly stars to the table. There should be at least one star`
			`# before you can say that there is an improvement.`
benchmark: use "confidence" in output of compare.R Use the word "confidence" to indicate the confidence level of the p value so it's easier to understand. With this change more stars in the output of compare.R means higher confidence level (lower significance level). PR-URL: https://github.com/nodejs/node/pull/10737 Refs: https://github.com/nodejs/node/pull/10439 Reviewed-By: Anna Henningsen <anna@addaleax.net> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Andreas Madsen <amwebdk@gmail.com> 2017-01-11 13:16:25 +01:00			`confidence = '';`
benchmark: make compare.R easier to understand PR-URL: https://github.com/nodejs/node/pull/18373 Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> 2018-01-25 15:33:57 +01:00			`if (w$p.value < 0.001) {`
benchmark: use "confidence" in output of compare.R Use the word "confidence" to indicate the confidence level of the p value so it's easier to understand. With this change more stars in the output of compare.R means higher confidence level (lower significance level). PR-URL: https://github.com/nodejs/node/pull/10737 Refs: https://github.com/nodejs/node/pull/10439 Reviewed-By: Anna Henningsen <anna@addaleax.net> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Andreas Madsen <amwebdk@gmail.com> 2017-01-11 13:16:25 +01:00			`confidence = '***';`
benchmark: make compare.R easier to understand PR-URL: https://github.com/nodejs/node/pull/18373 Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> 2018-01-25 15:33:57 +01:00			`} else if (w$p.value < 0.01) {`
benchmark: use "confidence" in output of compare.R Use the word "confidence" to indicate the confidence level of the p value so it's easier to understand. With this change more stars in the output of compare.R means higher confidence level (lower significance level). PR-URL: https://github.com/nodejs/node/pull/10737 Refs: https://github.com/nodejs/node/pull/10439 Reviewed-By: Anna Henningsen <anna@addaleax.net> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Andreas Madsen <amwebdk@gmail.com> 2017-01-11 13:16:25 +01:00			`confidence = '**';`
benchmark: make compare.R easier to understand PR-URL: https://github.com/nodejs/node/pull/18373 Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> 2018-01-25 15:33:57 +01:00			`} else if (w$p.value < 0.05) {`
benchmark: use "confidence" in output of compare.R Use the word "confidence" to indicate the confidence level of the p value so it's easier to understand. With this change more stars in the output of compare.R means higher confidence level (lower significance level). PR-URL: https://github.com/nodejs/node/pull/10737 Refs: https://github.com/nodejs/node/pull/10439 Reviewed-By: Anna Henningsen <anna@addaleax.net> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Andreas Madsen <amwebdk@gmail.com> 2017-01-11 13:16:25 +01:00			`confidence = '*';`
benchmark: ignore significance when using --runs 1 Because the standard deviation can't be calculated when there is only one observation the R scripts raises an error. However it may still be useful to run them for non-statistical purposes. This changes the behaviour such when there is only one observation, the values that depends on the standard deviation becomes Not Applicable (NA). Fixes: https://github.com/nodejs/node/issues/8288 PR-URL: https://github.com/nodejs/node/pull/8299 Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-08-27 13:27:02 +02:00			`}`
benchmark: make compare.R easier to understand PR-URL: https://github.com/nodejs/node/pull/18373 Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> 2018-01-25 15:33:57 +01:00
			`r = list(`
			`confidence = confidence,`
			`improvement = improvement,`
			`"accuracy (*)" = confidence.interval(shared.se, old.mu, w, 0.05),`
			`"(**)" = confidence.interval(shared.se, old.mu, w, 0.01),`
			`"(***)" = confidence.interval(shared.se, old.mu, w, 0.001)`
			`);`
benchmark: use t-test for comparing node versions The data sampling is done in node and the data processing is done in R. Only plyr was added as an R dependency and it is fairly standard. PR-URL: https://github.com/nodejs/node/pull/7094 Reviewed-By: Trevor Norris <trev.norris@gmail.com> Reviewed-By: Jeremiah Senkpiel <fishrock123@rocketmail.com> Reviewed-By: Brian White <mscdex@mscdex.net> Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-02-01 22:16:55 +01:00			`}`

benchmark: make compare.R easier to understand PR-URL: https://github.com/nodejs/node/pull/18373 Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> 2018-01-25 15:33:57 +01:00			`return(data.frame(r, check.names=FALSE));`
benchmark: use t-test for comparing node versions The data sampling is done in node and the data processing is done in R. Only plyr was added as an R dependency and it is fairly standard. PR-URL: https://github.com/nodejs/node/pull/7094 Reviewed-By: Trevor Norris <trev.norris@gmail.com> Reviewed-By: Jeremiah Senkpiel <fishrock123@rocketmail.com> Reviewed-By: Brian White <mscdex@mscdex.net> Reviewed-By: Anna Henningsen <anna@addaleax.net> 2016-02-01 22:16:55 +01:00			`});`


			`# Set the benchmark names as the row.names to left align them in the print`
			`row.names(statistics) = statistics$name;`
			`statistics$name = NULL;`

			`options(width = 200);`
			`print(statistics);`
benchmark: make compare.R easier to understand PR-URL: https://github.com/nodejs/node/pull/18373 Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> 2018-01-25 15:33:57 +01:00			`cat("\n")`
			`cat(sprintf(`
benchmark: fix "comparisons"' typo PR-URL: https://github.com/nodejs/node/pull/21085 Reviewed-By: Benjamin Gruenbaum <benjamingr@gmail.com> Reviewed-By: Lance Ball <lball@redhat.com> Reviewed-By: Luigi Pinca <luigipinca@gmail.com> Reviewed-By: Rich Trott <rtrott@gmail.com> Reviewed-By: Richard Lau <riclau@uk.ibm.com> 2018-06-01 17:30:17 +02:00			`"Be aware that when doing many comparisons the risk of a false-positive`
			`result increases. In this case there are %d comparisons, you can thus`
benchmark: make compare.R easier to understand PR-URL: https://github.com/nodejs/node/pull/18373 Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> 2018-01-25 15:33:57 +01:00			`expect the following amount of false-positive results:`
			`%.2f false positives, when considering a 5%% risk acceptance (, , **),`
			`%.2f false positives, when considering a 1%% risk acceptance (, *),`
			`%.2f false positives, when considering a 0.1%% risk acceptance (***)`
			`",`
			`nrow(statistics),`
			`nrow(statistics) * 0.05,`
			`nrow(statistics) * 0.01,`
			`nrow(statistics) * 0.001))`