11cb0ef41Sopenharmony_ci# Copyright 2016 the V8 project authors. All rights reserved.
21cb0ef41Sopenharmony_ci# Use of this source code is governed by a BSD-style license that can be
31cb0ef41Sopenharmony_ci# found in the LICENSE file.
41cb0ef41Sopenharmony_ci
51cb0ef41Sopenharmony_ci# Do statistical tests on benchmark results
61cb0ef41Sopenharmony_ci# This script requires the libraries rjson, R.utils, ggplot2 and data.table
71cb0ef41Sopenharmony_ci# Install them prior to running
81cb0ef41Sopenharmony_ci
91cb0ef41Sopenharmony_ci# To use the script, first get some benchmark results, for example via
101cb0ef41Sopenharmony_ci# tools/run_perf.py ../v8-perf/benchmarks/Octane2.1/Octane2.1-TF.json
111cb0ef41Sopenharmony_ci#  --outdir=out/x64.release-on --outdir-secondary=out/x64.release-off
121cb0ef41Sopenharmony_ci# --json-test-results=results-on.json
131cb0ef41Sopenharmony_ci# --json-test-results-secondary=results-off.json
141cb0ef41Sopenharmony_ci# then run this script
151cb0ef41Sopenharmony_ci# Rscript statistics-for-json.R results-on.json results-off.json ~/SVG
161cb0ef41Sopenharmony_ci# to produce graphs (and get stdio output of statistical tests).
171cb0ef41Sopenharmony_ci
181cb0ef41Sopenharmony_ci
191cb0ef41Sopenharmony_cisuppressMessages(library("rjson"))       # for fromJson
201cb0ef41Sopenharmony_cisuppressMessages(library("R.utils"))     # for printf
211cb0ef41Sopenharmony_cisuppressMessages(library("ggplot2"))     # for plotting
221cb0ef41Sopenharmony_cisuppressMessages(library("data.table"))  # less broken than data.frame
231cb0ef41Sopenharmony_ci
241cb0ef41Sopenharmony_ci# Clear all variables from environment
251cb0ef41Sopenharmony_cirm(list=ls())
261cb0ef41Sopenharmony_ci
271cb0ef41Sopenharmony_ciargs <- commandArgs(TRUE)
281cb0ef41Sopenharmony_ciif (length(args) != 3) {
291cb0ef41Sopenharmony_ci  printf(paste("usage: Rscript %%this_script patched-results.json",
301cb0ef41Sopenharmony_ci               "unpatched-results.json\n"))
311cb0ef41Sopenharmony_ci} else {
321cb0ef41Sopenharmony_ci  patch <- fromJSON(file=args[1])
331cb0ef41Sopenharmony_ci  nopatch <- fromJSON(file=args[2])
341cb0ef41Sopenharmony_ci  outputPath <- args[3]
351cb0ef41Sopenharmony_ci  df <- data.table(L = numeric(), R = numeric(), E = numeric(), 
361cb0ef41Sopenharmony_ci                   p.value = numeric(), yL = character(), 
371cb0ef41Sopenharmony_ci                   p.value.sig = logical())
381cb0ef41Sopenharmony_ci  
391cb0ef41Sopenharmony_ci  for (i in seq(1, length(patch$traces))) {
401cb0ef41Sopenharmony_ci    testName <- patch$traces[[i]]$graphs[[2]]
411cb0ef41Sopenharmony_ci    printf("%s\n", testName)
421cb0ef41Sopenharmony_ci    
431cb0ef41Sopenharmony_ci    nopatch_res <- as.integer(nopatch$traces[[i]]$results)
441cb0ef41Sopenharmony_ci    patch_res <- as.integer(patch$traces[[i]]$results)
451cb0ef41Sopenharmony_ci    if (length(nopatch_res) > 0) {
461cb0ef41Sopenharmony_ci      patch_norm <- shapiro.test(patch_res);
471cb0ef41Sopenharmony_ci      nopatch_norm <- shapiro.test(nopatch_res);
481cb0ef41Sopenharmony_ci
491cb0ef41Sopenharmony_ci      # Shaprio-Wilk test indicates whether data is not likely to 
501cb0ef41Sopenharmony_ci      # come from a normal distribution. The p-value is the probability
511cb0ef41Sopenharmony_ci      # to obtain the sample from a normal distribution. This means, the
521cb0ef41Sopenharmony_ci      # smaller p, the more likely the sample was not drawn from a normal
531cb0ef41Sopenharmony_ci      # distribution. See [wikipedia:Shapiro-Wilk-Test].
541cb0ef41Sopenharmony_ci      printf("  Patched scores look %s distributed (W=%.4f, p=%.4f)\n", 
551cb0ef41Sopenharmony_ci             ifelse(patch_norm$p.value < 0.05, "not normally", "normally"), 
561cb0ef41Sopenharmony_ci             patch_norm$statistic, patch_norm$p.value);
571cb0ef41Sopenharmony_ci      printf("  Unpatched scores look %s distributed (W=%.4f, p=%.4f)\n", 
581cb0ef41Sopenharmony_ci             ifelse(nopatch_norm$p.value < 0.05, "not normally", "normally"), 
591cb0ef41Sopenharmony_ci             nopatch_norm$statistic, nopatch_norm$p.value);
601cb0ef41Sopenharmony_ci      
611cb0ef41Sopenharmony_ci      hist <- ggplot(data=data.frame(x=as.integer(patch_res)), aes(x)) +
621cb0ef41Sopenharmony_ci        theme_bw() + 
631cb0ef41Sopenharmony_ci        geom_histogram(bins=50) +
641cb0ef41Sopenharmony_ci        ylab("Points") +
651cb0ef41Sopenharmony_ci        xlab(patch$traces[[i]]$graphs[[2]])
661cb0ef41Sopenharmony_ci      ggsave(filename=sprintf("%s/%s.svg", outputPath, testName), 
671cb0ef41Sopenharmony_ci             plot=hist, width=7, height=7)
681cb0ef41Sopenharmony_ci      
691cb0ef41Sopenharmony_ci      hist <- ggplot(data=data.frame(x=as.integer(nopatch_res)), aes(x)) +
701cb0ef41Sopenharmony_ci        theme_bw() + 
711cb0ef41Sopenharmony_ci        geom_histogram(bins=50) +
721cb0ef41Sopenharmony_ci        ylab("Points") +
731cb0ef41Sopenharmony_ci        xlab(patch$traces[[i]]$graphs[[2]])
741cb0ef41Sopenharmony_ci      ggsave(filename=sprintf("%s/%s-before.svg", outputPath, testName), 
751cb0ef41Sopenharmony_ci             plot=hist, width=7, height=7)
761cb0ef41Sopenharmony_ci      
771cb0ef41Sopenharmony_ci      # The Wilcoxon rank-sum test 
781cb0ef41Sopenharmony_ci      mww <- wilcox.test(patch_res, nopatch_res, conf.int = TRUE, exact=TRUE)
791cb0ef41Sopenharmony_ci      printf(paste("  Wilcoxon U-test W=%.4f, p=%.4f,",
801cb0ef41Sopenharmony_ci                   "confidence interval [%.1f, %.1f],",
811cb0ef41Sopenharmony_ci                   "est. effect size %.1f \n"),
821cb0ef41Sopenharmony_ci                   mww$statistic, mww$p.value,
831cb0ef41Sopenharmony_ci                   mww$conf.int[1], mww$conf.int[2], mww$estimate);
841cb0ef41Sopenharmony_ci      df <-rbind(df, list(mww$conf.int[1], mww$conf.int[2], 
851cb0ef41Sopenharmony_ci                          unname(mww$estimate), unname(mww$p.value),
861cb0ef41Sopenharmony_ci                          testName, ifelse(mww$p.value < 0.05, TRUE, FALSE)))
871cb0ef41Sopenharmony_ci      # t-test
881cb0ef41Sopenharmony_ci      t <- t.test(patch_res, nopatch_res, paired=FALSE)
891cb0ef41Sopenharmony_ci      printf(paste("  Welch t-test t=%.4f, df = %.2f, p=%.4f,",
901cb0ef41Sopenharmony_ci                   "confidence interval [%.1f, %.1f], mean diff %.1f \n"),
911cb0ef41Sopenharmony_ci             t$statistic, t$parameter, t$p.value, 
921cb0ef41Sopenharmony_ci             t$conf.int[1], t$conf.int[2], t$estimate[1]-t$estimate[2]);
931cb0ef41Sopenharmony_ci    }
941cb0ef41Sopenharmony_ci  }
951cb0ef41Sopenharmony_ci  df2 <- cbind(x=1:nrow(df), df[order(E),])
961cb0ef41Sopenharmony_ci  speedup <- ggplot(df2, aes(x = x, y = E, colour=p.value.sig)) +
971cb0ef41Sopenharmony_ci    geom_errorbar(aes(ymax = L, ymin = R), colour="black") +
981cb0ef41Sopenharmony_ci    geom_point(size = 4) +
991cb0ef41Sopenharmony_ci    scale_x_discrete(limits=df2$yL,
1001cb0ef41Sopenharmony_ci                       name=paste("Benchmark, n=", length(patch_res))) +
1011cb0ef41Sopenharmony_ci    theme_bw() +
1021cb0ef41Sopenharmony_ci    geom_hline(yintercept = 0) +
1031cb0ef41Sopenharmony_ci    ylab("Est. Effect Size in Points") +
1041cb0ef41Sopenharmony_ci    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5)) +
1051cb0ef41Sopenharmony_ci    theme(legend.position = "bottom") +
1061cb0ef41Sopenharmony_ci    scale_colour_manual(name="Statistical Significance (MWW, p < 0.05)",
1071cb0ef41Sopenharmony_ci                          values=c("red", "green"),
1081cb0ef41Sopenharmony_ci                          labels=c("not significant", "significant")) +
1091cb0ef41Sopenharmony_ci    theme(legend.justification=c(0,1), legend.position=c(0,1))
1101cb0ef41Sopenharmony_ci  print(speedup)
1111cb0ef41Sopenharmony_ci  ggsave(filename=sprintf("%s/speedup-estimates.svg", outputPath), 
1121cb0ef41Sopenharmony_ci         plot=speedup, width=7, height=7)
1131cb0ef41Sopenharmony_ci}
114