11cb0ef41Sopenharmony_ci# Copyright 2016 the V8 project authors. All rights reserved. 21cb0ef41Sopenharmony_ci# Use of this source code is governed by a BSD-style license that can be 31cb0ef41Sopenharmony_ci# found in the LICENSE file. 41cb0ef41Sopenharmony_ci 51cb0ef41Sopenharmony_ci# Do statistical tests on benchmark results 61cb0ef41Sopenharmony_ci# This script requires the libraries rjson, R.utils, ggplot2 and data.table 71cb0ef41Sopenharmony_ci# Install them prior to running 81cb0ef41Sopenharmony_ci 91cb0ef41Sopenharmony_ci# To use the script, first get some benchmark results, for example via 101cb0ef41Sopenharmony_ci# tools/run_perf.py ../v8-perf/benchmarks/Octane2.1/Octane2.1-TF.json 111cb0ef41Sopenharmony_ci# --outdir=out/x64.release-on --outdir-secondary=out/x64.release-off 121cb0ef41Sopenharmony_ci# --json-test-results=results-on.json 131cb0ef41Sopenharmony_ci# --json-test-results-secondary=results-off.json 141cb0ef41Sopenharmony_ci# then run this script 151cb0ef41Sopenharmony_ci# Rscript statistics-for-json.R results-on.json results-off.json ~/SVG 161cb0ef41Sopenharmony_ci# to produce graphs (and get stdio output of statistical tests). 171cb0ef41Sopenharmony_ci 181cb0ef41Sopenharmony_ci 191cb0ef41Sopenharmony_cisuppressMessages(library("rjson")) # for fromJson 201cb0ef41Sopenharmony_cisuppressMessages(library("R.utils")) # for printf 211cb0ef41Sopenharmony_cisuppressMessages(library("ggplot2")) # for plotting 221cb0ef41Sopenharmony_cisuppressMessages(library("data.table")) # less broken than data.frame 231cb0ef41Sopenharmony_ci 241cb0ef41Sopenharmony_ci# Clear all variables from environment 251cb0ef41Sopenharmony_cirm(list=ls()) 261cb0ef41Sopenharmony_ci 271cb0ef41Sopenharmony_ciargs <- commandArgs(TRUE) 281cb0ef41Sopenharmony_ciif (length(args) != 3) { 291cb0ef41Sopenharmony_ci printf(paste("usage: Rscript %%this_script patched-results.json", 301cb0ef41Sopenharmony_ci "unpatched-results.json\n")) 311cb0ef41Sopenharmony_ci} else { 321cb0ef41Sopenharmony_ci patch <- fromJSON(file=args[1]) 331cb0ef41Sopenharmony_ci nopatch <- fromJSON(file=args[2]) 341cb0ef41Sopenharmony_ci outputPath <- args[3] 351cb0ef41Sopenharmony_ci df <- data.table(L = numeric(), R = numeric(), E = numeric(), 361cb0ef41Sopenharmony_ci p.value = numeric(), yL = character(), 371cb0ef41Sopenharmony_ci p.value.sig = logical()) 381cb0ef41Sopenharmony_ci 391cb0ef41Sopenharmony_ci for (i in seq(1, length(patch$traces))) { 401cb0ef41Sopenharmony_ci testName <- patch$traces[[i]]$graphs[[2]] 411cb0ef41Sopenharmony_ci printf("%s\n", testName) 421cb0ef41Sopenharmony_ci 431cb0ef41Sopenharmony_ci nopatch_res <- as.integer(nopatch$traces[[i]]$results) 441cb0ef41Sopenharmony_ci patch_res <- as.integer(patch$traces[[i]]$results) 451cb0ef41Sopenharmony_ci if (length(nopatch_res) > 0) { 461cb0ef41Sopenharmony_ci patch_norm <- shapiro.test(patch_res); 471cb0ef41Sopenharmony_ci nopatch_norm <- shapiro.test(nopatch_res); 481cb0ef41Sopenharmony_ci 491cb0ef41Sopenharmony_ci # Shaprio-Wilk test indicates whether data is not likely to 501cb0ef41Sopenharmony_ci # come from a normal distribution. The p-value is the probability 511cb0ef41Sopenharmony_ci # to obtain the sample from a normal distribution. This means, the 521cb0ef41Sopenharmony_ci # smaller p, the more likely the sample was not drawn from a normal 531cb0ef41Sopenharmony_ci # distribution. See [wikipedia:Shapiro-Wilk-Test]. 541cb0ef41Sopenharmony_ci printf(" Patched scores look %s distributed (W=%.4f, p=%.4f)\n", 551cb0ef41Sopenharmony_ci ifelse(patch_norm$p.value < 0.05, "not normally", "normally"), 561cb0ef41Sopenharmony_ci patch_norm$statistic, patch_norm$p.value); 571cb0ef41Sopenharmony_ci printf(" Unpatched scores look %s distributed (W=%.4f, p=%.4f)\n", 581cb0ef41Sopenharmony_ci ifelse(nopatch_norm$p.value < 0.05, "not normally", "normally"), 591cb0ef41Sopenharmony_ci nopatch_norm$statistic, nopatch_norm$p.value); 601cb0ef41Sopenharmony_ci 611cb0ef41Sopenharmony_ci hist <- ggplot(data=data.frame(x=as.integer(patch_res)), aes(x)) + 621cb0ef41Sopenharmony_ci theme_bw() + 631cb0ef41Sopenharmony_ci geom_histogram(bins=50) + 641cb0ef41Sopenharmony_ci ylab("Points") + 651cb0ef41Sopenharmony_ci xlab(patch$traces[[i]]$graphs[[2]]) 661cb0ef41Sopenharmony_ci ggsave(filename=sprintf("%s/%s.svg", outputPath, testName), 671cb0ef41Sopenharmony_ci plot=hist, width=7, height=7) 681cb0ef41Sopenharmony_ci 691cb0ef41Sopenharmony_ci hist <- ggplot(data=data.frame(x=as.integer(nopatch_res)), aes(x)) + 701cb0ef41Sopenharmony_ci theme_bw() + 711cb0ef41Sopenharmony_ci geom_histogram(bins=50) + 721cb0ef41Sopenharmony_ci ylab("Points") + 731cb0ef41Sopenharmony_ci xlab(patch$traces[[i]]$graphs[[2]]) 741cb0ef41Sopenharmony_ci ggsave(filename=sprintf("%s/%s-before.svg", outputPath, testName), 751cb0ef41Sopenharmony_ci plot=hist, width=7, height=7) 761cb0ef41Sopenharmony_ci 771cb0ef41Sopenharmony_ci # The Wilcoxon rank-sum test 781cb0ef41Sopenharmony_ci mww <- wilcox.test(patch_res, nopatch_res, conf.int = TRUE, exact=TRUE) 791cb0ef41Sopenharmony_ci printf(paste(" Wilcoxon U-test W=%.4f, p=%.4f,", 801cb0ef41Sopenharmony_ci "confidence interval [%.1f, %.1f],", 811cb0ef41Sopenharmony_ci "est. effect size %.1f \n"), 821cb0ef41Sopenharmony_ci mww$statistic, mww$p.value, 831cb0ef41Sopenharmony_ci mww$conf.int[1], mww$conf.int[2], mww$estimate); 841cb0ef41Sopenharmony_ci df <-rbind(df, list(mww$conf.int[1], mww$conf.int[2], 851cb0ef41Sopenharmony_ci unname(mww$estimate), unname(mww$p.value), 861cb0ef41Sopenharmony_ci testName, ifelse(mww$p.value < 0.05, TRUE, FALSE))) 871cb0ef41Sopenharmony_ci # t-test 881cb0ef41Sopenharmony_ci t <- t.test(patch_res, nopatch_res, paired=FALSE) 891cb0ef41Sopenharmony_ci printf(paste(" Welch t-test t=%.4f, df = %.2f, p=%.4f,", 901cb0ef41Sopenharmony_ci "confidence interval [%.1f, %.1f], mean diff %.1f \n"), 911cb0ef41Sopenharmony_ci t$statistic, t$parameter, t$p.value, 921cb0ef41Sopenharmony_ci t$conf.int[1], t$conf.int[2], t$estimate[1]-t$estimate[2]); 931cb0ef41Sopenharmony_ci } 941cb0ef41Sopenharmony_ci } 951cb0ef41Sopenharmony_ci df2 <- cbind(x=1:nrow(df), df[order(E),]) 961cb0ef41Sopenharmony_ci speedup <- ggplot(df2, aes(x = x, y = E, colour=p.value.sig)) + 971cb0ef41Sopenharmony_ci geom_errorbar(aes(ymax = L, ymin = R), colour="black") + 981cb0ef41Sopenharmony_ci geom_point(size = 4) + 991cb0ef41Sopenharmony_ci scale_x_discrete(limits=df2$yL, 1001cb0ef41Sopenharmony_ci name=paste("Benchmark, n=", length(patch_res))) + 1011cb0ef41Sopenharmony_ci theme_bw() + 1021cb0ef41Sopenharmony_ci geom_hline(yintercept = 0) + 1031cb0ef41Sopenharmony_ci ylab("Est. Effect Size in Points") + 1041cb0ef41Sopenharmony_ci theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5)) + 1051cb0ef41Sopenharmony_ci theme(legend.position = "bottom") + 1061cb0ef41Sopenharmony_ci scale_colour_manual(name="Statistical Significance (MWW, p < 0.05)", 1071cb0ef41Sopenharmony_ci values=c("red", "green"), 1081cb0ef41Sopenharmony_ci labels=c("not significant", "significant")) + 1091cb0ef41Sopenharmony_ci theme(legend.justification=c(0,1), legend.position=c(0,1)) 1101cb0ef41Sopenharmony_ci print(speedup) 1111cb0ef41Sopenharmony_ci ggsave(filename=sprintf("%s/speedup-estimates.svg", outputPath), 1121cb0ef41Sopenharmony_ci plot=speedup, width=7, height=7) 1131cb0ef41Sopenharmony_ci} 114