package org.openjdk;

import org.openjdk.jmh.annotations.*;

import java.util.concurrent.TimeUnit;

@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(value = 3, jvmArgsAppend = {"-Xmx1g", "-Xms1g"})
@State(Scope.Benchmark)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
public class StackTraceBench {

    @Param({"1", "10", "100", "1000"})
    private int depth;

    @Benchmark
    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public StackTraceElement[] test() {
        return doDepth(depth);
    }

    private StackTraceElement[] doDepth(int depth) {
        if (depth == 0) {
            return new Exception().getStackTrace();
        }
        return doDepth(depth - 1);
    }

    /*
        === Baseline

                Benchmark                               (depth) Mode  Cnt      Score    Error  Units

                # Average time:
                StackTraceBench.test                         1  avgt   15      9.619 ±  0.637  us/op
                StackTraceBench.test                        10  avgt   15     14.820 ±  0.251  us/op
                StackTraceBench.test                       100  avgt   15     57.219 ±  5.149  us/op
                StackTraceBench.test                      1000  avgt   15    519.329 ± 40.075  us/op

                # Allocation rates:
                StackTraceBench.test:·gc.alloc.rate.norm     1  avgt   15   1160.005 ±  0.001   B/op
                StackTraceBench.test:·gc.alloc.rate.norm    10  avgt   15   1512.020 ±  0.001   B/op
                StackTraceBench.test:·gc.alloc.rate.norm   100  avgt   15   6576.077 ±  0.021   B/op
                StackTraceBench.test:·gc.alloc.rate.norm  1000  avgt   15  56240.660 ±  0.211   B/op


        === Coleen's patch (Java array alloc)
            http://cr.openjdk.java.net/~shade/8150778/hs-original.patch
            http://cr.openjdk.java.net/~shade/8150778/jdk-original.patch

                Benchmark                               (depth) Mode  Cnt      Score    Error  Units

                # Average time:
                StackTraceBench.test                         1  avgt   15      8.114 ±  0.165  us/op
                StackTraceBench.test                        10  avgt   15     11.668 ±  0.087  us/op
                StackTraceBench.test                       100  avgt   15     47.622 ±  1.137  us/op
                StackTraceBench.test                      1000  avgt   15    433.374 ± 53.341  us/op

                # Allocation rates:
                StackTraceBench.test:·gc.alloc.rate.norm     1  avgt   15   1168.014 ±  0.002   B/op
                StackTraceBench.test:·gc.alloc.rate.norm    10  avgt   15   1520.017 ±  0.005   B/op
                StackTraceBench.test:·gc.alloc.rate.norm   100  avgt   15   6584.068 ±  0.019   B/op
                StackTraceBench.test:·gc.alloc.rate.norm  1000  avgt   15  56248.392 ±  0.269   B/op

            Interpretation: this test runs significantly faster than JDK 9 baseline, and allocates roughly
            the same memory as previous version.


        === Coleen's patch + returning the array
            http://cr.openjdk.java.net/~shade/8150778/jdk-returnArray.patch
            http://cr.openjdk.java.net/~shade/8150778/hs-returnArray-optionalNoIntern.patch

                Benchmark                               (depth) Mode  Cnt      Score   Error  Units

                # Average time:
                StackTraceBench.test                         1  avgt   15      8.389 ± 0.111  us/op
                StackTraceBench.test                        10  avgt   15     12.216 ± 0.321  us/op
                StackTraceBench.test                       100  avgt   15     49.620 ± 0.733  us/op
                StackTraceBench.test                      1000  avgt   15    422.063 ± 9.776  us/op

                # Allocation rates:
                StackTraceBench.test:·gc.alloc.rate.norm     1  avgt   15   1168.004 ± 0.001   B/op
                StackTraceBench.test:·gc.alloc.rate.norm    10  avgt   15   1520.005 ± 0.001   B/op
                StackTraceBench.test:·gc.alloc.rate.norm   100  avgt   15   6584.023 ± 0.002   B/op
                StackTraceBench.test:·gc.alloc.rate.norm  1000  avgt   15  56248.185 ± 0.002   B/op

            Interpretation: This patch runs marginally slower than Java-allocated array, but still much
            faster than a baseline. Allocates the same amount of Java heap memory. Looking at profiles,
            I believe the difference is in native memset vs. Java-allocated zeroing performance.


        == Coleen's patch + returning the array + replace StringTable::intern with String::create_from
            http://cr.openjdk.java.net/~shade/8150778/jdk-returnArray.patch
            http://cr.openjdk.java.net/~shade/8150778/hs-returnArray-optionalNoIntern.patch
               + uncomment String::create_from lines

                Benchmark                              (depth)  Mode  Cnt       Score   Error  Units

                # Average time:
                StackTraceBench.test                         1  avgt   15       6.355 ± 0.047  us/op
                StackTraceBench.test                        10  avgt   15       9.220 ± 0.316  us/op
                StackTraceBench.test                       100  avgt   15      38.389 ± 1.434  us/op
                StackTraceBench.test                      1000  avgt   15     323.706 ± 7.382  us/op

                # Allocation rates:
                StackTraceBench.test:·gc.alloc.rate.norm     1  avgt   15    4120.003 ± 0.001   B/op
                StackTraceBench.test:·gc.alloc.rate.norm    10  avgt   15    6128.004 ± 0.001   B/op
                StackTraceBench.test:·gc.alloc.rate.norm   100  avgt   15   27752.017 ± 0.002   B/op
                StackTraceBench.test:·gc.alloc.rate.norm  1000  avgt   15  243016.137 ± 0.005   B/op

             Interpretation: avoiding String interning helps to improve single-threaded performance, because
             the critical path is faster. However, it comes with a much greater allocation pressure. This
             indicates we may want to cache some of the things original patch interns, straight in the Symbols?
             That can trim interning costs without sacrificing allocation pressure.

     */

}