Java VM V8 Dart VM

complexity

ChaCha20

function getBlock(buffer) { var x = new Uint32Array(16); for (var i = 16; i--;) x[i] = input[i]; for (var i = 20; i > 0; i -= 2) { quarterRound(x, 0, 4, 8,12); quarterRound(x, 1, 5, 9,13); quarterRound(x, 2, 6,10,14); quarterRound(x, 3, 7,11,15); quarterRound(x, 0, 5,10,15); quarterRound(x, 1, 6,11,12); quarterRound(x, 2, 7, 8,13); quarterRound(x, 3, 4, 9,14); } for (i = 16; i--;) x[i] += input[i]; for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]); input[12]++; return buffer; }

19MB/s

function getBlock(buffer) { var x = new Uint32Array(16); for (var i = 16; i--;) x[i] = input[i]; for (var i = 20; i > 0; i -= 2) { quarterRound(x, 0, 4, 8,12); quarterRound(x, 1, 5, 9,13); quarterRound(x, 2, 6,10,14); quarterRound(x, 3, 7,11,15); quarterRound(x, 0, 5,10,15); quarterRound(x, 1, 6,11,12); quarterRound(x, 2, 7, 8,13); quarterRound(x, 3, 4, 9,14); } for (i = 16; i--;) x[i] += input[i]; for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]); input[12]++; return buffer; }

function getBlock(buffer) { var x = new Uint32Array(16); for (var i = 16; i--;) x[i] = input[i]; for (var i = 20; i > 0; i -= 2) { // quarterRound(x, 0, 4, 8,12); x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) << 16) | ((x[12] ^ x[ 0]) >>> 16); x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) << 12) | ((x[ 4] ^ x[ 8]) >>> 20); x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) << 8) | ((x[12] ^ x[ 0]) >>> 24); x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) << 7) | ((x[ 4] ^ x[ 8]) >>> 25); // ... so on ... } for (i = 16; i--;) x[i] += input[i]; for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]); input[12]++; return buffer; }

2MB/s

function U32TO8_LE(x, i, u) { x[i] = u; u >>>= 8; x[i+1] = u; u >>>= 8; x[i+2] = u; u >>>= 8; x[i+3] = u; }

function U32TO8_LE(x, i, u) { // _ _ __ __ _______ _______ _______ _______ __ __ _ _ // | || | | | | || || | | _ || || |_| || || | // |_||_| | | | || _____|| ___| | |_| || _____|| ||_||_| // | |_| || |_____ | |___ | || |_____ | | // | ||_____ || ___| | ||_____ || | // | | _____| || |___ | _ | _____| || ||_|| | // |_______||_______||_______| |__| |__||_______||_| |_| // x[i] = u; u >>>= 8; x[i+1] = u; u >>>= 8; x[i+2] = u; u >>>= 8; x[i+3] = u; }

19MB/s

function U32TO8_LE(x, i, u) { // _ _ __ __ _______ _______ _______ _______ __ __ _ _ // | || | | | | || || | | _ || || |_| || || | // |_||_| | | | || _____|| ___| | |_| || _____|| ||_||_| // | |_| || |_____ | |___ | || |_____ | | // | ||_____ || ___| | ||_____ || | // | | _____| || |___ | _ | _____| || ||_|| | // |_______||_______||_______| |__| |__||_______||_| |_| // // ^^^^^^ Don't remove this comment it makes the code 10x faster. ^^^^^^ x[i] = u; u >>>= 8; x[i+1] = u; u >>>= 8; x[i+2] = u; u >>>= 8; x[i+3] = u; }

...

we say performance

we mean jsperf.com

// split str.split("e").length - 1; // while var count = 0; while (str.length) { if (str.charAt(0) === "e") { count += 1; } str = str.slice(1); }

// split (IT'S OVER 9000K OP/S ON FF!) str.split("e").length - 1; // while var count = 0; while (str.length) { if (str.charAt(0) === "e") { count += 1; } str = str.slice(1); }

DOUBT EVERYTHING

µbench 101

the cost of OP ?

function benchmark() { var start = Date.now(); /* OP */ return (Date.now() - start); }

what if OP faster than clock?

function benchmark(N) { var start = Date.now(); for (var i = 0; i < N; i++) { /* OP */ } return (Date.now() - start) / N; }

\[C\]

\[C \times N\]

\[\frac{C \times N}{N}\]

\[\frac{C \times \cancel{N}}{\cancel{N}}\]

while (str.length) { if (str.charAt(0) === "e") { count += 1; } str = str.slice(1); }

function benchmark() { var str = "..."; var start = Date.now(); for (var i = 0; i < N; i++) { var count = 0; while (str.length) { if (str.charAt(0) === "e") { count += 1; } str = str.slice(1); } } return (Date.now() - start) / N; }

function benchmark() { var str = "..."; var start = Date.now(); for (var i = 0; i < N; i++) { var count = 0; while (str.length) { if (str.charAt(0) === "e") { count += 1; } str = str.slice(1); } } return (Date.now() - start) / N; }

repetitions start with str === ""

\[C \times N\]

\[C \times 1 + 0 \times (N - 1)\]

\[\frac{C \times 1 + 0 \times (N - 1)}{N}\]

\[\frac{C}{N}\]

\[\frac{C}{N} \approx 0\]

jsperf misuse OP should take the same time when repeatedly executed

« ~~ for the win!»

NOPE

JITs are clever

JITs optimize

as program runs

\[C \times N\]

\[C_u N_u + C_o N_o\] [unoptimized + optimized version]

\[{\sum C_{i} N_i}\] [multiple unoptimized and

optimized versions]

\[{\sum C_{i} N_i} + \color{#dc1e1a}{\frac{1}{\sqrt{2\pi}}\int C(\xi) e ^ {-\frac{\xi^2}{2}} d\xi}\] [math gets too complicated to approach analytically]

JITs are clever to measure you need to outsmart

Disclaimer : On slides I will show optimizations as source-to-source transformations. In reality compilers operate on more sophisticated representations.

function benchmark() { var i = '1561565', j; var start = Date.now(); for (var n = 0; n < N; n++) { j = ~~i; } return (Date.now() - start) / N; }

function benchmark() { var j; var start = Date.now(); for (var n = 0; n < N; n++) { j = ~~'1561565' ; } return (Date.now() - start) / N; }

function benchmark() { var j; var start = Date.now(); for (var n = 0; n < N; n++) { j = ~-1561566 ; } return (Date.now() - start) / N; }

function benchmark() { var j; var start = Date.now(); for (var n = 0; n < N; n++) { j = 1561565 ; } return (Date.now() - start) / N; }

constant propagation

«hey, I can trick the compiler!»

function benchmark() { var i = Date.now().toString(), j; // ^ not a constant any more var start = Date.now(); for (var n = 0; n < N; n++) { j = ~~i; } return (Date.now() - start) / N; }

«Wrong answer, McFly!»

function benchmark() { var i = Date.now().toString(), j; var start = Date.now(); for (var n = 0; n < N; n++) { j = ~~i; // ^^^ loop invariant } return (Date.now() - start) / N; }

function benchmark() { var i = Date.now().toString(), j; var start = Date.now(); var j0 = ~~i; // ^^^^^^^^ hoisted from the loop for (var n = 0; n < N; n++) { j = j0; } return (Date.now() - start) / N; }

loop invariant code motion

function benchmark() { var i = Date.now().toString(), j; var start = Date.now(); var j0 = ~~i; for (var n = 0; n < N; n++) { j = j0; } return (Date.now() - start) / N; }

function benchmark() { var i = Date.now().toString(), j ; var start = Date.now(); var j0 = ~~i; for (var n = 0; n < N; n++) { j = j0 ; } return (Date.now() - start) / N; }

function benchmark() { var i = Date.now().toString(), j ; var start = Date.now(); var j0 = ~~i; for (var n = 0; n < N; n++) { j = j0 ; } return (Date.now() - start) / N; }

function benchmark() { var i = Date.now().toString() , j ; var start = Date.now(); var j0 = ~~i; for (var n = 0; n < N; n++) { j = j0 ; } return (Date.now() - start) / N; }

function benchmark() { var start = Date.now(); for (var n = 0; n < N; n++) { /* sound of silence */ } return (Date.now() - start) / N; }

function benchmark() { var start = Date.now(); /* * sound of silence */ return (Date.now() - start) / N; }

dead code elimination

// Remember this one? // split (IT'S OVER 9000K OP/S ON FF!) str.split("e").length - 1; // it's full of dead code.

optimizer eats µbenchmarks

for breakfast

chew proof µbenchmarks?

do not try

to write them

verify results no constants no loop invariants no dead code

function benchmark() { var i = Date.now().toString(), i1 = Date.now().toString(), t; var j; var start = Date.now(); for (var n = 0; n < N; n++) { t = i; i = i1; i1 = t; j = ~~i; } if (i != j || i1 != j) throw "whoa?"; return (Date.now() - start) / N; }

not bad

(potentially) still not enough

for (var n = 0; n < N; n++) { t = i; i = i1; i1 = t; j = ~~i; }

for (var n = 0; n < (N / 2) ; n++) { t = i; i = i1; i1 = t; j = ~~i; t = i; i = i1; i1 = t; j = ~~i; } if ((N % 2) === 1) { t = i; i = i1; i1 = t; j = ~~i; }

for (var n = 0; n < (N / 2); n++) { j = ~~i1; t = i1; j = ~~i; }

for (var n = 0; n < (N / 2); n++) { j = ~~i1; /* dead */ t = i1; /* dead */ j = ~~i; /* invariant */ }

loop unrolling

V8 doesn't do it but I want to induce paranoia ☺

µbench ⚔ VM

$ d8 concat.js Inside x 198,893,290 ops/sec ±1.08% Outside x 674,248,118 ops/sec ±1.08% Fastest is Outside Fastest is Outside

«quick! move all your strings out of functions»

time to look

at the code we expect both variants to be DCEd

IRHydra2 all samples from this talk

are demos within the tool

why Outside is fast(er)?

the forest hidden behind the trees

Outside is fast

at doing nothing

V8 failed to constant fold [bug!]

lets fix it!

--- src/hydrogen.cc (revision 21348) +++ src/hydrogen.cc (working copy) @@ -9639,6 +9639,14 @@ return left; } + if (FLAG_fold_constants && + left->IsConstant() && HConstant::cast(left)->HasStringValue() && + right->IsConstant() && HConstant::cast(right)->HasStringValue()) { + return AddUncasted<HStringAdd>( + left, right, allocation_mode.GetPretenureMode(), + STRING_ADD_CHECK_NONE, allocation_mode.feedback_site()); + } + // Register the dependent code with the allocation site. if (!allocation_mode.feedback_site().is_null()) { ASSERT(!graph()->info()->IsStub());

$ d8 concat.js Inside x 758,530,478 ops/sec ±1.82% Outside x 674,248,118 ops/sec ±1.14% Fastest is Inside Fastest is Inside

both are now doing nothing !

presumption of performance

reasonable code

reasonably fast

confirmation

bias

«prototype chains are slow »

var obj = Object.create( Object.create( Object.create( Object.create( Object.create({prop: 10})))));

var obj = Object.create( Object.create( Object.create( Object.create( Object.create({prop: 10}))))); // LISP tribute ^^^^^

function doManyLookups() { var counter = 0; for(var i = 0; i < 1000; i++) for(var j = 0; j < 1000; j++) for(var k = 0; k < 1000; k++) counter += obj.prop; print('In total: ' + counter); }

function lookupAndCache() { var counter = 0; var value = obj.prop; for(var i = 0; i < 1000; i++) for(var j = 0; j < 1000; j++) for(var k = 0; k < 1000; k++) counter += value; print('In total: ' + counter); }

// State of art benchmark driver. function measure(f) { var start = Date.now(); f(); var end = Date.now(); print(f.name + ' took ' + (end - start) + ' ms.'); }

measure(doManyLookups); measure(lookupAndCache);

$ node prototype.js In total: 10000000000 doManyLookups took 8243 ms. In total: 10000000000 lookupAndCache took 1058 ms. CONFIRMED

lets make it harder

- Object.create({ prop: 10 }))))); + Object.create({ get prop () { return 10 } })))));

$ node prototype.js In total: 10000000000 doManyLookups took 1082 ms. In total: 10000000000 lookupAndCache took 1061 ms.

«what kind of voodoo is this?»

very old V8 did not inline loads from data properties defined on prototypes

trying newer V8

$ d8 prototype.js In total: 10000000000 doManyLookups took 1294 ms. In total: 10000000000 lookupAndCache took 1189 ms.

$ d8 prototype.js In total: 10000000000 doManyLookups took 1294 ms. In total: 10000000000 lookupAndCache took 1189 ms.

prototype chain traversal got LICMed

... and now for something completely different

what if we run benchmark

twice ?

measure(doManyLookups); measure(doManyLookups); measure(lookupAndCache); measure(lookupAndCache);

$ d8 prototype.js | grep took doManyLookups took 1301 ms. doManyLookups took 3408 ms. lookupAndCache took 1204 ms. lookupAndCache took 3406 ms.

what just

happened here?

'In total: ' + counter type-feedback leaks upwards into the loop and causes excessive boxing of the counter variable

workaround: hide + from representation inference

- print('In total: ' + counter); + print('In total: ' + counter.toString());

$ d8 prototype.js | grep took doManyLookups took 1298 ms. doManyLookups took 1119 ms. lookupAndCache took 1188 ms. lookupAndCache took 982 ms.

now desert!

method call

vs.

function call

function mk(word) { var len = word.length; if (len > 255) return undefined; var i = len >> 2; return String.fromCharCode( (word.charCodeAt( 0) & 0x03) << 14 | (word.charCodeAt( i) & 0x03) << 12 | (word.charCodeAt( i+i) & 0x03) << 10 | (word.charCodeAt(i+i+i) & 0x03) << 8 | len ); }

Benchmark.prototype.setup = function() { function mk(word) { /* ... */ } var MK = function() { }; MK.prototype.mk = mk; var mker = new MK; };

suite .add('Function', function() { var key = mk('www.wired.com'); key = mk('www.youtube.com'); key = mk('scorecardresearch.com'); key = mk('www.google-analytics.com'); }) .add('Method', function() { var key = mker.mk('www.wired.com'); key = mker.mk('www.youtube.com'); key = mker.mk('scorecardresearch.com'); key = mker.mk('www.google-analytics.com'); })

$ d8 method-vs-function.js Function x 4,149,776 ops/sec ±0.62% Method x 682,273,122 ops/sec ±0.72% Fastest is Method Fastest is Method

method call

is faster?

keep secret

what I am going to show you now

--- a/method-function.js +++ b/method-function.js @@ -2,6 +2,9 @@ load("../benchmark.js"); Benchmark.prototype.setup = function() { + "Speed" + "your" + "JS" + "with" + + "this" + "one" + "weird" + "trick"; + function mk(word) { var len = word.length; if (len > 255) return undefined;

$ d8 method-vs-function.js Function x 695,708,197 ops/sec ±0.38% Method x 692,496,013 ops/sec ±0.29% Fastest is Function,Method Fastest is Function,Method

Function was never optimized!

— heuristics that decide when to optimize are based (among other things) on the amount of initialized inline caches — function call does not go through an IC, but method call does — Method had enough initialized ICs to trigger optimizations, but Function didn't! — Until fake + ICs were added in the setup section yada yada

measuring almost empty loop again!

can function call be faster than method call?

// for (var item in list) { ... } var sum = 0; for (var i = 0, L = arr.length; i < arr.length; ++i) { if (arr.length !== L) H.throwConcurrentModificationError(arr); var item = list[i]; sum += item; }

// for (var item in list) { ... } var sum = 0; for (var i = 0, L = arr.length; i < arr.length; ++i) { // if (arr.length !== L) // H.throwConcurrentModificationError(arr); var item = list[i]; sum += item; }

$ d8 iteration.js WithCheck x 396,792 ops/sec ±0.37% WithoutCheck x 456,653 ops/sec ±0.82% Fastest is WithoutCheck (by 18%) Fastest is WithoutCheck (by 18%)

// for (var item in list) { ... } var sum = 0; for (var i = 0, L = arr.length; i < arr.length; ++i) { if (arr.length !== L) H.throwConcurrentModificationError(arr); var item = list[i]; sum += item; }

// for (var item in list) { ... } var sum = 0; for (var i = 0, L = arr.length; i < arr.length; ++i) { if (arr.length !== L) (0,H.throwConcurrentModificationError)(arr); var item = list[i]; sum += item; }

$ d8 iteration.js WithCheck x 396,792 ops/sec ±0.37% WithoutCheck x 456,653 ops/sec ±0.82% WithHack x 462,698 ops/sec ±0.48% Fastest is WithoutCheck,WithHack Fastest is WithoutCheck,WithHack

18% speedup by replacing

o.f()

with

(0,o.f)()

in the code that never executes

algorithms first µbenchmarks last

THANK YOU

WAIT! what about the first example?

function getBlock(buffer) { var x = new Uint32Array(16); for (var i = 16; i--;) x[i] = input[i]; for (var i = 20; i > 0; i -= 2) { // quarterRound(x, 0, 4, 8,12); x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) << 16) | ((x[12] ^ x[ 0]) >>> 16); x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) << 12) | ((x[ 4] ^ x[ 8]) >>> 20); x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) << 8) | ((x[12] ^ x[ 0]) >>> 24); x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) << 7) | ((x[ 4] ^ x[ 8]) >>> 25); // ... so on ... } for (i = 16; i--;) x[i] += input[i]; for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]); input[12]++; return buffer; }

repetitive deopt is always V8 bug

— inlining U32TO8_LE used to break safe-uint32 analysis — adding long comment into U32TO8_LE disables inlining — there are more sane workarounds

function getBlock(buffer) { var x = new Uint32Array(16); for (var i = 16; i--;) x[i] = input[i]; for (var i = 20; i > 0; i -= 2) { // ... } for (i = 16; i--;) x[i] += input[i]; for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]|0); // immediately truncate to int32 ^^ input[12]++; return buffer; }

never assume that a language feature has to be slow