Fast 64-bit integers for Scala.js

Scala.js logo

Sébastien Doeraene @sjrdoeraene

September 28, 2017 -- VM Meetup

LAMP, lamp.epfl.ch
École polytechnique fédérale de Lausanne
 

EPFL logo             Scala logo

Navigation tip: if charts do not display properly, force a re-layout in your browser (resize, toggle full screen, maximize/minimize, etc.)

Motivation

Problem

  • JavaScript only has Doubles
  • Thanks to asm.js and its encoding: (a + b) | 0
    it basically also has Ints
  • But Longs? Nope.

Shape of the solution

  • A class RuntimeLong provided by the compiler
  • Each primitive operation is rerouted to a method call

class RuntimeLong {
  constructor(lo, hi) {
    this.lo = lo;
    this.hi = hi;
  }
  or(b) {
    return new RuntimeLong(
      this.lo | b.lo, this.hi | b.hi);
  }
}
    

How bad is it?

  • 1 to 2 orders of magnitude slower than Ints
  • Some implementations disregard correctness
    (Ceylon, Emscripten under a flag, Kotlin until recently)
  • Switching from the spec'ed Long-based implementation of java.util.Random to a hand-optimized (but equivalent) one made the difference between unusably slow and unnoticeably fast for a user

State of the art

GWT -- data representation

  • 3 Ints l, m and h with bits 0--21, 22-43 and 44--63
  • Allows to streamline some carry propagation

public static LongEmul add(LongEmul a, LongEmul b) {
  int sum0 = a.l + b.l;
  int sum1 = a.m + b.m + (sum0 >> BITS);
  int sum2 = a.h + b.h + (sum1 >> BITS);
  return create(sum0 & MASK, sum1 & MASK, sum2 & MASK_2);
}
    

TeaVM and Kotlin -- data representation

  • 2 Ints lo and hi with bits 0--31 and 32--63
  • Streamlines bitwise operations, but complicates carry

function Long_add(a, b) {
    var a_lolo = a.lo & 0xFFFF;
    // ...
    var b_hihi = b.hi >>> 16;
    var lolo = (a_lolo + b_lolo) | 0;
    var lohi = (a_lohi + b_lohi + (lolo >> 16)) | 0;
    var hilo = (a_hilo + b_hilo + (lohi >> 16)) | 0;
    var hihi = (a_hihi + b_hihi + (hilo >> 16)) | 0;
    return new Long(
        (lolo & 0xFFFF) | ((lohi & 0xFFFF) << 16),
        (hilo & 0xFFFF) | ((hihi & 0xFFFF) << 16));
}
    

TeaVM -- fast paths with Doubles

  • Rely on fast operations on Doubles to speed up operations with small operands

function Long_add(a, b) {
  if (a.hi === (a.lo >> 31) && b.hi === (b.lo >> 31)) {
    return Long_fromNumber(a.lo + b.lo);
  } else if (Math.abs(a.hi) < Long_MAX_NORMAL &&
             Math.abs(b.hi) < Long_MAX_NORMAL) {
    return Long_fromNumber(
        Long_toNumber(a) + Long_toNumber(b));
  }
  // slow-path
}
    

Which is fastest?

  • Micro-benchmarks to measure GWT, TeaVM and Kotlin

Which is fastest?

  • Xor: TeaVM
  • Addition: Kotlin; but TeaVM without its "fast path" becomes faster
  • Multiplication: TeaVM (GWT first by a small margin on Chrome)
  • Division: sometimes GWT (divisor ~= dividend), sometimes TeaVM (divisor << dividend)
  • toString: GWT

Let's make a Best of Breed for Scala.js

  • Data representation of TeaVM
  • Implementations of the operations taken from the best for each

Improving the user-land implementation

Shift


def <<(n: Int): RuntimeLong = {
  val n1 = n & 63
  if (n1 == 0)
    this
  else if (n1 < 32)
    new RuntimeLong(lo << n1,
        (lo >>> (32 - n1)) | (hi << n1))
  else if (n1 == 32)
    new RuntimeLong(0, lo)
  else
    new RuntimeLong(0, lo << (n1 - 32))
}
    

Shift: removing branches


def <<(n: Int): RuntimeLong = {
  if ((n & 32) == 0)
    new RuntimeLong(lo << n,
        (lo >>> 1 >>> (31-n)) | (hi << n))
  else
    new RuntimeLong(0, lo << n)
}
    

Shift

  • Is it still correct?
  • Mechanically verified using Predicate-Qualified Types

Addition: textbook algorithm


def +(b: RuntimeLong): RuntimeLong = {
  val lo = this.lo + b.lo
  val hi = adc(this.hi, b.hi)
  new RuntimeLong(lo, hi)
}
    

Addition: implementing adc


def +(b: RuntimeLong): RuntimeLong = {
  val lo = this.lo + b.lo
  val hi =
    if (overflow_occurred_in_lo_+)
      this.hi + b.hi + 1
    else
      this.hi + b.hi
  new RuntimeLong(lo, hi)
}
    

Addition: implementing adc (2)


def +(b: RuntimeLong): RuntimeLong = {
  val lo = this.lo + b.lo
  val hi =
    if (lo ^ 0x80000000 < this.lo ^ 0x80000000)
      this.hi + b.hi + 1
    else
      this.hi + b.hi
  new RuntimeLong(lo, hi)
}
    

Multiplication

  • In TeaVM and Kotlin: requires 10 primitive Int multiplications
  • This is necessary to prevent intermediate multiplications from overflowing
  • What if we exploit the overflows instead?

def *(b: RuntimeLong): RuntimeLong = {
  val alo = this.lo
  val blo = b.lo
  val a0 = alo & 0xffff
  val a1 = alo >>> 16
  val b0 = blo & 0xffff
  val b1 = blo >>> 16

  val a0b0 = a0 * b0
  val a1b0 = a1 * b0
  val a0b1 = a0 * b1
  val lo = a0b0 + ((a1b0 + a0b1) << 16)
  val c1part = (a0b0 >>> 16) + a0b1
  val hi = {
    alo*b.hi + a.hi*blo + a1 * b1 +
    (c1part >>> 16) +
    (((c1part & 0xffff) + a1b0) >>> 16)
  }
  new RuntimeLong(lo, hi)
}
    

Multiplication

Division: GWT's core loop


def divModHelper(a: RuntimeLong, b: RuntimeLong,
    isDivide: Boolean): RuntimeLong = {
  var shift = numberOfLeadingZeros(b) -
    numberOfLeadingZeros(a)
  var bShift = b << shift
  var r = a
  var q = new RuntimeLong(0, 0)
  while (shift >= 0 && r != 0) {
    if (unsigned_>=(r, bShift)) {
      r -= bShift
      q |= (1L << shift)
    }
    shift -= 1
    bShift >>>= 1
  }
  if (isDivide) q
  else r
}
    

Division: GWT's core loop -- invariant


/* Invariants:
 *   q >= 0
 *   If shift >= 0:
 *     bShift == b << shift == b * 2^shift
 *     0 <= r < 2 * bShift
 *   Else:
 *     0 <= r < b
 *   q * b + r == a
 */
    

Division: short-cut once r < 2^53


while (shift >= 0 && !isUnsignedSafeDouble(r)) {
  ...
}
// q * b + r == a (per the invariant)
if (unsigned_>=(r, b)) {
  val rDbl = asUnsignedSafeDouble(r)
  val bDbl = asUnsignedSafeDouble(b)
  if (isDivide)
    q + fromUnsignedSafeDouble(rDbl / bDbl)
  else
    fromUnsignedSafeDouble(rDbl % bDbl)
} else {
  if (isDivide) q
  else r
}
    

Division: short-cut once r < 2^53 (2)

  • From the invariant: q * b + r == a
  • We also need 0 <= r < b
  • If r >= b, we finish with a double divison
  • Let q′ = r/b and r′ = r%b
  • We have q′·b + r′ == r and 0 <= r′ < b
  • Therefore q·b + (q′·b + r′) == a
  • And then (q + q′)·b + r′ == a

Conversion to string

  • Skip 😌

Optimizations

Optimizations

  • Inlining + scalar replacement
  • Simplification of Int operations

Inlining + scalar replacement

  • Force inlining of all the RuntimeLong methods
    except division and toString
  • Apply scalar replacement to all RuntimeLong
  • Upon escape, don't bail, instead allocate a new instance on the fly
    (OK because identity of RuntimeLong is not observable)

Simplification of Int operations

  • From the as simple and generic as 0 & x == 0
  • To the pretty specialized
    (x & 0xffff) >>> 16 == 0
  • To the pretty specialized and non-trivial
    (x >> c1) >> c2 == x >> ((c1 + c2) & 31)
    if (c1 & 31) + (c2 & 31) < 32,
    == x >> 31 otherwise

Example: multiply by a small constant


val result = x * 10
    

val result = x * 10
    

val result = 10 * x
    

val result = new RuntimeLong(10, 0).*(x)
    

val a = new RuntimeLong(10, 0)
val alo = a.lo
val blo = x.lo

val a0 = alo & 0xffff
val a1 = alo >>> 16
val b0 = blo & 0xffff
val b1 = blo >>> 16

val a0b0 = a0 * b0
val a1b0 = a1 * b0
val a0b1 = a0 * b1

val lo = a0b0 + ((a1b0 + a0b1) << 16)

val c1part = (a0b0 >>> 16) + a0b1
val hi = {
  alo * x.hi +
  a.hi * blo +
  a1 * b1 +
  (c1part >>> 16) +
  (((c1part & 0xffff) + a1b0) >>> 16)
}

val result = new RuntimeLong(lo, hi)
    

val a$lo = 10
val a$hi = 0
// x in (x$lo, x$hi)
val alo = a$lo
val blo = x$lo

val a0 = alo & 0xffff
val a1 = alo >>> 16
val b0 = blo & 0xffff
val b1 = blo >>> 16

val a0b0 = a0 * b0
val a1b0 = a1 * b0
val a0b1 = a0 * b1

val lo = a0b0 + ((a1b0 + a0b1) << 16)

val c1part = (a0b0 >>> 16) + a0b1
val hi = {
  alo * x$hi +
  a$hi * blo +
  a1 * b1 +
  (c1part >>> 16) +
  (((c1part & 0xffff) + a1b0) >>> 16)
}

// result in (lo, hi)
    

// x in (x$lo, x$hi)
val a0 = 10 & 0xffff
val a1 = 10 >>> 16
val b0 = x$lo & 0xffff
val b1 = x$lo >>> 16

val a0b0 = a0 * b0
val a1b0 = a1 * b0
val a0b1 = a0 * b1

val lo = a0b0 + ((a1b0 + a0b1) << 16)

val c1part = (a0b0 >>> 16) + a0b1
val hi = {
  10 * x$hi +
  0 * x$lo +
  a1 * b1 +
  (c1part >>> 16) +
  (((c1part & 0xffff) + a1b0) >>> 16)
}

// result in (lo, hi)
    

// x in (x$lo, x$hi)
val a0 = 10
val a1 = 0
val b0 = x$lo & 0xffff
val b1 = x$lo >>> 16

val a0b0 = 10 * b0
val a1b0 = 0 * b0 // multiplication disappears
val a0b1 = 10 * b1

val lo = a0b0 + ((a1b0 + a0b1) << 16)

val c1part = (a0b0 >>> 16) + a0b1
val hi = {
  10 * x$hi +
  0 * x$lo + // multiplication disappears
  0 * b1 + // multiplication disappears
  (c1part >>> 16) +
  (((c1part & 0xffff) + a1b0) >>> 16)
}

// result in (lo, hi)
    

// x in (x$lo, x$hi)
val b0 = x$lo & 0xffff
val b1 = x$lo >>> 16

val a0b0 = 10 * b0
val a0b1 = 10 * b1

val lo = a0b0 + ((0 + a0b1) << 16)

val c1part = (a0b0 >>> 16) + a0b1
val hi = {
  10 * x$hi +
  (c1part >>> 16) +
  (((c1part & 0xffff) + 0) >>> 16)
}

// result in (lo, hi)
    

// x in (x$lo, x$hi)
val b0 = x$lo & 0xffff
val b1 = x$lo >>> 16

val a0b0 = 10 * b0
val a0b1 = 10 * b1

val lo = a0b0 + (a0b1 << 16)

val c1part = (a0b0 >>> 16) + a0b1
val hi = {
  10 * x$hi +
  (c1part >>> 16)
}

// result in (lo, hi)
    

// x in (x$lo, x$hi)
val b0 = x$lo & 0xffff
val b1 = x$lo >>> 16
val a0b0 = 10 * b0
val a0b1 = 10 * b1
val lo = a0b0 + (a0b1 << 16)
val c1part = (a0b0 >>> 16) + a0b1
val hi = 10 * x$hi + (c1part >>> 16)
// result in (lo, hi)
    

Putting everything together

Scala.js logo

scala-js.org