[llvm] 542523a - [WebAssembly] Emulate v128.const efficiently

Fri Oct 2 00:28:14 PDT 2020

Author: Thomas Lively
Date: 2020-10-02T00:28:06-07:00
New Revision: 542523a61a21c13e7f244bcf821b0fdeb8c6bb24

URL: https://github.com/llvm/llvm-project/commit/542523a61a21c13e7f244bcf821b0fdeb8c6bb24
DIFF: https://github.com/llvm/llvm-project/commit/542523a61a21c13e7f244bcf821b0fdeb8c6bb24.diff

LOG: [WebAssembly] Emulate v128.const efficiently

v128.const was recently implemented in V8, but until it rolls into Chrome
stable, we can't enable it in the WebAssembly backend without breaking origin
trial users. So far we have been lowering build_vectors that would otherwise
have been lowered to v128.const to splats followed by sequences of replace_lane
instructions to initialize each lane individually. That produces large and
inefficient code, so this patch introduces new logic to lower integer vector
constants to a single i64x2.splat where possible, with at most a single
i64x2.replace_lane following it if necessary.

Adapted from a patch authored by @omnisip.

Differential Revision: https://reviews.llvm.org/D88591

Added: 
    

Modified: 
    llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
    llvm/test/CodeGen/WebAssembly/simd-build-vector.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 425f8b86c9fb..8474e50ea42f 100644

--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
@@ -1565,6 +1566,7 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     };
   } else if (NumConstantLanes >= NumSplatLanes &&
              Subtarget->hasUnimplementedSIMD128()) {
+    // If we support v128.const, emit it directly
     SmallVector<SDValue, 16> ConstLanes;
     for (const SDValue &Lane : Op->op_values()) {
       if (IsConstant(Lane)) {
@@ -1576,11 +1578,67 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       }
     }
     Result = DAG.getBuildVector(VecT, DL, ConstLanes);
-    IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+    IsLaneConstructed = [&IsConstant](size_t _, const SDValue &Lane) {
       return IsConstant(Lane);
     };
-  }
-  if (!Result) {
+  } else if (NumConstantLanes >= NumSplatLanes && VecT.isInteger()) {
+    // Otherwise, if this is an integer vector, pack the lane values together so
+    // we can construct the 128-bit constant from a pair of i64s using a splat
+    // followed by at most one i64x2.replace_lane. Also keep track of the lanes
+    // that actually matter so we can avoid the replace_lane in more cases.
+    std::array<uint64_t, 2> I64s({0, 0});
+    std::array<uint64_t, 2> ConstLaneMasks({0, 0});
+    uint8_t *I64Bytes = reinterpret_cast<uint8_t *>(I64s.data());
+    uint8_t *MaskBytes = reinterpret_cast<uint8_t *>(ConstLaneMasks.data());
+    unsigned I = 0;
+    size_t ByteStep = VecT.getScalarSizeInBits() / 8;
+    for (const SDValue &Lane : Op->op_values()) {
+      if (IsConstant(Lane)) {
+        using llvm::support::little;
+        using llvm::support::endian::byte_swap;
+        // The endianness of the compiler matters here. We want to enforce
+        // little endianness so that the bytes of a smaller integer type will
+        // occur first in the uint64_t.
+        auto *Const = cast<ConstantSDNode>(Lane.getNode());
+        uint64_t Val = byte_swap(Const->getLimitedValue(), little);
+        uint8_t *ValPtr = reinterpret_cast<uint8_t *>(&Val);
+        std::copy(ValPtr, ValPtr + ByteStep, I64Bytes + I * ByteStep);
+        uint64_t Mask = uint64_t(-1LL);
+        uint8_t *MaskPtr = reinterpret_cast<uint8_t *>(&Mask);
+        std::copy(MaskPtr, MaskPtr + ByteStep, MaskBytes + I * ByteStep);
+      }
+      ++I;
+    }
+    // Check whether all constant lanes in the second half of the vector are
+    // equivalent in the first half or vice versa to determine whether splatting
+    // either side will be sufficient to materialize the constant. As a special
+    // case, if the first and second halves have no constant lanes in common, we
+    // can just combine them.
+    bool FirstHalfSufficient = (I64s[0] & ConstLaneMasks[1]) == I64s[1];
+    bool SecondHalfSufficient = (I64s[1] & ConstLaneMasks[0]) == I64s[0];
+    bool CombinedSufficient = (ConstLaneMasks[0] & ConstLaneMasks[1]) == 0;
+
+    uint64_t Splatted;
+    if (SecondHalfSufficient) {
+      Splatted = I64s[1];
+    } else if (CombinedSufficient) {
+      Splatted = I64s[0] | I64s[1];
+    } else {
+      Splatted = I64s[0];
+    }
+
+    Result = DAG.getSplatBuildVector(MVT::v2i64, DL,
+                                     DAG.getConstant(Splatted, DL, MVT::i64));
+    if (!FirstHalfSufficient && !SecondHalfSufficient && !CombinedSufficient) {
+      Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i64, Result,
+                           DAG.getConstant(I64s[1], DL, MVT::i64),
+                           DAG.getConstant(1, DL, MVT::i32));
+    }
+    Result = DAG.getBitcast(VecT, Result);
+    IsLaneConstructed = [&IsConstant](size_t _, const SDValue &Lane) {
+      return IsConstant(Lane);
+    };
+  } else {
     // Use a splat, but possibly a load_splat
     LoadSDNode *SplattedLoad;
     if ((SplattedLoad = dyn_cast<LoadSDNode>(SplatValue)) &&
@@ -1593,11 +1651,14 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     } else {
       Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
     }
-    IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+    IsLaneConstructed = [&SplatValue](size_t _, const SDValue &Lane) {
       return Lane == SplatValue;
     };
   }
 
+  assert(Result);
+  assert(IsLaneConstructed);
+
   // Add replace_lane instructions for any unhandled values
   for (size_t I = 0; I < Lanes; ++I) {
     const SDValue &Lane = Op->getOperand(I);

diff  --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
index 43cfa97933f8..afd7375d146a 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
@@ -8,12 +8,73 @@
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
+; CHECK-LABEL:  emulated_const_trivial_splat:
+; CHECK-NEXT:   .functype       emulated_const_trivial_splat () -> (v128)
+; SIMD-VM-NEXT: i64.const       $push0=, 8589934593
+; SIMD-VM-NEXT: i64x2.splat     $push1=, $pop0
+; SIMD-VM-NEXT: return  $pop1
+; UNIMP: v128.const
+define <4 x i32> @emulated_const_trivial_splat() {
+  ret <4 x i32> <i32 1, i32 2, i32 1, i32 2>
+}
+
+; CHECK-LABEL:  emulated_const_first_sufficient:
+; CHECK-NEXT:   .functype       emulated_const_first_sufficient () -> (v128)
+; SIMD-VM-NEXT: i64.const       $push0=, 8589934593
+; SIMD-VM-NEXT: i64x2.splat     $push1=, $pop0
+; SIMD-VM-NEXT: return  $pop1
+; UNIMP: v128.const
+define <4 x i32> @emulated_const_first_sufficient() {
+  ret <4 x i32> <i32 1, i32 2, i32 undef, i32 2>
+}
+
+; CHECK-LABEL:  emulated_const_second_sufficient:
+; CHECK-NEXT:   .functype       emulated_const_second_sufficient () -> (v128)
+; SIMD-VM-NEXT: i64.const       $push0=, 8589934593
+; SIMD-VM-NEXT: i64x2.splat     $push1=, $pop0
+; SIMD-VM-NEXT: return  $pop1
+; UNIMP: v128.const
+define <4 x i32> @emulated_const_second_sufficient() {
+  ret <4 x i32> <i32 1, i32 undef, i32 1, i32 2>
+}
+
+; CHECK-LABEL:  emulated_const_combined_sufficient:
+; CHECK-NEXT:   .functype       emulated_const_combined_sufficient () -> (v128)
+; SIMD-VM-NEXT: i64.const       $push0=, 8589934593
+; SIMD-VM-NEXT: i64x2.splat     $push1=, $pop0
+; SIMD-VM-NEXT: return  $pop1
+; UNIMP: v128.const
+define <4 x i32> @emulated_const_combined_sufficient() {
+  ret <4 x i32> <i32 1, i32 undef, i32 undef, i32 2>
+}
+
+; CHECK-LABEL:  emulated_const_either_sufficient:
+; CHECK-NEXT:   .functype       emulated_const_either_sufficient () -> (v128)
+; SIMD-VM-NEXT: i64.const       $push0=, 1
+; SIMD-VM-NEXT: i64x2.splat     $push1=, $pop0
+; SIMD-VM-NEXT: return  $pop1
+; UNIMP: v128.const
+define <4 x i32> @emulated_const_either_sufficient() {
+  ret <4 x i32> <i32 1, i32 undef, i32 1, i32 undef>
+}
+
+; CHECK-LABEL: emulated_const_neither_sufficient:
+; CHECK-NEXT:   .functype       emulated_const_neither_sufficient () -> (v128)
+; SIMD-VM-NEXT: i64.const       $push0=, 8589934593
+; SIMD-VM-NEXT: i64x2.splat     $push1=, $pop0
+; SIMD-VM-NEXT: i64.const       $push2=, 17179869184
+; SIMD-VM-NEXT: i64x2.replace_lane      $push3=, $pop1, 1, $pop2
+; SIMD-VM-NEXT: return  $pop3
+define <4 x i32> @emulated_const_neither_sufficient() {
+  ret <4 x i32> <i32 1, i32 2, i32 undef, i32 4>
+}
+
 ; CHECK-LABEL: same_const_one_replaced_i16x8:
 ; CHECK-NEXT:  .functype       same_const_one_replaced_i16x8 (i32) -> (v128)
 ; UNIMP-NEXT:  v128.const      $push[[L0:[0-9]+]]=, 42, 42, 42, 42, 42, 0, 42, 42
 ; UNIMP-NEXT:  i16x8.replace_lane      $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0
 ; UNIMP-NEXT:  return          $pop[[L1]]
-; SIMD-VM: i16x8.splat
+; SIMD-VM: i64x2.splat
 define <8 x i16> @same_const_one_replaced_i16x8(i16 %x) {
   %v = insertelement
     <8 x i16> <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>,
@@ -27,7 +88,7 @@ define <8 x i16> @same_const_one_replaced_i16x8(i16 %x) {
 ; UNIMP-NEXT:  v128.const      $push[[L0:[0-9]+]]=, 1, -2, 3, -4, 5, 0, 7, -8
 ; UNIMP-NEXT:  i16x8.replace_lane      $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0
 ; UNIMP-NEXT:  return          $pop[[L1]]
-; SIMD-VM: i16x8.splat
+; SIMD-VM: i64x2.splat
 define <8 x i16> @
diff erent_const_one_replaced_i16x8(i16 %x) {
   %v = insertelement
     <8 x i16> <i16 1, i16 -2, i16 3, i16 -4, i16 5, i16 -6, i16 7, i16 -8>,
@@ -68,7 +129,7 @@ define <4 x float> @
diff erent_const_one_replaced_f32x4(float %x) {
 ; CHECK-NEXT:  .functype       splat_common_const_i32x4 () -> (v128)
 ; UNIMP-NEXT:  v128.const      $push[[L0:[0-9]+]]=, 0, 3, 3, 1
 ; UNIMP-NEXT:  return          $pop[[L0]]
-; SIMD-VM: i32x4.splat
+; SIMD-VM: i64x2.splat
 define <4 x i32> @splat_common_const_i32x4() {
   ret <4 x i32> <i32 undef, i32 3, i32 3, i32 1>
 }
@@ -206,7 +267,7 @@ define <16 x i8> @mashup_swizzle_i8x16(<16 x i8> %src, <16 x i8> %mask, i8 %spla
 ; UNIMP:       i8x16.replace_lane
 ; UNIMP:       i8x16.replace_lane
 ; UNIMP:       return
-; SIMD-VM: i8x16.splat
+; SIMD-VM: i64x2.splat
 define <16 x i8> @mashup_const_i8x16(<16 x i8> %src, <16 x i8> %mask, i8 %splatted) {
   ; swizzle 0
   %m0 = extractelement <16 x i8> %mask, i32 0