[llvm] 291101a - [WebAssembly] Optimize vector shift using a splat value from outside block

Fri Aug 25 08:13:33 PDT 2023

Author: Yolanda Chen
Date: 2023-08-25T08:13:27-07:00
New Revision: 291101aa8ea56b094c8c887b1a2cf4af36ac6dea

URL: https://github.com/llvm/llvm-project/commit/291101aa8ea56b094c8c887b1a2cf4af36ac6dea
DIFF: https://github.com/llvm/llvm-project/commit/291101aa8ea56b094c8c887b1a2cf4af36ac6dea.diff

LOG: [WebAssembly] Optimize vector shift using a splat value from outside block

The vector shift operation in WebAssembly uses an i32 shift amount type, while
the LLVM IR requires binary operator uses the same type of operands. When the
shift amount operand is splated from a different block, the splat source will
not be exported and the vector shift will be unrolled to scalar shifts. This
patch enables the vector shift to identify the splat source value from the other
block, and generate expected WebAssembly bytecode when lowering.

Reviewed By: tlively

Differential Revision: https://reviews.llvm.org/D158399

Added: 
    llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll

Modified: 
    llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
    llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index f48b3f4cabd0df..fd154a90edef1d 100644

--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
@@ -833,6 +834,30 @@ bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
   return isa<Function>(GV) ? false : TargetLowering::isOffsetFoldingLegal(GA);
 }
 
+bool WebAssemblyTargetLowering::shouldSinkOperands(
+    Instruction *I, SmallVectorImpl<Use *> &Ops) const {
+  using namespace llvm::PatternMatch;
+
+  if (!I->getType()->isVectorTy() || !I->isShift())
+    return false;
+
+  Value *V = I->getOperand(1);
+  // We dont need to sink constant splat.
+  if (dyn_cast<Constant>(V))
+    return false;
+
+  if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
+                         m_Value(), m_ZeroMask()))) {
+    // Sink insert
+    Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
+    // Sink shuffle
+    Ops.push_back(&I->getOperandUse(1));
+    return true;
+  }
+
+  return false;
+}
+
 EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL,
                                                   LLVMContext &C,
                                                   EVT VT) const {

diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index ecf5d5b1ea5dab..1d1338ab40d0e5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -76,6 +76,8 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
   bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+  bool shouldSinkOperands(Instruction *I,
+                          SmallVectorImpl<Use *> &Ops) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,

diff  --git a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
new file mode 100644
index 00000000000000..d4518d40e42986
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+; Test that SIMD shifts can be lowered correctly even when shift
+; values are exported from outside blocks.
+
+target triple = "wasm32-unknown-unknown"
+
+define void @shl_loop(ptr %a, i8 %shift, i32 %count) {
+; CHECK-LABEL: shl_loop:
+; CHECK:         .functype shl_loop (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:  .LBB0_1: # %body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    loop # label0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shl
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.set 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32.const -1
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    i32.eqz
+; CHECK-NEXT:    br_if 0 # 0: up to label0
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    end_loop
+; CHECK-NEXT:    # fallthrough-return
+entry:
+ %t1 = insertelement <16 x i8> undef, i8 %shift, i32 0
+ %vshift = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer
+ br label %body
+body:
+ %out = phi ptr [%a, %entry], [%b, %body]
+ %i = phi i32 [0, %entry], [%next, %body]
+ %v = load <16 x i8>, ptr %out, align 1
+ %r = shl <16 x i8> %v, %vshift
+ %b = getelementptr inbounds i8, ptr %out, i32 16
+ store <16 x i8> %r, ptr %b
+ %next = add i32 %i, 1
+ %i.cmp = icmp eq i32 %next, %count
+ br i1 %i.cmp, label %body, label %exit
+exit:
+ ret void
+}
+
+; Test that SIMD shifts can be lowered correctly when shift value
+; is a phi inside loop body.
+
+define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) {
+; CHECK-LABEL: shl_phi_loop:
+; CHECK:         .functype shl_phi_loop (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:  .LBB1_1: # %body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    loop # label1:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shl
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    local.set 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.set 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32.const -1
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    i32.eqz
+; CHECK-NEXT:    br_if 0 # 0: up to label1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    end_loop
+; CHECK-NEXT:    # fallthrough-return
+entry:
+ br label %body
+body:
+ %out = phi ptr [%a, %entry], [%b, %body]
+ %i = phi i32 [0, %entry], [%next, %body]
+ %t1 = phi i8 [%shift, %entry], [%sand, %body]
+ %t2 = insertelement <16 x i8> undef, i8 %t1, i32 0
+ %vshift = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32> zeroinitializer
+ %v = load <16 x i8>, ptr %out, align 1
+ %r = shl <16 x i8> %v, %vshift
+ %b = getelementptr inbounds i8, ptr %out, i32 16
+ store <16 x i8> %r, ptr %b
+ %sand = and i8 %t1, 1
+ %next = add i32 %i, 1
+ %i.cmp = icmp eq i32 %next, %count
+ br i1 %i.cmp, label %body, label %exit
+exit:
+ ret void
+}