[llvm] [WebAssembly] [Backend] Draft pull request for #129441 (PR #145108)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 21 19:00:37 PDT 2025
https://github.com/badumbatish updated https://github.com/llvm/llvm-project/pull/145108
>From ffb724a358564b8cdc48086c34c48c183ab9143b Mon Sep 17 00:00:00 2001
From: badumbatish <jjasmine at igalia.com>
Date: Sat, 21 Jun 2025 15:24:06 -0700
Subject: [PATCH 1/2] Precommit missed optimization test for #129441
This shows that reduceand is not well-optimized in WebAssembly.
The long chain of shuffle should be turned to all_true.
---
.../CodeGen/WebAssembly/simd-reduceand.ll | 77 +++++++++++++++++++
1 file changed, 77 insertions(+)
create mode 100644 llvm/test/CodeGen/WebAssembly/simd-reduceand.ll
diff --git a/llvm/test/CodeGen/WebAssembly/simd-reduceand.ll b/llvm/test/CodeGen/WebAssembly/simd-reduceand.ll
new file mode 100644
index 0000000000000..00beedfbd7e96
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-reduceand.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
+target triple = "wasm64"
+
+define i1 @reduce_and_to_all_true_16i8(<16 x i8> %0) {
+; CHECK-LABEL: reduce_and_to_all_true_16i8:
+; CHECK: .functype reduce_and_to_all_true_16i8 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: i8x16.ne $push10=, $0, $pop0
+; CHECK-NEXT: local.tee $push9=, $0=, $pop10
+; CHECK-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK-NEXT: v128.and $push8=, $pop9, $pop1
+; CHECK-NEXT: local.tee $push7=, $0=, $pop8
+; CHECK-NEXT: i8x16.shuffle $push2=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK-NEXT: v128.and $push3=, $pop7, $pop2
+; CHECK-NEXT: i32x4.extract_lane $push4=, $pop3, 0
+; CHECK-NEXT: i32.const $push5=, 0
+; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5
+; CHECK-NEXT: return $pop6
+ %2 = icmp ne <16 x i8> %0, zeroinitializer
+ %3 = sext <16 x i1> %2 to <16 x i8>
+ %4 = bitcast <16 x i8> %3 to <4 x i32>
+ %5 = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %4)
+ %6 = icmp ne i32 %5, 0
+ ret i1 %6
+}
+
+
+define i1 @reduce_and_to_all_true_4i32(<4 x i32> %0) {
+; CHECK-LABEL: reduce_and_to_all_true_4i32:
+; CHECK: .functype reduce_and_to_all_true_4i32 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0
+; CHECK-NEXT: i32x4.ne $push10=, $0, $pop0
+; CHECK-NEXT: local.tee $push9=, $0=, $pop10
+; CHECK-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK-NEXT: v128.and $push8=, $pop9, $pop1
+; CHECK-NEXT: local.tee $push7=, $0=, $pop8
+; CHECK-NEXT: i8x16.shuffle $push2=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK-NEXT: v128.and $push3=, $pop7, $pop2
+; CHECK-NEXT: i32x4.extract_lane $push4=, $pop3, 0
+; CHECK-NEXT: i32.const $push5=, 0
+; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5
+; CHECK-NEXT: return $pop6
+ %2 = icmp ne <4 x i32> %0, zeroinitializer
+ %3 = sext <4 x i1> %2 to <4 x i32>
+ %4 = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %3)
+ %5 = icmp ne i32 %4, 0
+ ret i1 %5
+}
+
+
+
+define i1 @reduce_and_to_all_true_2i64(<2 x i64> %0) {
+; CHECK-LABEL: reduce_and_to_all_true_2i64:
+; CHECK: .functype reduce_and_to_all_true_2i64 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0
+; CHECK-NEXT: i32x4.ne $push10=, $0, $pop0
+; CHECK-NEXT: local.tee $push9=, $0=, $pop10
+; CHECK-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK-NEXT: v128.and $push8=, $pop9, $pop1
+; CHECK-NEXT: local.tee $push7=, $0=, $pop8
+; CHECK-NEXT: i8x16.shuffle $push2=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK-NEXT: v128.and $push3=, $pop7, $pop2
+; CHECK-NEXT: i32x4.extract_lane $push4=, $pop3, 0
+; CHECK-NEXT: i32.const $push5=, 0
+; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5
+; CHECK-NEXT: return $pop6
+ %2 = bitcast <2 x i64> %0 to <4 x i32>
+ %3 = icmp ne <4 x i32> %2, zeroinitializer
+ %4 = sext <4 x i1> %3 to <4 x i32>
+ %5 = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %4)
+ %6 = icmp ne i32 %5, 0
+ ret i1 %6
+}
>From 430a54b5644b80b3733d13452faca5fb08e6276d Mon Sep 17 00:00:00 2001
From: badumbatish <jjasmine at igalia.com>
Date: Sat, 21 Jun 2025 18:05:18 -0700
Subject: [PATCH 2/2] Combine and(X, shuffle(X, pow 2 mask)) to all true
Combine N = and(X, shuffle_vector(X, power of 2 mask)) to all true.
Where X is either N or setcc(v, <0>, ne) or a bitcast of said setcc.
---
.../WebAssembly/WebAssemblyISelLowering.cpp | 87 +++++++++++++++++++
.../CodeGen/WebAssembly/simd-reduceand.ll | 42 ++-------
2 files changed, 93 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 3cd923c0ba058..d9c2f789e2248 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -18,12 +18,14 @@
#include "WebAssemblySubtarget.h"
#include "WebAssemblyTargetMachine.h"
#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/IR/DiagnosticInfo.h"
@@ -184,6 +186,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// Combine partial.reduce.add before legalization gets confused.
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+ // Combine EXTRACT VECTOR ELT of AND(AND(X, SHUFFLE(X)), SHUFFLE(...)), 0
+ // to all_true
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+
// Combine wide-vector muls, with extend inputs, to extmul_half.
setTargetDAGCombine(ISD::MUL);
@@ -3287,6 +3293,85 @@ static SDValue performSETCCCombine(SDNode *N,
return SDValue();
}
+static SmallVector<int> buildMaskArrayByPower(int Power, size_t NumElements) {
+ // Generate 1-index array of elements from 2^Power to 2^(Power+1) exclusive
+ // The rest is filled with -1.
+ //
+ // For example, with NumElements = 4:
+ // When Power = 0: <1 -1 -1 -1>
+ // When Power = 1: <2 3 -1 -1>
+ // When Power = 2: <4 5 6 7>
+
+ uint From = pow(2, Power), To = pow(2, Power + 1);
+ assert(From < NumElements && To <= NumElements);
+
+ SmallVector<int> Res;
+ for (uint I = From; I < To; I++)
+ Res.push_back(I);
+ Res.resize(NumElements, -1);
+
+ return Res;
+}
+static SDValue matchAndOfShuffle(SDNode *N, int Power) {
+ // Matching on the case of
+ //
+ // Base case: A [bitcast for a] setcc(v, <0>, ne).
+ // Recursive case: N = and(X, shuffle(X, power mask)) where X is either
+ // recursive or base case.
+ using namespace llvm::SDPatternMatch;
+
+ EVT VT = N->getValueType(0);
+
+ SDValue LHS = N->getOperand(0);
+ int NumElements = VT.getVectorNumElements();
+ if (NumElements < pow(2, Power))
+ return SDValue();
+
+ if (N->getOpcode() != ISD::AND && NumElements == pow(2, Power)) {
+ SDValue BitCast, Matched;
+
+ // Try for a setcc first.
+ if (sd_match(N, m_c_SetCC(m_Value(Matched), m_Zero(),
+ m_SpecificCondCode(ISD::SETNE))))
+ return Matched;
+
+ // Now try for bitcast
+ if (!sd_match(N, m_BitCast(m_Value(BitCast))))
+ return SDValue();
+
+ if (!sd_match(BitCast, m_c_SetCC(m_Value(Matched), m_Zero(),
+ m_SpecificCondCode(ISD::SETNE))))
+ return SDValue();
+ return Matched;
+ }
+
+ SmallVector<int> PowerIndices = buildMaskArrayByPower(Power, NumElements);
+ if (sd_match(N, m_And(m_Value(LHS),
+ m_Shuffle(m_Value(LHS), m_VectorVT(m_Opc(ISD::POISON)),
+ m_SpecificMask(PowerIndices)))))
+ return matchAndOfShuffle(LHS.getNode(), Power + 1);
+
+ return SDValue();
+}
+static SDValue performExtractVecEltCombine(SDNode *N, SelectionDAG &DAG) {
+ using namespace llvm::SDPatternMatch;
+
+ assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+ SDLoc DL(N);
+
+ SDValue And;
+ if (!sd_match(N, m_ExtractElt(m_VectorVT(m_Value(And)), m_Zero())))
+ return SDValue();
+
+ if (SDValue Matched = matchAndOfShuffle(And.getNode(), 0))
+ return DAG.getZExtOrTrunc(
+ DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+ {DAG.getConstant(Intrinsic::wasm_alltrue, DL, MVT::i32), Matched}),
+ DL, N->getValueType(0));
+
+ return SDValue();
+}
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::MUL);
@@ -3402,6 +3487,8 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
return performTruncateCombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN:
return performLowerPartialReduction(N, DCI.DAG);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return performExtractVecEltCombine(N, DCI.DAG);
case ISD::MUL:
return performMulCombine(N, DCI.DAG);
}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-reduceand.ll b/llvm/test/CodeGen/WebAssembly/simd-reduceand.ll
index 00beedfbd7e96..f494691941b64 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-reduceand.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-reduceand.ll
@@ -6,18 +6,8 @@ define i1 @reduce_and_to_all_true_16i8(<16 x i8> %0) {
; CHECK-LABEL: reduce_and_to_all_true_16i8:
; CHECK: .functype reduce_and_to_all_true_16i8 (v128) -> (i32)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK-NEXT: i8x16.ne $push10=, $0, $pop0
-; CHECK-NEXT: local.tee $push9=, $0=, $pop10
-; CHECK-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK-NEXT: v128.and $push8=, $pop9, $pop1
-; CHECK-NEXT: local.tee $push7=, $0=, $pop8
-; CHECK-NEXT: i8x16.shuffle $push2=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK-NEXT: v128.and $push3=, $pop7, $pop2
-; CHECK-NEXT: i32x4.extract_lane $push4=, $pop3, 0
-; CHECK-NEXT: i32.const $push5=, 0
-; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5
-; CHECK-NEXT: return $pop6
+; CHECK-NEXT: i8x16.all_true $push0=, $0
+; CHECK-NEXT: return $pop0
%2 = icmp ne <16 x i8> %0, zeroinitializer
%3 = sext <16 x i1> %2 to <16 x i8>
%4 = bitcast <16 x i8> %3 to <4 x i32>
@@ -31,18 +21,8 @@ define i1 @reduce_and_to_all_true_4i32(<4 x i32> %0) {
; CHECK-LABEL: reduce_and_to_all_true_4i32:
; CHECK: .functype reduce_and_to_all_true_4i32 (v128) -> (i32)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0
-; CHECK-NEXT: i32x4.ne $push10=, $0, $pop0
-; CHECK-NEXT: local.tee $push9=, $0=, $pop10
-; CHECK-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK-NEXT: v128.and $push8=, $pop9, $pop1
-; CHECK-NEXT: local.tee $push7=, $0=, $pop8
-; CHECK-NEXT: i8x16.shuffle $push2=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK-NEXT: v128.and $push3=, $pop7, $pop2
-; CHECK-NEXT: i32x4.extract_lane $push4=, $pop3, 0
-; CHECK-NEXT: i32.const $push5=, 0
-; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5
-; CHECK-NEXT: return $pop6
+; CHECK-NEXT: i32x4.all_true $push0=, $0
+; CHECK-NEXT: return $pop0
%2 = icmp ne <4 x i32> %0, zeroinitializer
%3 = sext <4 x i1> %2 to <4 x i32>
%4 = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %3)
@@ -56,18 +36,8 @@ define i1 @reduce_and_to_all_true_2i64(<2 x i64> %0) {
; CHECK-LABEL: reduce_and_to_all_true_2i64:
; CHECK: .functype reduce_and_to_all_true_2i64 (v128) -> (i32)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0
-; CHECK-NEXT: i32x4.ne $push10=, $0, $pop0
-; CHECK-NEXT: local.tee $push9=, $0=, $pop10
-; CHECK-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK-NEXT: v128.and $push8=, $pop9, $pop1
-; CHECK-NEXT: local.tee $push7=, $0=, $pop8
-; CHECK-NEXT: i8x16.shuffle $push2=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK-NEXT: v128.and $push3=, $pop7, $pop2
-; CHECK-NEXT: i32x4.extract_lane $push4=, $pop3, 0
-; CHECK-NEXT: i32.const $push5=, 0
-; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5
-; CHECK-NEXT: return $pop6
+; CHECK-NEXT: i32x4.all_true $push0=, $0
+; CHECK-NEXT: return $pop0
%2 = bitcast <2 x i64> %0 to <4 x i32>
%3 = icmp ne <4 x i32> %2, zeroinitializer
%4 = sext <4 x i1> %3 to <4 x i32>
More information about the llvm-commits
mailing list