[llvm] [WebAssembly] Legalize i128 to v2i64 for setcc (PR #149461)
Jasmine Tang via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 7 11:27:27 PDT 2025
https://github.com/badumbatish updated https://github.com/llvm/llvm-project/pull/149461
>From 45ad537e65e9b1482ebdffeec2e4ab02a669d85f Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Thu, 31 Jul 2025 14:11:24 -0700
Subject: [PATCH 1/3] [WebAssembly] Add simd support for memcmp
---
.../WebAssembly/WebAssemblyISelLowering.cpp | 60 ++++++++++++++++++-
.../WebAssemblyTargetTransformInfo.cpp | 3 +-
.../test/CodeGen/WebAssembly/memcmp-expand.ll | 22 +++----
3 files changed, 68 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index cd434f7a331e4..ee16f7bf9133d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3383,8 +3383,61 @@ static SDValue TryMatchTrue(SDNode *N, EVT VecVT, SelectionDAG &DAG) {
return DAG.getZExtOrTrunc(Ret, DL, N->getValueType(0));
}
+static SDValue
+combineVectorSizedSetCCEquality(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const WebAssemblySubtarget *Subtarget) {
+
+ SDLoc DL(N);
+ SDValue X = N->getOperand(0);
+ SDValue Y = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ EVT OpVT = X.getValueType();
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ SelectionDAG &DAG = DCI.DAG;
+ // We're looking for an oversized integer equality comparison.
+ if (!OpVT.isScalarInteger() || !OpVT.isByteSized() || OpVT != MVT::i128 ||
+ !Subtarget->hasSIMD128())
+ return SDValue();
+
+ // Don't perform this combine if constructing the vector will be expensive.
+ auto IsVectorBitCastCheap = [](SDValue X) {
+ X = peekThroughBitcasts(X);
+ return isa<ConstantSDNode>(X) || X.getOpcode() == ISD::LOAD;
+ };
+
+ if (!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y))
+ return SDValue();
+
+ // TODO: Not sure what's the purpose of this? I'm keeping here since RISCV has
+ // it
+ if (DCI.DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat))
+ return SDValue();
+
+ unsigned OpSize = OpVT.getSizeInBits();
+ unsigned VecSize = OpSize / 8;
+
+ EVT VecVT = EVT::getVectorVT(*DCI.DAG.getContext(), MVT::i8, VecSize);
+ EVT CmpVT = EVT::getVectorVT(*DCI.DAG.getContext(), MVT::i8, VecSize);
+
+ SDValue VecX = DAG.getBitcast(VecVT, X);
+ SDValue VecY = DAG.getBitcast(VecVT, Y);
+
+ SDValue Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, CC);
+
+ SDValue AllTrue = DAG.getZExtOrTrunc(
+ DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+ {DAG.getConstant(Intrinsic::wasm_alltrue, DL, MVT::i32), Cmp}),
+ DL, MVT::i1);
+
+ return DAG.getSetCC(DL, VT, AllTrue, DAG.getConstant(0, DL, MVT::i1), CC);
+}
+
static SDValue performSETCCCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const WebAssemblySubtarget *Subtarget) {
if (!DCI.isBeforeLegalize())
return SDValue();
@@ -3392,6 +3445,9 @@ static SDValue performSETCCCombine(SDNode *N,
if (!VT.isScalarInteger())
return SDValue();
+ if (SDValue V = combineVectorSizedSetCCEquality(N, DCI, Subtarget))
+ return V;
+
SDValue LHS = N->getOperand(0);
if (LHS->getOpcode() != ISD::BITCAST)
return SDValue();
@@ -3532,7 +3588,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::BITCAST:
return performBitcastCombine(N, DCI);
case ISD::SETCC:
- return performSETCCCombine(N, DCI);
+ return performSETCCCombine(N, DCI, Subtarget);
case ISD::VECTOR_SHUFFLE:
return performVECTOR_SHUFFLECombine(N, DCI);
case ISD::SIGN_EXTEND:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 52e706514226b..08fb7586d215e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -147,7 +147,8 @@ WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
Options.AllowOverlappingLoads = true;
- // TODO: Teach WebAssembly backend about load v128.
+ if (ST->hasSIMD128())
+ Options.LoadSizes.push_back(16);
Options.LoadSizes.append({8, 4, 2, 1});
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
diff --git a/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
index 8030438645f82..c6df6b50693fa 100644
--- a/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
+++ b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
target triple = "wasm32-unknown-unknown"
@@ -132,19 +132,13 @@ define i1 @memcmp_expand_16(ptr %a, ptr %b) {
; CHECK-LABEL: memcmp_expand_16:
; CHECK: .functype memcmp_expand_16 (i32, i32) -> (i32)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: i64.load $push7=, 0($0):p2align=0
-; CHECK-NEXT: i64.load $push6=, 0($1):p2align=0
-; CHECK-NEXT: i64.xor $push8=, $pop7, $pop6
-; CHECK-NEXT: i32.const $push0=, 8
-; CHECK-NEXT: i32.add $push3=, $0, $pop0
-; CHECK-NEXT: i64.load $push4=, 0($pop3):p2align=0
-; CHECK-NEXT: i32.const $push11=, 8
-; CHECK-NEXT: i32.add $push1=, $1, $pop11
-; CHECK-NEXT: i64.load $push2=, 0($pop1):p2align=0
-; CHECK-NEXT: i64.xor $push5=, $pop4, $pop2
-; CHECK-NEXT: i64.or $push9=, $pop8, $pop5
-; CHECK-NEXT: i64.eqz $push10=, $pop9
-; CHECK-NEXT: return $pop10
+; CHECK-NEXT: v128.load $push1=, 0($0):p2align=0
+; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0
+; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0
+; CHECK-NEXT: i8x16.all_true $push3=, $pop2
+; CHECK-NEXT: i32.const $push4=, 1
+; CHECK-NEXT: i32.xor $push5=, $pop3, $pop4
+; CHECK-NEXT: return $pop5
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
%res = icmp eq i32 %cmp_16, 0
ret i1 %res
>From 1ae947fdb4cc4914078343fbb5dd3e1c99f80444 Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Tue, 5 Aug 2025 14:24:58 -0700
Subject: [PATCH 2/3] Addresses PR reviews
---
.../WebAssembly/WebAssemblyISelLowering.cpp | 24 +++++--------------
.../test/CodeGen/WebAssembly/memcmp-expand.ll | 5 ++--
2 files changed, 8 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index ee16f7bf9133d..b29c54ebc0b0b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3409,30 +3409,18 @@ combineVectorSizedSetCCEquality(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
if (!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y))
return SDValue();
- // TODO: Not sure what's the purpose of this? I'm keeping here since RISCV has
- // it
- if (DCI.DAG.getMachineFunction().getFunction().hasFnAttribute(
- Attribute::NoImplicitFloat))
- return SDValue();
-
- unsigned OpSize = OpVT.getSizeInBits();
- unsigned VecSize = OpSize / 8;
-
- EVT VecVT = EVT::getVectorVT(*DCI.DAG.getContext(), MVT::i8, VecSize);
- EVT CmpVT = EVT::getVectorVT(*DCI.DAG.getContext(), MVT::i8, VecSize);
+ EVT VecVT = MVT::v16i8;
SDValue VecX = DAG.getBitcast(VecVT, X);
SDValue VecY = DAG.getBitcast(VecVT, Y);
- SDValue Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, CC);
+ SDValue Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, CC);
- SDValue AllTrue = DAG.getZExtOrTrunc(
- DAG.getNode(
- ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
- {DAG.getConstant(Intrinsic::wasm_alltrue, DL, MVT::i32), Cmp}),
- DL, MVT::i1);
+ SDValue AllTrue = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+ {DAG.getConstant(Intrinsic::wasm_alltrue, DL, MVT::i32), Cmp});
- return DAG.getSetCC(DL, VT, AllTrue, DAG.getConstant(0, DL, MVT::i1), CC);
+ return DAG.getSetCC(DL, VT, AllTrue, DAG.getConstant(0, DL, MVT::i32), CC);
}
static SDValue performSETCCCombine(SDNode *N,
diff --git a/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
index c6df6b50693fa..00e87917aa173 100644
--- a/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
+++ b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
@@ -136,9 +136,8 @@ define i1 @memcmp_expand_16(ptr %a, ptr %b) {
; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0
; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
-; CHECK-NEXT: i32.const $push4=, 1
-; CHECK-NEXT: i32.xor $push5=, $pop3, $pop4
-; CHECK-NEXT: return $pop5
+; CHECK-NEXT: i32.eqz $push4=, $pop3
+; CHECK-NEXT: return $pop4
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
%res = icmp eq i32 %cmp_16, 0
ret i1 %res
>From 1934453a9f3bbf1e871204723d57c5bb1813cddc Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Thu, 7 Aug 2025 11:27:04 -0700
Subject: [PATCH 3/3] Addresses shortcomings, add more tests
---
.../WebAssembly/WebAssemblyISelLowering.cpp | 8 +-
.../test/CodeGen/WebAssembly/memcmp-expand.ll | 1 -
llvm/test/CodeGen/WebAssembly/simd-setcc.ll | 89 +++++++++++++++++++
3 files changed, 96 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/WebAssembly/simd-setcc.ll
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index b29c54ebc0b0b..971b1b85634e8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -19,6 +19,7 @@
#include "WebAssemblyTargetMachine.h"
#include "WebAssemblyUtilities.h"
#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -3394,7 +3395,13 @@ combineVectorSizedSetCCEquality(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
EVT OpVT = X.getValueType();
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ if (!isIntEqualitySetCC(CC))
+ return SDValue();
+
SelectionDAG &DAG = DCI.DAG;
+ if (DCI.DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat))
+ return SDValue();
// We're looking for an oversized integer equality comparison.
if (!OpVT.isScalarInteger() || !OpVT.isByteSized() || OpVT != MVT::i128 ||
!Subtarget->hasSIMD128())
@@ -3413,7 +3420,6 @@ combineVectorSizedSetCCEquality(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SDValue VecX = DAG.getBitcast(VecVT, X);
SDValue VecY = DAG.getBitcast(VecVT, Y);
-
SDValue Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, CC);
SDValue AllTrue = DAG.getNode(
diff --git a/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
index 00e87917aa173..9c520ff9a4db7 100644
--- a/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
+++ b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
@@ -127,7 +127,6 @@ define i1 @memcmp_expand_8(ptr %a, ptr %b) {
ret i1 %res
}
-; TODO: Should be using a single load i64x2 or equivalent in bitsizes
define i1 @memcmp_expand_16(ptr %a, ptr %b) {
; CHECK-LABEL: memcmp_expand_16:
; CHECK: .functype memcmp_expand_16 (i32, i32) -> (i32)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-setcc.ll b/llvm/test/CodeGen/WebAssembly/simd-setcc.ll
new file mode 100644
index 0000000000000..eb89be344e13b
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-setcc.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+declare i32 @memcmp(ptr, ptr, i32)
+
+define i1 @setcc_load(ptr %a, ptr %b) {
+; CHECK-LABEL: setcc_load:
+; CHECK: .functype setcc_load (i32, i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.load $push1=, 0($0):p2align=0
+; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0
+; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0
+; CHECK-NEXT: i8x16.all_true $push3=, $pop2
+; CHECK-NEXT: i32.eqz $push4=, $pop3
+; CHECK-NEXT: return $pop4
+ %cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
+ %res = icmp eq i32 %cmp_16, 0
+ ret i1 %res
+}
+
+; INFO: Negative test: noimplicitfloat disables simd
+define i1 @setcc_load_should_not_vectorize(ptr %a, ptr %b) noimplicitfloat {
+; CHECK-LABEL: setcc_load_should_not_vectorize:
+; CHECK: .functype setcc_load_should_not_vectorize (i32, i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i64.load $push4=, 0($0):p2align=0
+; CHECK-NEXT: i64.load $push3=, 0($1):p2align=0
+; CHECK-NEXT: i64.xor $push5=, $pop4, $pop3
+; CHECK-NEXT: i64.load $push1=, 8($0):p2align=0
+; CHECK-NEXT: i64.load $push0=, 8($1):p2align=0
+; CHECK-NEXT: i64.xor $push2=, $pop1, $pop0
+; CHECK-NEXT: i64.or $push6=, $pop5, $pop2
+; CHECK-NEXT: i64.eqz $push7=, $pop6
+; CHECK-NEXT: return $pop7
+ %cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
+ %res = icmp eq i32 %cmp_16, 0
+ ret i1 %res
+}
+
+define i1 @setcc_eq_const_i128(ptr %ptr) {
+; CHECK-LABEL: setcc_eq_const_i128:
+; CHECK: .functype setcc_eq_const_i128 (i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.load $push0=, 0($0)
+; CHECK-NEXT: v128.const $push1=, 6, 0
+; CHECK-NEXT: i8x16.eq $push2=, $pop0, $pop1
+; CHECK-NEXT: i8x16.all_true $push3=, $pop2
+; CHECK-NEXT: i32.eqz $push4=, $pop3
+; CHECK-NEXT: return $pop4
+ %l = load i128, ptr %ptr
+ %res = icmp eq i128 %l, 6
+ ret i1 %res
+}
+
+define i1 @setcc_ne_const_i128(ptr %ptr) {
+; CHECK-LABEL: setcc_ne_const_i128:
+; CHECK: .functype setcc_ne_const_i128 (i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.load $push0=, 0($0)
+; CHECK-NEXT: v128.const $push1=, 16, 0
+; CHECK-NEXT: i8x16.ne $push2=, $pop0, $pop1
+; CHECK-NEXT: i8x16.all_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %l = load i128, ptr %ptr
+ %res = icmp ne i128 %l, 16
+ ret i1 %res
+}
+
+; INFO: Negative test: only eq and ne works
+define i1 @setcc_slt_const_i128(ptr %ptr) {
+; CHECK-LABEL: setcc_slt_const_i128:
+; CHECK: .functype setcc_slt_const_i128 (i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i64.load $push2=, 0($0)
+; CHECK-NEXT: i64.const $push3=, 25
+; CHECK-NEXT: i64.lt_u $push4=, $pop2, $pop3
+; CHECK-NEXT: i64.load $push8=, 8($0)
+; CHECK-NEXT: local.tee $push7=, $1=, $pop8
+; CHECK-NEXT: i64.const $push0=, 0
+; CHECK-NEXT: i64.lt_s $push1=, $pop7, $pop0
+; CHECK-NEXT: i64.eqz $push5=, $1
+; CHECK-NEXT: i32.select $push6=, $pop4, $pop1, $pop5
+; CHECK-NEXT: return $pop6
+ %l = load i128, ptr %ptr
+ %res = icmp slt i128 %l, 25
+ ret i1 %res
+}
More information about the llvm-commits
mailing list