[llvm-branch-commits] [llvm] release/22.x: [PowerPC] Fix i128 vcmpequb optimization for loads with range metadata and small constants (#196801) (PR #198177)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun May 17 06:17:19 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-powerpc
Author: llvmbot
<details>
<summary>Changes</summary>
Backport 1907b586384b51be2f6b44490c46941f08ff6974
Requested by: @<!-- -->amy-kwan
---
Full diff: https://github.com/llvm/llvm-project/pull/198177.diff
2 Files Affected:
- (modified) llvm/lib/Target/PowerPC/PPCISelLowering.cpp (+28-8)
- (added) llvm/test/CodeGen/PowerPC/ppc-i128-cmp.ll (+282)
``````````diff
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index bdba040529d00..56aa33fdd4098 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -15586,17 +15586,27 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
}
// The function check a i128 load can convert to 16i8 load for Vcmpequb.
-static bool canConvertToVcmpequb(SDValue &LHS, SDValue &RHS) {
+static bool canConvertToVcmpequb(SDValue &LHS, SDValue &RHS, bool IsPPC64) {
- auto isValidForConvert = [](SDValue &Operand) {
+ auto isValidForConvert = [IsPPC64](SDValue &Operand) {
if (!Operand.hasOneUse())
return false;
if (Operand.getValueType() != MVT::i128)
return false;
- if (Operand.getOpcode() == ISD::Constant)
+ if (Operand.getOpcode() == ISD::Constant) {
+ auto *C = cast<ConstantSDNode>(Operand);
+ const APInt &Val = C->getAPIntValue();
+ // On PPC64, comparing an i128 value loaded from memory against a
+ // constant smaller than 2^16 is usually better left to scalar lowering.
+ // In that case, the compare can be lowered using xori (since xori has a
+ // 16-bit immediate field), which is cheaper than materializing a vector
+ // constant and using vcmpequb.
+ if (IsPPC64 && Val.ult(1ULL << 16))
+ return false;
return true;
+ }
auto *LoadNode = dyn_cast<LoadSDNode>(Operand);
if (!LoadNode)
@@ -15647,10 +15657,19 @@ SDValue convertTwoLoadsAndCmpToVCMPEQUB(SelectionDAG &DAG, SDNode *N,
assert(Operand.getOpcode() == ISD::LOAD && "Must be LoadSDNode here.");
auto *LoadNode = cast<LoadSDNode>(Operand);
- SDValue NewLoad =
- DAG.getLoad(MVT::v16i8, DL, LoadNode->getChain(),
- LoadNode->getBasePtr(), LoadNode->getMemOperand());
- DAG.ReplaceAllUsesOfValueWith(Operand.getValue(1), NewLoad.getValue(1));
+ // Create a new MachineMemOperand without range metadata.
+ // Range metadata is only valid for integer scalar types, not vectors.
+ // The original i128 load may have range metadata, but when we convert
+ // to v16i8, that metadata is no longer semantically valid.
+ MachineMemOperand *MMO = LoadNode->getMemOperand();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *NewMMO = MF.getMachineMemOperand(
+ MMO->getPointerInfo(), MMO->getFlags(), MMO->getSize(), MMO->getAlign(),
+ MMO->getAAInfo(), nullptr, MMO->getSyncScopeID(),
+ MMO->getSuccessOrdering(), MMO->getFailureOrdering());
+ SDValue NewLoad = DAG.getLoad(MVT::v16i8, DL, LoadNode->getChain(),
+ LoadNode->getBasePtr(), NewMMO);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LoadNode, 1), NewLoad.getValue(1));
return NewLoad;
};
@@ -15815,7 +15834,8 @@ SDValue PPCTargetLowering::combineSetCC(SDNode *N,
// This transformation replaces memcmp(a, b, 16) with two vector loads
// and one vector compare instruction.
- if (Subtarget.hasAltivec() && canConvertToVcmpequb(LHS, RHS))
+ if (Subtarget.hasAltivec() &&
+ canConvertToVcmpequb(LHS, RHS, Subtarget.isPPC64()))
return convertTwoLoadsAndCmpToVCMPEQUB(DCI.DAG, N, SDLoc(N));
}
diff --git a/llvm/test/CodeGen/PowerPC/ppc-i128-cmp.ll b/llvm/test/CodeGen/PowerPC/ppc-i128-cmp.ll
new file mode 100644
index 0000000000000..c661d7da690b4
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/ppc-i128-cmp.ll
@@ -0,0 +1,282 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -mtriple=powerpc64-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefixes=COMMON,CHECK-AIX64
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s | \
+; RUN: FileCheck %s --check-prefixes=COMMON,CHECK-LINUX
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -mtriple=powerpc-ibm-aix < %s | \
+; RUN: FileCheck %s --check-prefixes=COMMON,CHECK-AIX32
+
+define i1 @test1() {
+; CHECK-AIX64-LABEL: test1:
+; CHECK-AIX64: # %bb.0: # %bb
+; CHECK-AIX64-NEXT: ld r3, 0(0)
+; CHECK-AIX64-NEXT: ld r4, 8(0)
+; CHECK-AIX64-NEXT: or r3, r4, r3
+; CHECK-AIX64-NEXT: cntlzd r3, r3
+; CHECK-AIX64-NEXT: rldicl r3, r3, 58, 63
+; CHECK-AIX64-NEXT: blr
+;
+; CHECK-LINUX-LABEL: test1:
+; CHECK-LINUX: # %bb.0: # %bb
+; CHECK-LINUX-NEXT: ld r3, 8(0)
+; CHECK-LINUX-NEXT: ld r4, 0(0)
+; CHECK-LINUX-NEXT: or r3, r4, r3
+; CHECK-LINUX-NEXT: cntlzd r3, r3
+; CHECK-LINUX-NEXT: rldicl r3, r3, 58, 63
+; CHECK-LINUX-NEXT: blr
+;
+; CHECK-AIX32-LABEL: test1:
+; CHECK-AIX32: # %bb.0: # %bb
+; CHECK-AIX32-NEXT: li r3, 0
+; CHECK-AIX32-NEXT: xxlxor vs35, vs35, vs35
+; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3
+; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3
+; CHECK-AIX32-NEXT: mfocrf r3, 2
+; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31
+; CHECK-AIX32-NEXT: blr
+bb:
+ %load = load i128, ptr null, align 16
+ %icmp = icmp eq i128 %load, 0
+ ret i1 %icmp
+}
+
+define i1 @test2() {
+; CHECK-AIX64-LABEL: test2:
+; CHECK-AIX64: # %bb.0: # %bb
+; CHECK-AIX64-NEXT: ld r4, 8(0)
+; CHECK-AIX64-NEXT: ld r3, 0(0)
+; CHECK-AIX64-NEXT: xori r4, r4, 10
+; CHECK-AIX64-NEXT: or r3, r4, r3
+; CHECK-AIX64-NEXT: cntlzd r3, r3
+; CHECK-AIX64-NEXT: rldicl r3, r3, 58, 63
+; CHECK-AIX64-NEXT: blr
+;
+; CHECK-LINUX-LABEL: test2:
+; CHECK-LINUX: # %bb.0: # %bb
+; CHECK-LINUX-NEXT: ld r4, 0(0)
+; CHECK-LINUX-NEXT: ld r3, 8(0)
+; CHECK-LINUX-NEXT: xori r4, r4, 10
+; CHECK-LINUX-NEXT: or r3, r4, r3
+; CHECK-LINUX-NEXT: cntlzd r3, r3
+; CHECK-LINUX-NEXT: rldicl r3, r3, 58, 63
+; CHECK-LINUX-NEXT: blr
+;
+; CHECK-AIX32-LABEL: test2:
+; CHECK-AIX32: # %bb.0: # %bb
+; CHECK-AIX32-NEXT: li r3, 0
+; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3
+; CHECK-AIX32-NEXT: lwz r3, L..C0(r2) # %const.0
+; CHECK-AIX32-NEXT: lxvw4x vs35, 0, r3
+; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3
+; CHECK-AIX32-NEXT: mfocrf r3, 2
+; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31
+; CHECK-AIX32-NEXT: blr
+bb:
+ %load = load i128, ptr null, align 16
+ %icmp = icmp eq i128 %load, 10
+ ret i1 %icmp
+}
+
+define i1 @test3() {
+; CHECK-AIX64-LABEL: test3:
+; CHECK-AIX64: # %bb.0: # %bb
+; CHECK-AIX64-NEXT: ld r4, 8(0)
+; CHECK-AIX64-NEXT: ld r3, 0(0)
+; CHECK-AIX64-NEXT: xori r4, r4, 65535
+; CHECK-AIX64-NEXT: or r3, r4, r3
+; CHECK-AIX64-NEXT: cntlzd r3, r3
+; CHECK-AIX64-NEXT: rldicl r3, r3, 58, 63
+; CHECK-AIX64-NEXT: blr
+;
+; CHECK-LINUX-LABEL: test3:
+; CHECK-LINUX: # %bb.0: # %bb
+; CHECK-LINUX-NEXT: ld r4, 0(0)
+; CHECK-LINUX-NEXT: ld r3, 8(0)
+; CHECK-LINUX-NEXT: xori r4, r4, 65535
+; CHECK-LINUX-NEXT: or r3, r4, r3
+; CHECK-LINUX-NEXT: cntlzd r3, r3
+; CHECK-LINUX-NEXT: rldicl r3, r3, 58, 63
+; CHECK-LINUX-NEXT: blr
+;
+; CHECK-AIX32-LABEL: test3:
+; CHECK-AIX32: # %bb.0: # %bb
+; CHECK-AIX32-NEXT: li r3, 0
+; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3
+; CHECK-AIX32-NEXT: lwz r3, L..C1(r2) # %const.0
+; CHECK-AIX32-NEXT: lxvw4x vs35, 0, r3
+; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3
+; CHECK-AIX32-NEXT: mfocrf r3, 2
+; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31
+; CHECK-AIX32-NEXT: blr
+bb:
+ %load = load i128, ptr null, align 16
+ %icmp = icmp eq i128 %load, 65535
+ ret i1 %icmp
+}
+
+define i1 @test4() {
+; CHECK-AIX64-LABEL: test4:
+; CHECK-AIX64: # %bb.0: # %bb
+; CHECK-AIX64-NEXT: li r3, 0
+; CHECK-AIX64-NEXT: lxvw4x vs34, 0, r3
+; CHECK-AIX64-NEXT: ld r3, L..C0(r2) # %const.0
+; CHECK-AIX64-NEXT: lxvd2x vs35, 0, r3
+; CHECK-AIX64-NEXT: vcmpequb. v2, v2, v3
+; CHECK-AIX64-NEXT: mfocrf r3, 2
+; CHECK-AIX64-NEXT: rlwinm r3, r3, 25, 31, 31
+; CHECK-AIX64-NEXT: blr
+;
+; CHECK-LINUX-LABEL: test4:
+; CHECK-LINUX: # %bb.0: # %bb
+; CHECK-LINUX-NEXT: li r3, 0
+; CHECK-LINUX-NEXT: lxvd2x vs34, 0, r3
+; CHECK-LINUX-NEXT: addis r3, r2, .LCPI3_0 at toc@ha
+; CHECK-LINUX-NEXT: addi r3, r3, .LCPI3_0 at toc@l
+; CHECK-LINUX-NEXT: lxvd2x vs35, 0, r3
+; CHECK-LINUX-NEXT: vcmpequb. v2, v2, v3
+; CHECK-LINUX-NEXT: mfocrf r3, 2
+; CHECK-LINUX-NEXT: rlwinm r3, r3, 25, 31, 31
+; CHECK-LINUX-NEXT: blr
+;
+; CHECK-AIX32-LABEL: test4:
+; CHECK-AIX32: # %bb.0: # %bb
+; CHECK-AIX32-NEXT: li r3, 0
+; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3
+; CHECK-AIX32-NEXT: lwz r3, L..C2(r2) # %const.0
+; CHECK-AIX32-NEXT: lxvw4x vs35, 0, r3
+; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3
+; CHECK-AIX32-NEXT: mfocrf r3, 2
+; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31
+; CHECK-AIX32-NEXT: blr
+bb:
+ %load = load i128, ptr null, align 16
+ %icmp = icmp eq i128 %load, 65536
+ ret i1 %icmp
+}
+
+; Test using the !range metadata
+define i1 @test5() {
+; CHECK-AIX64-LABEL: test5:
+; CHECK-AIX64: # %bb.0: # %bb
+; CHECK-AIX64-NEXT: ld r3, 0(0)
+; CHECK-AIX64-NEXT: ld r4, 8(0)
+; CHECK-AIX64-NEXT: or r3, r4, r3
+; CHECK-AIX64-NEXT: cntlzd r3, r3
+; CHECK-AIX64-NEXT: rldicl r3, r3, 58, 63
+; CHECK-AIX64-NEXT: blr
+;
+; CHECK-LINUX-LABEL: test5:
+; CHECK-LINUX: # %bb.0: # %bb
+; CHECK-LINUX-NEXT: ld r3, 8(0)
+; CHECK-LINUX-NEXT: ld r4, 0(0)
+; CHECK-LINUX-NEXT: or r3, r4, r3
+; CHECK-LINUX-NEXT: cntlzd r3, r3
+; CHECK-LINUX-NEXT: rldicl r3, r3, 58, 63
+; CHECK-LINUX-NEXT: blr
+;
+; CHECK-AIX32-LABEL: test5:
+; CHECK-AIX32: # %bb.0: # %bb
+; CHECK-AIX32-NEXT: li r3, 0
+; CHECK-AIX32-NEXT: xxlxor vs35, vs35, vs35
+; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3
+; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3
+; CHECK-AIX32-NEXT: mfocrf r3, 2
+; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31
+; CHECK-AIX32-NEXT: blr
+bb:
+ %load = load i128, ptr null, align 16, !range !0
+ %icmp = icmp eq i128 %load, 0
+ ret i1 %icmp
+}
+
+define i1 @test6() {
+; CHECK-AIX64-LABEL: test6:
+; CHECK-AIX64: # %bb.0: # %bb
+; CHECK-AIX64-NEXT: ld r4, 8(0)
+; CHECK-AIX64-NEXT: ld r3, 0(0)
+; CHECK-AIX64-NEXT: xori r4, r4, 65535
+; CHECK-AIX64-NEXT: or r3, r4, r3
+; CHECK-AIX64-NEXT: cntlzd r3, r3
+; CHECK-AIX64-NEXT: rldicl r3, r3, 58, 63
+; CHECK-AIX64-NEXT: blr
+;
+; CHECK-LINUX-LABEL: test6:
+; CHECK-LINUX: # %bb.0: # %bb
+; CHECK-LINUX-NEXT: ld r4, 0(0)
+; CHECK-LINUX-NEXT: ld r3, 8(0)
+; CHECK-LINUX-NEXT: xori r4, r4, 65535
+; CHECK-LINUX-NEXT: or r3, r4, r3
+; CHECK-LINUX-NEXT: cntlzd r3, r3
+; CHECK-LINUX-NEXT: rldicl r3, r3, 58, 63
+; CHECK-LINUX-NEXT: blr
+;
+; CHECK-AIX32-LABEL: test6:
+; CHECK-AIX32: # %bb.0: # %bb
+; CHECK-AIX32-NEXT: li r3, 0
+; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3
+; CHECK-AIX32-NEXT: lwz r3, L..C3(r2) # %const.0
+; CHECK-AIX32-NEXT: lxvw4x vs35, 0, r3
+; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3
+; CHECK-AIX32-NEXT: mfocrf r3, 2
+; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31
+; CHECK-AIX32-NEXT: blr
+bb:
+ %load = load i128, ptr null, align 16, !range !1
+ %icmp = icmp eq i128 %load, 65535
+ ret i1 %icmp
+}
+
+define i1 @test7() {
+; COMMON-LABEL: test7:
+; COMMON: # %bb.0: # %bb
+; COMMON-NEXT: li r3, 0
+; COMMON-NEXT: blr
+bb:
+ %load = load i128, ptr null, align 16, !range !1
+ %icmp = icmp eq i128 %load, 65536
+ ret i1 %icmp
+}
+
+define i1 @test8() {
+; CHECK-AIX64-LABEL: test8:
+; CHECK-AIX64: # %bb.0: # %bb
+; CHECK-AIX64-NEXT: li r3, 0
+; CHECK-AIX64-NEXT: lxvw4x vs34, 0, r3
+; CHECK-AIX64-NEXT: ld r3, L..C1(r2) # %const.0
+; CHECK-AIX64-NEXT: lxvd2x vs35, 0, r3
+; CHECK-AIX64-NEXT: vcmpequb. v2, v2, v3
+; CHECK-AIX64-NEXT: mfocrf r3, 2
+; CHECK-AIX64-NEXT: rlwinm r3, r3, 25, 31, 31
+; CHECK-AIX64-NEXT: blr
+;
+; CHECK-LINUX-LABEL: test8:
+; CHECK-LINUX: # %bb.0: # %bb
+; CHECK-LINUX-NEXT: li r3, 0
+; CHECK-LINUX-NEXT: lxvd2x vs34, 0, r3
+; CHECK-LINUX-NEXT: addis r3, r2, .LCPI7_0 at toc@ha
+; CHECK-LINUX-NEXT: addi r3, r3, .LCPI7_0 at toc@l
+; CHECK-LINUX-NEXT: lxvd2x vs35, 0, r3
+; CHECK-LINUX-NEXT: vcmpequb. v2, v2, v3
+; CHECK-LINUX-NEXT: mfocrf r3, 2
+; CHECK-LINUX-NEXT: rlwinm r3, r3, 25, 31, 31
+; CHECK-LINUX-NEXT: blr
+;
+; CHECK-AIX32-LABEL: test8:
+; CHECK-AIX32: # %bb.0: # %bb
+; CHECK-AIX32-NEXT: li r3, 0
+; CHECK-AIX32-NEXT: lxvw4x vs34, 0, r3
+; CHECK-AIX32-NEXT: lwz r3, L..C4(r2) # %const.0
+; CHECK-AIX32-NEXT: lxvw4x vs35, 0, r3
+; CHECK-AIX32-NEXT: vcmpequb. v2, v2, v3
+; CHECK-AIX32-NEXT: mfocrf r3, 2
+; CHECK-AIX32-NEXT: rlwinm r3, r3, 25, 31, 31
+; CHECK-AIX32-NEXT: blr
+bb:
+ %load = load i128, ptr null, align 16, !range !2
+ %icmp = icmp eq i128 %load, 65536
+ ret i1 %icmp
+}
+
+!0 = !{i128 0, i128 2}
+!1 = !{i128 0, i128 65536}
+!2 = !{i128 0, i128 65537}
``````````
</details>
https://github.com/llvm/llvm-project/pull/198177
More information about the llvm-branch-commits
mailing list