[llvm] [PowerPC] Implement a more efficient memcmp in cases where the length is known. (PR #158657)
zhijian lin via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 15 08:19:25 PDT 2025
https://github.com/diggerlin created https://github.com/llvm/llvm-project/pull/158657
For `int memcmp ( const void * ptr1, const void * ptr2, size_t num );` in cases where the `size_t num` parameter is known at compile time we can do a better job of generating code.
For example when we compile this:
```
#include <memory.h>
#include "altivec.h"
bool cmpeq16(const void *a, const void *b)
{
return memcmp(a, b, 16) == 0;
}
>From cfbf70c7ee7343d359dd4f286add8d8b8777c2e9 Mon Sep 17 00:00:00 2001
From: zhijian <zhijian at ca.ibm.com>
Date: Thu, 14 Aug 2025 20:18:18 +0000
Subject: [PATCH 1/4] implement memcmp with known fix length size.
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 60 +++++++++++++++++++
.../Target/PowerPC/PPCTargetTransformInfo.cpp | 2 +-
2 files changed, 61 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index fa104e4f69d7f..42e499aea2909 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -15556,6 +15556,66 @@ SDValue PPCTargetLowering::combineSetCC(SDNode *N,
SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
}
+
+ if (LHS.getOpcode() == ISD::LOAD && RHS.getOpcode() == ISD::LOAD &&
+ LHS.hasOneUse() && RHS.hasOneUse() && LHS.getValueType() == MVT::i128 &&
+ RHS.getValueType() == MVT::i128) {
+ SDLoc DL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ auto *LA = dyn_cast<LoadSDNode>(LHS);
+ auto *LB = dyn_cast<LoadSDNode>(RHS);
+ if (!LA || !LB)
+ return SDValue();
+
+ // If either memory operation (LA or LB) is volatile, do not perform any
+ // optimization or transformation. Volatile operations must be preserved
+ // as written to ensure correct program behavior, so we return an empty
+ // SDValue to indicate no action.
+ if (LA->isVolatile() || LB->isVolatile())
+ return SDValue();
+
+ // Only combine loads if both use the unindexed addressing mode.
+ // PowerPC AltiVec/VMX does not support vector loads or stores with
+ // pre/post-increment addressing. Indexed modes may imply implicit pointer
+ // updates, which are not compatible with AltiVec vector instructions.
+ if (LA->getAddressingMode() != ISD::UNINDEXED ||
+ LB->getAddressingMode() != ISD::UNINDEXED)
+ return SDValue();
+
+ // Only combine loads if both are non-extending loads (ISD::NON_EXTLOAD).
+ // Extending loads (such as ISD::ZEXTLOAD or ISD::SEXTLOAD) perform zero
+ // or sign extension, which may change the loaded value's semantics and
+ // are not compatible with vector loads.
+ if (LA->getExtensionType() != ISD::NON_EXTLOAD ||
+ LB->getExtensionType() != ISD::NON_EXTLOAD)
+ return SDValue();
+ // Build new v16i8 loads using the SAME chain/base/MMO (no extra memory
+ // op).
+ SDValue LHSVec = DAG.getLoad(MVT::v16i8, DL, LA->getChain(),
+ LA->getBasePtr(), LA->getMemOperand());
+ SDValue RHSVec = DAG.getLoad(MVT::v16i8, DL, LB->getChain(),
+ LB->getBasePtr(), LB->getMemOperand());
+
+ // Replace old loads?¡¥ results (value and chain) so the old nodes die.
+ // DAG.DeleteNode(LHS.getNode());
+ // DAG.DeleteNode(RHS.getNode());
+
+ // SDValue LHSVec = DAG.getBitcast(MVT::v16i8, LHS);
+ // SDValue RHSVec = DAG.getBitcast(MVT::v16i8, RHS);
+ SDValue IntrID =
+ DAG.getTargetConstant(Intrinsic::ppc_altivec_vcmpequb_p, DL,
+ Subtarget.isPPC64() ? MVT::i64 : MVT::i32);
+ SDValue CRSel =
+ DAG.getConstant(2, DL, MVT::i32); // which CR6 predicate field
+ SDValue Ops[] = {IntrID, CRSel, LHSVec, RHSVec};
+ SDValue PredResult =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, Ops);
+ // ppc_altivec_vcmpequb_p returns 1 when two vectors are the same,
+ // so we need to invert the CC opcode.
+ return DAG.getSetCC(DL, N->getValueType(0), PredResult,
+ DAG.getConstant(0, DL, MVT::i32),
+ CC == ISD::SETNE ? ISD::SETEQ : ISD::SETNE);
+ }
}
return DAGCombineTruncBoolExt(N, DCI);
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2fba090f2d501..93b7308f3fdab 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -439,7 +439,7 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) const {
PPCTTIImpl::TTI::MemCmpExpansionOptions
PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
TTI::MemCmpExpansionOptions Options;
- Options.LoadSizes = {8, 4, 2, 1};
+ Options.LoadSizes = {16, 8, 4, 2, 1};
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
return Options;
}
>From cc66c46f8b06def2299af656b3b953e819bad709 Mon Sep 17 00:00:00 2001
From: zhijian <zhijian at ca.ibm.com>
Date: Tue, 9 Sep 2025 15:35:04 +0000
Subject: [PATCH 2/4] delete dead code
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 42e499aea2909..5dcbd84a15633 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -15589,19 +15589,13 @@ SDValue PPCTargetLowering::combineSetCC(SDNode *N,
if (LA->getExtensionType() != ISD::NON_EXTLOAD ||
LB->getExtensionType() != ISD::NON_EXTLOAD)
return SDValue();
- // Build new v16i8 loads using the SAME chain/base/MMO (no extra memory
+ // Build new v16i8 loads using the same chain/base/MMO (no extra memory
// op).
SDValue LHSVec = DAG.getLoad(MVT::v16i8, DL, LA->getChain(),
LA->getBasePtr(), LA->getMemOperand());
SDValue RHSVec = DAG.getLoad(MVT::v16i8, DL, LB->getChain(),
LB->getBasePtr(), LB->getMemOperand());
- // Replace old loads?¡¥ results (value and chain) so the old nodes die.
- // DAG.DeleteNode(LHS.getNode());
- // DAG.DeleteNode(RHS.getNode());
-
- // SDValue LHSVec = DAG.getBitcast(MVT::v16i8, LHS);
- // SDValue RHSVec = DAG.getBitcast(MVT::v16i8, RHS);
SDValue IntrID =
DAG.getTargetConstant(Intrinsic::ppc_altivec_vcmpequb_p, DL,
Subtarget.isPPC64() ? MVT::i64 : MVT::i32);
@@ -15610,6 +15604,7 @@ SDValue PPCTargetLowering::combineSetCC(SDNode *N,
SDValue Ops[] = {IntrID, CRSel, LHSVec, RHSVec};
SDValue PredResult =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, Ops);
+
// ppc_altivec_vcmpequb_p returns 1 when two vectors are the same,
// so we need to invert the CC opcode.
return DAG.getSetCC(DL, N->getValueType(0), PredResult,
>From 46d907a4e9a51af0c7ad5229fc0e6f7bfab890b1 Mon Sep 17 00:00:00 2001
From: zhijian <zhijian at ca.ibm.com>
Date: Tue, 9 Sep 2025 15:32:03 +0000
Subject: [PATCH 3/4] modify test case based on the new functionality
---
.../memCmpUsedInZeroEqualityComparison.ll | 45 ++++++---------
llvm/test/CodeGen/PowerPC/memcmpIR.ll | 55 +++++--------------
2 files changed, 30 insertions(+), 70 deletions(-)
diff --git a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
index 1da40d46aa773..7c4cf7265ff6a 100644
--- a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
+++ b/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
@@ -35,18 +35,13 @@ define signext i32 @zeroEqualityTest02(ptr %x, ptr %y) {
define signext i32 @zeroEqualityTest01(ptr %x, ptr %y) {
; CHECK-LABEL: zeroEqualityTest01:
; CHECK: # %bb.0:
-; CHECK-NEXT: ld 5, 0(3)
-; CHECK-NEXT: ld 6, 0(4)
-; CHECK-NEXT: cmpld 5, 6
-; CHECK-NEXT: bne 0, .LBB1_2
-; CHECK-NEXT: # %bb.1: # %loadbb1
-; CHECK-NEXT: ld 5, 8(3)
-; CHECK-NEXT: ld 4, 8(4)
-; CHECK-NEXT: li 3, 0
-; CHECK-NEXT: cmpld 5, 4
-; CHECK-NEXT: beqlr 0
-; CHECK-NEXT: .LBB1_2: # %res_block
-; CHECK-NEXT: li 3, 1
+; CHECK-NEXT: lxvd2x 34, 0, 4
+; CHECK-NEXT: lxvd2x 35, 0, 3
+; CHECK-NEXT: vcmpequb. 2, 3, 2
+; CHECK-NEXT: mfocrf 3, 2
+; CHECK-NEXT: rlwinm 3, 3, 25, 31, 31
+; CHECK-NEXT: cntlzw 3, 3
+; CHECK-NEXT: srwi 3, 3, 5
; CHECK-NEXT: blr
%call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 16)
%not.tobool = icmp ne i32 %call, 0
@@ -85,7 +80,7 @@ define signext i32 @zeroEqualityTest03(ptr %x, ptr %y) {
; Validate with > 0
define signext i32 @zeroEqualityTest04() {
; CHECK-LABEL: zeroEqualityTest04:
-; CHECK: # %bb.0: # %loadbb
+; CHECK: # %bb.0:
; CHECK-NEXT: li 3, 0
; CHECK-NEXT: blr
%call = tail call signext i32 @memcmp(ptr @zeroEqualityTest02.buffer1, ptr @zeroEqualityTest02.buffer2, i64 16)
@@ -97,7 +92,7 @@ define signext i32 @zeroEqualityTest04() {
; Validate with < 0
define signext i32 @zeroEqualityTest05() {
; CHECK-LABEL: zeroEqualityTest05:
-; CHECK: # %bb.0: # %loadbb
+; CHECK: # %bb.0:
; CHECK-NEXT: li 3, 0
; CHECK-NEXT: blr
%call = tail call signext i32 @memcmp(ptr @zeroEqualityTest03.buffer1, ptr @zeroEqualityTest03.buffer2, i64 16)
@@ -109,7 +104,7 @@ define signext i32 @zeroEqualityTest05() {
; Validate with memcmp()?:
define signext i32 @equalityFoldTwoConstants() {
; CHECK-LABEL: equalityFoldTwoConstants:
-; CHECK: # %bb.0: # %loadbb
+; CHECK: # %bb.0:
; CHECK-NEXT: li 3, 1
; CHECK-NEXT: blr
%call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr @zeroEqualityTest04.buffer2, i64 16)
@@ -122,23 +117,17 @@ define signext i32 @equalityFoldOneConstant(ptr %X) {
; CHECK-LABEL: equalityFoldOneConstant:
; CHECK: # %bb.0:
; CHECK-NEXT: li 5, 1
-; CHECK-NEXT: ld 4, 0(3)
+; CHECK-NEXT: ld 4, 8(3)
+; CHECK-NEXT: ld 3, 0(3)
; CHECK-NEXT: rldic 5, 5, 32, 31
-; CHECK-NEXT: cmpld 4, 5
-; CHECK-NEXT: bne 0, .LBB6_2
-; CHECK-NEXT: # %bb.1: # %loadbb1
+; CHECK-NEXT: xor 3, 3, 5
; CHECK-NEXT: lis 5, -32768
-; CHECK-NEXT: ld 4, 8(3)
-; CHECK-NEXT: li 3, 0
; CHECK-NEXT: ori 5, 5, 1
; CHECK-NEXT: rldic 5, 5, 1, 30
-; CHECK-NEXT: cmpld 4, 5
-; CHECK-NEXT: beq 0, .LBB6_3
-; CHECK-NEXT: .LBB6_2: # %res_block
-; CHECK-NEXT: li 3, 1
-; CHECK-NEXT: .LBB6_3: # %endblock
-; CHECK-NEXT: cntlzw 3, 3
-; CHECK-NEXT: srwi 3, 3, 5
+; CHECK-NEXT: xor 4, 4, 5
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: cntlzd 3, 3
+; CHECK-NEXT: rldicl 3, 3, 58, 63
; CHECK-NEXT: blr
%call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr %X, i64 16)
%not.tobool = icmp eq i32 %call, 0
diff --git a/llvm/test/CodeGen/PowerPC/memcmpIR.ll b/llvm/test/CodeGen/PowerPC/memcmpIR.ll
index b57d2b5116b77..995ecb64d4bdd 100644
--- a/llvm/test/CodeGen/PowerPC/memcmpIR.ll
+++ b/llvm/test/CodeGen/PowerPC/memcmpIR.ll
@@ -4,48 +4,19 @@
define signext i32 @test1(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
entry:
; CHECK-LABEL: @test1(
- ; CHECK-LABEL: res_block:{{.*}}
- ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
- ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
- ; CHECK-NEXT: br label %endblock
-
- ; CHECK-LABEL: loadbb:{{.*}}
- ; CHECK: [[LOAD1:%[0-9]+]] = load i64, ptr
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
- ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
- ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
- ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]]
- ; CHECK-NEXT: br i1 [[ICMP]], label %loadbb1, label %res_block
-
- ; CHECK-LABEL: loadbb1:{{.*}}
- ; CHECK-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
- ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
- ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, ptr [[GEP1]]
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr [[GEP2]]
- ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
- ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
- ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]]
- ; CHECK-NEXT: br i1 [[ICMP]], label %endblock, label %res_block
-
+ ; CHECK: [[LOAD0:%[0-9]+]] = load i128, ptr %buffer1, align 1
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i128, ptr %buffer2, align 1
+ ; CHECK-NEXT: [[CALL1:%[0-9]+]] = call i128 @llvm.bswap.i128(i128 [[LOAD0]])
+ ; CHECK-NEXT: [[CALL2:%[0-9]+]] = call i128 @llvm.bswap.i128(i128 [[LOAD1]])
+ ; CHECK-NEXT: [[CALL3:%[0-9]+]] = call i32 @llvm.ucmp.i32.i128(i128 [[CALL1]], i128 [[CALL2]])
+ ; CHECK-NEXT: ret i32 [[CALL3]]
+
+
; CHECK-BE-LABEL: @test1(
- ; CHECK-BE-LABEL: res_block:{{.*}}
- ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
- ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
- ; CHECK-BE-NEXT: br label %endblock
-
- ; CHECK-BE-LABEL: loadbb:{{.*}}
- ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, ptr
- ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
- ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
- ; CHECK-BE-NEXT: br i1 [[ICMP]], label %loadbb1, label %res_block
-
- ; CHECK-BE-LABEL: loadbb1:{{.*}}
- ; CHECK-BE-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
- ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
- ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, ptr [[GEP1]]
- ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr [[GEP2]]
- ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
- ; CHECK-BE-NEXT: br i1 [[ICMP]], label %endblock, label %res_block
+ ; CHECK-BE: [[LOAD0:%[0-9]+]] = load i128, ptr %buffer1, align 1
+ ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i128, ptr %buffer2, align 1
+ ; CHECK-BE-NEXT: [[CALL0:%[0-9]+]] = call i32 @llvm.ucmp.i32.i128(i128 [[LOAD0]], i128 [[LOAD1]])
+ ; CHECK-BE-NEXT: ret i32 [[CALL0]]
%call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 16)
ret i32 %call
@@ -156,7 +127,7 @@ entry:
define signext i32 @test4(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
entry:
- %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 65)
+ %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 165)
ret i32 %call
}
>From bff468f678feb6aecbc47bd1d9a4d842d9710945 Mon Sep 17 00:00:00 2001
From: zhijian <zhijian at ca.ibm.com>
Date: Mon, 15 Sep 2025 15:22:00 +0000
Subject: [PATCH 4/4] add check Subtarget hasVSX check
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 104 ++++++++--------
llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll | 112 ++++--------------
llvm/test/CodeGen/PowerPC/memcmp64_fixsize.ll | 78 +++---------
3 files changed, 91 insertions(+), 203 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 5dcbd84a15633..94912e1537a80 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -15556,60 +15556,62 @@ SDValue PPCTargetLowering::combineSetCC(SDNode *N,
SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
}
+ if (Subtarget.hasVSX()) {
+ if (LHS.getOpcode() == ISD::LOAD && RHS.getOpcode() == ISD::LOAD &&
+ LHS.hasOneUse() && RHS.hasOneUse() &&
+ LHS.getValueType() == MVT::i128 && RHS.getValueType() == MVT::i128) {
+ SDLoc DL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ auto *LA = dyn_cast<LoadSDNode>(LHS);
+ auto *LB = dyn_cast<LoadSDNode>(RHS);
+ if (!LA || !LB)
+ return SDValue();
- if (LHS.getOpcode() == ISD::LOAD && RHS.getOpcode() == ISD::LOAD &&
- LHS.hasOneUse() && RHS.hasOneUse() && LHS.getValueType() == MVT::i128 &&
- RHS.getValueType() == MVT::i128) {
- SDLoc DL(N);
- SelectionDAG &DAG = DCI.DAG;
- auto *LA = dyn_cast<LoadSDNode>(LHS);
- auto *LB = dyn_cast<LoadSDNode>(RHS);
- if (!LA || !LB)
- return SDValue();
-
- // If either memory operation (LA or LB) is volatile, do not perform any
- // optimization or transformation. Volatile operations must be preserved
- // as written to ensure correct program behavior, so we return an empty
- // SDValue to indicate no action.
- if (LA->isVolatile() || LB->isVolatile())
- return SDValue();
+ // If either memory operation (LA or LB) is volatile, do not perform any
+ // optimization or transformation. Volatile operations must be preserved
+ // as written to ensure correct program behavior, so we return an empty
+ // SDValue to indicate no action.
+ if (LA->isVolatile() || LB->isVolatile())
+ return SDValue();
- // Only combine loads if both use the unindexed addressing mode.
- // PowerPC AltiVec/VMX does not support vector loads or stores with
- // pre/post-increment addressing. Indexed modes may imply implicit pointer
- // updates, which are not compatible with AltiVec vector instructions.
- if (LA->getAddressingMode() != ISD::UNINDEXED ||
- LB->getAddressingMode() != ISD::UNINDEXED)
- return SDValue();
+ // Only combine loads if both use the unindexed addressing mode.
+ // PowerPC AltiVec/VMX does not support vector loads or stores with
+ // pre/post-increment addressing. Indexed modes may imply implicit
+ // pointer updates, which are not compatible with AltiVec vector
+ // instructions.
+ if (LA->getAddressingMode() != ISD::UNINDEXED ||
+ LB->getAddressingMode() != ISD::UNINDEXED)
+ return SDValue();
- // Only combine loads if both are non-extending loads (ISD::NON_EXTLOAD).
- // Extending loads (such as ISD::ZEXTLOAD or ISD::SEXTLOAD) perform zero
- // or sign extension, which may change the loaded value's semantics and
- // are not compatible with vector loads.
- if (LA->getExtensionType() != ISD::NON_EXTLOAD ||
- LB->getExtensionType() != ISD::NON_EXTLOAD)
- return SDValue();
- // Build new v16i8 loads using the same chain/base/MMO (no extra memory
- // op).
- SDValue LHSVec = DAG.getLoad(MVT::v16i8, DL, LA->getChain(),
- LA->getBasePtr(), LA->getMemOperand());
- SDValue RHSVec = DAG.getLoad(MVT::v16i8, DL, LB->getChain(),
- LB->getBasePtr(), LB->getMemOperand());
-
- SDValue IntrID =
- DAG.getTargetConstant(Intrinsic::ppc_altivec_vcmpequb_p, DL,
- Subtarget.isPPC64() ? MVT::i64 : MVT::i32);
- SDValue CRSel =
- DAG.getConstant(2, DL, MVT::i32); // which CR6 predicate field
- SDValue Ops[] = {IntrID, CRSel, LHSVec, RHSVec};
- SDValue PredResult =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, Ops);
-
- // ppc_altivec_vcmpequb_p returns 1 when two vectors are the same,
- // so we need to invert the CC opcode.
- return DAG.getSetCC(DL, N->getValueType(0), PredResult,
- DAG.getConstant(0, DL, MVT::i32),
- CC == ISD::SETNE ? ISD::SETEQ : ISD::SETNE);
+ // Only combine loads if both are non-extending loads
+ // (ISD::NON_EXTLOAD). Extending loads (such as ISD::ZEXTLOAD or
+ // ISD::SEXTLOAD) perform zero or sign extension, which may change the
+ // loaded value's semantics and are not compatible with vector loads.
+ if (LA->getExtensionType() != ISD::NON_EXTLOAD ||
+ LB->getExtensionType() != ISD::NON_EXTLOAD)
+ return SDValue();
+ // Build new v16i8 loads using the same chain/base/MMO (no extra memory
+ // op).
+ SDValue LHSVec = DAG.getLoad(MVT::v16i8, DL, LA->getChain(),
+ LA->getBasePtr(), LA->getMemOperand());
+ SDValue RHSVec = DAG.getLoad(MVT::v16i8, DL, LB->getChain(),
+ LB->getBasePtr(), LB->getMemOperand());
+
+ SDValue IntrID =
+ DAG.getTargetConstant(Intrinsic::ppc_altivec_vcmpequb_p, DL,
+ Subtarget.isPPC64() ? MVT::i64 : MVT::i32);
+ SDValue CRSel =
+ DAG.getConstant(2, DL, MVT::i32); // which CR6 predicate field
+ SDValue Ops[] = {IntrID, CRSel, LHSVec, RHSVec};
+ SDValue PredResult =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, Ops);
+
+ // ppc_altivec_vcmpequb_p returns 1 when two vectors are the same,
+ // so we need to invert the CC opcode.
+ return DAG.getSetCC(DL, N->getValueType(0), PredResult,
+ DAG.getConstant(0, DL, MVT::i32),
+ CC == ISD::SETNE ? ISD::SETEQ : ISD::SETNE);
+ }
}
}
diff --git a/llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll b/llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll
index f5483ad2a7c3f..7dfaac1a8ae37 100644
--- a/llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll
+++ b/llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll
@@ -14,110 +14,38 @@
define dso_local signext range(i32 0, 2) i32 @cmpeq16(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b) {
; CHECK-AIX32-P8-LABEL: cmpeq16:
; CHECK-AIX32-P8: # %bb.0: # %entry
-; CHECK-AIX32-P8-NEXT: lwz r5, 4(r3)
-; CHECK-AIX32-P8-NEXT: lwz r6, 0(r3)
-; CHECK-AIX32-P8-NEXT: lwz r7, 4(r4)
-; CHECK-AIX32-P8-NEXT: lwz r8, 0(r4)
-; CHECK-AIX32-P8-NEXT: xor r6, r6, r8
-; CHECK-AIX32-P8-NEXT: xor r5, r5, r7
-; CHECK-AIX32-P8-NEXT: or. r5, r5, r6
-; CHECK-AIX32-P8-NEXT: bne cr0, L..BB0_2
-; CHECK-AIX32-P8-NEXT: # %bb.1: # %loadbb1
-; CHECK-AIX32-P8-NEXT: lwz r5, 12(r3)
-; CHECK-AIX32-P8-NEXT: lwz r3, 8(r3)
-; CHECK-AIX32-P8-NEXT: lwz r6, 12(r4)
-; CHECK-AIX32-P8-NEXT: lwz r4, 8(r4)
-; CHECK-AIX32-P8-NEXT: xor r3, r3, r4
-; CHECK-AIX32-P8-NEXT: xor r4, r5, r6
-; CHECK-AIX32-P8-NEXT: or. r3, r4, r3
-; CHECK-AIX32-P8-NEXT: li r3, 0
-; CHECK-AIX32-P8-NEXT: beq cr0, L..BB0_3
-; CHECK-AIX32-P8-NEXT: L..BB0_2: # %res_block
-; CHECK-AIX32-P8-NEXT: li r3, 1
-; CHECK-AIX32-P8-NEXT: L..BB0_3: # %endblock
-; CHECK-AIX32-P8-NEXT: cntlzw r3, r3
-; CHECK-AIX32-P8-NEXT: rlwinm r3, r3, 27, 31, 31
+; CHECK-AIX32-P8-NEXT: lxvw4x vs34, 0, r4
+; CHECK-AIX32-P8-NEXT: lxvw4x vs35, 0, r3
+; CHECK-AIX32-P8-NEXT: vcmpequb. v2, v3, v2
+; CHECK-AIX32-P8-NEXT: mfocrf r3, 2
+; CHECK-AIX32-P8-NEXT: rlwinm r3, r3, 25, 31, 31
; CHECK-AIX32-P8-NEXT: blr
;
; CHECK-AIX32-P10-LABEL: cmpeq16:
; CHECK-AIX32-P10: # %bb.0: # %entry
-; CHECK-AIX32-P10-NEXT: lwz r5, 4(r3)
-; CHECK-AIX32-P10-NEXT: lwz r6, 0(r3)
-; CHECK-AIX32-P10-NEXT: lwz r7, 4(r4)
-; CHECK-AIX32-P10-NEXT: xor r5, r5, r7
-; CHECK-AIX32-P10-NEXT: lwz r8, 0(r4)
-; CHECK-AIX32-P10-NEXT: xor r6, r6, r8
-; CHECK-AIX32-P10-NEXT: or. r5, r5, r6
-; CHECK-AIX32-P10-NEXT: bne cr0, L..BB0_2
-; CHECK-AIX32-P10-NEXT: # %bb.1: # %loadbb1
-; CHECK-AIX32-P10-NEXT: lwz r5, 12(r3)
-; CHECK-AIX32-P10-NEXT: lwz r3, 8(r3)
-; CHECK-AIX32-P10-NEXT: lwz r6, 12(r4)
-; CHECK-AIX32-P10-NEXT: lwz r4, 8(r4)
-; CHECK-AIX32-P10-NEXT: xor r3, r3, r4
-; CHECK-AIX32-P10-NEXT: xor r4, r5, r6
-; CHECK-AIX32-P10-NEXT: or. r3, r4, r3
-; CHECK-AIX32-P10-NEXT: li r3, 0
-; CHECK-AIX32-P10-NEXT: beq cr0, L..BB0_3
-; CHECK-AIX32-P10-NEXT: L..BB0_2: # %res_block
-; CHECK-AIX32-P10-NEXT: li r3, 1
-; CHECK-AIX32-P10-NEXT: L..BB0_3: # %endblock
-; CHECK-AIX32-P10-NEXT: cntlzw r3, r3
-; CHECK-AIX32-P10-NEXT: rlwinm r3, r3, 27, 31, 31
+; CHECK-AIX32-P10-NEXT: lxv vs34, 0(r4)
+; CHECK-AIX32-P10-NEXT: lxv vs35, 0(r3)
+; CHECK-AIX32-P10-NEXT: vcmpequb. v2, v3, v2
+; CHECK-AIX32-P10-NEXT: setbc r3, 4*cr6+lt
; CHECK-AIX32-P10-NEXT: blr
;
; CHECK-LINUX32-P8-LABEL: cmpeq16:
; CHECK-LINUX32-P8: # %bb.0: # %entry
-; CHECK-LINUX32-P8-NEXT: lwz r5, 0(r3)
-; CHECK-LINUX32-P8-NEXT: lwz r6, 4(r3)
-; CHECK-LINUX32-P8-NEXT: lwz r7, 0(r4)
-; CHECK-LINUX32-P8-NEXT: lwz r8, 4(r4)
-; CHECK-LINUX32-P8-NEXT: xor r6, r6, r8
-; CHECK-LINUX32-P8-NEXT: xor r5, r5, r7
-; CHECK-LINUX32-P8-NEXT: or. r5, r5, r6
-; CHECK-LINUX32-P8-NEXT: bne cr0, .LBB0_2
-; CHECK-LINUX32-P8-NEXT: # %bb.1: # %loadbb1
-; CHECK-LINUX32-P8-NEXT: lwz r5, 8(r3)
-; CHECK-LINUX32-P8-NEXT: lwz r3, 12(r3)
-; CHECK-LINUX32-P8-NEXT: lwz r6, 8(r4)
-; CHECK-LINUX32-P8-NEXT: lwz r4, 12(r4)
-; CHECK-LINUX32-P8-NEXT: xor r3, r3, r4
-; CHECK-LINUX32-P8-NEXT: xor r4, r5, r6
-; CHECK-LINUX32-P8-NEXT: or. r3, r4, r3
-; CHECK-LINUX32-P8-NEXT: li r3, 0
-; CHECK-LINUX32-P8-NEXT: beq cr0, .LBB0_3
-; CHECK-LINUX32-P8-NEXT: .LBB0_2: # %res_block
-; CHECK-LINUX32-P8-NEXT: li r3, 1
-; CHECK-LINUX32-P8-NEXT: .LBB0_3: # %endblock
-; CHECK-LINUX32-P8-NEXT: cntlzw r3, r3
-; CHECK-LINUX32-P8-NEXT: rlwinm r3, r3, 27, 31, 31
+; CHECK-LINUX32-P8-NEXT: lxvd2x vs0, 0, r4
+; CHECK-LINUX32-P8-NEXT: xxswapd vs34, vs0
+; CHECK-LINUX32-P8-NEXT: lxvd2x vs0, 0, r3
+; CHECK-LINUX32-P8-NEXT: xxswapd vs35, vs0
+; CHECK-LINUX32-P8-NEXT: vcmpequb. v2, v3, v2
+; CHECK-LINUX32-P8-NEXT: mfocrf r3, 2
+; CHECK-LINUX32-P8-NEXT: rlwinm r3, r3, 25, 31, 31
; CHECK-LINUX32-P8-NEXT: blr
;
; CHECK-LINUX32-P10-LABEL: cmpeq16:
; CHECK-LINUX32-P10: # %bb.0: # %entry
-; CHECK-LINUX32-P10-NEXT: lwz r5, 0(r3)
-; CHECK-LINUX32-P10-NEXT: lwz r6, 4(r3)
-; CHECK-LINUX32-P10-NEXT: lwz r7, 0(r4)
-; CHECK-LINUX32-P10-NEXT: xor r5, r5, r7
-; CHECK-LINUX32-P10-NEXT: lwz r8, 4(r4)
-; CHECK-LINUX32-P10-NEXT: xor r6, r6, r8
-; CHECK-LINUX32-P10-NEXT: or. r5, r5, r6
-; CHECK-LINUX32-P10-NEXT: bne cr0, .LBB0_2
-; CHECK-LINUX32-P10-NEXT: # %bb.1: # %loadbb1
-; CHECK-LINUX32-P10-NEXT: lwz r5, 8(r3)
-; CHECK-LINUX32-P10-NEXT: lwz r3, 12(r3)
-; CHECK-LINUX32-P10-NEXT: lwz r6, 8(r4)
-; CHECK-LINUX32-P10-NEXT: lwz r4, 12(r4)
-; CHECK-LINUX32-P10-NEXT: xor r3, r3, r4
-; CHECK-LINUX32-P10-NEXT: xor r4, r5, r6
-; CHECK-LINUX32-P10-NEXT: or. r3, r4, r3
-; CHECK-LINUX32-P10-NEXT: li r3, 0
-; CHECK-LINUX32-P10-NEXT: beq cr0, .LBB0_3
-; CHECK-LINUX32-P10-NEXT: .LBB0_2: # %res_block
-; CHECK-LINUX32-P10-NEXT: li r3, 1
-; CHECK-LINUX32-P10-NEXT: .LBB0_3: # %endblock
-; CHECK-LINUX32-P10-NEXT: cntlzw r3, r3
-; CHECK-LINUX32-P10-NEXT: rlwinm r3, r3, 27, 31, 31
+; CHECK-LINUX32-P10-NEXT: lxv vs34, 0(r4)
+; CHECK-LINUX32-P10-NEXT: lxv vs35, 0(r3)
+; CHECK-LINUX32-P10-NEXT: vcmpequb. v2, v3, v2
+; CHECK-LINUX32-P10-NEXT: setbc r3, 4*cr6+lt
; CHECK-LINUX32-P10-NEXT: blr
entry:
%bcmp = tail call i32 @bcmp(ptr noundef nonnull dereferenceable(16) %a, ptr noundef nonnull dereferenceable(16) %b, i32 16)
diff --git a/llvm/test/CodeGen/PowerPC/memcmp64_fixsize.ll b/llvm/test/CodeGen/PowerPC/memcmp64_fixsize.ll
index 216b7638642d4..bd703b9d35cf7 100644
--- a/llvm/test/CodeGen/PowerPC/memcmp64_fixsize.ll
+++ b/llvm/test/CodeGen/PowerPC/memcmp64_fixsize.ll
@@ -14,78 +14,36 @@
define dso_local signext range(i32 0, 2) i32 @cmpeq16(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b) {
; CHECK-AIX64-32-P8-LABEL: cmpeq16:
; CHECK-AIX64-32-P8: # %bb.0: # %entry
-; CHECK-AIX64-32-P8-NEXT: ld r5, 0(r3)
-; CHECK-AIX64-32-P8-NEXT: ld r6, 0(r4)
-; CHECK-AIX64-32-P8-NEXT: cmpld r5, r6
-; CHECK-AIX64-32-P8-NEXT: bne cr0, L..BB0_2
-; CHECK-AIX64-32-P8-NEXT: # %bb.1: # %loadbb1
-; CHECK-AIX64-32-P8-NEXT: ld r5, 8(r3)
-; CHECK-AIX64-32-P8-NEXT: ld r4, 8(r4)
-; CHECK-AIX64-32-P8-NEXT: li r3, 0
-; CHECK-AIX64-32-P8-NEXT: cmpld r5, r4
-; CHECK-AIX64-32-P8-NEXT: beq cr0, L..BB0_3
-; CHECK-AIX64-32-P8-NEXT: L..BB0_2: # %res_block
-; CHECK-AIX64-32-P8-NEXT: li r3, 1
-; CHECK-AIX64-32-P8-NEXT: L..BB0_3: # %endblock
-; CHECK-AIX64-32-P8-NEXT: cntlzw r3, r3
-; CHECK-AIX64-32-P8-NEXT: srwi r3, r3, 5
+; CHECK-AIX64-32-P8-NEXT: lxvw4x vs34, 0, r4
+; CHECK-AIX64-32-P8-NEXT: lxvw4x vs35, 0, r3
+; CHECK-AIX64-32-P8-NEXT: vcmpequb. v2, v3, v2
+; CHECK-AIX64-32-P8-NEXT: mfocrf r3, 2
+; CHECK-AIX64-32-P8-NEXT: rlwinm r3, r3, 25, 31, 31
; CHECK-AIX64-32-P8-NEXT: blr
;
; CHECK-AIX64-32-P10-LABEL: cmpeq16:
; CHECK-AIX64-32-P10: # %bb.0: # %entry
-; CHECK-AIX64-32-P10-NEXT: ld r5, 0(r3)
-; CHECK-AIX64-32-P10-NEXT: ld r6, 0(r4)
-; CHECK-AIX64-32-P10-NEXT: cmpld r5, r6
-; CHECK-AIX64-32-P10-NEXT: bne cr0, L..BB0_2
-; CHECK-AIX64-32-P10-NEXT: # %bb.1: # %loadbb1
-; CHECK-AIX64-32-P10-NEXT: ld r5, 8(r3)
-; CHECK-AIX64-32-P10-NEXT: ld r4, 8(r4)
-; CHECK-AIX64-32-P10-NEXT: li r3, 0
-; CHECK-AIX64-32-P10-NEXT: cmpld r5, r4
-; CHECK-AIX64-32-P10-NEXT: beq cr0, L..BB0_3
-; CHECK-AIX64-32-P10-NEXT: L..BB0_2: # %res_block
-; CHECK-AIX64-32-P10-NEXT: li r3, 1
-; CHECK-AIX64-32-P10-NEXT: L..BB0_3: # %endblock
-; CHECK-AIX64-32-P10-NEXT: cntlzw r3, r3
-; CHECK-AIX64-32-P10-NEXT: rlwinm r3, r3, 27, 31, 31
+; CHECK-AIX64-32-P10-NEXT: lxv vs34, 0(r4)
+; CHECK-AIX64-32-P10-NEXT: lxv vs35, 0(r3)
+; CHECK-AIX64-32-P10-NEXT: vcmpequb. v2, v3, v2
+; CHECK-AIX64-32-P10-NEXT: setbc r3, 4*cr6+lt
; CHECK-AIX64-32-P10-NEXT: blr
;
; CHECK-LINUX64-P8-LABEL: cmpeq16:
; CHECK-LINUX64-P8: # %bb.0: # %entry
-; CHECK-LINUX64-P8-NEXT: ld r5, 0(r3)
-; CHECK-LINUX64-P8-NEXT: ld r6, 0(r4)
-; CHECK-LINUX64-P8-NEXT: cmpld r5, r6
-; CHECK-LINUX64-P8-NEXT: bne cr0, .LBB0_2
-; CHECK-LINUX64-P8-NEXT: # %bb.1: # %loadbb1
-; CHECK-LINUX64-P8-NEXT: ld r5, 8(r3)
-; CHECK-LINUX64-P8-NEXT: ld r4, 8(r4)
-; CHECK-LINUX64-P8-NEXT: li r3, 0
-; CHECK-LINUX64-P8-NEXT: cmpld r5, r4
-; CHECK-LINUX64-P8-NEXT: beq cr0, .LBB0_3
-; CHECK-LINUX64-P8-NEXT: .LBB0_2: # %res_block
-; CHECK-LINUX64-P8-NEXT: li r3, 1
-; CHECK-LINUX64-P8-NEXT: .LBB0_3: # %endblock
-; CHECK-LINUX64-P8-NEXT: cntlzw r3, r3
-; CHECK-LINUX64-P8-NEXT: srwi r3, r3, 5
+; CHECK-LINUX64-P8-NEXT: lxvd2x vs34, 0, r4
+; CHECK-LINUX64-P8-NEXT: lxvd2x vs35, 0, r3
+; CHECK-LINUX64-P8-NEXT: vcmpequb. v2, v3, v2
+; CHECK-LINUX64-P8-NEXT: mfocrf r3, 2
+; CHECK-LINUX64-P8-NEXT: rlwinm r3, r3, 25, 31, 31
; CHECK-LINUX64-P8-NEXT: blr
;
; CHECK-LINUX64-P10-LABEL: cmpeq16:
; CHECK-LINUX64-P10: # %bb.0: # %entry
-; CHECK-LINUX64-P10-NEXT: ld r5, 0(r3)
-; CHECK-LINUX64-P10-NEXT: ld r6, 0(r4)
-; CHECK-LINUX64-P10-NEXT: cmpld r5, r6
-; CHECK-LINUX64-P10-NEXT: bne cr0, .LBB0_2
-; CHECK-LINUX64-P10-NEXT: # %bb.1: # %loadbb1
-; CHECK-LINUX64-P10-NEXT: ld r5, 8(r3)
-; CHECK-LINUX64-P10-NEXT: ld r4, 8(r4)
-; CHECK-LINUX64-P10-NEXT: li r3, 0
-; CHECK-LINUX64-P10-NEXT: cmpld r5, r4
-; CHECK-LINUX64-P10-NEXT: beq cr0, .LBB0_3
-; CHECK-LINUX64-P10-NEXT: .LBB0_2: # %res_block
-; CHECK-LINUX64-P10-NEXT: li r3, 1
-; CHECK-LINUX64-P10-NEXT: .LBB0_3: # %endblock
-; CHECK-LINUX64-P10-NEXT: cntlzw r3, r3
-; CHECK-LINUX64-P10-NEXT: rlwinm r3, r3, 27, 31, 31
+; CHECK-LINUX64-P10-NEXT: lxv vs34, 0(r4)
+; CHECK-LINUX64-P10-NEXT: lxv vs35, 0(r3)
+; CHECK-LINUX64-P10-NEXT: vcmpequb. v2, v3, v2
+; CHECK-LINUX64-P10-NEXT: setbc r3, 4*cr6+lt
; CHECK-LINUX64-P10-NEXT: blr
entry:
%bcmp = tail call i32 @bcmp(ptr noundef nonnull dereferenceable(16) %a, ptr noundef nonnull dereferenceable(16) %b, i64 16)
More information about the llvm-commits
mailing list