[llvm] [SelectionDAG] Deal with POISON for INSERT_VECTOR_ELT/INSERT_SUBVECTOR (part 2) (PR #143103)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 6 02:58:04 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Björn Pettersson (bjope)
<details>
<summary>Changes</summary>
Add support in isGuaranteedNotToBeUndefOrPoison and
SimplifyDemandedVectorElts to avoid regressions seen
after a previous commit fixing https://github.com/llvm/llvm-project/issues/141034.
---
Patch is 158.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143103.diff
14 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+51-13)
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+109-10)
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+4)
- (modified) llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll (+67)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll (+75-6)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll (+100-8)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll (+5-1)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll (+21)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll (+28)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll (+60-30)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll (+36-19)
- (modified) llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll (+36)
- (modified) llvm/test/CodeGen/X86/pr62286.ll (+8-6)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll (+693-597)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index aba3c0f80a024..a52edca64dbce 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22905,6 +22905,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
// Insert into out-of-bounds element is undefined.
+ // Code below relies on that we handle this special case early.
if (IndexC && VT.isFixedLengthVector() &&
IndexC->getZExtValue() >= VT.getVectorNumElements())
return DAG.getUNDEF(VT);
@@ -22915,14 +22916,29 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
return InVec;
- if (!IndexC) {
- // If this is variable insert to undef vector, it might be better to splat:
- // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
- if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT))
- return DAG.getSplat(VT, DL, InVal);
- return SDValue();
+ // If this is variable insert to undef vector, it might be better to splat:
+ // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
+ if (!IndexC && InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT))
+ return DAG.getSplat(VT, DL, InVal);
+
+ // Try to drop insert of UNDEF/POISON elements. This is also done in getNode,
+ // but we also do it as a DAG combine since for example simplifications into
+ // SPLAT_VECTOR/BUILD_VECTOR may turn poison elements into undef/zero etc, and
+ // then suddenly the InVec is guaranteed to not be poison.
+ if (InVal.isUndef()) {
+ if (IndexC && VT.isFixedLengthVector()) {
+ APInt EltMask = APInt::getOneBitSet(VT.getVectorNumElements(),
+ IndexC->getZExtValue());
+ if (DAG.isGuaranteedNotToBePoison(InVec, EltMask))
+ return InVec;
+ } else if (DAG.isGuaranteedNotToBePoison(InVec)) {
+ return InVec;
+ }
}
+ if (!IndexC)
+ return SDValue();
+
if (VT.isScalableVector())
return SDValue();
@@ -27355,18 +27371,40 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
SDValue N2 = N->getOperand(2);
uint64_t InsIdx = N->getConstantOperandVal(2);
- // If inserting an UNDEF, just return the original vector.
- if (N1.isUndef())
- return N0;
+ // If inserting an UNDEF, just return the original vector (unless it makes the
+ // result more poisonous).
+ if (N1.isUndef()){
+ if (VT.isFixedLengthVector()) {
+ unsigned SubVecNumElts = N1.getValueType().getVectorNumElements();
+ APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(),
+ InsIdx, InsIdx + SubVecNumElts);
+ if (DAG.isGuaranteedNotToBePoison(N0, EltMask))
+ return N0;
+ } else if (DAG.isGuaranteedNotToBePoison(N0))
+ return N0;
+ }
- // If this is an insert of an extracted vector into an undef vector, we can
- // just use the input to the extract if the types match, and can simplify
+ // If this is an insert of an extracted vector into an undef/poison vector, we
+ // can just use the input to the extract if the types match, and can simplify
// in some cases even if they don't.
if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N1.getOperand(1) == N2) {
+ EVT N1VT = N1.getValueType();
EVT SrcVT = N1.getOperand(0).getValueType();
- if (SrcVT == VT)
- return N1.getOperand(0);
+ if (SrcVT == VT) {
+ // Need to ensure that result isn't more poisonous if skipping both the
+ // extract+insert.
+ if (N0.getOpcode() == ISD::POISON)
+ return N1.getOperand(0);
+ if (VT.isFixedLengthVector() && N1VT.isFixedLengthVector()) {
+ unsigned SubVecNumElts = N1VT.getVectorNumElements();
+ APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(),
+ InsIdx, InsIdx + SubVecNumElts);
+ if (DAG.isGuaranteedNotToBePoison(N1.getOperand(0), ~EltMask))
+ return N1.getOperand(0);
+ } else if (DAG.isGuaranteedNotToBePoison(N1.getOperand(0)))
+ return N1.getOperand(0);
+ }
// TODO: To remove the zero check, need to adjust the offset to
// a multiple of the new src type.
if (isNullConstant(N2)) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 279c7daf71c33..fadfd35489e54 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5454,6 +5454,60 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
}
return true;
+ case ISD::INSERT_SUBVECTOR: {
+ if (Op.getValueType().isScalableVector())
+ break;
+ SDValue Src = Op.getOperand(0);
+ SDValue Sub = Op.getOperand(1);
+ uint64_t Idx = Op.getConstantOperandVal(2);
+ unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+ APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+ APInt DemandedSrcElts = DemandedElts;
+ DemandedSrcElts.clearBits(Idx, Idx + NumSubElts);
+
+ if (!!DemandedSubElts &&
+ !isGuaranteedNotToBeUndefOrPoison(Sub, DemandedSubElts,
+ PoisonOnly, Depth + 1))
+ return false;
+ if (!!DemandedSrcElts &&
+ !isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrcElts,
+ PoisonOnly, Depth + 1))
+ return false;
+ return true;
+ }
+
+ case ISD::INSERT_VECTOR_ELT: {
+ SDValue InVec = Op.getOperand(0);
+ SDValue InVal = Op.getOperand(1);
+ SDValue EltNo = Op.getOperand(2);
+ EVT VT = InVec.getValueType();
+ auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
+ if (IndexC && VT.isFixedLengthVector() &&
+ IndexC->getZExtValue() < VT.getVectorNumElements()) {
+ if (DemandedElts[IndexC->getZExtValue()] &&
+ !isGuaranteedNotToBeUndefOrPoison(InVal, PoisonOnly, Depth + 1))
+ return false;
+ APInt InVecDemandedElts = DemandedElts;
+ InVecDemandedElts.clearBit(IndexC->getZExtValue());
+ if (!!InVecDemandedElts &&
+ !isGuaranteedNotToBeUndefOrPoison(InVec, InVecDemandedElts,
+ PoisonOnly, Depth + 1))
+ return false;
+ return true;
+ }
+ break;
+ }
+
+ case ISD::SCALAR_TO_VECTOR:
+ // If only demanding upper (undef) elements.
+ if (DemandedElts.ugt(1))
+ return PoisonOnly;
+ // If only demanding element 0, or only considering poison.
+ if (PoisonOnly || DemandedElts == 0)
+ return isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), PoisonOnly,
+ Depth + 1);
+ return false;
+
case ISD::SPLAT_VECTOR:
return isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), PoisonOnly,
Depth + 1);
@@ -7900,23 +7954,42 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
// INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF, except
// for scalable vectors where we will generate appropriate code to
// deal with out-of-bounds cases correctly.
- if (N3C && N1.getValueType().isFixedLengthVector() &&
- N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
+ if (N3C && VT.isFixedLengthVector() &&
+ N3C->getZExtValue() >= VT.getVectorNumElements())
return getUNDEF(VT);
// Undefined index can be assumed out-of-bounds, so that's UNDEF too.
if (N3.isUndef())
return getUNDEF(VT);
- // If the inserted element is an UNDEF, just use the input vector.
- if (N2.isUndef())
+ // If inserting poison, just use the input vector.
+ if (N2.getOpcode() == ISD::POISON)
return N1;
+ // Inserting undef into undef/poison is still undef.
+ if (N2.getOpcode() == ISD::UNDEF && N1.isUndef())
+ return getUNDEF(VT);
+
+ // If the inserted element is an UNDEF, just use the input vector.
+ // But not if skipping the insert could make the result more poisonous.
+ if (N2.isUndef()) {
+ if (N3C && VT.isFixedLengthVector()) {
+ APInt EltMask = APInt::getOneBitSet(VT.getVectorNumElements(),
+ N3C->getZExtValue());
+ if (isGuaranteedNotToBePoison(N1, EltMask))
+ return N1;
+ } else if (isGuaranteedNotToBePoison(N1))
+ return N1;
+ }
break;
}
case ISD::INSERT_SUBVECTOR: {
- // Inserting undef into undef is still undef.
- if (N1.isUndef() && N2.isUndef())
+ // If inserting poison, just use the input vector,
+ if (N2.getOpcode() == ISD::POISON)
+ return N1;
+
+ // Inserting undef into undef/poison is still undef.
+ if (N2.getOpcode() == ISD::UNDEF && N1.isUndef())
return getUNDEF(VT);
EVT N2VT = N2.getValueType();
@@ -7945,11 +8018,37 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
if (VT == N2VT)
return N2;
- // If this is an insert of an extracted vector into an undef vector, we
- // can just use the input to the extract.
+ // If this is an insert of an extracted vector into an undef/poison vector,
+ // we can just use the input to the extract. But not if skipping the
+ // extract+insert could make the result more poisonous.
if (N1.isUndef() && N2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT)
- return N2.getOperand(0);
+ N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT) {
+ if (N1.getOpcode() == ISD::POISON)
+ return N2.getOperand(0);
+ if (VT.isFixedLengthVector() && N2VT.isFixedLengthVector()) {
+ unsigned LoBit = N3->getAsZExtVal();
+ unsigned HiBit = LoBit + N2VT.getVectorNumElements();
+ APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(),
+ LoBit, HiBit);
+ if (isGuaranteedNotToBePoison(N2.getOperand(0), ~EltMask))
+ return N2.getOperand(0);
+ } else if (isGuaranteedNotToBePoison(N2.getOperand(0)))
+ return N2.getOperand(0);
+ }
+
+ // If the inserted subvector is UNDEF, just use the input vector.
+ // But not if skipping the insert could make the result more poisonous.
+ if (N2.isUndef()) {
+ if (VT.isFixedLengthVector()) {
+ unsigned LoBit = N3->getAsZExtVal();
+ unsigned HiBit = LoBit + N2VT.getVectorNumElements();
+ APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(),
+ LoBit, HiBit);
+ if (isGuaranteedNotToBePoison(N1, EltMask))
+ return N1;
+ } else if (isGuaranteedNotToBePoison(N1))
+ return N1;
+ }
break;
}
case ISD::BITCAST:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e8e820ac1f695..643ec26bba3f7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3359,6 +3359,10 @@ bool TargetLowering::SimplifyDemandedVectorElts(
APInt DemandedSrcElts = DemandedElts;
DemandedSrcElts.clearBits(Idx, Idx + NumSubElts);
+ // If none of the sub operand elements are demanded, bypass the insert.
+ if (!DemandedSubElts)
+ return TLO.CombineTo(Op, Src);
+
APInt SubUndef, SubZero;
if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, SubUndef, SubZero, TLO,
Depth + 1))
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
index 94074d1689f6a..2c44f56316801 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -66,6 +66,36 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) {
%v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
%v.7 = insertelement <16 x i8> %v.6, i8 %a, i32 7
%v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8
+ %v.10 = insertelement <16 x i8> %v.8, i8 %a, i32 10
+ %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
+ %v.12 = insertelement <16 x i8> %v.11, i8 %a, i32 12
+ %v.13 = insertelement <16 x i8> %v.12, i8 %a, i32 13
+ %v.14 = insertelement <16 x i8> %v.13, i8 %a, i32 14
+ %v.15 = insertelement <16 x i8> %v.14, i8 %a, i32 15
+ ret <16 x i8> %v.15
+}
+
+; Similar to above, but we leave element 8 as undef. One interesting part with
+; this test case is that %a may be poison, so simply inserting %a also at
+; index 8 would make the result vector more poisonous.
+define <16 x i8> @test_insert_v16i8_insert_2_undef_base_skip8(i32 %a0) {
+; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_skip8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsr w8, w0, #5
+; CHECK-NEXT: dup.16b v0, w8
+; CHECK-NEXT: mov.b v0[5], wzr
+; CHECK-NEXT: mov.b v0[8], w8
+; CHECK-NEXT: mov.b v0[9], wzr
+; CHECK-NEXT: ret
+ %a1 = lshr exact i32 %a0, 5
+ %a = trunc i32 %a1 to i8
+ %v.0 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef> , i8 %a, i32 0
+ %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1
+ %v.2 = insertelement <16 x i8> %v.1, i8 %a, i32 2
+ %v.3 = insertelement <16 x i8> %v.2, i8 %a, i32 3
+ %v.4 = insertelement <16 x i8> %v.3, i8 %a, i32 4
+ %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
+ %v.7 = insertelement <16 x i8> %v.6, i8 %a, i32 7
%v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10
%v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
%v.12 = insertelement <16 x i8> %v.11, i8 %a, i32 12
@@ -94,6 +124,43 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a,
%v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
%v.7 = insertelement <16 x i8> %v.6, i8 %b, i32 7
%v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8
+ %v.10 = insertelement <16 x i8> %v.8, i8 %a, i32 10
+ %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
+ %v.12 = insertelement <16 x i8> %v.11, i8 %b, i32 12
+ %v.13 = insertelement <16 x i8> %v.12, i8 %a, i32 13
+ %v.14 = insertelement <16 x i8> %v.13, i8 %a, i32 14
+ %v.15 = insertelement <16 x i8> %v.14, i8 %b, i32 15
+ ret <16 x i8> %v.15
+}
+
+; Similar to above, but we leave element 8 as undef. One interesting part with
+; this test case is that %a and %b may be poison, so simply inserting %a or %b
+; at index 8 would make the result vector more poisonous.
+define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus_skip8(i32 %a0, i32 %b0) {
+; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_valeus_skip8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsr w8, w0, #5
+; CHECK-NEXT: dup.16b v0, w8
+; CHECK-NEXT: lsr w8, w1, #5
+; CHECK-NEXT: mov.b v0[2], w8
+; CHECK-NEXT: mov.b v0[5], wzr
+; CHECK-NEXT: mov.b v0[7], w8
+; CHECK-NEXT: mov.b v0[8], w8
+; CHECK-NEXT: mov.b v0[9], wzr
+; CHECK-NEXT: mov.b v0[12], w8
+; CHECK-NEXT: mov.b v0[15], w8
+; CHECK-NEXT: ret
+ %a1 = lshr exact i32 %a0, 5
+ %a = trunc i32 %a1 to i8
+ %b1 = lshr exact i32 %b0, 5
+ %b = trunc i32 %b1 to i8
+ %v.0 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef> , i8 %a, i32 0
+ %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1
+ %v.2 = insertelement <16 x i8> %v.1, i8 %b, i32 2
+ %v.3 = insertelement <16 x i8> %v.2, i8 %a, i32 3
+ %v.4 = insertelement <16 x i8> %v.3, i8 %a, i32 4
+ %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
+ %v.7 = insertelement <16 x i8> %v.6, i8 %b, i32 7
%v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10
%v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
%v.12 = insertelement <16 x i8> %v.11, i8 %b, i32 12
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
index 9efe0b33910c8..2905d707bdd09 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
@@ -37,6 +37,10 @@ define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: and z2.h, z2.h, #0x1
+; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
@@ -59,8 +63,15 @@ define void @select_v32f16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z2.h, z3.h
-; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h
-; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h
+; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: ptrue p1.h
+; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1
+; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1
+; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0
+; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0
+; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h
+; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
@@ -71,6 +82,10 @@ define void @select_v32f16(ptr %a, ptr %b) #0 {
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
+; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_512-NEXT: ptrue p1.h
+; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1
+; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0
; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
@@ -89,6 +104,10 @@ define void @select_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: and z2.h, z2.h, #0x1
+; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
@@ -107,6 +126,10 @@ define void @select_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: and z2.h, z2.h, #0x1
+; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
@@ -150,6 +173,10 @@ define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: and z2.s, z2.s, #0x1
+; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
@@ -172,8 +199,15 @@ define void @select_v16f32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s
-; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s
-; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s
+; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: ptrue p1.s
+; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1
+; VBITS_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/143103
More information about the llvm-commits
mailing list