[llvm] Revert "[AArch64] Lower alias mask to a whilewr" (PR #120261)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 17 08:25:33 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Sam Tebbs (SamTebbs33)
<details>
<summary>Changes</summary>
Reverts llvm/llvm-project#<!-- -->100769
A bug in the lowering (the subtraction should be reversed) was found after merging and it will all be replaced by #<!-- -->117007 anyway.
---
Patch is 57.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/120261.diff
2 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (-121)
- (removed) llvm/test/CodeGen/AArch64/whilewr.ll (-1086)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 28f304100326c6..abc00fc86ee455 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1539,7 +1539,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
- setOperationAction(ISD::OR, VT, Custom);
setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
@@ -14329,128 +14328,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
return ResultSLI;
}
-/// Try to lower the construction of a pointer alias mask to a WHILEWR.
-/// The mask's enabled lanes represent the elements that will not overlap across
-/// one loop iteration. This tries to match:
-/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))),
-/// (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size))
-SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
- const AArch64Subtarget &Subtarget) {
- if (!Subtarget.hasSVE2())
- return SDValue();
- SDValue LaneMask = Op.getOperand(0);
- SDValue Splat = Op.getOperand(1);
-
- if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
- std::swap(LaneMask, Splat);
-
- if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
- LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask ||
- Splat.getOpcode() != ISD::SPLAT_VECTOR)
- return SDValue();
-
- SDValue Cmp = Splat.getOperand(0);
- if (Cmp.getOpcode() != ISD::SETCC)
- return SDValue();
-
- CondCodeSDNode *Cond = cast<CondCodeSDNode>(Cmp.getOperand(2));
-
- auto ComparatorConst = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
- if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 ||
- Cond->get() != ISD::CondCode::SETLT)
- return SDValue();
- unsigned CompValue = std::abs(ComparatorConst->getSExtValue());
- unsigned EltSize = CompValue + 1;
- if (!isPowerOf2_64(EltSize) || EltSize > 8)
- return SDValue();
-
- SDValue Diff = Cmp.getOperand(0);
- if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64)
- return SDValue();
-
- if (!isNullConstant(LaneMask.getOperand(1)) ||
- (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA))
- return SDValue();
-
- // The number of elements that alias is calculated by dividing the positive
- // difference between the pointers by the element size. An alias mask for i8
- // elements omits the division because it would just divide by 1
- if (EltSize > 1) {
- SDValue DiffDiv = LaneMask.getOperand(2);
- auto DiffDivConst = dyn_cast<ConstantSDNode>(DiffDiv.getOperand(1));
- if (!DiffDivConst || DiffDivConst->getZExtValue() != Log2_64(EltSize))
- return SDValue();
- if (EltSize > 2) {
- // When masking i32 or i64 elements, the positive value of the
- // possibly-negative difference comes from a select of the difference if
- // it's positive, otherwise the difference plus the element size if it's
- // negative: pos_diff = diff < 0 ? (diff + 7) : diff
- SDValue Select = DiffDiv.getOperand(0);
- // Make sure the difference is being compared by the select
- if (Select.getOpcode() != ISD::SELECT_CC || Select.getOperand(3) != Diff)
- return SDValue();
- // Make sure it's checking if the difference is less than 0
- if (!isNullConstant(Select.getOperand(1)) ||
- cast<CondCodeSDNode>(Select.getOperand(4))->get() !=
- ISD::CondCode::SETLT)
- return SDValue();
- // An add creates a positive value from the negative difference
- SDValue Add = Select.getOperand(2);
- if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff)
- return SDValue();
- if (auto *AddConst = dyn_cast<ConstantSDNode>(Add.getOperand(1));
- !AddConst || AddConst->getZExtValue() != EltSize - 1)
- return SDValue();
- } else {
- // When masking i16 elements, this positive value comes from adding the
- // difference's sign bit to the difference itself. This is equivalent to
- // the 32 bit and 64 bit case: pos_diff = diff + sign_bit (diff)
- SDValue Add = DiffDiv.getOperand(0);
- if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff)
- return SDValue();
- // A logical right shift by 63 extracts the sign bit from the difference
- SDValue Shift = Add.getOperand(1);
- if (Shift.getOpcode() != ISD::SRL || Shift.getOperand(0) != Diff)
- return SDValue();
- if (auto *ShiftConst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
- !ShiftConst || ShiftConst->getZExtValue() != 63)
- return SDValue();
- }
- } else if (LaneMask.getOperand(2) != Diff)
- return SDValue();
-
- SDValue StorePtr = Diff.getOperand(0);
- SDValue ReadPtr = Diff.getOperand(1);
-
- unsigned IntrinsicID = 0;
- switch (EltSize) {
- case 1:
- IntrinsicID = Intrinsic::aarch64_sve_whilewr_b;
- break;
- case 2:
- IntrinsicID = Intrinsic::aarch64_sve_whilewr_h;
- break;
- case 4:
- IntrinsicID = Intrinsic::aarch64_sve_whilewr_s;
- break;
- case 8:
- IntrinsicID = Intrinsic::aarch64_sve_whilewr_d;
- break;
- default:
- return SDValue();
- }
- SDLoc DL(Op);
- SDValue ID = DAG.getConstant(IntrinsicID, DL, MVT::i32);
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID,
- StorePtr, ReadPtr);
-}
-
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
- if (SDValue SV =
- tryWhileWRFromOR(Op, DAG, DAG.getSubtarget<AArch64Subtarget>()))
- return SV;
-
if (useSVEForFixedLengthVectorVT(Op.getValueType(),
!Subtarget->isNeonAvailable()))
return LowerToScalableOp(Op, DAG);
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
deleted file mode 100644
index 9f1ea850792384..00000000000000
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ /dev/null
@@ -1,1086 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2
-
-define <vscale x 16 x i1> @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilewr p0.b, x1, x2
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_8:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: cmp x8, #0
-; CHECK-NOSVE2-NEXT: cset w9, lt
-; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8
-; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8
-; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %c14 = ptrtoint ptr %c to i64
- %b15 = ptrtoint ptr %b to i64
- %sub.diff = sub i64 %b15, %c14
- %neg.compare = icmp slt i64 %sub.diff, 0
- %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
- %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
- ret <vscale x 16 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 16 x i1> @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_commutative:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilewr p0.b, x1, x2
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_commutative:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: cmp x8, #0
-; CHECK-NOSVE2-NEXT: cset w9, lt
-; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8
-; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8
-; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %c14 = ptrtoint ptr %c to i64
- %b15 = ptrtoint ptr %b to i64
- %sub.diff = sub i64 %b15, %c14
- %neg.compare = icmp slt i64 %sub.diff, 0
- %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
- %active.lane.mask.alias = or <vscale x 16 x i1> %.splat, %ptr.diff.lane.mask
- ret <vscale x 16 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 8 x i1> @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilewr p0.h, x1, x2
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_16:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: cmn x8, #1
-; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63
-; CHECK-NOSVE2-NEXT: cset w9, lt
-; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT: asr x8, x8, #1
-; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9
-; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8
-; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %b14 = ptrtoint ptr %b to i64
- %c15 = ptrtoint ptr %c to i64
- %sub.diff = sub i64 %b14, %c15
- %diff = sdiv i64 %sub.diff, 2
- %neg.compare = icmp slt i64 %sub.diff, -1
- %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
- %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
- ret <vscale x 8 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilewr p0.s, x1, x2
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_32:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: add x9, x8, #3
-; CHECK-NOSVE2-NEXT: cmp x8, #0
-; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT: cmn x8, #3
-; CHECK-NOSVE2-NEXT: cset w8, lt
-; CHECK-NOSVE2-NEXT: asr x9, x9, #2
-; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9
-; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x8
-; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %b12 = ptrtoint ptr %b to i64
- %c13 = ptrtoint ptr %c to i64
- %sub.diff = sub i64 %b12, %c13
- %diff = sdiv i64 %sub.diff, 4
- %neg.compare = icmp slt i64 %sub.diff, -3
- %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
- %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
- ret <vscale x 4 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilewr p0.d, x1, x2
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_64:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: add x9, x8, #7
-; CHECK-NOSVE2-NEXT: cmp x8, #0
-; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT: cmn x8, #7
-; CHECK-NOSVE2-NEXT: cset w8, lt
-; CHECK-NOSVE2-NEXT: asr x9, x9, #3
-; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9
-; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x8
-; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %b12 = ptrtoint ptr %b to i64
- %c13 = ptrtoint ptr %c to i64
- %sub.diff = sub i64 %b12, %c13
- %diff = sdiv i64 %sub.diff, 8
- %neg.compare = icmp slt i64 %sub.diff, -7
- %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
- %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
- ret <vscale x 2 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: no_whilewr_128:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub x8, x1, x2
-; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: add x9, x8, #15
-; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: csel x9, x9, x8, lt
-; CHECK-NEXT: cmn x8, #15
-; CHECK-NEXT: asr x9, x9, #4
-; CHECK-NEXT: cset w8, lt
-; CHECK-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NEXT: mov z1.d, x9
-; CHECK-NEXT: whilelo p1.d, xzr, x8
-; CHECK-NEXT: cmphi p0.d, p0/z, z1.d, z0.d
-; CHECK-NEXT: punpklo p1.h, p1.b
-; CHECK-NEXT: punpklo p0.h, p0.b
-; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: no_whilewr_128:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: index z0.d, #0, #1
-; CHECK-NOSVE2-NEXT: ptrue p0.d
-; CHECK-NOSVE2-NEXT: add x9, x8, #15
-; CHECK-NOSVE2-NEXT: cmp x8, #0
-; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT: cmn x8, #15
-; CHECK-NOSVE2-NEXT: asr x9, x9, #4
-; CHECK-NOSVE2-NEXT: cset w8, lt
-; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT: mov z1.d, x9
-; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8
-; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d
-; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b
-; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b
-; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %b12 = ptrtoint ptr %b to i64
- %c13 = ptrtoint ptr %c to i64
- %sub.diff = sub i64 %b12, %c13
- %diff = sdiv i64 %sub.diff, 16
- %neg.compare = icmp slt i64 %sub.diff, -15
- %.splatinsert = insertelement <vscale x 1 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 1 x i1> %.splatinsert, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff)
- %active.lane.mask.alias = or <vscale x 1 x i1> %ptr.diff.lane.mask, %.splat
- ret <vscale x 1 x i1> %active.lane.mask.alias
-}
-
-define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB6_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: whilewr p0.b, x1, x2
-; CHECK-NEXT: mov w9, w3
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: whilelo p1.b, xzr, x9
-; CHECK-NEXT: cntp x10, p0, p0.b
-; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB6_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NEXT: add z0.b, z1.b, z0.b
-; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8]
-; CHECK-NEXT: add x8, x8, x10
-; CHECK-NEXT: whilelo p1.b, x8, x9
-; CHECK-NEXT: b.mi .LBB6_2
-; CHECK-NEXT: .LBB6_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_8:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB6_3
-; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT: sub x9, x1, x2
-; CHECK-NOSVE2-NEXT: mov x8, xzr
-; CHECK-NOSVE2-NEXT: cmp x9, #0
-; CHECK-NOSVE2-NEXT: cset w10, lt
-; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9
-; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
-; CHECK-NOSVE2-NEXT: mov w9, w3
-; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
-; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b
-; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT: .LBB6_2: // %vector.body
-; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b
-; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8]
-; CHECK-NOSVE2-NEXT: add x8, x8, x10
-; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB6_2
-; CHECK-NOSVE2-NEXT: .LBB6_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT: ret
-entry:
- %cmp11 = icmp sgt i32 %n, 0
- br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- %c14 = ptrtoint ptr %c to i64
- %b15 = ptrtoint ptr %b to i64
- %wide.trip.count = zext nneg i32 %n to i64
- %sub.diff = sub i64 %b15, %c14
- %neg.compare = icmp slt i64 %sub.diff, 0
- %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
- %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
- %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
- %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
- %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
- %0 = zext <vscale x 16 x i1> %active.lane.mask.alias to <vscale x 16 x i8>
- %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %0)
- %2 = zext i8 %1 to i64
- br label %vector.body
-
-vector.body:
- %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
- %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
- %3 = and <vscale x 16 x i1> %active.lane.mask, %active.lane.mask.alias
- %4 = getelementptr inbounds i8, ptr %a, i64 %index
- %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
- %5 = getelementptr inbounds i8, ptr %b, i64 %index
- %wide.masked.load16 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
- %6 = add <vscale x 16 x i8> %wide.masked.load16, %wide.masked.load
- %7 = getelementptr inbounds i8, ptr %c, i64 %index
- tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %6, ptr %7, i32 1, <vscale x 16 x i1> %3)
- %index.next = add i64 %index, %2
- %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
- %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
- br i1 %8, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
- ret void
-}
-
-define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB7_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: mov w8, w3
-; CHECK-NEXT: whilewr p1.h, x1, x2
-; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: whilelo p0.h, xzr, x8
-; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: .LBB7_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loo...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/120261
More information about the llvm-commits
mailing list