[llvm] 09924cb - [ARM] Rejig some of the MVE gather/scatter lowering pass. NFC
David Green via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 15 07:38:58 PDT 2021
Author: David Green
Date: 2021-06-15T15:38:39+01:00
New Revision: 09924cbab78016d793c15ece5762faff3756c0cf
URL: https://github.com/llvm/llvm-project/commit/09924cbab78016d793c15ece5762faff3756c0cf
DIFF: https://github.com/llvm/llvm-project/commit/09924cbab78016d793c15ece5762faff3756c0cf.diff
LOG: [ARM] Rejig some of the MVE gather/scatter lowering pass. NFC
This adjusts some of how the gather/scatter lowering pass passes around
data and where certain gathers/scatters are created from. It should not
effect code generation on its own, but allows other patches to more
clearly reason about the code.
A number of extra test cases were also added for smaller gathers/
scatters that can be extended, and some of the test comments were
updated.
Added:
Modified:
llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 4c1d0ba63c4c2..449d8b45bdc0a 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -84,8 +84,8 @@ class MVEGatherScatterLowering : public FunctionPass {
// Check for a getelementptr and deduce base and offsets from it, on success
// returning the base directly and the offsets indirectly using the Offsets
// argument
- Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP,
- IRBuilder<> &Builder);
+ Value *decomposeGEP(Value *&Offsets, FixedVectorType *Ty,
+ GetElementPtrInst *GEP, IRBuilder<> &Builder);
// Compute the scale of this gather/scatter instruction
int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
// If the value is a constant, or derived from constants via additions
@@ -123,8 +123,7 @@ class MVEGatherScatterLowering : public FunctionPass {
// QI gathers and scatters can increment their offsets on their own if
// the increment is a constant value (digit)
- Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *BasePtr,
- Value *Ptr, GetElementPtrInst *GEP,
+ Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *Ptr,
IRBuilder<> &Builder);
// QI gathers/scatters can increment their offsets on their own if the
// increment is a constant value (digit) - this creates a writeback QI
@@ -214,9 +213,10 @@ static bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) {
return true;
}
-Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty,
- GetElementPtrInst *GEP,
- IRBuilder<> &Builder) {
+Value *MVEGatherScatterLowering::decomposeGEP(Value *&Offsets,
+ FixedVectorType *Ty,
+ GetElementPtrInst *GEP,
+ IRBuilder<> &Builder) {
if (!GEP) {
LLVM_DEBUG(dbgs() << "masked gathers/scatters: no getelementpointer "
<< "found\n");
@@ -372,7 +372,10 @@ Value *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
Builder.SetCurrentDebugLocation(I->getDebugLoc());
Instruction *Root = I;
- Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
+
+ Value *Load = tryCreateIncrementingGatScat(I, Ptr, Builder);
+ if (!Load)
+ Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
if (!Load)
Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
if (!Load)
@@ -478,14 +481,9 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
Value *Offsets;
Value *BasePtr =
- checkGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder);
+ decomposeGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder);
if (!BasePtr)
return nullptr;
- // Check whether the offset is a constant increment that could be merged into
- // a QI gather
- Value *Load = tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
- if (Load)
- return Load;
int Scale =
computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
@@ -533,7 +531,9 @@ Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
Builder.SetInsertPoint(I);
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
+ Value *Store = tryCreateIncrementingGatScat(I, Ptr, Builder);
+ if (!Store)
+ Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
if (!Store)
Store = tryCreateMaskedScatterBase(I, Ptr, Builder);
if (!Store)
@@ -598,6 +598,7 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
Value *Mask = I->getArgOperand(3);
Type *InputTy = Input->getType();
Type *MemoryTy = InputTy;
+
LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing"
<< " to base + vector of offsets\n");
// If the input has been truncated, try to integrate that trunc into the
@@ -619,15 +620,10 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
Value *Offsets;
Value *BasePtr =
- checkGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder);
+ decomposeGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder);
if (!BasePtr)
return nullptr;
- // Check whether the offset is a constant increment that could be merged into
- // a QI gather
- Value *Store =
- tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
- if (Store)
- return Store;
+
int Scale =
computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
MemoryTy->getScalarSizeInBits());
@@ -652,21 +648,28 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
}
Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
- IntrinsicInst *I, Value *BasePtr, Value *Offsets, GetElementPtrInst *GEP,
- IRBuilder<> &Builder) {
+ IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
FixedVectorType *Ty;
if (I->getIntrinsicID() == Intrinsic::masked_gather)
Ty = cast<FixedVectorType>(I->getType());
else
Ty = cast<FixedVectorType>(I->getArgOperand(0)->getType());
+
// Incrementing gathers only exist for v4i32
- if (Ty->getNumElements() != 4 ||
- Ty->getScalarSizeInBits() != 32)
+ if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
return nullptr;
+ // Incrementing gathers are not beneficial outside of a loop
Loop *L = LI->getLoopFor(I->getParent());
if (L == nullptr)
- // Incrementing gathers are not beneficial outside of a loop
return nullptr;
+
+ // Decompose the GEP into Base and Offsets
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ Value *Offsets;
+ Value *BasePtr = decomposeGEP(Offsets, Ty, GEP, Builder);
+ if (!BasePtr)
+ return nullptr;
+
LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
"wb gather/scatter\n");
@@ -689,6 +692,7 @@ Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
if (Load != nullptr)
return Load;
}
+
LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
"non-wb gather/scatter\n");
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
index 755011ad0d146..973929eabdbe5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -273,6 +273,25 @@ entry:
ret <4 x i32> %ext
}
+define arm_aapcs_vfpcc <4 x i16> @ptr_v4i16(<4 x i16*>* %offptr) {
+; CHECK-LABEL: ptr_v4i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vmov r0, r1, d1
+; CHECK-NEXT: vmov r2, r3, d0
+; CHECK-NEXT: ldrh r0, [r0]
+; CHECK-NEXT: ldrh r2, [r2]
+; CHECK-NEXT: ldrh r1, [r1]
+; CHECK-NEXT: ldrh r3, [r3]
+; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
+ %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
+ ret <4 x i16> %gather
+}
+
define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) {
; CHECK-LABEL: ptr_v8i16_sext:
; CHECK: @ %bb.0: @ %entry
@@ -369,6 +388,25 @@ entry:
ret <8 x half> %gather
}
+define arm_aapcs_vfpcc <4 x half> @ptr_v4f16(<4 x half*>* %offptr) {
+; CHECK-LABEL: ptr_v4f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmov r0, r1, d2
+; CHECK-NEXT: vldr.16 s8, [r1]
+; CHECK-NEXT: vldr.16 s0, [r0]
+; CHECK-NEXT: vmov r0, r1, d3
+; CHECK-NEXT: vins.f16 s0, s8
+; CHECK-NEXT: vldr.16 s4, [r1]
+; CHECK-NEXT: vldr.16 s1, [r0]
+; CHECK-NEXT: vins.f16 s1, s4
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x half*>, <4 x half*>* %offptr, align 4
+ %gather = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x half> undef)
+ ret <4 x half> %gather
+}
+
; i8
define arm_aapcs_vfpcc <16 x i8> @ptr_i8(<16 x i8*>* %offptr) {
@@ -499,6 +537,40 @@ entry:
ret <8 x i16> %ext
}
+define arm_aapcs_vfpcc <8 x i8> @ptr_v8i8(<8 x i8*>* %offptr) {
+; CHECK-LABEL: ptr_v8i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vmov r1, r2, d0
+; CHECK-NEXT: vmov r3, r12, d1
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vmov r4, r5, d0
+; CHECK-NEXT: vmov r0, lr, d1
+; CHECK-NEXT: ldrb r1, [r1]
+; CHECK-NEXT: ldrb r6, [r3]
+; CHECK-NEXT: ldrb r2, [r2]
+; CHECK-NEXT: ldrb r4, [r4]
+; CHECK-NEXT: ldrb r5, [r5]
+; CHECK-NEXT: vmov.16 q0[0], r4
+; CHECK-NEXT: ldrb r0, [r0]
+; CHECK-NEXT: vmov.16 q0[1], r5
+; CHECK-NEXT: ldrb.w r3, [lr]
+; CHECK-NEXT: vmov.16 q0[2], r0
+; CHECK-NEXT: ldrb.w r12, [r12]
+; CHECK-NEXT: vmov.16 q0[3], r3
+; CHECK-NEXT: vmov.16 q0[4], r1
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: vmov.16 q0[6], r6
+; CHECK-NEXT: vmov.16 q0[7], r12
+; CHECK-NEXT: pop {r4, r5, r6, pc}
+entry:
+ %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
+ %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
+ ret <8 x i8> %gather
+}
+
define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) {
; CHECK-LABEL: ptr_v4i8_sext32:
; CHECK: @ %bb.0: @ %entry
@@ -543,6 +615,25 @@ entry:
ret <4 x i32> %ext
}
+define arm_aapcs_vfpcc <4 x i8> @ptr_v4i8(<4 x i8*>* %offptr) {
+; CHECK-LABEL: ptr_v4i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vmov r0, r1, d1
+; CHECK-NEXT: vmov r2, r3, d0
+; CHECK-NEXT: ldrb r0, [r0]
+; CHECK-NEXT: ldrb r2, [r2]
+; CHECK-NEXT: ldrb r1, [r1]
+; CHECK-NEXT: ldrb r3, [r3]
+; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
+ %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
+ ret <4 x i8> %gather
+}
+
define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(<8 x i8*>* %offptr) {
; CHECK-LABEL: ptr_v8i8_sext32:
; CHECK: @ %bb.0: @ %entry
@@ -623,17 +714,17 @@ define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) {
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
-; CHECK-NEXT: .LBB22_1: @ %vector.body.preheader
+; CHECK-NEXT: .LBB26_1: @ %vector.body.preheader
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
-; CHECK-NEXT: .LBB22_2: @ %vector.body
+; CHECK-NEXT: .LBB26_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vptt.i32 ne, q0, zr
; CHECK-NEXT: vldrwt.u32 q1, [q0]
; CHECK-NEXT: vstrwt.32 q1, [r0], #16
-; CHECK-NEXT: le lr, .LBB22_2
+; CHECK-NEXT: le lr, .LBB26_2
; CHECK-NEXT: @ %bb.3: @ %for.end
; CHECK-NEXT: pop {r7, pc}
entry:
@@ -668,17 +759,17 @@ define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) {
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
-; CHECK-NEXT: .LBB23_1: @ %vector.body.preheader
+; CHECK-NEXT: .LBB27_1: @ %vector.body.preheader
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
-; CHECK-NEXT: .LBB23_2: @ %vector.body
+; CHECK-NEXT: .LBB27_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vptt.i32 ne, q0, zr
; CHECK-NEXT: vldrwt.u32 q1, [q0]
; CHECK-NEXT: vstrwt.32 q1, [r0], #16
-; CHECK-NEXT: le lr, .LBB23_2
+; CHECK-NEXT: le lr, .LBB27_2
; CHECK-NEXT: @ %bb.3: @ %for.end
; CHECK-NEXT: pop {r7, pc}
entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
index 2638bf7760da0..0e80c6241c041 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
@@ -47,7 +47,7 @@ entry:
ret void
}
-; Expand
+; Expand - sext offset
define arm_aapcs_vfpcc void @scaled_v8i16_sext(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) {
; CHECK-LABEL: scaled_v8i16_sext:
; CHECK: @ %bb.0: @ %entry
@@ -88,7 +88,7 @@ entry:
ret void
}
-; Expand
+; Expand - sext offset
define arm_aapcs_vfpcc void @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr, <8 x half> %input) {
; CHECK-LABEL: scaled_v8f16_sext:
; CHECK: @ %bb.0: @ %entry
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
index 41508444863bc..248ba3e95de36 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
@@ -65,7 +65,7 @@ entry:
ret void
}
-; Expand
+; Expand - sext offsets
define arm_aapcs_vfpcc void @unscaled_v8i16_sext(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
; CHECK-LABEL: unscaled_v8i16_sext:
; CHECK: @ %bb.0: @ %entry
@@ -105,7 +105,7 @@ entry:
ret void
}
-; Expand
+; Expand - sext offsets
define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr, <8 x half> %input) {
; CHECK-LABEL: unscaled_v8f16_sext:
; CHECK: @ %bb.0: @ %entry
@@ -139,7 +139,7 @@ entry:
ret void
}
-; Expand
+; Expand - i32 offsets
define arm_aapcs_vfpcc void @unscaled_v8i16_noext(i8* %base, <8 x i32>* %offptr, <8 x i16> %input) {
; CHECK-LABEL: unscaled_v8i16_noext:
; CHECK: @ %bb.0: @ %entry
@@ -178,7 +178,7 @@ entry:
ret void
}
-; Expand
+; Expand - i32 offsets
define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr, <8 x half> %input) {
; CHECK-LABEL: unscaled_v8f16_noext:
; CHECK: @ %bb.0: @ %entry
@@ -243,7 +243,7 @@ entry:
ret void
}
-; Expand ?
+; Expand - sext offsets
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
; CHECK: @ %bb.0: @ %entry
@@ -287,7 +287,6 @@ entry:
ret void
}
-; Expand ?
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
; CHECK: @ %bb.0: @ %entry
@@ -323,7 +322,7 @@ entry:
ret void
}
-; Expand ?
+; Expand - sext offsets
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
; CHECK-LABEL: trunc_signed_unscaled_i32_i8:
; CHECK: @ %bb.0: @ %entry
@@ -360,7 +359,6 @@ entry:
ret void
}
-; Expand ?
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
; CHECK: @ %bb.0: @ %entry
@@ -391,7 +389,7 @@ entry:
ret void
}
-; Expand ?
+; Expand - sext offsets
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
; CHECK: @ %bb.0: @ %entry
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
index 8fdb1a60eba51..f004024a2381b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
@@ -16,7 +16,7 @@ entry:
ret void
}
-; Expand
+; Expanded ?
define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) {
; CHECK-LABEL: unscaled_v8i8_i8:
; CHECK: @ %bb.0: @ %entry
@@ -79,7 +79,7 @@ entry:
ret void
}
-; Expand
+; Expand - sext offsets
define arm_aapcs_vfpcc void @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
; CHECK-LABEL: unscaled_v16i8_sext:
; CHECK: @ %bb.0: @ %entry
@@ -142,7 +142,7 @@ entry:
ret void
}
-; Expand
+; Expand - sext offsets
define arm_aapcs_vfpcc void @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr, <16 x i8> %input) {
; CHECK-LABEL: unscaled_v16i8_i16:
; CHECK: @ %bb.0: @ %entry
@@ -205,7 +205,7 @@ entry:
ret void
}
-; Expand
+; Could be manually scaled offsets
define arm_aapcs_vfpcc void @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr, <16 x i8> %input) {
; CHECK-LABEL: unscaled_v16i8_scaled:
; CHECK: @ %bb.0: @ %entry
@@ -273,7 +273,7 @@ entry:
ret void
}
-; Expand
+; Expand - large offsets
define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr, <16 x i8> %input) {
; CHECK-LABEL: unscaled_v16i8_i8_next:
; CHECK: @ %bb.0: @ %entry
@@ -335,7 +335,6 @@ entry:
ret void
}
-; Expand
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <16 x i8>* %offptr, <16 x i64> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
; CHECK: @ %bb.0: @ %entry
@@ -396,7 +395,6 @@ entry:
ret void
}
-; Expand
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
; CHECK: @ %bb.0: @ %entry
@@ -441,7 +439,6 @@ entry:
ret void
}
-; Expand
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
; CHECK: @ %bb.0: @ %entry
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
index 7847c4d3ac806..b1044f26b94f1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
@@ -251,6 +251,24 @@ entry:
ret void
}
+define arm_aapcs_vfpcc void @ptr_v4i16_dup(i32 %v, <4 x i16*> %offs) {
+; CHECK-LABEL: ptr_v4i16_dup:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r1, r2, d0
+; CHECK-NEXT: vmov r3, r12, d1
+; CHECK-NEXT: strh r0, [r1]
+; CHECK-NEXT: strh r0, [r2]
+; CHECK-NEXT: strh r0, [r3]
+; CHECK-NEXT: strh.w r0, [r12]
+; CHECK-NEXT: bx lr
+entry:
+ %ext = trunc i32 %v to i16
+ %splatinsert = insertelement <4 x i16> poison, i16 %ext, i32 0
+ %splat = shufflevector <4 x i16> %splatinsert, <4 x i16> poison, <4 x i32> zeroinitializer
+ call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %splat, <4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
; Expand
define arm_aapcs_vfpcc void @ptr_v8i16_trunc(<8 x i32> %v, <8 x i16*>* %offptr) {
; CHECK-LABEL: ptr_v8i16_trunc:
@@ -314,6 +332,42 @@ entry:
ret void
}
+define arm_aapcs_vfpcc void @ptr_v4f16(<4 x half> %v, <4 x half*>* %offptr) {
+; CHECK-LABEL: ptr_v4f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmovx.f16 s8, s0
+; CHECK-NEXT: vmov r0, r1, d2
+; CHECK-NEXT: vstr.16 s0, [r0]
+; CHECK-NEXT: vstr.16 s8, [r1]
+; CHECK-NEXT: vmov r0, r1, d3
+; CHECK-NEXT: vmovx.f16 s0, s1
+; CHECK-NEXT: vstr.16 s1, [r0]
+; CHECK-NEXT: vstr.16 s0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x half*>, <4 x half*>* %offptr, align 4
+ call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %v, <4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define arm_aapcs_vfpcc void @ptr_v4f16_dup(half %v, <4 x half*> %offs) {
+; CHECK-LABEL: ptr_v4f16_dup:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r0, r1, d2
+; CHECK-NEXT: vmov r2, r3, d3
+; CHECK-NEXT: vstr.16 s0, [r0]
+; CHECK-NEXT: vstr.16 s0, [r1]
+; CHECK-NEXT: vstr.16 s0, [r2]
+; CHECK-NEXT: vstr.16 s0, [r3]
+; CHECK-NEXT: bx lr
+entry:
+ %splatinsert = insertelement <4 x half> poison, half %v, i32 0
+ %splat = shufflevector <4 x half> %splatinsert, <4 x half> poison, <4 x i32> zeroinitializer
+ call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %splat, <4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
; i8
; Expand.
@@ -473,14 +527,14 @@ define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) {
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
-; CHECK-NEXT: .LBB16_1: @ %vector.body
+; CHECK-NEXT: .LBB19_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vptt.i32 ne, q0, zr
; CHECK-NEXT: vldrwt.u32 q1, [r0], #16
; CHECK-NEXT: vstrwt.32 q1, [q0]
-; CHECK-NEXT: bne .LBB16_1
+; CHECK-NEXT: bne .LBB19_1
; CHECK-NEXT: @ %bb.2: @ %for.end
; CHECK-NEXT: bx lr
entry:
@@ -513,14 +567,14 @@ define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) {
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
-; CHECK-NEXT: .LBB17_1: @ %vector.body
+; CHECK-NEXT: .LBB20_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vptt.i32 ne, q0, zr
; CHECK-NEXT: vldrwt.u32 q1, [r0], #16
; CHECK-NEXT: vstrwt.32 q1, [q0]
-; CHECK-NEXT: bne .LBB17_1
+; CHECK-NEXT: bne .LBB20_1
; CHECK-NEXT: @ %bb.2: @ %for.end
; CHECK-NEXT: bx lr
entry:
More information about the llvm-commits
mailing list