[llvm] c24cf97 - [ARM][MVE] Enable extending gathers
Anna Welker via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 16 07:25:16 PST 2020
Author: Anna Welker
Date: 2020-01-16T15:24:54Z
New Revision: c24cf97960827fa4993c399dc3f0be5a5376d9e7
URL: https://github.com/llvm/llvm-project/commit/c24cf97960827fa4993c399dc3f0be5a5376d9e7
DIFF: https://github.com/llvm/llvm-project/commit/c24cf97960827fa4993c399dc3f0be5a5376d9e7.diff
LOG: [ARM][MVE] Enable extending gathers
Enables the masked gather pass to
create extending masked gathers.
Differential Revision: https://reviews.llvm.org/D72451
Added:
Modified:
llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll
llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 9f64af02e698..42a0a790ad5a 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -84,7 +84,7 @@ class MVEGatherScatterLowering : public FunctionPass {
bool lowerGather(IntrinsicInst *I);
// Create a gather from a base + vector of offsets
Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr,
- IRBuilder<> Builder);
+ Instruction *&Root, IRBuilder<> Builder);
// Create a gather from a vector of pointers
Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr,
IRBuilder<> Builder);
@@ -104,9 +104,9 @@ Pass *llvm::createMVEGatherScatterLoweringPass() {
bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements,
unsigned ElemSize,
unsigned Alignment) {
- // Do only allow non-extending gathers for now
- if (((NumElements == 4 && ElemSize == 32) ||
- (NumElements == 8 && ElemSize == 16) ||
+ if (((NumElements == 4 &&
+ (ElemSize == 32 || ElemSize == 16 || ElemSize == 8)) ||
+ (NumElements == 8 && (ElemSize == 16 || ElemSize == 8)) ||
(NumElements == 16 && ElemSize == 8)) &&
ElemSize / 8 <= Alignment)
return true;
@@ -126,9 +126,6 @@ Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr,
<< " from base + vector of offsets\n");
Value *GEPPtr = GEP->getPointerOperand();
if (GEPPtr->getType()->isVectorTy()) {
- LLVM_DEBUG(dbgs() << "masked gathers: gather from a vector of pointers"
- << " hidden behind a getelementptr currently not"
- << " supported. Expanding.\n");
return nullptr;
}
if (GEP->getNumOperands() != 2) {
@@ -194,7 +191,10 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
IRBuilder<> Builder(I->getContext());
Builder.SetInsertPoint(I);
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Builder);
+
+ Instruction *Root = I;
+
+ Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
if (!Load)
Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
if (!Load)
@@ -206,18 +206,24 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
Load = Builder.CreateSelect(Mask, Load, PassThru);
}
+ Root->replaceAllUsesWith(Load);
+ Root->eraseFromParent();
+ if (Root != I)
+ // If this was an extending gather, we need to get rid of the sext/zext
+ // sext/zext as well as of the gather itself
+ I->eraseFromParent();
LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n");
- I->replaceAllUsesWith(Load);
- I->eraseFromParent();
return true;
}
Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(
IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
using namespace PatternMatch;
- LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n");
+
Type *Ty = I->getType();
- if (Ty->getVectorNumElements() != 4)
+
+ LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n");
+ if (Ty->getVectorNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
// Can't build an intrinsic for this
return nullptr;
Value *Mask = I->getArgOperand(2);
@@ -233,23 +239,55 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(
}
Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
- IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
+ IntrinsicInst *I, Value *Ptr, Instruction *&Root, IRBuilder<> Builder) {
using namespace PatternMatch;
- Type *Ty = I->getType();
+
+ Type *OriginalTy = I->getType();
+ Type *ResultTy = OriginalTy;
+
+ unsigned Unsigned = 1;
+ // The size of the gather was already checked in isLegalTypeAndAlignment;
+ // if it was not a full vector width an appropriate extend should follow.
+ auto *Extend = Root;
+ if (OriginalTy->getPrimitiveSizeInBits() < 128) {
+ // Only transform gathers with exactly one use
+ if (!I->hasOneUse())
+ return nullptr;
+
+ // The correct root to replace is the not the CallInst itself, but the
+ // instruction which extends it
+ Extend = cast<Instruction>(*I->users().begin());
+ if (isa<SExtInst>(Extend)) {
+ Unsigned = 0;
+ } else if (!isa<ZExtInst>(Extend)) {
+ LLVM_DEBUG(dbgs() << "masked gathers: extend needed but not provided. "
+ << "Expanding\n");
+ return nullptr;
+ }
+ LLVM_DEBUG(dbgs() << "masked gathers: found an extending gather\n");
+ ResultTy = Extend->getType();
+ // The final size of the gather must be a full vector width
+ if (ResultTy->getPrimitiveSizeInBits() != 128) {
+ LLVM_DEBUG(dbgs() << "masked gathers: extending from the wrong type. "
+ << "Expanding\n");
+ return nullptr;
+ }
+ }
+
Value *Offsets;
- Value *BasePtr = checkGEP(Offsets, Ty, Ptr, Builder);
+ Value *BasePtr = checkGEP(Offsets, ResultTy, Ptr, Builder);
if (!BasePtr)
return nullptr;
unsigned Scale;
int GEPElemSize =
BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits();
- int ResultElemSize = Ty->getScalarSizeInBits();
+ int MemoryElemSize = OriginalTy->getScalarSizeInBits();
// This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a
// 8bit, 16bit or 32bit load scaled by 1
- if (GEPElemSize == 32 && ResultElemSize == 32) {
+ if (GEPElemSize == 32 && MemoryElemSize == 32) {
Scale = 2;
- } else if (GEPElemSize == 16 && ResultElemSize == 16) {
+ } else if (GEPElemSize == 16 && MemoryElemSize == 16) {
Scale = 1;
} else if (GEPElemSize == 8) {
Scale = 0;
@@ -258,20 +296,21 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
<< " create masked gather\n");
return nullptr;
}
+ Root = Extend;
Value *Mask = I->getArgOperand(2);
if (!match(Mask, m_One()))
return Builder.CreateIntrinsic(
Intrinsic::arm_mve_vldr_gather_offset_predicated,
- {Ty, BasePtr->getType(), Offsets->getType(), Mask->getType()},
- {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()),
- Builder.getInt32(Scale), Builder.getInt32(1), Mask});
+ {ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()},
+ {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()),
+ Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask});
else
return Builder.CreateIntrinsic(
Intrinsic::arm_mve_vldr_gather_offset,
- {Ty, BasePtr->getType(), Offsets->getType()},
- {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()),
- Builder.getInt32(Scale), Builder.getInt32(1)});
+ {ResultTy, BasePtr->getType(), Offsets->getType()},
+ {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()),
+ Builder.getInt32(Scale), Builder.getInt32(Unsigned)});
}
bool MVEGatherScatterLowering::runOnFunction(Function &F) {
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
index 5e4e5a394096..7b908576f899 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
@@ -4,38 +4,9 @@
define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) {
; CHECK-LABEL: zext_unscaled_i8_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vldrh.u32 q0, [r1]
-; CHECK-NEXT: vldrh.u32 q1, [r1, #8]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vadd.i32 q1, q1, r0
-; CHECK-NEXT: vmov r2, s3
-; CHECK-NEXT: vmov r3, s1
-; CHECK-NEXT: vmov r5, s0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov r1, s5
-; CHECK-NEXT: vmov r4, s7
-; CHECK-NEXT: ldrb.w r12, [r2]
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: ldrb.w lr, [r3]
-; CHECK-NEXT: vmov r3, s6
-; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: vmov.16 q0[0], r5
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.16 q0[1], lr
-; CHECK-NEXT: ldrb r4, [r4]
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.16 q0[2], r2
-; CHECK-NEXT: vmov.16 q0[3], r12
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.16 q0[5], r1
-; CHECK-NEXT: vmov.16 q0[6], r3
-; CHECK-NEXT: vmov.16 q0[7], r4
-; CHECK-NEXT: vmovlb.u8 q0, q0
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: vldrh.u16 q1, [r1]
+; CHECK-NEXT: vldrb.u16 q0, [r0, q1]
+; CHECK-NEXT: bx lr
entry:
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
%offs.zext = zext <8 x i16> %offs to <8 x i32>
@@ -48,38 +19,9 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @sext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) {
; CHECK-LABEL: sext_unscaled_i8_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vldrh.u32 q0, [r1]
-; CHECK-NEXT: vldrh.u32 q1, [r1, #8]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vadd.i32 q1, q1, r0
-; CHECK-NEXT: vmov r2, s3
-; CHECK-NEXT: vmov r3, s1
-; CHECK-NEXT: vmov r5, s0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov r1, s5
-; CHECK-NEXT: vmov r4, s7
-; CHECK-NEXT: ldrb.w r12, [r2]
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: ldrb.w lr, [r3]
-; CHECK-NEXT: vmov r3, s6
-; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: vmov.16 q0[0], r5
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.16 q0[1], lr
-; CHECK-NEXT: ldrb r4, [r4]
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.16 q0[2], r2
-; CHECK-NEXT: vmov.16 q0[3], r12
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.16 q0[5], r1
-; CHECK-NEXT: vmov.16 q0[6], r3
-; CHECK-NEXT: vmov.16 q0[7], r4
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: vldrh.u16 q1, [r1]
+; CHECK-NEXT: vldrb.s16 q0, [r0, q1]
+; CHECK-NEXT: bx lr
entry:
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
%offs.zext = zext <8 x i16> %offs to <8 x i32>
@@ -122,38 +64,9 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @zext_unsigned_unscaled_i8_i8(i8* %base, <8 x i8>* %offptr) {
; CHECK-LABEL: zext_unsigned_unscaled_i8_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vldrb.u32 q0, [r1]
-; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vadd.i32 q1, q1, r0
-; CHECK-NEXT: vmov r2, s3
-; CHECK-NEXT: vmov r3, s1
-; CHECK-NEXT: vmov r5, s0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov r1, s5
-; CHECK-NEXT: vmov r4, s7
-; CHECK-NEXT: ldrb.w r12, [r2]
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: ldrb.w lr, [r3]
-; CHECK-NEXT: vmov r3, s6
-; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: vmov.16 q0[0], r5
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.16 q0[1], lr
-; CHECK-NEXT: ldrb r4, [r4]
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.16 q0[2], r2
-; CHECK-NEXT: vmov.16 q0[3], r12
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.16 q0[5], r1
-; CHECK-NEXT: vmov.16 q0[6], r3
-; CHECK-NEXT: vmov.16 q0[7], r4
-; CHECK-NEXT: vmovlb.u8 q0, q0
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: vldrb.u16 q1, [r1]
+; CHECK-NEXT: vldrb.u16 q0, [r0, q1]
+; CHECK-NEXT: bx lr
entry:
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
%offs.zext = zext <8 x i8> %offs to <8 x i32>
@@ -166,38 +79,9 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @sext_unsigned_unscaled_i8_i8(i8* %base, <8 x i8>* %offptr) {
; CHECK-LABEL: sext_unsigned_unscaled_i8_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vldrb.u32 q0, [r1]
-; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vadd.i32 q1, q1, r0
-; CHECK-NEXT: vmov r2, s3
-; CHECK-NEXT: vmov r3, s1
-; CHECK-NEXT: vmov r5, s0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov r1, s5
-; CHECK-NEXT: vmov r4, s7
-; CHECK-NEXT: ldrb.w r12, [r2]
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: ldrb.w lr, [r3]
-; CHECK-NEXT: vmov r3, s6
-; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: vmov.16 q0[0], r5
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.16 q0[1], lr
-; CHECK-NEXT: ldrb r4, [r4]
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.16 q0[2], r2
-; CHECK-NEXT: vmov.16 q0[3], r12
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.16 q0[5], r1
-; CHECK-NEXT: vmov.16 q0[6], r3
-; CHECK-NEXT: vmov.16 q0[7], r4
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: vldrb.u16 q1, [r1]
+; CHECK-NEXT: vldrb.s16 q0, [r0, q1]
+; CHECK-NEXT: bx lr
entry:
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
%offs.zext = zext <8 x i8> %offs to <8 x i32>
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll
index 1acba1a95c3d..0cbfb8961eb9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll
@@ -4,22 +4,8 @@
define arm_aapcs_vfpcc <4 x i32> @zext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr) {
; CHECK-LABEL: zext_scaled_i16_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.u16 q0, q0
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4
@@ -32,22 +18,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr) {
; CHECK-LABEL: sext_scaled_i16_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4
@@ -147,22 +119,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) {
; CHECK-LABEL: zext_signed_scaled_i16_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q0, [r1]
-; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.u16 q0, q0
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
@@ -176,22 +134,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) {
; CHECK-LABEL: sext_signed_scaled_i16_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q0, [r1]
-; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
@@ -205,22 +149,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) {
; CHECK-LABEL: zext_unsigned_scaled_i16_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.u32 q0, [r1]
-; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.u16 q0, q0
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
@@ -234,22 +164,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) {
; CHECK-LABEL: sext_unsigned_scaled_i16_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.u32 q0, [r1]
-; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
@@ -321,22 +237,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) {
; CHECK-LABEL: zext_signed_scaled_i16_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.s32 q0, [r1]
-; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.u16 q0, q0
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
@@ -350,22 +252,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) {
; CHECK-LABEL: sext_signed_scaled_i16_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.s32 q0, [r1]
-; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
@@ -379,22 +267,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) {
; CHECK-LABEL: zext_unsigned_scaled_i16_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.u32 q0, [r1]
-; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.u16 q0, q0
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
@@ -408,22 +282,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) {
; CHECK-LABEL: sext_unsigned_scaled_i16_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.u32 q0, [r1]
-; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
index c862aa9656cb..ac365ab23f7e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll
@@ -4,22 +4,8 @@
define arm_aapcs_vfpcc <4 x i32> @zext_unscaled_i8_i32(i8* %base, <4 x i32>* %offptr) {
; CHECK-LABEL: zext_unscaled_i8_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vmov.i32 q1, #0xff
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r0, s3
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r1
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r0
-; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vldrb.u32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4
@@ -32,22 +18,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_unscaled_i8_i32(i8* %base, <4 x i32>* %offptr) {
; CHECK-LABEL: sext_unscaled_i8_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vmov r3, s1
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: vmov r2, s3
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.32 q0[0], r0
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.32 q0[1], r3
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: vmov.32 q0[2], r1
-; CHECK-NEXT: vmov.32 q0[3], r2
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vldrb.s32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4
@@ -60,21 +32,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_unscaled_i16_i32(i8* %base, <4 x i32>* %offptr) {
; CHECK-LABEL: zext_unscaled_i16_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.u16 q0, q0
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vldrh.u32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4
@@ -88,21 +47,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_unscaled_i16_i32(i8* %base, <4 x i32>* %offptr) {
; CHECK-LABEL: sext_unscaled_i16_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vldrh.s32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4
@@ -204,21 +150,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) {
; CHECK-LABEL: zext_signed_unscaled_i16_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.u16 q0, q0
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vldrh.u32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
@@ -233,21 +166,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) {
; CHECK-LABEL: sext_signed_unscaled_i16_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vldrh.s32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
@@ -262,21 +182,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) {
; CHECK-LABEL: zext_unsigned_unscaled_i16_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.u32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.u16 q0, q0
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vldrh.u32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
@@ -291,21 +198,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) {
; CHECK-LABEL: sext_unsigned_unscaled_i16_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.u32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vldrh.s32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
@@ -320,22 +214,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) {
; CHECK-LABEL: zext_signed_unscaled_i8_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q0, [r1]
-; CHECK-NEXT: vmov.i32 q1, #0xff
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r0, s3
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r1
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r0
-; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vldrb.u32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
@@ -349,22 +229,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) {
; CHECK-LABEL: sext_signed_unscaled_i8_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vmov r3, s1
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: vmov r2, s3
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.32 q0[0], r0
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.32 q0[1], r3
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: vmov.32 q0[2], r1
-; CHECK-NEXT: vmov.32 q0[3], r2
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vldrb.s32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
@@ -378,22 +244,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) {
; CHECK-LABEL: zext_unsigned_unscaled_i8_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.u32 q0, [r1]
-; CHECK-NEXT: vmov.i32 q1, #0xff
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r0, s3
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r1
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r0
-; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vldrb.u32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
@@ -407,22 +259,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) {
; CHECK-LABEL: sext_unsigned_unscaled_i8_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.u32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vmov r3, s1
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: vmov r2, s3
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.32 q0[0], r0
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.32 q0[1], r3
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: vmov.32 q0[2], r1
-; CHECK-NEXT: vmov.32 q0[3], r2
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vldrb.s32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
@@ -496,21 +334,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) {
; CHECK-LABEL: zext_signed_unscaled_i16_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.s32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.u16 q0, q0
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vldrh.u32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
@@ -525,21 +350,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) {
; CHECK-LABEL: sext_signed_unscaled_i16_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.s32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vldrh.s32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
@@ -554,21 +366,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) {
; CHECK-LABEL: zext_unsigned_unscaled_i16_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.u32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.u16 q0, q0
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vldrh.u32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
@@ -583,21 +382,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) {
; CHECK-LABEL: sext_unsigned_unscaled_i16_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.u32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s3
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r0, [r0]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r1
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vldrh.s32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
@@ -612,22 +398,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) {
; CHECK-LABEL: zext_signed_unscaled_i8_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.s32 q0, [r1]
-; CHECK-NEXT: vmov.i32 q1, #0xff
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r0, s3
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r1
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r0
-; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vldrb.u32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
@@ -641,22 +413,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) {
; CHECK-LABEL: sext_signed_unscaled_i8_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.s32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vmov r3, s1
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: vmov r2, s3
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.32 q0[0], r0
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.32 q0[1], r3
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: vmov.32 q0[2], r1
-; CHECK-NEXT: vmov.32 q0[3], r2
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vldrb.s32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
@@ -670,22 +428,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) {
; CHECK-LABEL: zext_unsigned_unscaled_i8_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.u32 q0, [r1]
-; CHECK-NEXT: vmov.i32 q1, #0xff
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r0, s3
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.32 q0[1], r1
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: vmov.32 q0[2], r3
-; CHECK-NEXT: vmov.32 q0[3], r0
-; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vldrb.u32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
@@ -699,22 +443,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) {
; CHECK-LABEL: sext_unsigned_unscaled_i8_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.u32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vmov r3, s1
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: vmov r2, s3
-; CHECK-NEXT: ldrb r0, [r0]
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.32 q0[0], r0
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.32 q0[1], r3
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: vmov.32 q0[2], r1
-; CHECK-NEXT: vmov.32 q0[3], r2
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vldrb.s32 q0, [r0, q1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
index 75f18ea8f081..fd4fbb945ad4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -842,6 +842,53 @@ entry:
ret <4 x i32> %gather
}
+define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(i8* %base, <8 x i8>* %offptr) {
+; CHECK-LABEL: sext_unsigned_unscaled_i8_i8_toi64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: vldrb.u32 q0, [r1]
+; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
+; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s3
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov r1, s5
+; CHECK-NEXT: vmov r5, s1
+; CHECK-NEXT: vmov r4, s7
+; CHECK-NEXT: ldrb.w r12, [r2]
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: ldrb.w lr, [r3]
+; CHECK-NEXT: vmov r3, s6
+; CHECK-NEXT: ldrb r0, [r0]
+; CHECK-NEXT: ldrb r1, [r1]
+; CHECK-NEXT: ldrb r5, [r5]
+; CHECK-NEXT: vmov.32 q1[0], r0
+; CHECK-NEXT: ldrb r4, [r4]
+; CHECK-NEXT: vmov.32 q1[1], r1
+; CHECK-NEXT: ldrb r2, [r2]
+; CHECK-NEXT: ldrb r3, [r3]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: vmov.32 q0[1], r5
+; CHECK-NEXT: vmov.32 q1[2], r3
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: vmov.32 q1[3], r4
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vmovlb.s8 q1, q1
+; CHECK-NEXT: vmovlb.s8 q0, q0
+; CHECK-NEXT: vmovlb.s16 q1, q1
+; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: pop {r4, r5, r7, pc}
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
+ %offs.zext = zext <8 x i8> %offs to <8 x i32>
+ %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
+ %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
+ %gather.sext = sext <8 x i8> %gather to <8 x i32>
+ ret <8 x i32> %gather.sext
+}
+
declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>)
More information about the llvm-commits
mailing list