[llvm] ff9877c - [ARM][MVE] Enable masked scatter
Anna Welker via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 21 01:46:39 PST 2020
Author: Anna Welker
Date: 2020-01-21T09:46:26Z
New Revision: ff9877ce34b86f8f63a773e0e8e383a7ef2bec95
URL: https://github.com/llvm/llvm-project/commit/ff9877ce34b86f8f63a773e0e8e383a7ef2bec95
DIFF: https://github.com/llvm/llvm-project/commit/ff9877ce34b86f8f63a773e0e8e383a7ef2bec95.diff
LOG: [ARM][MVE] Enable masked scatter
Extends the gather/scatter pass in MVEGatherScatterLowering.cpp to
enable the transformation of masked scatters into calls to MVE's masked
scatter intrinsic.
Differential Revision: https://reviews.llvm.org/D72856
Added:
llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll
llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
Modified:
llvm/lib/Target/ARM/ARMTargetTransformInfo.h
llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index b9a0a40fdc05..871fe7f79fe6 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -158,7 +158,9 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
bool isLegalMaskedGather(Type *Ty, MaybeAlign Alignment);
- bool isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment) { return false; }
+ bool isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment) {
+ return isLegalMaskedGather(Ty, Alignment);
+ }
int getMemcpyCost(const Instruction *I);
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 42a0a790ad5a..ffa9335361b7 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -80,6 +80,8 @@ class MVEGatherScatterLowering : public FunctionPass {
// returning the base directly and the offsets indirectly using the Offsets
// argument
Value *checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, IRBuilder<> Builder);
+ // Compute the scale of this gather/scatter instruction
+ int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
bool lowerGather(IntrinsicInst *I);
// Create a gather from a base + vector of offsets
@@ -88,6 +90,14 @@ class MVEGatherScatterLowering : public FunctionPass {
// Create a gather from a vector of pointers
Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr,
IRBuilder<> Builder);
+
+ bool lowerScatter(IntrinsicInst *I);
+ // Create a scatter to a base + vector of offsets
+ Value *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Ptr,
+ IRBuilder<> Builder);
+ // Create a scatter to a vector of pointers
+ Value *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr,
+ IRBuilder<> Builder);
};
} // end anonymous namespace
@@ -110,8 +120,8 @@ bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements,
(NumElements == 16 && ElemSize == 8)) &&
ElemSize / 8 <= Alignment)
return true;
- LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid "
- << "alignment or vector type \n");
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: instruction does not have "
+ << "valid alignment or vector type \n");
return false;
}
@@ -119,17 +129,18 @@ Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr,
IRBuilder<> Builder) {
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
if (!GEP) {
- LLVM_DEBUG(dbgs() << "masked gathers: no getelementpointer found\n");
+ LLVM_DEBUG(
+ dbgs() << "masked gathers/scatters: no getelementpointer found\n");
return nullptr;
}
- LLVM_DEBUG(dbgs() << "masked gathers: getelementpointer found. Loading"
- << " from base + vector of offsets\n");
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found."
+ << " Looking at intrinsic for base + vector of offsets\n");
Value *GEPPtr = GEP->getPointerOperand();
if (GEPPtr->getType()->isVectorTy()) {
return nullptr;
}
if (GEP->getNumOperands() != 2) {
- LLVM_DEBUG(dbgs() << "masked gathers: getelementptr with too many"
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many"
<< " operands. Expanding.\n");
return nullptr;
}
@@ -140,16 +151,16 @@ Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr,
Offsets = ZextOffs->getOperand(0);
Type *OffsType = VectorType::getInteger(cast<VectorType>(Ty));
// If the offset we found does not have the type the intrinsic expects,
- // i.e., the same type as the gather itself, we need to convert it (only i
- // types) or fall back to expanding the gather
+ // i.e., the same type as the gather (or scatter input) itself, we need to
+ // convert it (only i types) or fall back to expanding the gather
if (OffsType != Offsets->getType()) {
if (OffsType->getScalarSizeInBits() >
Offsets->getType()->getScalarSizeInBits()) {
- LLVM_DEBUG(dbgs() << "masked gathers: extending offsets\n");
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: extending offsets\n");
Offsets = Builder.CreateZExt(Offsets, OffsType, "");
} else {
- LLVM_DEBUG(dbgs() << "masked gathers: no correct offset type. Can't"
- << " create masked gather\n");
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type."
+ << " Can't create intrinsic.\n");
return nullptr;
}
}
@@ -163,12 +174,28 @@ void MVEGatherScatterLowering::lookThroughBitcast(Value *&Ptr) {
Type *BCTy = BitCast->getType();
Type *BCSrcTy = BitCast->getOperand(0)->getType();
if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) {
- LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n");
+ LLVM_DEBUG(
+ dbgs() << "masked gathers/scatters: looking through bitcast\n");
Ptr = BitCast->getOperand(0);
}
}
}
+int MVEGatherScatterLowering::computeScale(unsigned GEPElemSize,
+ unsigned MemoryElemSize) {
+ // This can be a 32bit load/store scaled by 4, a 16bit load/store scaled by 2,
+ // or a 8bit, 16bit or 32bit load/store scaled by 1
+ if (GEPElemSize == 32 && MemoryElemSize == 32)
+ return 2;
+ else if (GEPElemSize == 16 && MemoryElemSize == 16)
+ return 1;
+ else if (GEPElemSize == 8)
+ return 0;
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: incorrect scale. Can't "
+ << "create intrinsic\n");
+ return -1;
+}
+
bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
using namespace PatternMatch;
LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n");
@@ -193,7 +220,6 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
Builder.SetCurrentDebugLocation(I->getDebugLoc());
Instruction *Root = I;
-
Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
if (!Load)
Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
@@ -219,9 +245,7 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(
IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
using namespace PatternMatch;
-
Type *Ty = I->getType();
-
LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n");
if (Ty->getVectorNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
// Can't build an intrinsic for this
@@ -279,23 +303,11 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
if (!BasePtr)
return nullptr;
- unsigned Scale;
- int GEPElemSize =
- BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits();
- int MemoryElemSize = OriginalTy->getScalarSizeInBits();
- // This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a
- // 8bit, 16bit or 32bit load scaled by 1
- if (GEPElemSize == 32 && MemoryElemSize == 32) {
- Scale = 2;
- } else if (GEPElemSize == 16 && MemoryElemSize == 16) {
- Scale = 1;
- } else if (GEPElemSize == 8) {
- Scale = 0;
- } else {
- LLVM_DEBUG(dbgs() << "masked gathers: incorrect scale for load. Can't"
- << " create masked gather\n");
+ int Scale = computeScale(
+ BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
+ OriginalTy->getScalarSizeInBits());
+ if (Scale == -1)
return nullptr;
- }
Root = Extend;
Value *Mask = I->getArgOperand(2);
@@ -313,6 +325,117 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
Builder.getInt32(Scale), Builder.getInt32(Unsigned)});
}
+bool MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
+ using namespace PatternMatch;
+ LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n");
+
+ // @llvm.masked.scatter.*(data, ptrs, alignment, mask)
+ // Attempt to turn the masked scatter in I into a MVE intrinsic
+ // Potentially optimising the addressing modes as we do so.
+ Value *Input = I->getArgOperand(0);
+ Value *Ptr = I->getArgOperand(1);
+ unsigned Alignment = cast<ConstantInt>(I->getArgOperand(2))->getZExtValue();
+ Type *Ty = Input->getType();
+
+ if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(),
+ Ty->getScalarSizeInBits(), Alignment))
+ return false;
+ lookThroughBitcast(Ptr);
+ assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
+
+ IRBuilder<> Builder(I->getContext());
+ Builder.SetInsertPoint(I);
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+
+ Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
+ if (!Store)
+ Store = tryCreateMaskedScatterBase(I, Ptr, Builder);
+ if (!Store)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n");
+ I->replaceAllUsesWith(Store);
+ I->eraseFromParent();
+ return true;
+}
+
+Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase(
+ IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
+ using namespace PatternMatch;
+ Value *Input = I->getArgOperand(0);
+ Value *Mask = I->getArgOperand(3);
+ Type *Ty = Input->getType();
+ // Only QR variants allow truncating
+ if (!(Ty->getVectorNumElements() == 4 && Ty->getScalarSizeInBits() == 32)) {
+ // Can't build an intrinsic for this
+ return nullptr;
+ }
+ // int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask)
+ LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n");
+ if (match(Mask, m_One()))
+ return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base,
+ {Ptr->getType(), Input->getType()},
+ {Ptr, Builder.getInt32(0), Input});
+ else
+ return Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_vstr_scatter_base_predicated,
+ {Ptr->getType(), Input->getType(), Mask->getType()},
+ {Ptr, Builder.getInt32(0), Input, Mask});
+}
+
+Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
+ IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
+ using namespace PatternMatch;
+ Value *Input = I->getArgOperand(0);
+ Value *Mask = I->getArgOperand(3);
+ Type *InputTy = Input->getType();
+ Type *MemoryTy = InputTy;
+ LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing"
+ << " to base + vector of offsets\n");
+ // If the input has been truncated, try to integrate that trunc into the
+ // scatter instruction (we don't care about alignment here)
+ if (TruncInst *Trunc = dyn_cast<TruncInst>(Input)) {
+ Value *PreTrunc = Trunc->getOperand(0);
+ Type *PreTruncTy = PreTrunc->getType();
+ if (PreTruncTy->getPrimitiveSizeInBits() == 128) {
+ Input = PreTrunc;
+ InputTy = PreTruncTy;
+ }
+ }
+ if (InputTy->getPrimitiveSizeInBits() != 128) {
+ LLVM_DEBUG(
+ dbgs() << "masked scatters: cannot create scatters for non-standard"
+ << " input types. Expanding.\n");
+ return nullptr;
+ }
+
+ Value *Offsets;
+ Value *BasePtr = checkGEP(Offsets, InputTy, Ptr, Builder);
+ if (!BasePtr)
+ return nullptr;
+ int Scale = computeScale(
+ BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
+ MemoryTy->getScalarSizeInBits());
+ if (Scale == -1)
+ return nullptr;
+
+ if (!match(Mask, m_One()))
+ return Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_vstr_scatter_offset_predicated,
+ {BasePtr->getType(), Offsets->getType(), Input->getType(),
+ Mask->getType()},
+ {BasePtr, Offsets, Input,
+ Builder.getInt32(MemoryTy->getScalarSizeInBits()),
+ Builder.getInt32(Scale), Mask});
+ else
+ return Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_vstr_scatter_offset,
+ {BasePtr->getType(), Offsets->getType(), Input->getType()},
+ {BasePtr, Offsets, Input,
+ Builder.getInt32(MemoryTy->getScalarSizeInBits()),
+ Builder.getInt32(Scale)});
+}
+
bool MVEGatherScatterLowering::runOnFunction(Function &F) {
if (!EnableMaskedGatherScatters)
return false;
@@ -322,19 +445,22 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
if (!ST->hasMVEIntegerOps())
return false;
SmallVector<IntrinsicInst *, 4> Gathers;
+ SmallVector<IntrinsicInst *, 4> Scatters;
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
Gathers.push_back(II);
+ else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter)
+ Scatters.push_back(II);
}
}
- if (Gathers.empty())
- return false;
-
+ bool Changed = false;
for (IntrinsicInst *I : Gathers)
- lowerGather(I);
+ Changed |= lowerGather(I);
+ for (IntrinsicInst *I : Scatters)
+ Changed |= lowerScatter(I);
- return true;
+ return Changed;
}
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
index a50bd2cc94a0..c9bb6660c856 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll
@@ -2,8 +2,8 @@
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - | FileCheck --check-prefix NOGATSCAT %s
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-mve -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck --check-prefix NOMVE %s
-define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr) {
-; NOGATSCAT-LABEL: unscaled_i32_i32:
+define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32_gather(i8* %base, <4 x i32>* %offptr) {
+; NOGATSCAT-LABEL: unscaled_i32_i32_gather:
; NOGATSCAT: @ %bb.0: @ %entry
; NOGATSCAT-NEXT: vldrw.u32 q0, [r1]
; NOGATSCAT-NEXT: vadd.i32 q0, q0, r0
@@ -21,7 +21,7 @@ define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr
; NOGATSCAT-NEXT: vmov.32 q0[3], r2
; NOGATSCAT-NEXT: bx lr
;
-; NOMVE-LABEL: unscaled_i32_i32:
+; NOMVE-LABEL: unscaled_i32_i32_gather:
; NOMVE: @ %bb.0: @ %entry
; NOMVE-NEXT: .save {r4, lr}
; NOMVE-NEXT: push {r4, lr}
@@ -35,6 +35,7 @@ define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr
; NOMVE-NEXT: pop {r4, pc}
+
entry:
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4
%byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs
@@ -44,3 +45,51 @@ entry:
}
declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+
+
+define arm_aapcs_vfpcc void @unscaled_i32_i8_scatter(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
+; NOGATSCAT-LABEL: unscaled_i32_i8_scatter:
+; NOGATSCAT: @ %bb.0: @ %entry
+; NOGATSCAT-NEXT: vldrb.u32 q1, [r1]
+; NOGATSCAT-NEXT: vmov r1, s0
+; NOGATSCAT-NEXT: vadd.i32 q1, q1, r0
+; NOGATSCAT-NEXT: vmov r0, s4
+; NOGATSCAT-NEXT: str r1, [r0]
+; NOGATSCAT-NEXT: vmov r0, s5
+; NOGATSCAT-NEXT: vmov r1, s1
+; NOGATSCAT-NEXT: str r1, [r0]
+; NOGATSCAT-NEXT: vmov r0, s6
+; NOGATSCAT-NEXT: vmov r1, s2
+; NOGATSCAT-NEXT: str r1, [r0]
+; NOGATSCAT-NEXT: vmov r0, s7
+; NOGATSCAT-NEXT: vmov r1, s3
+; NOGATSCAT-NEXT: str r1, [r0]
+; NOGATSCAT-NEXT: bx lr
+;
+; NOMVE-LABEL: unscaled_i32_i8_scatter:
+; NOMVE: @ %bb.0: @ %entry
+; NOMVE-NEXT: .save {r4, lr}
+; NOMVE-NEXT: push {r4, lr}
+; NOMVE-NEXT: ldrb.w r12, [r1]
+; NOMVE-NEXT: ldrb.w lr, [r1, #1]
+; NOMVE-NEXT: ldrb r4, [r1, #2]
+; NOMVE-NEXT: ldrb r1, [r1, #3]
+; NOMVE-NEXT: str.w r2, [r0, r12]
+; NOMVE-NEXT: ldr r2, [sp, #8]
+; NOMVE-NEXT: str.w r3, [r0, lr]
+; NOMVE-NEXT: str r2, [r0, r4]
+; NOMVE-NEXT: ldr r2, [sp, #12]
+; NOMVE-NEXT: str r2, [r0, r1]
+; NOMVE-NEXT: pop {r4, pc}
+
+
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.zext = zext <4 x i8> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
new file mode 100644
index 000000000000..82f357d28a69
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o - | FileCheck %s
+
+; VLDRH.16 Qd, [base, offs, uxtw #1]
+define arm_aapcs_vfpcc void @scaled_v8i16_i16(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) {
+; CHECK-LABEL: scaled_v8i16_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u16 q1, [r1]
+; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
+ %offs.zext = zext <8 x i16> %offs to <8 x i32>
+ %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
+ call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.16 Qd, [base, offs, uxtw #1]
+define arm_aapcs_vfpcc void @scaled_v8f16_i16(i16* %base, <8 x i16>* %offptr, <8 x half> %input) {
+; CHECK-LABEL: scaled_v8f16_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u16 q1, [r1]
+; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
+ %offs.zext = zext <8 x i16> %offs to <8 x i32>
+ %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
+ %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*>
+ call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.16 Qd, [base, offs, uxtw #1]
+define arm_aapcs_vfpcc void @scaled_v8f16_half(half* %base, <8 x i16>* %offptr, <8 x half> %input) {
+; CHECK-LABEL: scaled_v8f16_half:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u16 q1, [r1]
+; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
+ %offs.zext = zext <8 x i16> %offs to <8 x i32>
+ %ptrs = getelementptr inbounds half, half* %base, <8 x i32> %offs.zext
+ call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @scaled_v8i16_sext(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) {
+; CHECK-LABEL: scaled_v8i16_sext:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.s32 q2, [r1]
+; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT: vmov.u16 r1, q0[0]
+; CHECK-NEXT: vshl.i32 q2, q2, #1
+; CHECK-NEXT: vshl.i32 q1, q1, #1
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov.u16 r1, q0[1]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov.u16 r1, q0[3]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov.u16 r1, q0[4]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov.u16 r1, q0[5]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov.u16 r1, q0[6]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
+ %offs.sext = sext <8 x i16> %offs to <8 x i32>
+ %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.sext
+ call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr, <8 x half> %input) {
+; CHECK-LABEL: scaled_v8f16_sext:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vmovx.f16 s12, s0
+; CHECK-NEXT: vshl.i32 q2, q1, #1
+; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vmov r1, s8
+; CHECK-NEXT: vshl.i32 q1, q1, #1
+; CHECK-NEXT: vstr.16 s0, [r1]
+; CHECK-NEXT: vmov r1, s9
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vstr.16 s12, [r1]
+; CHECK-NEXT: vmov r1, s10
+; CHECK-NEXT: vmovx.f16 s0, s3
+; CHECK-NEXT: vstr.16 s1, [r1]
+; CHECK-NEXT: vmov r1, s11
+; CHECK-NEXT: vmovx.f16 s8, s1
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vstr.16 s8, [r1]
+; CHECK-NEXT: vstr.16 s2, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmovx.f16 s8, s2
+; CHECK-NEXT: vstr.16 s8, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vstr.16 s3, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vstr.16 s0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
+ %offs.sext = sext <8 x i16> %offs to <8 x i32>
+ %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.sext
+ %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*>
+ call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.16 Qd, [base, zext(offs), uxtw #1]
+define arm_aapcs_vfpcc void @unsigned_scaled_v8i16_i8(i16* %base, <8 x i8>* %offptr, <8 x i16> %input) {
+; CHECK-LABEL: unsigned_scaled_v8i16_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u16 q1, [r1]
+; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
+ %offs.zext = zext <8 x i8> %offs to <8 x i32>
+ %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
+ call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.16 Qd, [base, zext(offs), uxtw #1]
+define arm_aapcs_vfpcc void @unsigned_scaled_v8f16_i8(i16* %base, <8 x i8>* %offptr, <8 x half> %input) {
+; CHECK-LABEL: unsigned_scaled_v8f16_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u16 q1, [r1]
+; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
+ %offs.zext = zext <8 x i8> %offs to <8 x i32>
+ %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
+ %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*>
+ call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define arm_aapcs_vfpcc void @scaled_v8i16_i16_passthru_icmp0(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) {
+; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp0:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u16 q1, [r1]
+; CHECK-NEXT: vpt.s16 gt, q1, zr
+; CHECK-NEXT: vstrht.16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
+ %offs.zext = zext <8 x i16> %offs to <8 x i32>
+ %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
+ %mask = icmp sgt <8 x i16> %offs, zeroinitializer
+ call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
new file mode 100644
index 000000000000..156bf9826952
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
@@ -0,0 +1,460 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o - | FileCheck %s
+
+; VLDRB.u16 Qd, [base, offs]
+define arm_aapcs_vfpcc void @ext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
+; CHECK-LABEL: ext_unscaled_i8_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u16 q1, [r1]
+; CHECK-NEXT: vstrb.16 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
+ %offs.zext = zext <8 x i16> %offs to <8 x i32>
+ %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
+ %t = trunc <8 x i16> %input to <8 x i8>
+ call void @llvm.masked.scatter.v8i8(<8 x i8> %t, <8 x i8*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRB.u16 Qd, [base, offs]
+define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
+; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u16 q1, [r1]
+; CHECK-NEXT: vstrb.16 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
+ %offs.zext = zext <8 x i8> %offs to <8 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
+ %input.trunc = trunc <8 x i16> %input to <8 x i8>
+ call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x i8*> %byte_ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.16 Qd, [base, offs]
+define arm_aapcs_vfpcc void @unscaled_i16_i16(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
+; CHECK-LABEL: unscaled_i16_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u16 q1, [r1]
+; CHECK-NEXT: vstrh.16 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
+ %offs.zext = zext <8 x i16> %offs to <8 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
+ %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
+ call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.s16 Qd, [base, offs]
+define arm_aapcs_vfpcc void @unscaled_v8f16_i16(i8* %base, <8 x i16>* %offptr, <8 x half> %input) {
+; CHECK-LABEL: unscaled_v8f16_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u16 q1, [r1]
+; CHECK-NEXT: vstrh.16 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
+ %offs.zext = zext <8 x i16> %offs to <8 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
+ %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
+ call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @unscaled_v8i16_sext(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
+; CHECK-LABEL: unscaled_v8i16_sext:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.s32 q2, [r1]
+; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT: vmov.u16 r1, q0[0]
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov.u16 r1, q0[1]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov.u16 r1, q0[3]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov.u16 r1, q0[4]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov.u16 r1, q0[5]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov.u16 r1, q0[6]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
+ %offs.sext = sext <8 x i16> %offs to <8 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
+ %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
+ call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr, <8 x half> %input) {
+; CHECK-LABEL: unscaled_v8f16_sext:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.s32 q2, [r1]
+; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT: vmovx.f16 s12, s0
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r1, s8
+; CHECK-NEXT: vstr.16 s0, [r1]
+; CHECK-NEXT: vmov r1, s9
+; CHECK-NEXT: vstr.16 s12, [r1]
+; CHECK-NEXT: vmov r1, s10
+; CHECK-NEXT: vstr.16 s1, [r1]
+; CHECK-NEXT: vmov r1, s11
+; CHECK-NEXT: vmovx.f16 s8, s1
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vstr.16 s8, [r1]
+; CHECK-NEXT: vstr.16 s2, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmovx.f16 s8, s2
+; CHECK-NEXT: vstr.16 s8, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vstr.16 s3, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmovx.f16 s0, s3
+; CHECK-NEXT: vstr.16 s0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
+ %offs.sext = sext <8 x i16> %offs to <8 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
+ %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
+ call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @unscaled_v8i16_noext(i8* %base, <8 x i32>* %offptr, <8 x i16> %input) {
+; CHECK-LABEL: unscaled_v8i16_noext:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q2, [r1]
+; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
+; CHECK-NEXT: vmov.u16 r1, q0[0]
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov.u16 r1, q0[1]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov.u16 r1, q0[3]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov.u16 r1, q0[4]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov.u16 r1, q0[5]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov.u16 r1, q0[6]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i32>, <8 x i32>* %offptr, align 4
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs
+ %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
+ call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr, <8 x half> %input) {
+; CHECK-LABEL: unscaled_v8f16_noext:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q2, [r1]
+; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
+; CHECK-NEXT: vmovx.f16 s12, s0
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r1, s8
+; CHECK-NEXT: vstr.16 s0, [r1]
+; CHECK-NEXT: vmov r1, s9
+; CHECK-NEXT: vstr.16 s12, [r1]
+; CHECK-NEXT: vmov r1, s10
+; CHECK-NEXT: vstr.16 s1, [r1]
+; CHECK-NEXT: vmov r1, s11
+; CHECK-NEXT: vmovx.f16 s8, s1
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vstr.16 s8, [r1]
+; CHECK-NEXT: vstr.16 s2, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmovx.f16 s8, s2
+; CHECK-NEXT: vstr.16 s8, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vstr.16 s3, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmovx.f16 s0, s3
+; CHECK-NEXT: vstr.16 s0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i32>, <8 x i32>* %offptr, align 4
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs
+ %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
+ call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.16 Qd, [base, zext(offs)]
+define arm_aapcs_vfpcc void @unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
+; CHECK-LABEL: unsigned_unscaled_i16_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u16 q1, [r1]
+; CHECK-NEXT: vstrh.16 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
+ %offs.zext = zext <8 x i8> %offs to <8 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
+ %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
+ call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.16 Qd, [base, zext(offs)]
+define arm_aapcs_vfpcc void @unsigned_unscaled_f16_i8(i8* %base, <8 x i8>* %offptr, <8 x half> %input) {
+; CHECK-LABEL: unsigned_unscaled_f16_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u16 q1, [r1]
+; CHECK-NEXT: vstrh.16 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
+ %offs.zext = zext <8 x i8> %offs to <8 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
+ %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
+ call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand ?
+define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
+; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: vldrb.s32 q5, [r1]
+; CHECK-NEXT: vldrb.s32 q4, [r1, #4]
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vadd.i32 q5, q5, r0
+; CHECK-NEXT: vadd.i32 q4, q4, r0
+; CHECK-NEXT: vmov r0, s20
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s21
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s22
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s23
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s16
+; CHECK-NEXT: vmov r1, s8
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s17
+; CHECK-NEXT: vmov r1, s10
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s18
+; CHECK-NEXT: vmov r1, s12
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s19
+; CHECK-NEXT: vmov r1, s14
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
+ %offs.sext = sext <8 x i8> %offs to <8 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
+ %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
+ %input.trunc = trunc <8 x i64> %input to <8 x i16>
+ call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand ?
+define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
+; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vmov r3, s0
+; CHECK-NEXT: vmov.16 q4[0], r3
+; CHECK-NEXT: vmov r3, s2
+; CHECK-NEXT: vmov.16 q4[1], r3
+; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmov.16 q4[2], r3
+; CHECK-NEXT: vmov r3, s6
+; CHECK-NEXT: vmov.16 q4[3], r3
+; CHECK-NEXT: vmov r3, s8
+; CHECK-NEXT: vmov.16 q4[4], r3
+; CHECK-NEXT: vmov r3, s10
+; CHECK-NEXT: vmov.16 q4[5], r3
+; CHECK-NEXT: vmov r3, s12
+; CHECK-NEXT: vmov r2, s14
+; CHECK-NEXT: vmov.16 q4[6], r3
+; CHECK-NEXT: vldrb.u16 q0, [r1]
+; CHECK-NEXT: vmov.16 q4[7], r2
+; CHECK-NEXT: vstrh.16 q4, [r0, q0]
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
+ %offs.zext = zext <8 x i8> %offs to <8 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
+ %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
+ %input.trunc = trunc <8 x i64> %input to <8 x i16>
+ call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand ?
+define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
+; CHECK-LABEL: trunc_signed_unscaled_i32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.s32 q3, [r1]
+; CHECK-NEXT: vldrb.s32 q2, [r1, #4]
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vadd.i32 q3, q3, r0
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vmov r0, s12
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s13
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s14
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s15
+; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov r1, s5
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov r1, s7
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
+ %offs.sext = sext <8 x i8> %offs to <8 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
+ %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
+ %input.trunc = trunc <8 x i32> %input to <8 x i16>
+ call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand ?
+define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
+; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r3, s0
+; CHECK-NEXT: vmov.16 q2[0], r3
+; CHECK-NEXT: vmov r3, s1
+; CHECK-NEXT: vmov.16 q2[1], r3
+; CHECK-NEXT: vmov r3, s2
+; CHECK-NEXT: vmov.16 q2[2], r3
+; CHECK-NEXT: vmov r3, s3
+; CHECK-NEXT: vmov.16 q2[3], r3
+; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmov.16 q2[4], r3
+; CHECK-NEXT: vmov r3, s5
+; CHECK-NEXT: vmov.16 q2[5], r3
+; CHECK-NEXT: vmov r3, s6
+; CHECK-NEXT: vmov r2, s7
+; CHECK-NEXT: vmov.16 q2[6], r3
+; CHECK-NEXT: vldrb.u16 q0, [r1]
+; CHECK-NEXT: vmov.16 q2[7], r2
+; CHECK-NEXT: vstrh.16 q2, [r0, q0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
+ %offs.zext = zext <8 x i8> %offs to <8 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
+ %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
+ %input.trunc = trunc <8 x i32> %input to <8 x i16>
+ call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand ?
+define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
+; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.s32 q2, [r1]
+; CHECK-NEXT: vldrb.s32 q1, [r1, #4]
+; CHECK-NEXT: vmov.u16 r1, q0[0]
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov.u16 r1, q0[1]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov.u16 r1, q0[3]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov.u16 r1, q0[4]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov.u16 r1, q0[5]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov.u16 r1, q0[6]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
+ %offs.sext = sext <8 x i8> %offs to <8 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
+ %input.trunc = trunc <8 x i16> %input to <8 x i8>
+ call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x i8*> %byte_ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8f16(<8 x half>, <8 x half*>, i32, <8 x i1>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll
new file mode 100644
index 000000000000..8dc00bcbd76d
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck %s
+
+; VLDRH.u32 Qd, [base, offs, #uxtw #1]
+define arm_aapcs_vfpcc void @ext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_scaled_i16_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
+ %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs
+ %t = trunc <4 x i32> %input to <4 x i16>
+ call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs, uxtw #2]
+define arm_aapcs_vfpcc void @scaled_i32_i32(i32* %base, <4 x i32>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: scaled_i32_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
+ %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs, uxtw #2]
+define arm_aapcs_vfpcc void @scaled_f32_i32(i32* %base, <4 x i32>* %offptr, <4 x float> %input) {
+; CHECK-LABEL: scaled_f32_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
+ %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs
+ %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
+ call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.zext, uxtw #2]
+define arm_aapcs_vfpcc void @unsigned_scaled_b_i32_i16(i32* %base, <4 x i16>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: unsigned_scaled_b_i32_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.zext = zext <4 x i16> %offs to <4 x i32>
+ %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.sext, uxtw #2]
+define arm_aapcs_vfpcc void @signed_scaled_i32_i16(i32* %base, <4 x i16>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: signed_scaled_i32_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.sext = sext <4 x i16> %offs to <4 x i32>
+ %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.zext, uxtw #2]
+define arm_aapcs_vfpcc void @a_unsigned_scaled_f32_i16(i32* %base, <4 x i16>* %offptr, <4 x float> %input) {
+; CHECK-LABEL: a_unsigned_scaled_f32_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.zext = zext <4 x i16> %offs to <4 x i32>
+ %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
+ %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
+ call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.sext, uxtw #2]
+define arm_aapcs_vfpcc void @b_signed_scaled_f32_i16(i32* %base, <4 x i16>* %offptr, <4 x float> %input) {
+; CHECK-LABEL: b_signed_scaled_f32_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.sext = sext <4 x i16> %offs to <4 x i32>
+ %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
+ %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
+ call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.u32 Qd, [base, offs.sext, uxtw #1]
+define arm_aapcs_vfpcc void @ext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_signed_scaled_i16_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.sext = sext <4 x i16> %offs to <4 x i32>
+ %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext
+ %t = trunc <4 x i32> %input to <4 x i16>
+ call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRH.32 Qd, [base, offs.sext, uxtw #1]
+define arm_aapcs_vfpcc void @ext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_unsigned_scaled_i16_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.zext = zext <4 x i16> %offs to <4 x i32>
+ %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext
+ %t = trunc <4 x i32> %input to <4 x i16>
+ call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.zext, uxtw #2]
+define arm_aapcs_vfpcc void @unsigned_scaled_b_i32_i8(i32* %base, <4 x i8>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: unsigned_scaled_b_i32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.zext = zext <4 x i8> %offs to <4 x i32>
+ %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.sext, uxtw #2]
+define arm_aapcs_vfpcc void @signed_scaled_i32_i8(i32* %base, <4 x i8>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: signed_scaled_i32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.sext = sext <4 x i8> %offs to <4 x i32>
+ %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.zext, uxtw #2]
+define arm_aapcs_vfpcc void @a_unsigned_scaled_f32_i8(i32* %base, <4 x i8>* %offptr, <4 x float> %input) {
+; CHECK-LABEL: a_unsigned_scaled_f32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.zext = zext <4 x i8> %offs to <4 x i32>
+ %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
+ %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
+ call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.sext, uxtw #2]
+define arm_aapcs_vfpcc void @b_signed_scaled_f32_i8(i32* %base, <4 x i8>* %offptr, <4 x float> %input) {
+; CHECK-LABEL: b_signed_scaled_f32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.sext = sext <4 x i8> %offs to <4 x i32>
+ %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
+ %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
+ call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.z32 Qd, [base, offs.sext, uxtw #1]
+define arm_aapcs_vfpcc void @ext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_signed_scaled_i16_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.sext = sext <4 x i8> %offs to <4 x i32>
+ %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext
+ %t = trunc <4 x i32> %input to <4 x i16>
+ call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.z32 Qd, [base, offs.zext, uxtw #1]
+define arm_aapcs_vfpcc void @ext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_unsigned_scaled_i16_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.zext = zext <4 x i8> %offs to <4 x i32>
+ %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext
+ %t = trunc <4 x i32> %input to <4 x i16>
+ call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
new file mode 100644
index 000000000000..48c7b547e1b4
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
@@ -0,0 +1,419 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck %s
+
+; VLDRB.u32 Qd, [base, offs]
+define arm_aapcs_vfpcc void @ext_unscaled_i8_i32(i8* %base, <4 x i32>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_unscaled_i8_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vstrb.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
+ %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs
+ %t = trunc <4 x i32> %input to <4 x i8>
+ call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.u32 Qd, [base, offs]
+define arm_aapcs_vfpcc void @ext_unscaled_i16_i32(i8* %base, <4 x i32>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_unscaled_i16_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vstrh.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*>
+ %t = trunc <4 x i32> %input to <4 x i16>
+ call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs]
+define arm_aapcs_vfpcc void @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: unscaled_i32_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs]
+define arm_aapcs_vfpcc void @unscaled_f32_i32(i8* %base, <4 x i32>* %offptr, <4 x float> %input) {
+; CHECK-LABEL: unscaled_f32_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*>
+ call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.zext]
+define arm_aapcs_vfpcc void @unsigned_unscaled_b_i32_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: unsigned_unscaled_b_i32_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.zext = zext <4 x i16> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.sext]
+define arm_aapcs_vfpcc void @signed_unscaled_i32_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: signed_unscaled_i32_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.sext = sext <4 x i16> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.zext]
+define arm_aapcs_vfpcc void @a_unsigned_unscaled_f32_i16(i8* %base, <4 x i16>* %offptr, <4 x float> %input) {
+; CHECK-LABEL: a_unsigned_unscaled_f32_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.zext = zext <4 x i16> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*>
+ call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.sext]
+define arm_aapcs_vfpcc void @b_signed_unscaled_f32_i16(i8* %base, <4 x i16>* %offptr, <4 x float> %input) {
+; CHECK-LABEL: b_signed_unscaled_f32_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.sext = sext <4 x i16> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*>
+ call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.u32 Qd, [base, offs.sext]
+define arm_aapcs_vfpcc void @ext_signed_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_signed_unscaled_i16_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vstrh.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.sext = sext <4 x i16> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*>
+ %t = trunc <4 x i32> %input to <4 x i16>
+ call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.u32 Qd, [base, offs.zext]
+define arm_aapcs_vfpcc void @ext_unsigned_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_unsigned_unscaled_i16_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vstrh.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.zext = zext <4 x i16> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*>
+ %t = trunc <4 x i32> %input to <4 x i16>
+ call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRB.u32 Qd, [base, offs.sext]
+define arm_aapcs_vfpcc void @ext_signed_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_signed_unscaled_i8_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vstrb.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.sext = sext <4 x i16> %offs to <4 x i32>
+ %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
+ %t = trunc <4 x i32> %input to <4 x i8>
+ call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRB.s32 Qd, [base, offs.zext]
+define arm_aapcs_vfpcc void @ext_unsigned_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_unsigned_unscaled_i8_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vstrb.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
+ %offs.zext = zext <4 x i16> %offs to <4 x i32>
+ %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
+ %t = trunc <4 x i32> %input to <4 x i8>
+ call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.zext]
+define arm_aapcs_vfpcc void @unsigned_unscaled_b_i32_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: unsigned_unscaled_b_i32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.zext = zext <4 x i8> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.sext]
+define arm_aapcs_vfpcc void @signed_unscaled_i32_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: signed_unscaled_i32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.sext = sext <4 x i8> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.zext]
+define arm_aapcs_vfpcc void @a_unsigned_unscaled_f32_i8(i8* %base, <4 x i8>* %offptr, <4 x float> %input) {
+; CHECK-LABEL: a_unsigned_unscaled_f32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.zext = zext <4 x i8> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*>
+ call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [base, offs.sext]
+define arm_aapcs_vfpcc void @b_signed_unscaled_f32_i8(i8* %base, <4 x i8>* %offptr, <4 x float> %input) {
+; CHECK-LABEL: b_signed_unscaled_f32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.sext = sext <4 x i8> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*>
+ call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.u32 Qd, [base, offs.sext]
+define arm_aapcs_vfpcc void @ext_signed_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_signed_unscaled_i8_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vstrb.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.sext = sext <4 x i8> %offs to <4 x i32>
+ %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
+ %t = trunc <4 x i32> %input to <4 x i8>
+ call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; VLDRH.u32 Qd, [base, offs.zext]
+define arm_aapcs_vfpcc void @ext_unsigned_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: ext_unsigned_unscaled_i8_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vstrb.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.zext = zext <4 x i8> %offs to <4 x i32>
+ %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
+ %t = trunc <4 x i32> %input to <4 x i8>
+ call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) {
+; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s1, s2
+; CHECK-NEXT: vldrb.s32 q2, [r1]
+; CHECK-NEXT: vmov.f32 s2, s4
+; CHECK-NEXT: vmov.f32 s3, s6
+; CHECK-NEXT: vstrw.32 q0, [r0, q2]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.sext = sext <4 x i8> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
+ %input.trunc = trunc <4 x i64> %input to <4 x i32>
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input.trunc, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) {
+; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s1, s2
+; CHECK-NEXT: vldrb.u32 q2, [r1]
+; CHECK-NEXT: vmov.f32 s2, s4
+; CHECK-NEXT: vmov.f32 s3, s6
+; CHECK-NEXT: vstrw.32 q0, [r0, q2]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.zext = zext <4 x i8> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*>
+ %input.trunc = trunc <4 x i64> %input to <4 x i32>
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input.trunc, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: trunc_signed_unscaled_i32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vstrh.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.sext = sext <4 x i8> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*>
+ %input.trunc = trunc <4 x i32> %input to <4 x i16>
+ call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %input.trunc, <4 x i16*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) {
+; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vstrh.32 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.zext = zext <4 x i8> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
+ %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*>
+ %input.trunc = trunc <4 x i32> %input to <4 x i16>
+ call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %input.trunc, <4 x i16*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) {
+; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.s32 q1, [r1]
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.sext = sext <4 x i8> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext
+ %input.trunc = trunc <4 x i16> %input to <4 x i8>
+ call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %input.trunc, <4 x i8*> %byte_ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) {
+; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u32 q1, [r1]
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
+ %offs.zext = zext <4 x i8> %offs to <4 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext
+ %input.trunc = trunc <4 x i16> %input to <4 x i8>
+ call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %input.trunc, <4 x i8*> %byte_ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
new file mode 100644
index 000000000000..878765c0a862
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
@@ -0,0 +1,540 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o - | FileCheck %s
+
+; VLDRB.8
+define arm_aapcs_vfpcc void @unscaled_v16i8_i8(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
+; CHECK-LABEL: unscaled_v16i8_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u8 q1, [r1]
+; CHECK-NEXT: vstrb.8 q0, [r0, q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
+ %offs.zext = zext <16 x i8> %offs to <16 x i32>
+ %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
+ call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) {
+; CHECK-LABEL: unscaled_v8i8_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u32 q2, [r1]
+; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
+; CHECK-NEXT: vmov.u16 r1, q0[0]
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov.u16 r1, q0[1]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov.u16 r1, q0[3]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov.u16 r1, q0[4]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov.u16 r1, q0[5]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov.u16 r1, q0[6]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
+ %offs.zext = zext <8 x i8> %offs to <8 x i32>
+ %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
+ call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %input, <8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr, <2 x i8> %input) {
+; CHECK-LABEL: unscaled_v2i8_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrb r2, [r1]
+; CHECK-NEXT: vmov.i32 q1, #0xff
+; CHECK-NEXT: ldrb r1, [r1, #1]
+; CHECK-NEXT: vmov.32 q2[0], r2
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: vmov.32 q2[2], r1
+; CHECK-NEXT: vand q1, q2, q1
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: strb r2, [r0, r1]
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: strb r2, [r0, r1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <2 x i8>, <2 x i8>* %offptr, align 1
+ %offs.zext = zext <2 x i8> %offs to <2 x i32>
+ %ptrs = getelementptr inbounds i8, i8* %base, <2 x i32> %offs.zext
+ call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %input, <2 x i8*> %ptrs, i32 1, <2 x i1> <i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
+; CHECK-LABEL: unscaled_v16i8_sext:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vldrb.s32 q4, [r1]
+; CHECK-NEXT: vldrb.s32 q1, [r1, #12]
+; CHECK-NEXT: vldrb.s32 q2, [r1, #8]
+; CHECK-NEXT: vldrb.s32 q3, [r1, #4]
+; CHECK-NEXT: vadd.i32 q4, q4, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vadd.i32 q3, q3, r0
+; CHECK-NEXT: vmov r0, s16
+; CHECK-NEXT: vmov.u8 r1, q0[0]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s17
+; CHECK-NEXT: vmov.u8 r1, q0[1]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s18
+; CHECK-NEXT: vmov.u8 r1, q0[2]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s19
+; CHECK-NEXT: vmov.u8 r1, q0[3]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s12
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s13
+; CHECK-NEXT: vmov.u8 r1, q0[5]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s14
+; CHECK-NEXT: vmov.u8 r1, q0[6]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s15
+; CHECK-NEXT: vmov.u8 r1, q0[7]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: vmov.u8 r1, q0[8]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov.u8 r1, q0[9]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov.u8 r1, q0[10]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov.u8 r1, q0[11]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov.u8 r1, q0[12]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov.u8 r1, q0[13]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov.u8 r1, q0[14]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
+ %offs.sext = sext <16 x i8> %offs to <16 x i32>
+ %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext
+ call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr, <16 x i8> %input) {
+; CHECK-LABEL: unscaled_v16i8_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vldrh.s32 q4, [r1]
+; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
+; CHECK-NEXT: vldrh.s32 q2, [r1, #16]
+; CHECK-NEXT: vldrh.s32 q3, [r1, #8]
+; CHECK-NEXT: vadd.i32 q4, q4, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vadd.i32 q3, q3, r0
+; CHECK-NEXT: vmov r0, s16
+; CHECK-NEXT: vmov.u8 r1, q0[0]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s17
+; CHECK-NEXT: vmov.u8 r1, q0[1]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s18
+; CHECK-NEXT: vmov.u8 r1, q0[2]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s19
+; CHECK-NEXT: vmov.u8 r1, q0[3]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s12
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s13
+; CHECK-NEXT: vmov.u8 r1, q0[5]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s14
+; CHECK-NEXT: vmov.u8 r1, q0[6]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s15
+; CHECK-NEXT: vmov.u8 r1, q0[7]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: vmov.u8 r1, q0[8]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov.u8 r1, q0[9]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov.u8 r1, q0[10]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov.u8 r1, q0[11]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov.u8 r1, q0[12]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov.u8 r1, q0[13]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov.u8 r1, q0[14]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <16 x i16>, <16 x i16>* %offptr, align 2
+ %offs.sext = sext <16 x i16> %offs to <16 x i32>
+ %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext
+ call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr, <16 x i8> %input) {
+; CHECK-LABEL: unscaled_v16i8_scaled:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vldrb.u32 q4, [r1]
+; CHECK-NEXT: vldrb.u32 q1, [r1, #12]
+; CHECK-NEXT: vldrb.u32 q2, [r1, #8]
+; CHECK-NEXT: vldrb.u32 q3, [r1, #4]
+; CHECK-NEXT: vshl.i32 q4, q4, #2
+; CHECK-NEXT: vshl.i32 q1, q1, #2
+; CHECK-NEXT: vshl.i32 q2, q2, #2
+; CHECK-NEXT: vshl.i32 q3, q3, #2
+; CHECK-NEXT: vadd.i32 q4, q4, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vadd.i32 q3, q3, r0
+; CHECK-NEXT: vmov r0, s16
+; CHECK-NEXT: vmov.u8 r1, q0[0]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s17
+; CHECK-NEXT: vmov.u8 r1, q0[1]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s18
+; CHECK-NEXT: vmov.u8 r1, q0[2]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s19
+; CHECK-NEXT: vmov.u8 r1, q0[3]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s12
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s13
+; CHECK-NEXT: vmov.u8 r1, q0[5]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s14
+; CHECK-NEXT: vmov.u8 r1, q0[6]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s15
+; CHECK-NEXT: vmov.u8 r1, q0[7]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: vmov.u8 r1, q0[8]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov.u8 r1, q0[9]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov.u8 r1, q0[10]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov.u8 r1, q0[11]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov.u8 r1, q0[12]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov.u8 r1, q0[13]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov.u8 r1, q0[14]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <16 x i8>, <16 x i8>* %offptr, align 4
+ %offs.zext = zext <16 x i8> %offs to <16 x i32>
+ %ptrs32 = getelementptr inbounds i32, i32* %base, <16 x i32> %offs.zext
+ %ptrs = bitcast <16 x i32*> %ptrs32 to <16 x i8*>
+ call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr, <16 x i8> %input) {
+; CHECK-LABEL: unscaled_v16i8_i8_next:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vldrw.u32 q4, [r1]
+; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
+; CHECK-NEXT: vldrw.u32 q2, [r1, #32]
+; CHECK-NEXT: vldrw.u32 q3, [r1, #16]
+; CHECK-NEXT: vadd.i32 q4, q4, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vadd.i32 q3, q3, r0
+; CHECK-NEXT: vmov r0, s16
+; CHECK-NEXT: vmov.u8 r1, q0[0]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s17
+; CHECK-NEXT: vmov.u8 r1, q0[1]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s18
+; CHECK-NEXT: vmov.u8 r1, q0[2]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s19
+; CHECK-NEXT: vmov.u8 r1, q0[3]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s12
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s13
+; CHECK-NEXT: vmov.u8 r1, q0[5]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s14
+; CHECK-NEXT: vmov.u8 r1, q0[6]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s15
+; CHECK-NEXT: vmov.u8 r1, q0[7]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: vmov.u8 r1, q0[8]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov.u8 r1, q0[9]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov.u8 r1, q0[10]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov.u8 r1, q0[11]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov.u8 r1, q0[12]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov.u8 r1, q0[13]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov.u8 r1, q0[14]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <16 x i32>, <16 x i32>* %offptr, align 4
+ %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs
+ call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <16 x i8>* %offptr, <16 x i64> %input) {
+; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: vmov r4, s0
+; CHECK-NEXT: add r3, sp, #40
+; CHECK-NEXT: vmov.8 q5[0], r4
+; CHECK-NEXT: vmov r4, s2
+; CHECK-NEXT: vmov.8 q5[1], r4
+; CHECK-NEXT: vmov r4, s4
+; CHECK-NEXT: vmov.8 q5[2], r4
+; CHECK-NEXT: vmov r4, s6
+; CHECK-NEXT: vmov.8 q5[3], r4
+; CHECK-NEXT: vmov r4, s8
+; CHECK-NEXT: vmov.8 q5[4], r4
+; CHECK-NEXT: vmov r4, s10
+; CHECK-NEXT: vldrw.u32 q0, [r3]
+; CHECK-NEXT: vmov.8 q5[5], r4
+; CHECK-NEXT: vmov r4, s12
+; CHECK-NEXT: add.w lr, sp, #56
+; CHECK-NEXT: vmov.8 q5[6], r4
+; CHECK-NEXT: vmov r4, s14
+; CHECK-NEXT: vmov.8 q5[7], r4
+; CHECK-NEXT: vmov r3, s0
+; CHECK-NEXT: vmov.8 q5[8], r3
+; CHECK-NEXT: vmov r3, s2
+; CHECK-NEXT: vldrw.u32 q0, [lr]
+; CHECK-NEXT: vmov.8 q5[9], r3
+; CHECK-NEXT: add.w r12, sp, #72
+; CHECK-NEXT: add r2, sp, #88
+; CHECK-NEXT: vmov r3, s0
+; CHECK-NEXT: vldrw.u32 q4, [r2]
+; CHECK-NEXT: vmov.8 q5[10], r3
+; CHECK-NEXT: vmov r3, s2
+; CHECK-NEXT: vldrw.u32 q0, [r12]
+; CHECK-NEXT: vmov.8 q5[11], r3
+; CHECK-NEXT: vmov r2, s18
+; CHECK-NEXT: vmov r3, s0
+; CHECK-NEXT: vmov.8 q5[12], r3
+; CHECK-NEXT: vmov r3, s2
+; CHECK-NEXT: vmov.8 q5[13], r3
+; CHECK-NEXT: vmov r3, s16
+; CHECK-NEXT: vmov.8 q5[14], r3
+; CHECK-NEXT: vldrb.u8 q0, [r1]
+; CHECK-NEXT: vmov.8 q5[15], r2
+; CHECK-NEXT: vstrb.8 q5, [r0, q0]
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: pop {r4, pc}
+entry:
+ %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
+ %offs.zext = zext <16 x i8> %offs to <16 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
+ %input.trunc = trunc <16 x i64> %input to <16 x i8>
+ call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) {
+; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vmov r3, s0
+; CHECK-NEXT: vmov.8 q4[0], r3
+; CHECK-NEXT: vmov r3, s1
+; CHECK-NEXT: vmov.8 q4[1], r3
+; CHECK-NEXT: vmov r3, s2
+; CHECK-NEXT: vmov.8 q4[2], r3
+; CHECK-NEXT: vmov r3, s3
+; CHECK-NEXT: vmov.8 q4[3], r3
+; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmov.8 q4[4], r3
+; CHECK-NEXT: vmov r3, s5
+; CHECK-NEXT: vmov.8 q4[5], r3
+; CHECK-NEXT: vmov r3, s6
+; CHECK-NEXT: vmov.8 q4[6], r3
+; CHECK-NEXT: vmov r3, s7
+; CHECK-NEXT: vmov.8 q4[7], r3
+; CHECK-NEXT: vmov r3, s8
+; CHECK-NEXT: vmov.8 q4[8], r3
+; CHECK-NEXT: vmov r3, s9
+; CHECK-NEXT: vmov.8 q4[9], r3
+; CHECK-NEXT: vmov r3, s10
+; CHECK-NEXT: vmov.8 q4[10], r3
+; CHECK-NEXT: vmov r3, s11
+; CHECK-NEXT: vmov.8 q4[11], r3
+; CHECK-NEXT: vmov r3, s12
+; CHECK-NEXT: vmov.8 q4[12], r3
+; CHECK-NEXT: vmov r3, s13
+; CHECK-NEXT: vmov.8 q4[13], r3
+; CHECK-NEXT: vmov r3, s14
+; CHECK-NEXT: vmov r2, s15
+; CHECK-NEXT: vmov.8 q4[14], r3
+; CHECK-NEXT: vldrb.u8 q0, [r1]
+; CHECK-NEXT: vmov.8 q4[15], r2
+; CHECK-NEXT: vstrb.8 q4, [r0, q0]
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
+ %offs.zext = zext <16 x i8> %offs to <16 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
+ %input.trunc = trunc <16 x i32> %input to <16 x i8>
+ call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) {
+; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: vmov.u16 r2, q1[7]
+; CHECK-NEXT: vmov.8 q2[0], r3
+; CHECK-NEXT: vmov.u16 r3, q0[1]
+; CHECK-NEXT: vmov.8 q2[1], r3
+; CHECK-NEXT: vmov.u16 r3, q0[2]
+; CHECK-NEXT: vmov.8 q2[2], r3
+; CHECK-NEXT: vmov.u16 r3, q0[3]
+; CHECK-NEXT: vmov.8 q2[3], r3
+; CHECK-NEXT: vmov.u16 r3, q0[4]
+; CHECK-NEXT: vmov.8 q2[4], r3
+; CHECK-NEXT: vmov.u16 r3, q0[5]
+; CHECK-NEXT: vmov.8 q2[5], r3
+; CHECK-NEXT: vmov.u16 r3, q0[6]
+; CHECK-NEXT: vmov.8 q2[6], r3
+; CHECK-NEXT: vmov.u16 r3, q0[7]
+; CHECK-NEXT: vmov.8 q2[7], r3
+; CHECK-NEXT: vmov.u16 r3, q1[0]
+; CHECK-NEXT: vmov.8 q2[8], r3
+; CHECK-NEXT: vmov.u16 r3, q1[1]
+; CHECK-NEXT: vmov.8 q2[9], r3
+; CHECK-NEXT: vmov.u16 r3, q1[2]
+; CHECK-NEXT: vmov.8 q2[10], r3
+; CHECK-NEXT: vmov.u16 r3, q1[3]
+; CHECK-NEXT: vmov.8 q2[11], r3
+; CHECK-NEXT: vmov.u16 r3, q1[4]
+; CHECK-NEXT: vmov.8 q2[12], r3
+; CHECK-NEXT: vmov.u16 r3, q1[5]
+; CHECK-NEXT: vmov.8 q2[13], r3
+; CHECK-NEXT: vmov.u16 r3, q1[6]
+; CHECK-NEXT: vmov.8 q2[14], r3
+; CHECK-NEXT: vldrb.u8 q0, [r1]
+; CHECK-NEXT: vmov.8 q2[15], r2
+; CHECK-NEXT: vstrb.8 q2, [r0, q0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <16 x i8>, <16 x i8>* %offptr, align 1
+ %offs.zext = zext <16 x i8> %offs to <16 x i32>
+ %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
+ %input.trunc = trunc <16 x i16> %input to <16 x i8>
+ call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+
+declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
new file mode 100644
index 000000000000..a317bcd0745a
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
@@ -0,0 +1,633 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedldst -enable-arm-maskedgatscat %s -o - | FileCheck %s
+
+; i32
+
+; Expand
+define arm_aapcs_vfpcc void @ptr_v2i32(<2 x i32> %v, <2 x i32*>* %offptr) {
+; CHECK-LABEL: ptr_v2i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: ldrd r1, r0, [r0]
+; CHECK-NEXT: str r2, [r1]
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <2 x i32*>, <2 x i32*>* %offptr, align 4
+ call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %v, <2 x i32*> %offs, i32 4, <2 x i1> <i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [offs, 0]
+define arm_aapcs_vfpcc void @ptr_v4i32(<4 x i32> %v, <4 x i32*>* %offptr) {
+; CHECK-LABEL: ptr_v4i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vstrw.32 q0, [q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i32*>, <4 x i32*>* %offptr, align 4
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v, <4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @ptr_v8i32(<8 x i32> %v, <8 x i32*>* %offptr) {
+; CHECK-LABEL: ptr_v8i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q3, [r0]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov r0, s12
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s13
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s14
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s15
+; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov r1, s5
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov r1, s7
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4
+ call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %v, <8 x i32*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @ptr_v16i32(<16 x i32> %v, <16 x i32*>* %offptr) {
+; CHECK-LABEL: ptr_v16i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vldrw.u32 q7, [r0]
+; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q6, [r0, #16]
+; CHECK-NEXT: vmov r0, s28
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s29
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s30
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s31
+; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s24
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s25
+; CHECK-NEXT: vmov r1, s5
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s26
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s27
+; CHECK-NEXT: vmov r1, s7
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s20
+; CHECK-NEXT: vmov r1, s8
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s21
+; CHECK-NEXT: vmov r1, s9
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s22
+; CHECK-NEXT: vmov r1, s10
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s23
+; CHECK-NEXT: vmov r1, s11
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s16
+; CHECK-NEXT: vmov r1, s12
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s17
+; CHECK-NEXT: vmov r1, s13
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s18
+; CHECK-NEXT: vmov r1, s14
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vmov r0, s19
+; CHECK-NEXT: vmov r1, s15
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4
+ call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %v, <16 x i32*> %offs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; f32
+
+; Expand
+define arm_aapcs_vfpcc void @ptr_v2f32(<2 x float> %v, <2 x float*>* %offptr) {
+; CHECK-LABEL: ptr_v2f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r0, [r0]
+; CHECK-NEXT: vstr s0, [r1]
+; CHECK-NEXT: vstr s1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <2 x float*>, <2 x float*>* %offptr, align 4
+ call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %v, <2 x float*> %offs, i32 4, <2 x i1> <i1 true, i1 true>)
+ ret void
+}
+
+; VSTRW.32 Qd, [offs, 0]
+define arm_aapcs_vfpcc void @ptr_v4f32(<4 x float> %v, <4 x float*>* %offptr) {
+; CHECK-LABEL: ptr_v4f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vstrw.32 q0, [q1]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x float*>, <4 x float*>* %offptr, align 4
+ call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %v, <4 x float*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @ptr_v8f32(<8 x float> %v, <8 x float*>* %offptr) {
+; CHECK-LABEL: ptr_v8f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT: vmov r12, s11
+; CHECK-NEXT: vmov lr, s10
+; CHECK-NEXT: vmov r3, s9
+; CHECK-NEXT: vmov r1, s8
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vmov r5, s8
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov r2, s10
+; CHECK-NEXT: vmov r4, s9
+; CHECK-NEXT: vstr s0, [r5]
+; CHECK-NEXT: vstr s1, [r4]
+; CHECK-NEXT: vstr s2, [r2]
+; CHECK-NEXT: vstr s3, [r0]
+; CHECK-NEXT: vstr s4, [r1]
+; CHECK-NEXT: vstr s5, [r3]
+; CHECK-NEXT: vstr s6, [lr]
+; CHECK-NEXT: vstr s7, [r12]
+; CHECK-NEXT: pop {r4, r5, r7, pc}
+entry:
+ %offs = load <8 x float*>, <8 x float*>* %offptr, align 4
+ call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %v, <8 x float*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; i16
+
+; Expand.
+define arm_aapcs_vfpcc void @ptr_i16(<8 x i16> %v, <8 x i16*>* %offptr) {
+; CHECK-LABEL: ptr_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vmov.u16 r1, q0[0]
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov.u16 r1, q0[1]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov.u16 r1, q0[3]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov.u16 r1, q0[4]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov.u16 r1, q0[5]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov.u16 r1, q0[6]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
+ call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v, <8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @ptr_v2i16_trunc(<2 x i32> %v, <2 x i16*>* %offptr) {
+; CHECK-LABEL: ptr_v2i16_trunc:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: ldrd r1, r0, [r0]
+; CHECK-NEXT: strh r2, [r1]
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <2 x i16*>, <2 x i16*>* %offptr, align 4
+ %ext = trunc <2 x i32> %v to <2 x i16>
+ call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %ext, <2 x i16*> %offs, i32 2, <2 x i1> <i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @ptr_v4i16_trunc(<4 x i32> %v, <4 x i16*>* %offptr) {
+; CHECK-LABEL: ptr_v4i16_trunc:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
+ %ext = trunc <4 x i32> %v to <4 x i16>
+ call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %ext, <4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @ptr_v8i16_trunc(<8 x i32> %v, <8 x i16*>* %offptr) {
+; CHECK-LABEL: ptr_v8i16_trunc:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q3, [r0]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov r0, s12
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s13
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s14
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s15
+; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov r1, s5
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov r1, s7
+; CHECK-NEXT: strh r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4
+ %ext = trunc <8 x i32> %v to <8 x i16>
+ call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %ext, <8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; f16
+
+; Expand.
+define arm_aapcs_vfpcc void @ptr_f16(<8 x half> %v, <8 x half*>* %offptr) {
+; CHECK-LABEL: ptr_f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vmovx.f16 s12, s0
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: vstr.16 s0, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vstr.16 s12, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vstr.16 s1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmovx.f16 s8, s1
+; CHECK-NEXT: vmovx.f16 s0, s3
+; CHECK-NEXT: vstr.16 s8, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vstr.16 s2, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmovx.f16 s8, s2
+; CHECK-NEXT: vstr.16 s8, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vstr.16 s3, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vstr.16 s0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x half*>, <8 x half*>* %offptr, align 4
+ call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %v, <8 x half*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; i8
+
+; Expand.
+define arm_aapcs_vfpcc void @ptr_i8(<16 x i8> %v, <16 x i8*>* %offptr) {
+; CHECK-LABEL: ptr_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vldrw.u32 q4, [r0]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT: vmov r0, s16
+; CHECK-NEXT: vmov.u8 r1, q0[0]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s17
+; CHECK-NEXT: vmov.u8 r1, q0[1]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s18
+; CHECK-NEXT: vmov.u8 r1, q0[2]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s19
+; CHECK-NEXT: vmov.u8 r1, q0[3]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s12
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s13
+; CHECK-NEXT: vmov.u8 r1, q0[5]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s14
+; CHECK-NEXT: vmov.u8 r1, q0[6]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s15
+; CHECK-NEXT: vmov.u8 r1, q0[7]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: vmov.u8 r1, q0[8]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov.u8 r1, q0[9]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov.u8 r1, q0[10]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov.u8 r1, q0[11]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov.u8 r1, q0[12]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov.u8 r1, q0[13]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov.u8 r1, q0[14]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <16 x i8*>, <16 x i8*>* %offptr, align 4
+ call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v, <16 x i8*> %offs, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @ptr_v8i8_trunc16(<8 x i16> %v, <8 x i8*>* %offptr) {
+; CHECK-LABEL: ptr_v8i8_trunc16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vmov.u16 r1, q0[0]
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov.u16 r1, q0[1]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov.u16 r1, q0[3]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov.u16 r1, q0[4]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov.u16 r1, q0[5]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov.u16 r1, q0[6]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
+ %ext = trunc <8 x i16> %v to <8 x i8>
+ call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %ext, <8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @ptr_v4i8_trunc32(<4 x i32> %v, <4 x i8*>* %offptr) {
+; CHECK-LABEL: ptr_v4i8_trunc32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s7
+; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
+ %ext = trunc <4 x i32> %v to <4 x i8>
+ call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %ext, <4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; Expand
+define arm_aapcs_vfpcc void @ptr_v8i8_trunc32(<8 x i32> %v, <8 x i8*>* %offptr) {
+; CHECK-LABEL: ptr_v8i8_trunc32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q3, [r0]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov r0, s12
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s13
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s14
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s15
+; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s8
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vmov r1, s5
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vmov r1, s7
+; CHECK-NEXT: strb r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
+ %ext = trunc <8 x i32> %v to <8 x i8>
+ call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %ext, <8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+; loops
+
+define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) {
+; CHECK-LABEL: foo_ptr_p_int32_t:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: bic r3, r2, #15
+; CHECK-NEXT: cmp r3, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: bxlt lr
+; CHECK-NEXT: .LBB16_1: @ %vector.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q0, [r1], #16
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vptt.i32 ne, q0, zr
+; CHECK-NEXT: vldrwt.u32 q1, [r0], #16
+; CHECK-NEXT: vstrwt.32 q1, [q0]
+; CHECK-NEXT: bne .LBB16_1
+; CHECK-NEXT: @ %bb.2: @ %for.end
+; CHECK-NEXT: bx lr
+entry:
+ %and = and i32 %n, -16
+ %cmp11 = icmp sgt i32 %and, 0
+ br i1 %cmp11, label %vector.body, label %for.end
+
+vector.body: ; preds = %entry, %vector.body
+ %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds i32*, i32** %src, i32 %index
+ %1 = bitcast i32** %0 to <4 x i32*>*
+ %wide.load = load <4 x i32*>, <4 x i32*>* %1, align 4
+ %2 = icmp ne <4 x i32*> %wide.load, zeroinitializer
+ %3 = getelementptr inbounds i32, i32* %dest, i32 %index
+ %4 = bitcast i32* %3 to <4 x i32>*
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.v4p0i32(<4 x i32>* %4, i32 4, <4 x i1> %2, <4 x i32> undef)
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %wide.masked.load, <4 x i32*> %wide.load, i32 4, <4 x i1> %2)
+ %index.next = add i32 %index, 4
+ %5 = icmp eq i32 %index.next, %n
+ br i1 %5, label %for.end, label %vector.body
+
+for.end: ; preds = %vector.body, %entry
+ ret void
+}
+
+define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) {
+; CHECK-LABEL: foo_ptr_p_float:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: bic r3, r2, #15
+; CHECK-NEXT: cmp r3, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: bxlt lr
+; CHECK-NEXT: .LBB17_1: @ %vector.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q0, [r1], #16
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vptt.i32 ne, q0, zr
+; CHECK-NEXT: vldrwt.u32 q1, [r0], #16
+; CHECK-NEXT: vstrwt.32 q1, [q0]
+; CHECK-NEXT: bne .LBB17_1
+; CHECK-NEXT: @ %bb.2: @ %for.end
+; CHECK-NEXT: bx lr
+entry:
+ %and = and i32 %n, -16
+ %cmp11 = icmp sgt i32 %and, 0
+ br i1 %cmp11, label %vector.body, label %for.end
+
+vector.body: ; preds = %entry, %vector.body
+ %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds float*, float** %src, i32 %index
+ %1 = bitcast float** %0 to <4 x float*>*
+ %wide.load = load <4 x float*>, <4 x float*>* %1, align 4
+ %2 = icmp ne <4 x float*> %wide.load, zeroinitializer
+ %3 = getelementptr inbounds float, float* %dest, i32 %index
+ %4 = bitcast float* %3 to <4 x i32>*
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.v4p0i32(<4 x i32>* %4, i32 4, <4 x i1> %2, <4 x i32> undef)
+ %5 = bitcast <4 x float*> %wide.load to <4 x i32*>
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %wide.masked.load, <4 x i32*> %5, i32 4, <4 x i1> %2)
+ %index.next = add i32 %index, 4
+ %6 = icmp eq i32 %index.next, %n
+ br i1 %6, label %for.end, label %vector.body
+
+for.end: ; preds = %vector.body, %entry
+ ret void
+}
+
+; VLSTW.u32 Qd, [P, 4]
+define arm_aapcs_vfpcc void @qi4(<4 x i32> %v, <4 x i32*> %p) {
+; CHECK-LABEL: qi4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.i32 q2, #0x10
+; CHECK-NEXT: vadd.i32 q1, q1, q2
+; CHECK-NEXT: vstrw.32 q0, [q1]
+; CHECK-NEXT: bx lr
+entry:
+ %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4
+ call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v, <4 x i32*> %g, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+declare void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16>, <2 x i16*>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32>, <2 x i32*>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float>, <2 x float*>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float>, <8 x float*>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>, <16 x i32*>, i32, <16 x i1>)
+declare <4 x i32> @llvm.masked.load.v4i32.v4p0i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
More information about the llvm-commits
mailing list