[llvm] [LoopVectorize][AArch64] Add limited support for scalable vectorisation of i1 types (PR #95920)

Tue Jun 18 06:23:33 PDT 2024

llvmbot wrote:



@llvm/pr-subscribers-backend-aarch64

@llvm/pr-subscribers-llvm-analysis

Author: David Sherwood (david-arm)

<details>
<summary>Changes</summary>

Previously isElementTypeLegalForScalableVector returned false for i1 types, which also prevented vectorisation of loops with i1 reductions. This is overkill - we only need to disable vectorisation for loads and/or stores of i1 types. I've added i1 as a legal type, but changed the cost model to return an invalid cost for loads and stores.

---

Patch is 22.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95920.diff


9 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+14-6) 
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h (+1-1) 
- (modified) llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll (+5) 
- (modified) llvm/test/Analysis/CostModel/AArch64/sve-gather.ll (+5) 
- (modified) llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll (+4) 
- (modified) llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll (+5) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll (+32) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll (-27) 
- (added) llvm/test/Transforms/LoopVectorize/AArch64/sve-invalid-costs.ll (+27) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9f5756fc7e401..ddb231869df6f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3187,11 +3187,16 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();
 
+  // Return an invalid cost for element types that we are unable to lower.
+  auto *VT = cast<VectorType>(Src);
+  if (VT->getElementType()->isIntegerTy(1))
+    return InstructionCost::getInvalid();
+
   // The code-generator is currently not able to handle scalable vectors
   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
   // it. This change will be removed when code-generation for these types is
   // sufficiently reliable.
-  if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
+  if (VT->getElementCount() == ElementCount::getScalable(1))
     return InstructionCost::getInvalid();
 
   return LT.first;
@@ -3212,16 +3217,17 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();
 
+  // Return an invalid cost for element types that we are unable to lower.
   if (!LT.second.isVector() ||
-      !isElementTypeLegalForScalableVector(VT->getElementType()))
+      !isElementTypeLegalForScalableVector(VT->getElementType()) ||
+      VT->getElementType()->isIntegerTy(1))
     return InstructionCost::getInvalid();
 
   // The code-generator is currently not able to handle scalable vectors
   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
   // it. This change will be removed when code-generation for these types is
   // sufficiently reliable.
-  if (cast<VectorType>(DataTy)->getElementCount() ==
-      ElementCount::getScalable(1))
+  if (VT->getElementCount() == ElementCount::getScalable(1))
     return InstructionCost::getInvalid();
 
   ElementCount LegalVF = LT.second.getVectorElementCount();
@@ -3259,8 +3265,10 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
   // it. This change will be removed when code-generation for these types is
   // sufficiently reliable.
+  // We also cannot handle loads or stores involving scalable vectors of i1.
   if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
-    if (VTy->getElementCount() == ElementCount::getScalable(1))
+    if (VTy->getElementCount() == ElementCount::getScalable(1) ||
+        VTy->getElementType()->isIntegerTy(1))
       return InstructionCost::getInvalid();
 
   // TODO: consider latency as well for TCK_SizeAndLatency.
@@ -4234,4 +4242,4 @@ bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                     C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 
   return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
-}
\ No newline at end of file
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index feec1a4289c3a..8f6242a5a73a8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -248,7 +248,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
       return true;
 
-    if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
+    if (Ty->isIntegerTy(1) || Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
         Ty->isIntegerTy(32) || Ty->isIntegerTy(64))
       return true;
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
index af41ed92319cf..219fbbaf072be 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
@@ -75,6 +75,7 @@ define void @scalable() {
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr undef, i32 16, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
@@ -103,6 +104,9 @@ entry:
   %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
   %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
 
+  ; Types that are legal, but for which we have no masked load/store lowering
+  %nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr undef, i32 16, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
+
   ret void
 }
 
@@ -265,6 +269,7 @@ declare <32 x half> @llvm.masked.load.v32f16.p0(ptr, i32, <32 x i1>, <32 x half>
 declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
 declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
 declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i1>)
 
 
 declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
index c05339d89d35c..3d835779aa882 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
@@ -12,23 +12,27 @@ define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nx
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gathers'
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-1-LABEL: 'masked_gathers'
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
   %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
   %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+  %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
   ret void
 }
 
@@ -130,6 +134,7 @@ attributes #3 = { "target-features"="+sve" vscale_range(2, 2) }
 declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
 declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
 declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
+declare <vscale x 4 x i1> @llvm.masked.gather.nxv4i1(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i1>)
 declare <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x double>)
 declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x ptr>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
 declare <vscale x 8 x float> @llvm.masked.gather.nxv8f32(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x float>)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll b/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll
index 3996eb40a2b21..eead19acfcb3d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll
@@ -9,12 +9,14 @@ define void @scalable_loads() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.nxv16i8 = load <vscale x 16 x i8>, ptr undef, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res.nxv32i8 = load <vscale x 32 x i8>, ptr undef, align 32
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = load <vscale x 1 x i64>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = load <vscale x 4 x i1>, ptr undef, align 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %res.nxv8i8 = load <vscale x 8 x i8>, ptr undef
   %res.nxv16i8 = load <vscale x 16 x i8>, ptr undef
   %res.nxv32i8 = load <vscale x 32 x i8>, ptr undef
   %res.nxv1i64 = load <vscale x 1 x i64>, ptr undef
+  %res.nxv4i1 = load <vscale x 4 x i1>, ptr undef
   ret void
 }
 
@@ -24,12 +26,14 @@ define void @scalable_stores() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 16 x i8> undef, ptr undef, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 32 x i8> undef, ptr undef, align 32
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: store <vscale x 1 x i64> undef, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: store <vscale x 4 x i1> undef, ptr undef, align 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   store <vscale x 8 x i8> undef, ptr undef
   store <vscale x 16 x i8> undef, ptr undef
   store <vscale x 32 x i8> undef, ptr undef
   store <vscale x 1 x i64> undef, ptr undef
+  store <vscale x 4 x i1> undef, ptr undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
index c2309ebbc274a..2a94d63278888 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
@@ -12,23 +12,27 @@ define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %n
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatters'
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-1-LABEL: 'masked_scatters'
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
   call void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
   call void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+  call void @llvm.masked.scatter.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
   ret void
 }
 
@@ -112,6 +116,7 @@ attributes #2 = { "target-features"="+sve" }
 declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
 declare void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x ptr>, i32, <vscale x 8 x i1>)
 declare void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x ptr>, i32, <vscale x 1 x i1>)
+declare void @llvm.masked.scatter.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
 declare void @llvm.masked.scatter.nxv4f64(<vscale x 4 x double>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
 declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x ptr>, i32, <vscale x 2 x i1>)
 declare void @llvm.masked.scatter.nxv8f32(<vscale x 8 x float>, <vscale x 8 x ptr>, i32, <vscale x 8 x i1>)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index b6691ed6ecfa7..fc7f81fe12807 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -350,6 +350,38 @@ for.cond.cleanup:
   ret void
 }
 
+
+; ADD (with reduction of i1)
+
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
+define i1 @add_trunc_i32_i1(ptr nocapture %src, i64 %N) {
+; CHECK-LABEL: @add_trunc_i32_i1
+; CHECK: vector.body:
+; CHECK: %[[PHI1:.*]] = phi <vscale x 8 x i1> [ zeroinitializer, %{{.*}} ], [ %20, %vector.body ]
+; CHECK: %[[PHI2:.*]] = phi <vscale x 8 x i1> [ zeroinitializer, %{{.*}} ], [ %21, %vector.body ]
+; CHECK: %[[TRUNC1:.*]] = trunc <vscale x 8 x i32> %{{.*}} to <vscale x 8 x i1>
+; CHECK: %[[TRUNC2:.*]] = trunc <vscale x 8 x i32> %{{.*}} to <vscale x 8 x i1>
+; CHECK: %{{.*}} = xor <vscale x 8 x i1> %[[PHI1]], %[[TRUNC1]]
+; CHECK: %{{.*}} = xor <vscale x 8 x i1> %[[PHI2]], %[[TRUNC2]]
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %red = phi i1 [ 0, %entry ], [ %red.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %iv
+  %load32 = load i32, ptr %arrayidx, align 4
+  %trunc = trunc i32 %load32 to i1
+  %red.next = xor i1 %red, %trunc
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret i1 %red.next
+}
+
+
 ; Reduction cannot be vectorized
 
 ; MUL
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
index 342b37710b653..375bbb74d3d48 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
@@ -80,33 +80,6 @@ for.end:
   ret void
 }
 
-; CHECK-REMARKS: Scalable vectorization is not supported for all element types found in this loop
-define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) {
-; CHECK-LABEL: @uniform_store_i1
-; CHECK: vector.body
-; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <64 x ptr> {{.*}}, i64 1
-; CHECK: %[[ICMP:.*]] = icmp eq <64 x ptr> %[[GEP]], %[[SPLAT:.*]]
-; CHECK: %[[EXTRACT1:.*]] = extractelement ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/95920