[llvm] [AArch64] Allow SVE code generation for fixed-width vectors (PR #67122)

Fri Oct 13 06:18:18 PDT 2023

https://github.com/igogo-x86 updated https://github.com/llvm/llvm-project/pull/67122

>From 83b84386d546a6e4542b61d7ddb282cd0540a15b Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Fri, 22 Sep 2023 11:45:55 +0000
Subject: [PATCH 1/3] [AArch64] Allow SVE code generation for fixed-width
 vectors

This patch implicitly decreases the cost of masked load and stores with
fixed width vectors when their width is equal to 128 bytes. Also, it allows
the generation of SVE code with masks that mimic Neon.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   4 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |   6 +-
 .../AArch64/AArch64TargetTransformInfo.h      |   3 +-
 .../Analysis/CostModel/AArch64/masked_ldst.ll |  14 +-
 .../AArch64/sve-fixed-length-masked-loads.ll  |   2 +-
 .../AArch64/sve-fixed-length-masked-stores.ll |   2 +-
 .../LoopVectorize/AArch64/masked-op-cost.ll   |   7 +-
 .../AArch64/expand-masked-load.ll             | 147 +++++++++++-------
 .../AArch64/expand-masked-store.ll            |  55 +++++--
 9 files changed, 150 insertions(+), 90 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 64d00dafd835b11..0ed5979a6592d82 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5619,9 +5619,7 @@ SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
   assert(LoadNode && "Expected custom lowering of a masked load node");
   EVT VT = Op->getValueType(0);
 
-  if (useSVEForFixedLengthVectorVT(
-          VT,
-          /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
+  if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
     return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
 
   SDValue PassThru = LoadNode->getPassThru();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d8a0e68d7123759..a73e220522e2abc 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2949,7 +2949,11 @@ InstructionCost
 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                       Align Alignment, unsigned AddressSpace,
                                       TTI::TargetCostKind CostKind) {
-  if (useNeonVector(Src))
+  if (!ST->hasSVE())
+    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                        CostKind);
+  if (isa<FixedVectorType>(Src) && !ST->useSVEForFixedLengthVectors() &&
+      Src->getPrimitiveSizeInBits() != 128)
     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                         CostKind);
   auto LT = getTypeLegalizationCost(Src);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6baade412c77d2..0501c4228523215 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -254,7 +254,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
       return false;
 
     // For fixed vectors, avoid scalarization if using SVE for them.
-    if (isa<FixedVectorType>(DataType) && !ST->useSVEForFixedLengthVectors())
+    if (isa<FixedVectorType>(DataType) && !ST->useSVEForFixedLengthVectors() &&
+        DataType->getPrimitiveSizeInBits() != 128)
       return false; // Fall back to scalarization of masked operations.
 
     return isElementTypeLegalForScalableVector(DataType->getScalarType());
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
index 652d36c01a77e71..b9bc1d488375870 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
@@ -8,19 +8,19 @@ define void @fixed() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
index a63b90856a66d82..5dfce78af18b8e6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -50,7 +50,7 @@ define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
   ret <2 x float> %load
 }
 
-define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
+define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
 ; CHECK-LABEL: masked_load_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl4
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
index 384b2cc6269328a..db0a5b2ba942532 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
@@ -48,7 +48,7 @@ define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
   ret void
 }
 
-define void @masked_store_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
+define void @masked_store_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
 ; CHECK-LABEL: masked_store_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
index 37ac570aa06c6ab..5acbc38650522d3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -6,13 +6,12 @@ target triple = "aarch64-unknown-linux-gnu"
 
 ; CHECK-COST: Checking a loop in 'fixed_width'
 ; CHECK-COST: Found an estimated cost of 12 for VF 2 For instruction:   store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Found an estimated cost of 24 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Selecting VF: 1.
+; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Selecting VF: 4.
 
-; We should decide this loop is not worth vectorising using fixed width vectors
 define void @fixed_width(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
 ; CHECK-LABEL: @fixed_width(
-; CHECK-NOT: vector.body
+; CHECK: vector.body
 entry:
   %cmp6 = icmp sgt i64 %n, 0
   br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
index de89cce9fa9028c..fe14f2ff17f8387 100644
--- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu | FileCheck -check-prefixes=CHECK,CHECK-LE %s
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve  | FileCheck -check-prefixes=CHECK,CHECK-LE %s
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64_be-linux-gnu -data-layout="E-m:o-i64:64-i128:128-n32:64-S128" | FileCheck -check-prefixes=CHECK,CHECK-BE %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu | FileCheck -check-prefixes=CHECK-LE-COMMON,CHECK-LE %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve  | FileCheck -check-prefixes=CHECK-LE-COMMON,CHECK-LE-SVE %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64_be-linux-gnu -data-layout="E-m:o-i64:64-i128:128-n32:64-S128" | FileCheck -check-prefixes=CHECK-BE %s
 
 define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) {
 ; CHECK-LE-LABEL: @scalarize_v2i64(
@@ -28,6 +28,10 @@ define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) {
 ; CHECK-LE-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
 ; CHECK-LE-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
 ;
+; CHECK-LE-SVE-LABEL: @scalarize_v2i64(
+; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 128, <2 x i1> [[MASK:%.*]], <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT:    ret <2 x i64> [[RET]]
+;
 ; CHECK-BE-LABEL: @scalarize_v2i64(
 ; CHECK-BE-NEXT:    [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
 ; CHECK-BE-NEXT:    [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], -2
@@ -57,28 +61,53 @@ define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) {
 }
 
 define <2 x i64> @scalarize_v2i64_ones_mask(ptr %p, <2 x i64> %passthru) {
-; CHECK-LABEL: @scalarize_v2i64_ones_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-LE-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-LE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
+; CHECK-LE-NEXT:    ret <2 x i64> [[TMP1]]
+;
+; CHECK-LE-SVE-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT:    ret <2 x i64> [[RET]]
+;
+; CHECK-BE-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-BE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
+; CHECK-BE-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %ret = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %p, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %passthru)
   ret <2 x i64> %ret
 }
 
 define <2 x i64> @scalarize_v2i64_zero_mask(ptr %p, <2 x i64> %passthru) {
-; CHECK-LABEL: @scalarize_v2i64_zero_mask(
-; CHECK-NEXT:    ret <2 x i64> [[PASSTHRU:%.*]]
+; CHECK-LE-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-LE-NEXT:    ret <2 x i64> [[PASSTHRU:%.*]]
+;
+; CHECK-LE-SVE-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> zeroinitializer, <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT:    ret <2 x i64> [[RET]]
+;
+; CHECK-BE-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-BE-NEXT:    ret <2 x i64> [[PASSTHRU:%.*]]
 ;
   %ret = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %p, i32 8, <2 x i1> <i1 false, i1 false>, <2 x i64> %passthru)
   ret <2 x i64> %ret
 }
 
 define <2 x i64> @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %passthru) {
-; CHECK-LABEL: @scalarize_v2i64_const_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 1
-; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+; CHECK-LE-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-LE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
+; CHECK-LE-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; CHECK-LE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 1
+; CHECK-LE-NEXT:    ret <2 x i64> [[TMP3]]
+;
+; CHECK-LE-SVE-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT:    ret <2 x i64> [[RET]]
+;
+; CHECK-BE-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-BE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
+; CHECK-BE-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; CHECK-BE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 1
+; CHECK-BE-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %ret = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %p, i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> %passthru)
   ret <2 x i64> %ret
@@ -86,29 +115,29 @@ define <2 x i64> @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %passthru) {
 
 ; This use a byte sized but non power of 2 element size. This used to crash due to bad alignment calculation.
 define <2 x i24> @scalarize_v2i24(ptr %p, <2 x i1> %mask, <2 x i24> %passthru) {
-; CHECK-LE-LABEL: @scalarize_v2i24(
-; CHECK-LE-NEXT:    [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
-; CHECK-LE-NEXT:    [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
-; CHECK-LE-NEXT:    [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
-; CHECK-LE-NEXT:    br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
-; CHECK-LE:       cond.load:
-; CHECK-LE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i24, ptr [[P:%.*]], i32 0
-; CHECK-LE-NEXT:    [[TMP4:%.*]] = load i24, ptr [[TMP3]], align 1
-; CHECK-LE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i24> [[PASSTHRU:%.*]], i24 [[TMP4]], i64 0
-; CHECK-LE-NEXT:    br label [[ELSE]]
-; CHECK-LE:       else:
-; CHECK-LE-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i24> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
-; CHECK-LE-NEXT:    [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
-; CHECK-LE-NEXT:    [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
-; CHECK-LE-NEXT:    br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
-; CHECK-LE:       cond.load1:
-; CHECK-LE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i24, ptr [[P]], i32 1
-; CHECK-LE-NEXT:    [[TMP9:%.*]] = load i24, ptr [[TMP8]], align 1
-; CHECK-LE-NEXT:    [[TMP10:%.*]] = insertelement <2 x i24> [[RES_PHI_ELSE]], i24 [[TMP9]], i64 1
-; CHECK-LE-NEXT:    br label [[ELSE2]]
-; CHECK-LE:       else2:
-; CHECK-LE-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i24> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
-; CHECK-LE-NEXT:    ret <2 x i24> [[RES_PHI_ELSE3]]
+; CHECK-LE-COMMON-LABEL: @scalarize_v2i24(
+; CHECK-LE-COMMON-NEXT:    [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
+; CHECK-LE-COMMON-NEXT:    [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
+; CHECK-LE-COMMON-NEXT:    [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
+; CHECK-LE-COMMON-NEXT:    br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK-LE-COMMON:       cond.load:
+; CHECK-LE-COMMON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i24, ptr [[P:%.*]], i32 0
+; CHECK-LE-COMMON-NEXT:    [[TMP4:%.*]] = load i24, ptr [[TMP3]], align 1
+; CHECK-LE-COMMON-NEXT:    [[TMP5:%.*]] = insertelement <2 x i24> [[PASSTHRU:%.*]], i24 [[TMP4]], i64 0
+; CHECK-LE-COMMON-NEXT:    br label [[ELSE]]
+; CHECK-LE-COMMON:       else:
+; CHECK-LE-COMMON-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i24> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-LE-COMMON-NEXT:    [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
+; CHECK-LE-COMMON-NEXT:    [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-LE-COMMON-NEXT:    br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK-LE-COMMON:       cond.load1:
+; CHECK-LE-COMMON-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i24, ptr [[P]], i32 1
+; CHECK-LE-COMMON-NEXT:    [[TMP9:%.*]] = load i24, ptr [[TMP8]], align 1
+; CHECK-LE-COMMON-NEXT:    [[TMP10:%.*]] = insertelement <2 x i24> [[RES_PHI_ELSE]], i24 [[TMP9]], i64 1
+; CHECK-LE-COMMON-NEXT:    br label [[ELSE2]]
+; CHECK-LE-COMMON:       else2:
+; CHECK-LE-COMMON-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i24> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-LE-COMMON-NEXT:    ret <2 x i24> [[RES_PHI_ELSE3]]
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i24(
 ; CHECK-BE-NEXT:    [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
@@ -140,29 +169,29 @@ define <2 x i24> @scalarize_v2i24(ptr %p, <2 x i1> %mask, <2 x i24> %passthru) {
 
 ; This use a byte sized but non power of 2 element size. This used to crash due to bad alignment calculation.
 define <2 x i48> @scalarize_v2i48(ptr %p, <2 x i1> %mask, <2 x i48> %passthru) {
-; CHECK-LE-LABEL: @scalarize_v2i48(
-; CHECK-LE-NEXT:    [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
-; CHECK-LE-NEXT:    [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
-; CHECK-LE-NEXT:    [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
-; CHECK-LE-NEXT:    br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
-; CHECK-LE:       cond.load:
-; CHECK-LE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i48, ptr [[P:%.*]], i32 0
-; CHECK-LE-NEXT:    [[TMP4:%.*]] = load i48, ptr [[TMP3]], align 2
-; CHECK-LE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i48> [[PASSTHRU:%.*]], i48 [[TMP4]], i64 0
-; CHECK-LE-NEXT:    br label [[ELSE]]
-; CHECK-LE:       else:
-; CHECK-LE-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i48> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
-; CHECK-LE-NEXT:    [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
-; CHECK-LE-NEXT:    [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
-; CHECK-LE-NEXT:    br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
-; CHECK-LE:       cond.load1:
-; CHECK-LE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i48, ptr [[P]], i32 1
-; CHECK-LE-NEXT:    [[TMP9:%.*]] = load i48, ptr [[TMP8]], align 2
-; CHECK-LE-NEXT:    [[TMP10:%.*]] = insertelement <2 x i48> [[RES_PHI_ELSE]], i48 [[TMP9]], i64 1
-; CHECK-LE-NEXT:    br label [[ELSE2]]
-; CHECK-LE:       else2:
-; CHECK-LE-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i48> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
-; CHECK-LE-NEXT:    ret <2 x i48> [[RES_PHI_ELSE3]]
+; CHECK-LE-COMMON-LABEL: @scalarize_v2i48(
+; CHECK-LE-COMMON-NEXT:    [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
+; CHECK-LE-COMMON-NEXT:    [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
+; CHECK-LE-COMMON-NEXT:    [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
+; CHECK-LE-COMMON-NEXT:    br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK-LE-COMMON:       cond.load:
+; CHECK-LE-COMMON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i48, ptr [[P:%.*]], i32 0
+; CHECK-LE-COMMON-NEXT:    [[TMP4:%.*]] = load i48, ptr [[TMP3]], align 2
+; CHECK-LE-COMMON-NEXT:    [[TMP5:%.*]] = insertelement <2 x i48> [[PASSTHRU:%.*]], i48 [[TMP4]], i64 0
+; CHECK-LE-COMMON-NEXT:    br label [[ELSE]]
+; CHECK-LE-COMMON:       else:
+; CHECK-LE-COMMON-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i48> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-LE-COMMON-NEXT:    [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
+; CHECK-LE-COMMON-NEXT:    [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-LE-COMMON-NEXT:    br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK-LE-COMMON:       cond.load1:
+; CHECK-LE-COMMON-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i48, ptr [[P]], i32 1
+; CHECK-LE-COMMON-NEXT:    [[TMP9:%.*]] = load i48, ptr [[TMP8]], align 2
+; CHECK-LE-COMMON-NEXT:    [[TMP10:%.*]] = insertelement <2 x i48> [[RES_PHI_ELSE]], i48 [[TMP9]], i64 1
+; CHECK-LE-COMMON-NEXT:    br label [[ELSE2]]
+; CHECK-LE-COMMON:       else2:
+; CHECK-LE-COMMON-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i48> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-LE-COMMON-NEXT:    ret <2 x i48> [[RES_PHI_ELSE3]]
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i48(
 ; CHECK-BE-NEXT:    [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
index 56c0d6e70980315..2f73227f056707c 100644
--- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu | FileCheck -check-prefixes=CHECK,CHECK-LE %s
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck -check-prefixes=CHECK,CHECK-LE %s
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64_be-linux-gnu -data-layout="E-m:o-i64:64-i128:128-n32:64-S128" | FileCheck -check-prefixes=CHECK,CHECK-BE %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu | FileCheck -check-prefixes=CHECK-LE %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck -check-prefixes=CHECK-SVE-LE %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64_be-linux-gnu -data-layout="E-m:o-i64:64-i128:128-n32:64-S128" | FileCheck -check-prefixes=CHECK-BE %s
 
 define void @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %data) {
 ; CHECK-LE-LABEL: @scalarize_v2i64(
@@ -26,6 +26,10 @@ define void @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %data) {
 ; CHECK-LE:       else2:
 ; CHECK-LE-NEXT:    ret void
 ;
+; CHECK-SVE-LE-LABEL: @scalarize_v2i64(
+; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 128, <2 x i1> [[MASK:%.*]])
+; CHECK-SVE-LE-NEXT:    ret void
+;
 ; CHECK-BE-LABEL: @scalarize_v2i64(
 ; CHECK-BE-NEXT:    [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
 ; CHECK-BE-NEXT:    [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], -2
@@ -53,28 +57,53 @@ define void @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %data) {
 }
 
 define void @scalarize_v2i64_ones_mask(ptr %p, <2 x i64> %data) {
-; CHECK-LABEL: @scalarize_v2i64_ones_mask(
-; CHECK-NEXT:    store <2 x i64> [[DATA:%.*]], ptr [[P:%.*]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-LE-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-LE-NEXT:    store <2 x i64> [[DATA:%.*]], ptr [[P:%.*]], align 8
+; CHECK-LE-NEXT:    ret void
+;
+; CHECK-SVE-LE-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 8, <2 x i1> <i1 true, i1 true>)
+; CHECK-SVE-LE-NEXT:    ret void
+;
+; CHECK-BE-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-BE-NEXT:    store <2 x i64> [[DATA:%.*]], ptr [[P:%.*]], align 8
+; CHECK-BE-NEXT:    ret void
 ;
   call void @llvm.masked.store.v2i64.p0(<2 x i64> %data, ptr %p, i32 8, <2 x i1> <i1 true, i1 true>)
   ret void
 }
 
 define void @scalarize_v2i64_zero_mask(ptr %p, <2 x i64> %data) {
-; CHECK-LABEL: @scalarize_v2i64_zero_mask(
-; CHECK-NEXT:    ret void
+; CHECK-LE-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-LE-NEXT:    ret void
+;
+; CHECK-SVE-LE-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 8, <2 x i1> zeroinitializer)
+; CHECK-SVE-LE-NEXT:    ret void
+;
+; CHECK-BE-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-BE-NEXT:    ret void
 ;
   call void @llvm.masked.store.v2i64.p0(<2 x i64> %data, ptr %p, i32 8, <2 x i1> <i1 false, i1 false>)
   ret void
 }
 
 define void @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %data) {
-; CHECK-LABEL: @scalarize_v2i64_const_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
-; CHECK-NEXT:    store i64 [[TMP1]], ptr [[TMP2]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-LE-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-LE-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 1
+; CHECK-LE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
+; CHECK-LE-NEXT:    store i64 [[TMP1]], ptr [[TMP2]], align 8
+; CHECK-LE-NEXT:    ret void
+;
+; CHECK-SVE-LE-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 8, <2 x i1> <i1 false, i1 true>)
+; CHECK-SVE-LE-NEXT:    ret void
+;
+; CHECK-BE-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-BE-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 1
+; CHECK-BE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
+; CHECK-BE-NEXT:    store i64 [[TMP1]], ptr [[TMP2]], align 8
+; CHECK-BE-NEXT:    ret void
 ;
   call void @llvm.masked.store.v2i64.p0(<2 x i64> %data, ptr %p, i32 8, <2 x i1> <i1 false, i1 true>)
   ret void

>From 3ba39241b6290499d7e90757ac5268b9c8cc24d3 Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Wed, 27 Sep 2023 10:53:09 +0000
Subject: [PATCH 2/3] Add tests for masked 128 bit loads and stores

---
 .../sve-fixed-length-masked-128bit-loads.ll   | 73 +++++++++++++++++++
 .../sve-fixed-length-masked-128bit-stores.ll  | 73 +++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll

diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
new file mode 100644
index 000000000000000..1868e5c922a53a1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; Masked Load
+;
+
+define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
+; CHECK-LABEL: masked_load_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer)
+  ret <16 x i8> %load
+}
+
+define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) {
+; CHECK-LABEL: masked_load_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %load = call <8 x half> @llvm.masked.load.v8f16(ptr %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer)
+  ret <8 x half> %load
+}
+
+define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) {
+; CHECK-LABEL: masked_load_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %load = call <4 x float> @llvm.masked.load.v4f32(ptr %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
+  ret <4 x float> %load
+}
+
+define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) {
+; CHECK-LABEL: masked_load_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    shl v0.2d, v0.2d, #63
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer)
+  ret <2 x double> %load
+}
+
+declare <16 x i8> @llvm.masked.load.v16i8(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x half> @llvm.masked.load.v8f16(ptr, i32, <8 x i1>, <8 x half>)
+declare <4 x float> @llvm.masked.load.v4f32(ptr, i32, <4 x i1>, <4 x float>)
+declare <2 x double> @llvm.masked.load.v2f64(ptr, i32, <2 x i1>, <2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll
new file mode 100644
index 000000000000000..bdd6ce0647016cf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; Masked Store
+;
+
+define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
+; CHECK-LABEL: masked_store_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    st1b { z1.b }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
+  ret void
+}
+
+define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
+; CHECK-LABEL: masked_store_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
+  ret void
+}
+
+define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
+; CHECK-LABEL: masked_store_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
+  ret void
+}
+
+define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) {
+; CHECK-LABEL: masked_store_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    shl v0.2d, v0.2d, #63
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
+  ret void
+}
+
+declare void @llvm.masked.store.v16i8(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8f16(<8 x half>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4f32(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v2f64(<2 x double>, ptr, i32, <2 x i1>)

>From 8c5ab59e40dd86f1d6c89e05c52d97507230769d Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Fri, 13 Oct 2023 13:01:55 +0000
Subject: [PATCH 3/3] Revert changes to the cost model as they lead to
 regression in x264 benchmark

---
 .../Target/AArch64/AArch64TargetTransformInfo.cpp  |  6 +-----
 .../test/Analysis/CostModel/AArch64/masked_ldst.ll | 14 +++++++-------
 .../LoopVectorize/AArch64/masked-op-cost.ll        |  7 ++++---
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a73e220522e2abc..d8a0e68d7123759 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2949,11 +2949,7 @@ InstructionCost
 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                       Align Alignment, unsigned AddressSpace,
                                       TTI::TargetCostKind CostKind) {
-  if (!ST->hasSVE())
-    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
-                                        CostKind);
-  if (isa<FixedVectorType>(Src) && !ST->useSVEForFixedLengthVectors() &&
-      Src->getPrimitiveSizeInBits() != 128)
+  if (useNeonVector(Src))
     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                         CostKind);
   auto LT = getTypeLegalizationCost(Src);
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
index b9bc1d488375870..652d36c01a77e71 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
@@ -8,19 +8,19 @@ define void @fixed() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
index 5acbc38650522d3..37ac570aa06c6ab 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -6,12 +6,13 @@ target triple = "aarch64-unknown-linux-gnu"
 
 ; CHECK-COST: Checking a loop in 'fixed_width'
 ; CHECK-COST: Found an estimated cost of 12 for VF 2 For instruction:   store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Selecting VF: 4.
+; CHECK-COST: Found an estimated cost of 24 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Selecting VF: 1.
 
+; We should decide this loop is not worth vectorising using fixed width vectors
 define void @fixed_width(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
 ; CHECK-LABEL: @fixed_width(
-; CHECK: vector.body
+; CHECK-NOT: vector.body
 entry:
   %cmp6 = icmp sgt i64 %n, 0
   br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup