[llvm] [AArch64] Allow SVE code generation for fixed-width vectors (PR #67122)
Igor Kirillov via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 27 04:01:44 PDT 2023
https://github.com/igogo-x86 updated https://github.com/llvm/llvm-project/pull/67122
>From 0c0abe559bfab88c951397f9346a2ed41dd429e8 Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Fri, 22 Sep 2023 11:45:55 +0000
Subject: [PATCH 1/2] [AArch64] Allow SVE code generation for fixed-width
vectors
This patch implicitly decreases the cost of masked load and stores with
fixed width vectors when their width is equal to 128 bytes. Also, it allows
the generation of SVE code with masks that mimic Neon.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 4 +-
.../AArch64/AArch64TargetTransformInfo.cpp | 6 +-
.../AArch64/AArch64TargetTransformInfo.h | 3 +-
.../Analysis/CostModel/AArch64/masked_ldst.ll | 14 +-
.../AArch64/sve-fixed-length-masked-loads.ll | 2 +-
.../AArch64/sve-fixed-length-masked-stores.ll | 2 +-
.../LoopVectorize/AArch64/masked-op-cost.ll | 7 +-
.../AArch64/expand-masked-load.ll | 147 +++++++++++-------
.../AArch64/expand-masked-store.ll | 55 +++++--
9 files changed, 150 insertions(+), 90 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3de6bd1ec94a82a..075fbbc3063f948 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5613,9 +5613,7 @@ SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(LoadNode && "Expected custom lowering of a masked load node");
EVT VT = Op->getValueType(0);
- if (useSVEForFixedLengthVectorVT(
- VT,
- /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
+ if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
SDValue PassThru = LoadNode->getPassThru();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e6209ca12a48c31..a6118a49737c494 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2925,7 +2925,11 @@ InstructionCost
AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind) {
- if (useNeonVector(Src))
+ if (!ST->hasSVE())
+ return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
+ if (isa<FixedVectorType>(Src) && !ST->useSVEForFixedLengthVectors() &&
+ Src->getPrimitiveSizeInBits() != 128)
return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
CostKind);
auto LT = getTypeLegalizationCost(Src);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6baade412c77d2..0501c4228523215 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -254,7 +254,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return false;
// For fixed vectors, avoid scalarization if using SVE for them.
- if (isa<FixedVectorType>(DataType) && !ST->useSVEForFixedLengthVectors())
+ if (isa<FixedVectorType>(DataType) && !ST->useSVEForFixedLengthVectors() &&
+ DataType->getPrimitiveSizeInBits() != 128)
return false; // Fall back to scalarization of masked operations.
return isElementTypeLegalForScalableVector(DataType->getScalarType());
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
index ee41c69baf2c8b0..f9c81884a57280a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
@@ -8,19 +8,19 @@ define void @fixed() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
index a63b90856a66d82..5dfce78af18b8e6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -50,7 +50,7 @@ define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
ret <2 x float> %load
}
-define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
+define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
; CHECK-LABEL: masked_load_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl4
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
index 384b2cc6269328a..db0a5b2ba942532 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
@@ -48,7 +48,7 @@ define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
ret void
}
-define void @masked_store_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
+define void @masked_store_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
; CHECK-LABEL: masked_store_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
index 37ac570aa06c6ab..5acbc38650522d3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -6,13 +6,12 @@ target triple = "aarch64-unknown-linux-gnu"
; CHECK-COST: Checking a loop in 'fixed_width'
; CHECK-COST: Found an estimated cost of 12 for VF 2 For instruction: store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Found an estimated cost of 24 for VF 4 For instruction: store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Selecting VF: 1.
+; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Selecting VF: 4.
-; We should decide this loop is not worth vectorising using fixed width vectors
define void @fixed_width(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
; CHECK-LABEL: @fixed_width(
-; CHECK-NOT: vector.body
+; CHECK: vector.body
entry:
%cmp6 = icmp sgt i64 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
index de89cce9fa9028c..fe14f2ff17f8387 100644
--- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu | FileCheck -check-prefixes=CHECK,CHECK-LE %s
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck -check-prefixes=CHECK,CHECK-LE %s
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64_be-linux-gnu -data-layout="E-m:o-i64:64-i128:128-n32:64-S128" | FileCheck -check-prefixes=CHECK,CHECK-BE %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu | FileCheck -check-prefixes=CHECK-LE-COMMON,CHECK-LE %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck -check-prefixes=CHECK-LE-COMMON,CHECK-LE-SVE %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64_be-linux-gnu -data-layout="E-m:o-i64:64-i128:128-n32:64-S128" | FileCheck -check-prefixes=CHECK-BE %s
define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) {
; CHECK-LE-LABEL: @scalarize_v2i64(
@@ -28,6 +28,10 @@ define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) {
; CHECK-LE-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-LE-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]]
;
+; CHECK-LE-SVE-LABEL: @scalarize_v2i64(
+; CHECK-LE-SVE-NEXT: [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 128, <2 x i1> [[MASK:%.*]], <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT: ret <2 x i64> [[RET]]
+;
; CHECK-BE-LABEL: @scalarize_v2i64(
; CHECK-BE-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
; CHECK-BE-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], -2
@@ -57,28 +61,53 @@ define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) {
}
define <2 x i64> @scalarize_v2i64_ones_mask(ptr %p, <2 x i64> %passthru) {
-; CHECK-LABEL: @scalarize_v2i64_ones_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
-; CHECK-NEXT: ret <2 x i64> [[TMP1]]
+; CHECK-LE-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-LE-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
+; CHECK-LE-NEXT: ret <2 x i64> [[TMP1]]
+;
+; CHECK-LE-SVE-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-LE-SVE-NEXT: [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT: ret <2 x i64> [[RET]]
+;
+; CHECK-BE-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-BE-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
+; CHECK-BE-NEXT: ret <2 x i64> [[TMP1]]
;
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %p, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %passthru)
ret <2 x i64> %ret
}
define <2 x i64> @scalarize_v2i64_zero_mask(ptr %p, <2 x i64> %passthru) {
-; CHECK-LABEL: @scalarize_v2i64_zero_mask(
-; CHECK-NEXT: ret <2 x i64> [[PASSTHRU:%.*]]
+; CHECK-LE-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-LE-NEXT: ret <2 x i64> [[PASSTHRU:%.*]]
+;
+; CHECK-LE-SVE-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-LE-SVE-NEXT: [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> zeroinitializer, <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT: ret <2 x i64> [[RET]]
+;
+; CHECK-BE-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-BE-NEXT: ret <2 x i64> [[PASSTHRU:%.*]]
;
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %p, i32 8, <2 x i1> <i1 false, i1 false>, <2 x i64> %passthru)
ret <2 x i64> %ret
}
define <2 x i64> @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %passthru) {
-; CHECK-LABEL: @scalarize_v2i64_const_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 1
-; CHECK-NEXT: ret <2 x i64> [[TMP3]]
+; CHECK-LE-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-LE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
+; CHECK-LE-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; CHECK-LE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 1
+; CHECK-LE-NEXT: ret <2 x i64> [[TMP3]]
+;
+; CHECK-LE-SVE-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-LE-SVE-NEXT: [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT: ret <2 x i64> [[RET]]
+;
+; CHECK-BE-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-BE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
+; CHECK-BE-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; CHECK-BE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 1
+; CHECK-BE-NEXT: ret <2 x i64> [[TMP3]]
;
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %p, i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> %passthru)
ret <2 x i64> %ret
@@ -86,29 +115,29 @@ define <2 x i64> @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %passthru) {
; This use a byte sized but non power of 2 element size. This used to crash due to bad alignment calculation.
define <2 x i24> @scalarize_v2i24(ptr %p, <2 x i1> %mask, <2 x i24> %passthru) {
-; CHECK-LE-LABEL: @scalarize_v2i24(
-; CHECK-LE-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
-; CHECK-LE-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
-; CHECK-LE-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
-; CHECK-LE-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
-; CHECK-LE: cond.load:
-; CHECK-LE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i24, ptr [[P:%.*]], i32 0
-; CHECK-LE-NEXT: [[TMP4:%.*]] = load i24, ptr [[TMP3]], align 1
-; CHECK-LE-NEXT: [[TMP5:%.*]] = insertelement <2 x i24> [[PASSTHRU:%.*]], i24 [[TMP4]], i64 0
-; CHECK-LE-NEXT: br label [[ELSE]]
-; CHECK-LE: else:
-; CHECK-LE-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i24> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
-; CHECK-LE-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
-; CHECK-LE-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
-; CHECK-LE-NEXT: br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
-; CHECK-LE: cond.load1:
-; CHECK-LE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i24, ptr [[P]], i32 1
-; CHECK-LE-NEXT: [[TMP9:%.*]] = load i24, ptr [[TMP8]], align 1
-; CHECK-LE-NEXT: [[TMP10:%.*]] = insertelement <2 x i24> [[RES_PHI_ELSE]], i24 [[TMP9]], i64 1
-; CHECK-LE-NEXT: br label [[ELSE2]]
-; CHECK-LE: else2:
-; CHECK-LE-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i24> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
-; CHECK-LE-NEXT: ret <2 x i24> [[RES_PHI_ELSE3]]
+; CHECK-LE-COMMON-LABEL: @scalarize_v2i24(
+; CHECK-LE-COMMON-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
+; CHECK-LE-COMMON-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
+; CHECK-LE-COMMON-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
+; CHECK-LE-COMMON-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK-LE-COMMON: cond.load:
+; CHECK-LE-COMMON-NEXT: [[TMP3:%.*]] = getelementptr inbounds i24, ptr [[P:%.*]], i32 0
+; CHECK-LE-COMMON-NEXT: [[TMP4:%.*]] = load i24, ptr [[TMP3]], align 1
+; CHECK-LE-COMMON-NEXT: [[TMP5:%.*]] = insertelement <2 x i24> [[PASSTHRU:%.*]], i24 [[TMP4]], i64 0
+; CHECK-LE-COMMON-NEXT: br label [[ELSE]]
+; CHECK-LE-COMMON: else:
+; CHECK-LE-COMMON-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i24> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-LE-COMMON-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
+; CHECK-LE-COMMON-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-LE-COMMON-NEXT: br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK-LE-COMMON: cond.load1:
+; CHECK-LE-COMMON-NEXT: [[TMP8:%.*]] = getelementptr inbounds i24, ptr [[P]], i32 1
+; CHECK-LE-COMMON-NEXT: [[TMP9:%.*]] = load i24, ptr [[TMP8]], align 1
+; CHECK-LE-COMMON-NEXT: [[TMP10:%.*]] = insertelement <2 x i24> [[RES_PHI_ELSE]], i24 [[TMP9]], i64 1
+; CHECK-LE-COMMON-NEXT: br label [[ELSE2]]
+; CHECK-LE-COMMON: else2:
+; CHECK-LE-COMMON-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i24> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-LE-COMMON-NEXT: ret <2 x i24> [[RES_PHI_ELSE3]]
;
; CHECK-BE-LABEL: @scalarize_v2i24(
; CHECK-BE-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
@@ -140,29 +169,29 @@ define <2 x i24> @scalarize_v2i24(ptr %p, <2 x i1> %mask, <2 x i24> %passthru) {
; This use a byte sized but non power of 2 element size. This used to crash due to bad alignment calculation.
define <2 x i48> @scalarize_v2i48(ptr %p, <2 x i1> %mask, <2 x i48> %passthru) {
-; CHECK-LE-LABEL: @scalarize_v2i48(
-; CHECK-LE-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
-; CHECK-LE-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
-; CHECK-LE-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
-; CHECK-LE-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
-; CHECK-LE: cond.load:
-; CHECK-LE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i48, ptr [[P:%.*]], i32 0
-; CHECK-LE-NEXT: [[TMP4:%.*]] = load i48, ptr [[TMP3]], align 2
-; CHECK-LE-NEXT: [[TMP5:%.*]] = insertelement <2 x i48> [[PASSTHRU:%.*]], i48 [[TMP4]], i64 0
-; CHECK-LE-NEXT: br label [[ELSE]]
-; CHECK-LE: else:
-; CHECK-LE-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i48> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
-; CHECK-LE-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
-; CHECK-LE-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
-; CHECK-LE-NEXT: br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
-; CHECK-LE: cond.load1:
-; CHECK-LE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i48, ptr [[P]], i32 1
-; CHECK-LE-NEXT: [[TMP9:%.*]] = load i48, ptr [[TMP8]], align 2
-; CHECK-LE-NEXT: [[TMP10:%.*]] = insertelement <2 x i48> [[RES_PHI_ELSE]], i48 [[TMP9]], i64 1
-; CHECK-LE-NEXT: br label [[ELSE2]]
-; CHECK-LE: else2:
-; CHECK-LE-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i48> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
-; CHECK-LE-NEXT: ret <2 x i48> [[RES_PHI_ELSE3]]
+; CHECK-LE-COMMON-LABEL: @scalarize_v2i48(
+; CHECK-LE-COMMON-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
+; CHECK-LE-COMMON-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
+; CHECK-LE-COMMON-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
+; CHECK-LE-COMMON-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK-LE-COMMON: cond.load:
+; CHECK-LE-COMMON-NEXT: [[TMP3:%.*]] = getelementptr inbounds i48, ptr [[P:%.*]], i32 0
+; CHECK-LE-COMMON-NEXT: [[TMP4:%.*]] = load i48, ptr [[TMP3]], align 2
+; CHECK-LE-COMMON-NEXT: [[TMP5:%.*]] = insertelement <2 x i48> [[PASSTHRU:%.*]], i48 [[TMP4]], i64 0
+; CHECK-LE-COMMON-NEXT: br label [[ELSE]]
+; CHECK-LE-COMMON: else:
+; CHECK-LE-COMMON-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i48> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-LE-COMMON-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
+; CHECK-LE-COMMON-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-LE-COMMON-NEXT: br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK-LE-COMMON: cond.load1:
+; CHECK-LE-COMMON-NEXT: [[TMP8:%.*]] = getelementptr inbounds i48, ptr [[P]], i32 1
+; CHECK-LE-COMMON-NEXT: [[TMP9:%.*]] = load i48, ptr [[TMP8]], align 2
+; CHECK-LE-COMMON-NEXT: [[TMP10:%.*]] = insertelement <2 x i48> [[RES_PHI_ELSE]], i48 [[TMP9]], i64 1
+; CHECK-LE-COMMON-NEXT: br label [[ELSE2]]
+; CHECK-LE-COMMON: else2:
+; CHECK-LE-COMMON-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i48> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-LE-COMMON-NEXT: ret <2 x i48> [[RES_PHI_ELSE3]]
;
; CHECK-BE-LABEL: @scalarize_v2i48(
; CHECK-BE-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
index 56c0d6e70980315..2f73227f056707c 100644
--- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu | FileCheck -check-prefixes=CHECK,CHECK-LE %s
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck -check-prefixes=CHECK,CHECK-LE %s
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64_be-linux-gnu -data-layout="E-m:o-i64:64-i128:128-n32:64-S128" | FileCheck -check-prefixes=CHECK,CHECK-BE %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu | FileCheck -check-prefixes=CHECK-LE %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck -check-prefixes=CHECK-SVE-LE %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64_be-linux-gnu -data-layout="E-m:o-i64:64-i128:128-n32:64-S128" | FileCheck -check-prefixes=CHECK-BE %s
define void @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %data) {
; CHECK-LE-LABEL: @scalarize_v2i64(
@@ -26,6 +26,10 @@ define void @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %data) {
; CHECK-LE: else2:
; CHECK-LE-NEXT: ret void
;
+; CHECK-SVE-LE-LABEL: @scalarize_v2i64(
+; CHECK-SVE-LE-NEXT: call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 128, <2 x i1> [[MASK:%.*]])
+; CHECK-SVE-LE-NEXT: ret void
+;
; CHECK-BE-LABEL: @scalarize_v2i64(
; CHECK-BE-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
; CHECK-BE-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], -2
@@ -53,28 +57,53 @@ define void @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %data) {
}
define void @scalarize_v2i64_ones_mask(ptr %p, <2 x i64> %data) {
-; CHECK-LABEL: @scalarize_v2i64_ones_mask(
-; CHECK-NEXT: store <2 x i64> [[DATA:%.*]], ptr [[P:%.*]], align 8
-; CHECK-NEXT: ret void
+; CHECK-LE-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-LE-NEXT: store <2 x i64> [[DATA:%.*]], ptr [[P:%.*]], align 8
+; CHECK-LE-NEXT: ret void
+;
+; CHECK-SVE-LE-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-SVE-LE-NEXT: call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 8, <2 x i1> <i1 true, i1 true>)
+; CHECK-SVE-LE-NEXT: ret void
+;
+; CHECK-BE-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-BE-NEXT: store <2 x i64> [[DATA:%.*]], ptr [[P:%.*]], align 8
+; CHECK-BE-NEXT: ret void
;
call void @llvm.masked.store.v2i64.p0(<2 x i64> %data, ptr %p, i32 8, <2 x i1> <i1 true, i1 true>)
ret void
}
define void @scalarize_v2i64_zero_mask(ptr %p, <2 x i64> %data) {
-; CHECK-LABEL: @scalarize_v2i64_zero_mask(
-; CHECK-NEXT: ret void
+; CHECK-LE-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-LE-NEXT: ret void
+;
+; CHECK-SVE-LE-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-SVE-LE-NEXT: call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 8, <2 x i1> zeroinitializer)
+; CHECK-SVE-LE-NEXT: ret void
+;
+; CHECK-BE-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-BE-NEXT: ret void
;
call void @llvm.masked.store.v2i64.p0(<2 x i64> %data, ptr %p, i32 8, <2 x i1> <i1 false, i1 false>)
ret void
}
define void @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %data) {
-; CHECK-LABEL: @scalarize_v2i64_const_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
-; CHECK-NEXT: store i64 [[TMP1]], ptr [[TMP2]], align 8
-; CHECK-NEXT: ret void
+; CHECK-LE-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-LE-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 1
+; CHECK-LE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
+; CHECK-LE-NEXT: store i64 [[TMP1]], ptr [[TMP2]], align 8
+; CHECK-LE-NEXT: ret void
+;
+; CHECK-SVE-LE-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-SVE-LE-NEXT: call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 8, <2 x i1> <i1 false, i1 true>)
+; CHECK-SVE-LE-NEXT: ret void
+;
+; CHECK-BE-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-BE-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 1
+; CHECK-BE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
+; CHECK-BE-NEXT: store i64 [[TMP1]], ptr [[TMP2]], align 8
+; CHECK-BE-NEXT: ret void
;
call void @llvm.masked.store.v2i64.p0(<2 x i64> %data, ptr %p, i32 8, <2 x i1> <i1 false, i1 true>)
ret void
>From 3e85634b6abb1f05e4877113b6c8334865250e30 Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Wed, 27 Sep 2023 10:53:09 +0000
Subject: [PATCH 2/2] Add tests for masked 128 bit loads and stores
---
.../sve-fixed-length-masked-128bit-loads.ll | 73 +++++++++++++++++++
.../sve-fixed-length-masked-128bit-stores.ll | 73 +++++++++++++++++++
2 files changed, 146 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
new file mode 100644
index 000000000000000..1868e5c922a53a1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; Masked Load
+;
+
+define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
+; CHECK-LABEL: masked_load_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
+ %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer)
+ ret <16 x i8> %load
+}
+
+define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) {
+; CHECK-LABEL: masked_load_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: shl v0.8h, v0.8h, #15
+; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
+ %load = call <8 x half> @llvm.masked.load.v8f16(ptr %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer)
+ ret <8 x half> %load
+}
+
+define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) {
+; CHECK-LABEL: masked_load_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: shl v0.4s, v0.4s, #31
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
+ %load = call <4 x float> @llvm.masked.load.v4f32(ptr %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
+ ret <4 x float> %load
+}
+
+define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) {
+; CHECK-LABEL: masked_load_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: shl v0.2d, v0.2d, #63
+; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
+ %load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer)
+ ret <2 x double> %load
+}
+
+declare <16 x i8> @llvm.masked.load.v16i8(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x half> @llvm.masked.load.v8f16(ptr, i32, <8 x i1>, <8 x half>)
+declare <4 x float> @llvm.masked.load.v4f32(ptr, i32, <4 x i1>, <4 x float>)
+declare <2 x double> @llvm.masked.load.v2f64(ptr, i32, <2 x i1>, <2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll
new file mode 100644
index 000000000000000..bdd6ce0647016cf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; Masked Store
+;
+
+define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
+; CHECK-LABEL: masked_store_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl16
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT: st1b { z1.b }, p0, [x0]
+; CHECK-NEXT: ret
+ call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
+ ret void
+}
+
+define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
+; CHECK-LABEL: masked_store_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: shl v0.8h, v0.8h, #15
+; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT: st1h { z1.h }, p0, [x0]
+; CHECK-NEXT: ret
+ call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
+ ret void
+}
+
+define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
+; CHECK-LABEL: masked_store_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: shl v0.4s, v0.4s, #31
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: st1w { z1.s }, p0, [x0]
+; CHECK-NEXT: ret
+ call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
+ ret void
+}
+
+define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) {
+; CHECK-LABEL: masked_store_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: shl v0.2d, v0.2d, #63
+; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: st1d { z1.d }, p0, [x0]
+; CHECK-NEXT: ret
+ call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.store.v16i8(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8f16(<8 x half>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4f32(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v2f64(<2 x double>, ptr, i32, <2 x i1>)
More information about the llvm-commits
mailing list