[llvm] [AMDGPU][GlobalIsel] Introduce isRegType to check for legal types, instead of checking bit width. (PR #68189)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 9 13:10:38 PST 2024
https://github.com/sstipanovic updated https://github.com/llvm/llvm-project/pull/68189
>From f2c42115ee62140f22ea34683b885aa901ae3cfa Mon Sep 17 00:00:00 2001
From: Stefan Stipanovic <stefan.stipanovic at syrmia.com>
Date: Fri, 9 Feb 2024 22:10:19 +0100
Subject: [PATCH] [AMDGPU][GlobalIsel] Introduce isRegisterClassType to check
for legal types, instead of checking bit width.
Make v13s32, v14s32, v15s32 and v7s64 illegal for bitcast first.
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 170 +++++++++++-------
.../AMDGPU/GlobalISel/bitcast_38_i16.ll | 85 +++++++++
.../AMDGPU/GlobalISel/extractelement.ll | 126 +++++++++++++
.../GlobalISel/legalize-build-vector.mir | 123 -------------
4 files changed, 317 insertions(+), 187 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 17ffb7ec988f0a..d957cda281f2b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -239,6 +239,7 @@ static bool isRegisterVectorType(LLT Ty) {
EltSize == 128 || EltSize == 256;
}
+// TODO: replace all uses of isRegisterType with isRegisterClassType
static bool isRegisterType(LLT Ty) {
if (!isRegisterSize(Ty.getSizeInBits()))
return false;
@@ -258,6 +259,8 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) {
}
// RegisterType that doesn't have a corresponding RegClass.
+// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
+// should be removed.
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
LLT Ty = Query.Types[TypeIdx];
@@ -276,6 +279,95 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
};
}
+static const LLT S1 = LLT::scalar(1);
+static const LLT S8 = LLT::scalar(8);
+static const LLT S16 = LLT::scalar(16);
+static const LLT S32 = LLT::scalar(32);
+static const LLT S64 = LLT::scalar(64);
+static const LLT S96 = LLT::scalar(96);
+static const LLT S128 = LLT::scalar(128);
+static const LLT S160 = LLT::scalar(160);
+static const LLT S224 = LLT::scalar(224);
+static const LLT S256 = LLT::scalar(256);
+static const LLT S512 = LLT::scalar(512);
+static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
+
+static const LLT V2S8 = LLT::fixed_vector(2, 8);
+static const LLT V2S16 = LLT::fixed_vector(2, 16);
+static const LLT V4S16 = LLT::fixed_vector(4, 16);
+static const LLT V6S16 = LLT::fixed_vector(6, 16);
+static const LLT V8S16 = LLT::fixed_vector(8, 16);
+static const LLT V10S16 = LLT::fixed_vector(10, 16);
+static const LLT V12S16 = LLT::fixed_vector(12, 16);
+static const LLT V16S16 = LLT::fixed_vector(16, 16);
+
+static const LLT V2S32 = LLT::fixed_vector(2, 32);
+static const LLT V3S32 = LLT::fixed_vector(3, 32);
+static const LLT V4S32 = LLT::fixed_vector(4, 32);
+static const LLT V5S32 = LLT::fixed_vector(5, 32);
+static const LLT V6S32 = LLT::fixed_vector(6, 32);
+static const LLT V7S32 = LLT::fixed_vector(7, 32);
+static const LLT V8S32 = LLT::fixed_vector(8, 32);
+static const LLT V9S32 = LLT::fixed_vector(9, 32);
+static const LLT V10S32 = LLT::fixed_vector(10, 32);
+static const LLT V11S32 = LLT::fixed_vector(11, 32);
+static const LLT V12S32 = LLT::fixed_vector(12, 32);
+static const LLT V16S32 = LLT::fixed_vector(16, 32);
+static const LLT V32S32 = LLT::fixed_vector(32, 32);
+
+static const LLT V2S64 = LLT::fixed_vector(2, 64);
+static const LLT V3S64 = LLT::fixed_vector(3, 64);
+static const LLT V4S64 = LLT::fixed_vector(4, 64);
+static const LLT V5S64 = LLT::fixed_vector(5, 64);
+static const LLT V6S64 = LLT::fixed_vector(6, 64);
+static const LLT V7S64 = LLT::fixed_vector(7, 64);
+static const LLT V8S64 = LLT::fixed_vector(8, 64);
+static const LLT V16S64 = LLT::fixed_vector(16, 64);
+
+static const LLT V2S128 = LLT::fixed_vector(2, 128);
+static const LLT V4S128 = LLT::fixed_vector(4, 128);
+
+static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
+ S160, S224, S256, S512};
+
+static std::initializer_list<LLT> AllS16Vectors{
+ V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
+
+static std::initializer_list<LLT> AllS32Vectors = {
+ V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
+ V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
+
+static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
+ V6S64, V7S64, V8S64, V16S64};
+
+static bool typeInSet(LLT Ty, std::initializer_list<LLT> TypesInit) {
+ SmallVector<LLT, 4> Types = TypesInit;
+ return llvm::is_contained(Types, Ty);
+}
+
+static LLT GetAddrSpacePtr(unsigned AS, const GCNTargetMachine &TM) {
+ return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
+}
+
+// Checks whether a type is in the list of legal register types.
+static bool isRegisterClassType(LLT Ty) {
+ if (Ty.isVector() && Ty.getElementType().isPointer())
+ Ty = LLT::fixed_vector(Ty.getNumElements(),
+ LLT::scalar(Ty.getScalarSizeInBits()));
+ else if (Ty.isPointer())
+ Ty = LLT::scalar(Ty.getScalarSizeInBits());
+
+ return typeInSet(Ty, AllS32Vectors) || typeInSet(Ty, AllS64Vectors) ||
+ typeInSet(Ty, AllScalarTypes) || typeInSet(Ty, AllS16Vectors) ||
+ Ty.isPointer();
+}
+
+static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
+ return [TypeIdx](const LegalityQuery &Query) {
+ return isRegisterClassType(Query.Types[TypeIdx]);
+ };
+}
+
// If we have a truncating store or an extending load with a data size larger
// than 32-bits, we need to reduce to a 32-bit type.
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
@@ -574,67 +666,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
: ST(ST_) {
using namespace TargetOpcode;
- auto GetAddrSpacePtr = [&TM](unsigned AS) {
- return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
- };
-
- const LLT S1 = LLT::scalar(1);
- const LLT S8 = LLT::scalar(8);
- const LLT S16 = LLT::scalar(16);
- const LLT S32 = LLT::scalar(32);
- const LLT S64 = LLT::scalar(64);
- const LLT S128 = LLT::scalar(128);
- const LLT S256 = LLT::scalar(256);
- const LLT S512 = LLT::scalar(512);
- const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
-
- const LLT V2S8 = LLT::fixed_vector(2, 8);
- const LLT V2S16 = LLT::fixed_vector(2, 16);
- const LLT V4S16 = LLT::fixed_vector(4, 16);
-
- const LLT V2S32 = LLT::fixed_vector(2, 32);
- const LLT V3S32 = LLT::fixed_vector(3, 32);
- const LLT V4S32 = LLT::fixed_vector(4, 32);
- const LLT V5S32 = LLT::fixed_vector(5, 32);
- const LLT V6S32 = LLT::fixed_vector(6, 32);
- const LLT V7S32 = LLT::fixed_vector(7, 32);
- const LLT V8S32 = LLT::fixed_vector(8, 32);
- const LLT V9S32 = LLT::fixed_vector(9, 32);
- const LLT V10S32 = LLT::fixed_vector(10, 32);
- const LLT V11S32 = LLT::fixed_vector(11, 32);
- const LLT V12S32 = LLT::fixed_vector(12, 32);
- const LLT V13S32 = LLT::fixed_vector(13, 32);
- const LLT V14S32 = LLT::fixed_vector(14, 32);
- const LLT V15S32 = LLT::fixed_vector(15, 32);
- const LLT V16S32 = LLT::fixed_vector(16, 32);
- const LLT V32S32 = LLT::fixed_vector(32, 32);
-
- const LLT V2S64 = LLT::fixed_vector(2, 64);
- const LLT V3S64 = LLT::fixed_vector(3, 64);
- const LLT V4S64 = LLT::fixed_vector(4, 64);
- const LLT V5S64 = LLT::fixed_vector(5, 64);
- const LLT V6S64 = LLT::fixed_vector(6, 64);
- const LLT V7S64 = LLT::fixed_vector(7, 64);
- const LLT V8S64 = LLT::fixed_vector(8, 64);
- const LLT V16S64 = LLT::fixed_vector(16, 64);
-
- std::initializer_list<LLT> AllS32Vectors =
- {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
- V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
- std::initializer_list<LLT> AllS64Vectors =
- {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
-
- const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
- const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
- const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
- const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
- const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
- const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
- const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
- const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
- const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
+ const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS, TM);
+ const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS, TM);
+ const LLT Constant32Ptr =
+ GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT, TM);
+ const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS, TM);
+ const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS, TM);
+ const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS, TM);
+ const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS, TM);
+ const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER, TM);
+ const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE, TM);
const LLT BufferStridedPtr =
- GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
+ GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER, TM);
const LLT CodePtr = FlatPtr;
@@ -836,10 +879,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
getActionDefinitionsBuilder(G_BITCAST)
- // Don't worry about the size constraint.
- .legalIf(all(isRegisterType(0), isRegisterType(1)))
- .lower();
-
+ // Don't worry about the size constraint.
+ .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
+ .lower();
getActionDefinitionsBuilder(G_CONSTANT)
.legalFor({S1, S32, S64, S16, GlobalPtr,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
new file mode 100644
index 00000000000000..5bea13af1649a2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+define void @main(<19 x i32> %arg) {
+; GCN-LABEL: main:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s4, 0
+; GCN-NEXT: s_mov_b32 s12, s4
+; GCN-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_mov_b32 s13, s4
+; GCN-NEXT: v_mov_b32_e32 v4, s12
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: s_mov_b32 s6, s4
+; GCN-NEXT: s_mov_b32 s7, s4
+; GCN-NEXT: s_mov_b32 s8, s4
+; GCN-NEXT: s_mov_b32 s9, s4
+; GCN-NEXT: s_mov_b32 s10, s4
+; GCN-NEXT: s_mov_b32 s11, s4
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: v_mov_b32_e32 v2, v1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: v_mov_b32_e32 v5, s13
+; GCN-NEXT: image_store v[0:3], v[4:5], s[4:11] unorm
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: main:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_mov_b32 s10, s4
+; GFX10-NEXT: s_mov_b32 s11, s4
+; GFX10-NEXT: v_mov_b32_e32 v4, s10
+; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, s11
+; GFX10-NEXT: s_mov_b32 s5, s4
+; GFX10-NEXT: s_mov_b32 s6, s4
+; GFX10-NEXT: s_mov_b32 s7, s4
+; GFX10-NEXT: s_mov_b32 s8, s4
+; GFX10-NEXT: s_mov_b32 s9, s4
+; GFX10-NEXT: image_store v[0:3], v[4:5], s[4:11] dim:SQ_RSRC_IMG_2D unorm
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: main:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_mov_b32 s6, s0
+; GFX11-NEXT: s_mov_b32 s7, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s6
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: v_mov_b32_e32 v5, s7
+; GFX11-NEXT: s_mov_b32 s1, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-NEXT: v_mov_b32_e32 v3, v1
+; GFX11-NEXT: s_mov_b32 s2, s0
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s0
+; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dim:SQ_RSRC_IMG_2D unorm
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %i = bitcast <19 x i32> %arg to <38 x i16>
+ %i1 = extractelement <38 x i16> %i, i64 0
+ %i2 = icmp eq i16 %i1, 0
+ %i3 = zext i1 %i2 to i32
+ %i4 = bitcast i32 %i3 to float
+ %i5 = insertelement <4 x float> zeroinitializer, float %i4, i64 0
+ call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %i5, i32 0, i32 0, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0)
+ ret void
+}
+declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10PLUS: {{.*}}
+; GPRIDX: {{.*}}
+; MOVREL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index ac153183be642a..1e1c90d142a1f3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2626,6 +2626,132 @@ entry:
ret double %ext
}
+define amdgpu_ps double @dyn_extract_v7f64_s_v_bitcast(<14 x float> inreg %userData, i32 %sel) {
+; GCN-LABEL: dyn_extract_v7f64_s_v_bitcast:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: v_mov_b32_e32 v3, s4
+; GCN-NEXT: v_mov_b32_e32 v4, s5
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NEXT: v_mov_b32_e32 v6, s7
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v7, s8
+; GCN-NEXT: v_mov_b32_e32 v8, s9
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v9, s10
+; GCN-NEXT: v_mov_b32_e32 v10, s11
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
+; GCN-NEXT: v_mov_b32_e32 v11, s12
+; GCN-NEXT: v_mov_b32_e32 v12, s13
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
+; GCN-NEXT: v_mov_b32_e32 v13, s14
+; GCN-NEXT: v_mov_b32_e32 v14, s15
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
+; GCN-NEXT: ; kill: def $vgpr15 killed $sgpr2 killed $exec
+; GCN-NEXT: ; kill: def $vgpr16 killed $sgpr3 killed $exec
+; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: v_readfirstlane_b32 s1, v1
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: dyn_extract_v7f64_s_v_bitcast:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-NEXT: v_mov_b32_e32 v2, s5
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT: s_mov_b32 s0, s14
+; GFX10-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s15, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: dyn_extract_v7f64_s_v_bitcast:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT: s_mov_b32 s0, s14
+; GFX11-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s15, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: ; return to shader part epilog
+entry:
+ %bc = bitcast <14 x float> %userData to <7 x double>
+ %ext = extractelement <7 x double> %bc, i32 %sel
+ ret double %ext
+}
+
+define amdgpu_ps i64 @dyn_extract_v7i64_s_v_bitcast(<14 x i32> inreg %userData, i32 %sel) {
+; GCN-LABEL: dyn_extract_v7i64_s_v_bitcast:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_mov_b32 s0, s10
+; GCN-NEXT: s_mov_b32 s1, s11
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: dyn_extract_v7i64_s_v_bitcast:
+; GFX10PLUS: ; %bb.0: ; %entry
+; GFX10PLUS-NEXT: s_mov_b32 s0, s10
+; GFX10PLUS-NEXT: s_mov_b32 s1, s11
+; GFX10PLUS-NEXT: ; return to shader part epilog
+entry:
+ %.bc = bitcast <14 x i32> %userData to <7 x i64>
+ %ext = extractelement <7 x i64> %.bc, i32 4
+ ret i64 %ext
+}
+
define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel) {
; GCN-LABEL: dyn_extract_v7f64_s_v:
; GCN: ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir
index 10766b0f79d818..25652b69afa929 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir
@@ -299,129 +299,6 @@ body: |
S_NOP 0, implicit %12
...
---
-name: legal_v13s32
-body: |
- bb.0:
- liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12
- ; CHECK-LABEL: name: legal_v13s32
- ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<13 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32)
- ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<13 x s32>)
- %0:_(s32) = COPY $vgpr0
- %1:_(s32) = COPY $vgpr1
- %2:_(s32) = COPY $vgpr2
- %3:_(s32) = COPY $vgpr3
- %4:_(s32) = COPY $vgpr4
- %5:_(s32) = COPY $vgpr5
- %6:_(s32) = COPY $vgpr6
- %7:_(s32) = COPY $vgpr7
- %8:_(s32) = COPY $vgpr8
- %9:_(s32) = COPY $vgpr9
- %10:_(s32) = COPY $vgpr10
- %11:_(s32) = COPY $vgpr11
- %12:_(s32) = COPY $vgpr12
- %13:_(<13 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12
- S_NOP 0, implicit %13
-...
----
-name: legal_v14s32
-body: |
- bb.0:
- liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13
- ; CHECK-LABEL: name: legal_v14s32
- ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<14 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32)
- ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<14 x s32>)
- %0:_(s32) = COPY $vgpr0
- %1:_(s32) = COPY $vgpr1
- %2:_(s32) = COPY $vgpr2
- %3:_(s32) = COPY $vgpr3
- %4:_(s32) = COPY $vgpr4
- %5:_(s32) = COPY $vgpr5
- %6:_(s32) = COPY $vgpr6
- %7:_(s32) = COPY $vgpr7
- %8:_(s32) = COPY $vgpr8
- %9:_(s32) = COPY $vgpr9
- %10:_(s32) = COPY $vgpr10
- %11:_(s32) = COPY $vgpr11
- %12:_(s32) = COPY $vgpr12
- %13:_(s32) = COPY $vgpr13
- %14:_(<14 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13
- S_NOP 0, implicit %14
-...
----
-name: legal_v15s32
-body: |
- bb.0:
- liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
- ; CHECK-LABEL: name: legal_v15s32
- ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<15 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
- ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<15 x s32>)
- %0:_(s32) = COPY $vgpr0
- %1:_(s32) = COPY $vgpr1
- %2:_(s32) = COPY $vgpr2
- %3:_(s32) = COPY $vgpr3
- %4:_(s32) = COPY $vgpr4
- %5:_(s32) = COPY $vgpr5
- %6:_(s32) = COPY $vgpr6
- %7:_(s32) = COPY $vgpr7
- %8:_(s32) = COPY $vgpr8
- %9:_(s32) = COPY $vgpr9
- %10:_(s32) = COPY $vgpr10
- %11:_(s32) = COPY $vgpr11
- %12:_(s32) = COPY $vgpr12
- %13:_(s32) = COPY $vgpr13
- %14:_(s32) = COPY $vgpr14
- %15:_(<15 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14
- S_NOP 0, implicit %15
-...
----
name: legal_v16s32
body: |
bb.0:
More information about the llvm-commits
mailing list