[llvm] r365972 - [AMDGPU] use v32f32 for 3 mfma intrinsics

Fri Jul 12 15:42:01 PDT 2019

Author: rampitec
Date: Fri Jul 12 15:42:01 2019
New Revision: 365972

URL: http://llvm.org/viewvc/llvm-project?rev=365972&view=rev
Log:
[AMDGPU] use v32f32 for 3 mfma intrinsics

These should really use v32f32, but were defined as v32i32
due to the lack of the v32f32 type.

Differential Revision: https://reviews.llvm.org/D64667

Modified:
    llvm/trunk/include/llvm/IR/Intrinsics.td
    llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
    llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td
    llvm/trunk/test/CodeGen/AMDGPU/agpr-register-count.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
    llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll
    llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll

Modified: llvm/trunk/include/llvm/IR/Intrinsics.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/Intrinsics.td?rev=365972&r1=365971&r2=365972&view=diff
==============================================================================

--- llvm/trunk/include/llvm/IR/Intrinsics.td (original)
+++ llvm/trunk/include/llvm/IR/Intrinsics.td Fri Jul 12 15:42:01 2019
@@ -261,6 +261,7 @@ def llvm_v2f32_ty      : LLVMType<v2f32>
 def llvm_v4f32_ty      : LLVMType<v4f32>;    //  4 x float
 def llvm_v8f32_ty      : LLVMType<v8f32>;    //  8 x float
 def llvm_v16f32_ty     : LLVMType<v16f32>;   // 16 x float
+def llvm_v32f32_ty     : LLVMType<v32f32>;   // 32 x float
 def llvm_v1f64_ty      : LLVMType<v1f64>;    //  1 x double
 def llvm_v2f64_ty      : LLVMType<v2f64>;    //  2 x double
 def llvm_v4f64_ty      : LLVMType<v4f64>;    //  4 x double

Modified: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td?rev=365972&r1=365971&r2=365972&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td Fri Jul 12 15:42:01 2019
@@ -1663,8 +1663,8 @@ def int_amdgcn_buffer_atomic_fadd    : A
 def int_amdgcn_global_atomic_fadd    : AMDGPUGlobalAtomicNoRtn;
 
 // llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
-def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32i32_ty],
-  [llvm_float_ty, llvm_float_ty, llvm_v32i32_ty,
+def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32f32_ty],
+  [llvm_float_ty, llvm_float_ty, llvm_v32f32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
 def int_amdgcn_mfma_f32_16x16x1f32 : Intrinsic<[llvm_v16f32_ty],
@@ -1683,8 +1683,8 @@ def int_amdgcn_mfma_f32_16x16x4f32 : Int
   [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
-def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32i32_ty],
-  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32i32_ty,
+def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32f32_ty],
+  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32f32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
 def int_amdgcn_mfma_f32_16x16x4f16 : Intrinsic<[llvm_v16f32_ty],
@@ -1723,8 +1723,8 @@ def int_amdgcn_mfma_i32_16x16x16i8 : Int
   [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
-def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32i32_ty],
-  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32i32_ty,
+def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32f32_ty],
+  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32f32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 
 def int_amdgcn_mfma_f32_16x16x2bf16 : Intrinsic<[llvm_v16f32_ty],

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=365972&r1=365971&r2=365972&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Fri Jul 12 15:42:01 2019
@@ -165,6 +165,9 @@ AMDGPUTargetLowering::AMDGPUTargetLoweri
   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
 
+  setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
+
   setOperationAction(ISD::LOAD, MVT::i64, Promote);
   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
 
@@ -256,6 +259,9 @@ AMDGPUTargetLowering::AMDGPUTargetLoweri
   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
 
+  setOperationAction(ISD::STORE, MVT::v32f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
+
   setOperationAction(ISD::STORE, MVT::i64, Promote);
   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
 
@@ -355,7 +361,10 @@ AMDGPUTargetLowering::AMDGPUTargetLoweri
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=365972&r1=365971&r2=365972&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Fri Jul 12 15:42:01 2019
@@ -153,6 +153,7 @@ SITargetLowering::SITargetLowering(const
 
   if (Subtarget->hasMAIInsts()) {
     addRegisterClass(MVT::v32i32, &AMDGPU::AReg_1024RegClass);
+    addRegisterClass(MVT::v32f32, &AMDGPU::AReg_1024RegClass);
   }
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -263,8 +264,9 @@ SITargetLowering::SITargetLowering(const
 
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
-  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
-        MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
+  for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
+                  MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
+                  MVT::v32i32, MVT::v32f32 }) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td?rev=365972&r1=365971&r2=365972&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td Fri Jul 12 15:42:01 2019
@@ -2178,14 +2178,13 @@ def VOP_I32_V2I16_V2I16_I32 : VOPProfile
 
 def VOP_V4F32_F32_F32_V4F32       : VOPProfile <[v4f32,  f32,   f32,   v4f32]>;
 def VOP_V16F32_F32_F32_V16F32     : VOPProfile <[v16f32, f32,   f32,   v16f32]>;
-// TODO: define v32f32
-def VOP_V32F32_F32_F32_V32F32     : VOPProfile <[v32i32, f32,   f32,   v32i32]>;
+def VOP_V32F32_F32_F32_V32F32     : VOPProfile <[v32f32, f32,   f32,   v32f32]>;
 def VOP_V4F32_V4F16_V4F16_V4F32   : VOPProfile <[v4f32,  v4f16, v4f16, v4f32]>;
 def VOP_V16F32_V4F16_V4F16_V16F32 : VOPProfile <[v16f32, v4f16, v4f16, v16f32]>;
-def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32i32, v4f16, v4f16, v32i32]>;
+def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32f32, v4f16, v4f16, v32f32]>;
 def VOP_V4F32_V2I16_V2I16_V4F32   : VOPProfile <[v4f32,  v2i16, v2i16, v4f32]>;
 def VOP_V16F32_V2I16_V2I16_V16F32 : VOPProfile <[v16f32, v2i16, v2i16, v16f32]>;
-def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32i32, v2i16, v2i16, v32i32]>;
+def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32f32, v2i16, v2i16, v32f32]>;
 def VOP_V4I32_I32_I32_V4I32       : VOPProfile <[v4i32,  i32,   i32,   v4i32]>;
 def VOP_V16I32_I32_I32_V16I32     : VOPProfile <[v16i32, i32,   i32,   v16i32]>;
 def VOP_V32I32_I32_I32_V32I32     : VOPProfile <[v32i32, i32,   i32,   v32i32]>;

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=365972&r1=365971&r2=365972&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Fri Jul 12 15:42:01 2019
@@ -942,6 +942,14 @@ foreach Index = 0-31 in {
   def Insert_Element_v32i32_#Index : Insert_Element <
     i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
   >;
+
+  def Extract_Element_v32f32_#Index : Extract_Element <
+    f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Insert_Element_v32f32_#Index : Insert_Element <
+    f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
 }
 
 // FIXME: Why do only some of these type combinations for SReg and
@@ -1034,6 +1042,10 @@ def : BitConvert <v8f32, v8i32, VReg_256
 def : BitConvert <v16i32, v16f32, VReg_512>;
 def : BitConvert <v16f32, v16i32, VReg_512>;
 
+// 1024-bit bitcast
+def : BitConvert <v32i32, v32f32, VReg_1024>;
+def : BitConvert <v32f32, v32i32, VReg_1024>;
+
 /********** =================== **********/
 /********** Src & Dst modifiers **********/
 /********** =================== **********/

Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td?rev=365972&r1=365971&r2=365972&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td Fri Jul 12 15:42:01 2019
@@ -757,11 +757,11 @@ def VRegOrLds_32 : RegisterClass<"AMDGPU
   let isAllocatable = 0;
 }
 
-def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add SGPR_1024Regs)> {
+def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add SGPR_1024Regs)> {
   let AllocationPriority = 19;
 }
 
-def SReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32,
+def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
   (add SGPR_1024)> {
   let CopyCost = 16;
   let AllocationPriority = 19;
@@ -812,7 +812,7 @@ def VReg_512 : RegisterClass<"AMDGPU", [
   let AllocationPriority = 7;
 }
 
-def VReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add VGPR_1024)> {
+def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add VGPR_1024)> {
   let Size = 1024;
   let CopyCost = 32;
   let AllocationPriority = 8;
@@ -840,7 +840,7 @@ def AReg_512 : RegisterClass<"AMDGPU", [
 }
 
 // TODO: add v32f32 value type
-def AReg_1024 : RegisterClass<"AMDGPU", [v32i32], 32, (add AGPR_1024)> {
+def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add AGPR_1024)> {
   let Size = 1024;
   let CopyCost = 65;
   let AllocationPriority = 8;

Modified: llvm/trunk/test/CodeGen/AMDGPU/agpr-register-count.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/agpr-register-count.ll?rev=365972&r1=365971&r2=365972&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/agpr-register-count.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/agpr-register-count.ll Fri Jul 12 15:42:01 2019
@@ -1,15 +1,15 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 
 ; GCN-LABEL: {{^}}test_32_agprs:
 ; GCN: v_mfma_f32_32x32x1f32 a[0:31], {{v[0-9]+}}, {{v[0-9]+}}, 0
 ; GCN-NOT: v28
 ; GCN: NumVgprs: 32
 ; GCN: VGPRBlocks: 7
-define amdgpu_kernel void @test_32_agprs(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_32_agprs(<32 x float> addrspace(1)* %arg) {
 bb:
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll?rev=365972&r1=365971&r2=365972&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll Fri Jul 12 15:42:01 2019
@@ -1,11 +1,11 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32)
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
@@ -15,7 +15,7 @@ declare <16 x i32> @llvm.amdgcn.mfma.i32
 declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
 declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
 declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
@@ -100,11 +100,11 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -326,14 +326,14 @@ bb:
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x i32> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
+define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
   %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
   %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
   %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -794,13 +794,13 @@ bb:
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
   %a = bitcast i32 1 to <2 x i16>
   %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -957,12 +957,12 @@ bb:
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc:
 ; GCN:      v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
 ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x float> addrspace(1)* %arg) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 0, i32 0, i32 0)
-  %mai.2 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %mai.1, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.2, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.2, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -1112,10 +1112,10 @@ bb:
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x float> addrspace(1)* %arg) {
 bb:
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -1184,7 +1184,7 @@ bb:
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm:
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1
+; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
@@ -1256,10 +1256,10 @@ bb:
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x float> addrspace(1)* %arg) {
 bb:
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> <i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
   ret void
 }
 
@@ -1350,12 +1350,12 @@ bb:
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
 ; GCN-DAG: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x float> addrspace(1)* %arg) {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x i32>, <32 x i32> addrspace(1)* %arg, i32 %tid
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %gep
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %gep
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll?rev=365972&r1=365971&r2=365972&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll Fri Jul 12 15:42:01 2019
@@ -84,23 +84,23 @@ define amdgpu_kernel void @max_10_vgprs_
 ; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
 ; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
 ; A2V:        ScratchSize: 0
-define amdgpu_kernel void @max_32regs_mfma32(i32 addrspace(1)* %arg) #3 {
+define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 {
 bb:
   %v = call i32 asm sideeffect "", "=a"()
   br label %use
 
 use:
-  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 2>, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 2.0>, i32 0, i32 0, i32 0)
   call void asm sideeffect "", "a"(i32 %v)
-  %elt1 = extractelement <32 x i32> %mai.1, i32 0
-  store i32 %elt1, i32 addrspace(1)* %arg
+  %elt1 = extractelement <32 x float> %mai.1, i32 0
+  store float %elt1, float addrspace(1)* %arg
   ret void
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
-declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 
 attributes #0 = { nounwind "amdgpu-num-vgpr"="24" }
 attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }

Modified: llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll?rev=365972&r1=365971&r2=365972&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll Fri Jul 12 15:42:01 2019
@@ -233,19 +233,23 @@ define amdgpu_kernel void @max_256_vgprs
   ret void
 }
 
+; FIXME: adding an AReg_1024 register class for v32f32 and v32i32
+;        produces unnecessary copies and we still have some amount
+;        of conventional spilling.
+
 ; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb:
 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GFX908-NOT: SCRATCH_RSRC
+; GFX908-FIXME-NOT: SCRATCH_RSRC
 ; GFX908-DAG: v_accvgpr_write_b32 a0, v
 ; GFX900:     buffer_store_dword v
 ; GFX900:     buffer_load_dword v
-; GFX908-NOT: buffer_
+; GFX908-FIXME-NOT: buffer_
 ; GFX908-DAG  v_accvgpr_read_b32
 
 ; GCN:    NumVgprs: 256
 ; GFX900: ScratchSize: 580
-; GFX908: ScratchSize: 0
+; GFX908-FIXME: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
 ; GCN:    NumVGPRsForWavesPerEU: 256
 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {