[llvm] 98f4872 - [AMDGPU] Add 224-bit vector types and link 192-bit types to MVTs
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 23 20:41:45 PDT 2021
Author: Carl Ritson
Date: 2021-06-24T12:41:22+09:00
New Revision: 98f48723f2ca286d6f12bb0c4dc7830a55e85637
URL: https://github.com/llvm/llvm-project/commit/98f48723f2ca286d6f12bb0c4dc7830a55e85637
DIFF: https://github.com/llvm/llvm-project/commit/98f48723f2ca286d6f12bb0c4dc7830a55e85637.diff
LOG: [AMDGPU] Add 224-bit vector types and link 192-bit types to MVTs
Add SReg_224, VReg_224, AReg_224, etc.
Link 224-bit types with v7i32/v7f32.
Link existing 192-bit types to newly added v3i64/v3f64/v6i32/v6f32.
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D104622
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
llvm/lib/Target/AMDGPU/SIRegisterInfo.td
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
llvm/test/CodeGen/AMDGPU/code-object-v3.ll
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
llvm/test/CodeGen/AMDGPU/function-returns.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
llvm/test/CodeGen/AMDGPU/load-global-f64.ll
llvm/test/CodeGen/AMDGPU/load-global-i64.ll
llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
llvm/test/CodeGen/AMDGPU/sdiv64.ll
llvm/test/CodeGen/AMDGPU/srem64.ll
llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 8f69e20cc0513..3a36c1d123ec0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -965,6 +965,16 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
IsSGPR = false;
IsAGPR = true;
Width = 6;
+ } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 7;
+ } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 7;
+ } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 7;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
"trap handler registers should not be used");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index cadcf40f759bb..6f9ff03dcb87b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -78,6 +78,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
+ setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
+
+ setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
+
setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
@@ -99,9 +105,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
+
setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
+ setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
+
setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
@@ -173,12 +185,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
@@ -198,6 +212,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v5f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
+ setOperationAction(ISD::STORE, MVT::v6f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
+
+ setOperationAction(ISD::STORE, MVT::v7f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
+
setOperationAction(ISD::STORE, MVT::v8f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
@@ -219,6 +239,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
+ setOperationAction(ISD::STORE, MVT::v3i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
+
+ setOperationAction(ISD::STORE, MVT::v3f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
+
setOperationAction(ISD::STORE, MVT::v4i64, Promote);
AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
@@ -261,6 +287,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
+ setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
+ setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
+
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
@@ -325,6 +356,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
@@ -337,6 +372,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
@@ -345,6 +384,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
@@ -414,8 +455,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
static const MVT::SimpleValueType VectorIntTypes[] = {
- MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
- };
+ MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
for (MVT VT : VectorIntTypes) {
// Expand the following operations for the current type by default.
@@ -456,8 +496,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
}
static const MVT::SimpleValueType FloatVectorTypes[] = {
- MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
- };
+ MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
for (MVT VT : FloatVectorTypes) {
setOperationAction(ISD::FABS, VT, Expand);
@@ -507,6 +546,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
+ setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
+
+ setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
+
// There are no libcalls of any kind.
for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 6c70b53b23c11..50999a4802b39 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -7,16 +7,16 @@
//===----------------------------------------------------------------------===//
def SGPRRegBank : RegisterBank<"SGPR",
- [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024]
+ [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_512, SReg_1024]
>;
def VGPRRegBank : RegisterBank<"VGPR",
- [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_256, VReg_512, VReg_1024]
+ [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_512, VReg_1024]
>;
// It is helpful to distinguish conditions from ordinary SGPRs.
def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
def AGPRRegBank : RegisterBank <"AGPR",
- [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_256, AReg_512, AReg_1024]
+ [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_512, AReg_1024]
>;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 4c8a62f2587de..42ffed3b982ea 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2192,6 +2192,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 4: return AMDGPU::VReg_128RegClassID;
case 5: return AMDGPU::VReg_160RegClassID;
case 6: return AMDGPU::VReg_192RegClassID;
+ case 7: return AMDGPU::VReg_224RegClassID;
case 8: return AMDGPU::VReg_256RegClassID;
case 16: return AMDGPU::VReg_512RegClassID;
case 32: return AMDGPU::VReg_1024RegClassID;
@@ -2214,6 +2215,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 4: return AMDGPU::SGPR_128RegClassID;
case 5: return AMDGPU::SGPR_160RegClassID;
case 6: return AMDGPU::SGPR_192RegClassID;
+ case 7: return AMDGPU::SGPR_224RegClassID;
case 8: return AMDGPU::SGPR_256RegClassID;
case 16: return AMDGPU::SGPR_512RegClassID;
}
@@ -2226,6 +2228,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 4: return AMDGPU::AReg_128RegClassID;
case 5: return AMDGPU::AReg_160RegClassID;
case 6: return AMDGPU::AReg_192RegClassID;
+ case 7: return AMDGPU::AReg_224RegClassID;
case 8: return AMDGPU::AReg_256RegClassID;
case 16: return AMDGPU::AReg_512RegClassID;
case 32: return AMDGPU::AReg_1024RegClassID;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 9275db67b1cf6..dbce4b2e872c0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -463,6 +463,7 @@ SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
Enc |= 512;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 13e289c0a55d1..39319e31ba445 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -102,6 +102,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
+ addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
+ addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
+
+ addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
+ addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
+
+ addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
+ addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
+
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
@@ -145,6 +154,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v6i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v7i32, Custom);
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
setOperationAction(ISD::LOAD, MVT::i1, Custom);
@@ -154,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v3i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
setOperationAction(ISD::STORE, MVT::v5i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v6i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v7i32, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
@@ -176,6 +189,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
@@ -203,8 +218,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);
@@ -245,6 +268,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// with > 4 elements.
for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
+ MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
@@ -290,6 +314,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
}
+ for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
+ }
+
for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
@@ -365,9 +403,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
- // Deal with vec5 vector operations when widened to vec8.
+ // Deal with vec5/6/7 vector operations when widened to vec8.
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
@@ -11697,6 +11739,8 @@ static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
return AMDGPU::VReg_160_Align2RegClassID;
case AMDGPU::VReg_192RegClassID:
return AMDGPU::VReg_192_Align2RegClassID;
+ case AMDGPU::VReg_224RegClassID:
+ return AMDGPU::VReg_224_Align2RegClassID;
case AMDGPU::VReg_256RegClassID:
return AMDGPU::VReg_256_Align2RegClassID;
case AMDGPU::VReg_512RegClassID:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a264ef0ab076e..358abec1e06c2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1340,6 +1340,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S160_SAVE;
case 24:
return AMDGPU::SI_SPILL_S192_SAVE;
+ case 28:
+ return AMDGPU::SI_SPILL_S224_SAVE;
case 32:
return AMDGPU::SI_SPILL_S256_SAVE;
case 64:
@@ -1365,6 +1367,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V160_SAVE;
case 24:
return AMDGPU::SI_SPILL_V192_SAVE;
+ case 28:
+ return AMDGPU::SI_SPILL_V224_SAVE;
case 32:
return AMDGPU::SI_SPILL_V256_SAVE;
case 64:
@@ -1473,6 +1477,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_S192_RESTORE;
+ case 28:
+ return AMDGPU::SI_SPILL_S224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_S256_RESTORE;
case 64:
@@ -1498,6 +1504,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_V192_RESTORE;
+ case 28:
+ return AMDGPU::SI_SPILL_V224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_V256_RESTORE;
case 64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9778a6f79f399..ad6d69468ec91 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -675,6 +675,7 @@ defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
+defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
@@ -718,6 +719,7 @@ defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
+defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
@@ -728,6 +730,7 @@ defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>;
defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>;
defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
+defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>;
defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
@@ -1052,6 +1055,38 @@ foreach Index = 0-4 in {
>;
}
+foreach Index = 0-5 in {
+ def Extract_Element_v6i32_#Index : Extract_Element <
+ i32, v6i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v6i32_#Index : Insert_Element <
+ i32, v6i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v6f32_#Index : Extract_Element <
+ f32, v6f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v6f32_#Index : Insert_Element <
+ f32, v6f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+foreach Index = 0-6 in {
+ def Extract_Element_v7i32_#Index : Extract_Element <
+ i32, v7i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v7i32_#Index : Insert_Element <
+ i32, v7i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v7f32_#Index : Extract_Element <
+ f32, v7f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v7f32_#Index : Insert_Element <
+ f32, v7f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
foreach Index = 0-7 in {
def Extract_Element_v8i32_#Index : Extract_Element <
i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -1202,8 +1237,32 @@ def : BitConvert <v4f32, v2i64, VReg_128>;
def : BitConvert <v2i64, v4f32, VReg_128>;
// 160-bit bitcast
-def : BitConvert <v5i32, v5f32, SGPR_160>;
-def : BitConvert <v5f32, v5i32, SGPR_160>;
+def : BitConvert <v5i32, v5f32, SReg_160>;
+def : BitConvert <v5f32, v5i32, SReg_160>;
+def : BitConvert <v5i32, v5f32, VReg_160>;
+def : BitConvert <v5f32, v5i32, VReg_160>;
+
+// 192-bit bitcast
+def : BitConvert <v6i32, v6f32, SReg_192>;
+def : BitConvert <v6f32, v6i32, SReg_192>;
+def : BitConvert <v6i32, v6f32, VReg_192>;
+def : BitConvert <v6f32, v6i32, VReg_192>;
+def : BitConvert <v3i64, v3f64, VReg_192>;
+def : BitConvert <v3f64, v3i64, VReg_192>;
+def : BitConvert <v3i64, v6i32, VReg_192>;
+def : BitConvert <v3i64, v6f32, VReg_192>;
+def : BitConvert <v3f64, v6i32, VReg_192>;
+def : BitConvert <v3f64, v6f32, VReg_192>;
+def : BitConvert <v6i32, v3i64, VReg_192>;
+def : BitConvert <v6f32, v3i64, VReg_192>;
+def : BitConvert <v6i32, v3f64, VReg_192>;
+def : BitConvert <v6f32, v3f64, VReg_192>;
+
+// 224-bit bitcast
+def : BitConvert <v7i32, v7f32, SReg_224>;
+def : BitConvert <v7f32, v7i32, SReg_224>;
+def : BitConvert <v7i32, v7f32, VReg_224>;
+def : BitConvert <v7f32, v7i32, VReg_224>;
// 256-bit bitcast
def : BitConvert <v8i32, v8f32, SReg_256>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index cf0a543e79516..d44c8c48a2468 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1874,6 +1874,8 @@ getAnyVGPRClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::VReg_160RegClass;
if (BitWidth <= 192)
return &AMDGPU::VReg_192RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::VReg_224RegClass;
if (BitWidth <= 256)
return &AMDGPU::VReg_256RegClass;
if (BitWidth <= 512)
@@ -1896,6 +1898,8 @@ getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::VReg_160_Align2RegClass;
if (BitWidth <= 192)
return &AMDGPU::VReg_192_Align2RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::VReg_224_Align2RegClass;
if (BitWidth <= 256)
return &AMDGPU::VReg_256_Align2RegClass;
if (BitWidth <= 512)
@@ -2036,6 +2040,11 @@ SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
&AMDGPU::SReg_192RegClass,
&AMDGPU::AReg_192_Align2RegClass,
&AMDGPU::AReg_192RegClass,
+ &AMDGPU::VReg_224_Align2RegClass,
+ &AMDGPU::VReg_224RegClass,
+ &AMDGPU::SReg_224RegClass,
+ &AMDGPU::AReg_224_Align2RegClass,
+ &AMDGPU::AReg_224RegClass,
&AMDGPU::VReg_256_Align2RegClass,
&AMDGPU::VReg_256RegClass,
&AMDGPU::SReg_256RegClass,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 095fceae609cf..21a2e2d860fdf 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -58,6 +58,7 @@ class getSubRegs<int size> {
list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3];
list<SubRegIndex> ret5 = [sub0, sub1, sub2, sub3, sub4];
list<SubRegIndex> ret6 = [sub0, sub1, sub2, sub3, sub4, sub5];
+ list<SubRegIndex> ret7 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6];
list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3,
sub4, sub5, sub6, sub7,
@@ -77,9 +78,10 @@ class getSubRegs<int size> {
!if(!eq(size, 4), ret4,
!if(!eq(size, 5), ret5,
!if(!eq(size, 6), ret6,
- !if(!eq(size, 8), ret8,
- !if(!eq(size, 16), ret16,
- ret32)))))));
+ !if(!eq(size, 7), ret7,
+ !if(!eq(size, 8), ret8,
+ !if(!eq(size, 16), ret16,
+ ret32))))))));
}
// Generates list of sequential register tuple names.
@@ -350,9 +352,12 @@ def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">;
// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs.
def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">;
-// SGPR 192-bit registers
+// SGPR 192-bit registers. No operations use these, but for symmetry with 192-bit VGPRs.
def SGPR_192Regs : SIRegisterTuples<getSubRegs<6>.ret, SGPR_32, 105, 4, 6, "s">;
+// SGPR 224-bit registers. No operations use these, but for symmetry with 224-bit VGPRs.
+def SGPR_224Regs : SIRegisterTuples<getSubRegs<7>.ret, SGPR_32, 105, 4, 7, "s">;
+
// SGPR 256-bit registers
def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">;
@@ -508,6 +513,9 @@ def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">;
// VGPR 192-bit registers
def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 255, 1, 6, "v">;
+// VGPR 224-bit registers
+def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 255, 1, 7, "v">;
+
// VGPR 256-bit registers
def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
@@ -547,6 +555,9 @@ def AGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, AGPR_32, 255, 1, 5, "a">;
// AGPR 192-bit registers
def AGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, AGPR_32, 255, 1, 6, "a">;
+// AGPR 224-bit registers
+def AGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, AGPR_32, 255, 1, 7, "a">;
+
// AGPR 256-bit registers
def AGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, AGPR_32, 255, 1, 8, "a">;
@@ -725,20 +736,41 @@ def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
(add SGPR_160)> {
// FIXME: Should be isAllocatable = 0, but that causes all TableGen-generated
// subclasses of SGPR_160 to be marked unallocatable too.
+ // This occurs because SGPR_160 and SReg_160 classes are equivalent in size
+ // meaning their enumeration order is dependent on alphanumeric ordering of
+ // their names. The superclass for inherence is the last one in topological
+ // order (i.e. enumeration order), hence SReg_160 is selected.
+ // Potential workarounds involve renaming SGPR_160, adding another class
+ // which is ordered last and hence used for inheritance, or adding more
+ // registers to SReg_160 to cause it to be moved earlier in the superclass
+ // list.
+ let CopyCost = 3;
+}
+
+// There are no 6-component scalar instructions, but this is needed
+// for symmetry with VGPRs.
+def SGPR_192 : RegisterClass<"AMDGPU", [v6i32, v6f32, v3i64, v3f64], 32, (add SGPR_192Regs)> {
+ let AllocationPriority = 17;
}
-def SGPR_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192Regs)> {
- let Size = 192;
- let AllocationPriority = 17;
+def SReg_192 : RegisterClass<"AMDGPU", [v6i32, v6f32, v3i64, v3f64], 32, (add SGPR_192)> {
+ let isAllocatable = 0;
+ let CopyCost = 3;
}
-def SReg_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192)> {
- let Size = 192;
+// There are no 7-component scalar instructions, but this is needed
+// for symmetry with VGPRs.
+def SGPR_224 : RegisterClass<"AMDGPU", [v7i32, v7f32], 32, (add SGPR_224Regs)> {
+ let AllocationPriority = 18;
+}
+
+def SReg_224 : RegisterClass<"AMDGPU", [v7i32, v7f32], 32, (add SGPR_224)> {
let isAllocatable = 0;
+ let CopyCost = 4;
}
def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add SGPR_256Regs)> {
- let AllocationPriority = 18;
+ let AllocationPriority = 19;
}
def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add TTMP_256Regs)> {
@@ -754,7 +786,7 @@ def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32,
def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
(add SGPR_512Regs)> {
- let AllocationPriority = 19;
+ let AllocationPriority = 20;
}
def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
@@ -776,7 +808,7 @@ def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 3
def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
(add SGPR_1024Regs)> {
- let AllocationPriority = 20;
+ let AllocationPriority = 21;
}
def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
@@ -812,7 +844,8 @@ defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>;
defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
-defm VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>;
+defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
+defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
@@ -832,7 +865,8 @@ defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
-defm AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>;
+defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
+defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c7d6a849fc491..8cd748eaec6b4 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1650,6 +1650,13 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::VReg_192_Align2RegClassID:
case AMDGPU::AReg_192_Align2RegClassID:
return 192;
+ case AMDGPU::SGPR_224RegClassID:
+ case AMDGPU::SReg_224RegClassID:
+ case AMDGPU::VReg_224RegClassID:
+ case AMDGPU::AReg_224RegClassID:
+ case AMDGPU::VReg_224_Align2RegClassID:
+ case AMDGPU::AReg_224_Align2RegClassID:
+ return 224;
case AMDGPU::SGPR_256RegClassID:
case AMDGPU::SReg_256RegClassID:
case AMDGPU::VReg_256RegClassID:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 921937ffe1ed3..5aae1526cdbc5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -3501,13 +3501,13 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
; GPRIDX-LABEL: dyn_insertelement_v7f32_s_v_s:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_mov_b32 s0, s2
-; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s2, s4
-; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
-; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
-; GPRIDX-NEXT: v_mov_b32_e32 v14, s7
+; GPRIDX-NEXT: s_mov_b32 s1, s3
+; GPRIDX-NEXT: s_mov_b32 s3, s5
+; GPRIDX-NEXT: s_mov_b32 s5, s7
+; GPRIDX-NEXT: v_mov_b32_e32 v13, s6
; GPRIDX-NEXT: v_mov_b32_e32 v7, s0
; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0
; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
@@ -3526,7 +3526,6 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
; GPRIDX-NEXT: v_mov_b32_e32 v12, s5
; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 5
; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v12, v0, vcc
-; GPRIDX-NEXT: v_mov_b32_e32 v13, s6
; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 6
; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v13, v0, vcc
; GPRIDX-NEXT: v_mov_b32_e32 v0, v7
@@ -3535,13 +3534,13 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_s:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
-; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_mov_b32 s2, s4
-; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s4, s6
-; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 s6, s8
-; MOVREL-NEXT: v_mov_b32_e32 v14, s7
+; MOVREL-NEXT: s_mov_b32 s1, s3
+; MOVREL-NEXT: s_mov_b32 s3, s5
+; MOVREL-NEXT: s_mov_b32 s5, s7
+; MOVREL-NEXT: v_mov_b32_e32 v13, s6
; MOVREL-NEXT: v_mov_b32_e32 v7, s0
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 0
; MOVREL-NEXT: v_mov_b32_e32 v8, s1
@@ -3551,7 +3550,6 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 1
; MOVREL-NEXT: v_mov_b32_e32 v12, s5
-; MOVREL-NEXT: v_mov_b32_e32 v13, s6
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 2
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc_lo
@@ -3574,13 +3572,13 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
; GPRIDX-LABEL: dyn_insertelement_v7f32_s_v_v:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_mov_b32 s0, s2
-; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s2, s4
-; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
-; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
-; GPRIDX-NEXT: v_mov_b32_e32 v15, s7
+; GPRIDX-NEXT: s_mov_b32 s1, s3
+; GPRIDX-NEXT: s_mov_b32 s3, s5
+; GPRIDX-NEXT: s_mov_b32 s5, s7
+; GPRIDX-NEXT: v_mov_b32_e32 v14, s6
; GPRIDX-NEXT: v_mov_b32_e32 v8, s0
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
@@ -3600,7 +3598,6 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1
; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1
-; GPRIDX-NEXT: v_mov_b32_e32 v14, s6
; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc
; GPRIDX-NEXT: v_mov_b32_e32 v0, v8
; GPRIDX-NEXT: v_mov_b32_e32 v1, v7
@@ -3609,13 +3606,13 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_v:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
-; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_mov_b32 s2, s4
-; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s4, s6
-; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 s6, s8
-; MOVREL-NEXT: v_mov_b32_e32 v15, s7
+; MOVREL-NEXT: s_mov_b32 s1, s3
+; MOVREL-NEXT: s_mov_b32 s3, s5
+; MOVREL-NEXT: s_mov_b32 s5, s7
+; MOVREL-NEXT: v_mov_b32_e32 v14, s6
; MOVREL-NEXT: v_mov_b32_e32 v8, s0
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; MOVREL-NEXT: v_mov_b32_e32 v9, s1
@@ -3625,7 +3622,6 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; MOVREL-NEXT: v_mov_b32_e32 v13, s5
-; MOVREL-NEXT: v_mov_b32_e32 v14, s6
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index a6083d200c7bc..7d0dd5f0ee0bf 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -15,7 +15,7 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -33,7 +33,7 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index e0b30adc06270..af389deee2f6e 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1081,32 +1081,31 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v12, v[4:5]
-; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v10, vcc, 1, v0
-; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v10, v[4:5]
+; VI-NEXT: flat_load_ubyte v11, v[6:7]
; VI-NEXT: flat_load_ubyte v8, v[8:9]
-; VI-NEXT: flat_load_ubyte v9, v[10:11]
+; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v6, v[6:7]
-; VI-NEXT: flat_load_ubyte v7, v[4:5]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8
; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v7
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v11
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
@@ -1127,25 +1126,23 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2
; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
; GFX10-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4
-; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:6
-; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6
+; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1
; GFX10-NEXT: global_load_ubyte v7, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1
-; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v6
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v4
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
+; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
+; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
-; GFX10-NEXT: global_store_dword v8, v6, s[0:1] offset:24
-; GFX10-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16
+; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 41c3f02611418..22402597a192f 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -287,7 +287,7 @@ define <2 x i64> @v2i64_func_void() #0 {
; GCN-LABEL: {{^}}v3i64_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx2 v[4:5], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <3 x i64> @v3i64_func_void() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
index 396b8c713ac66..bd16b9e0585d3 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -15,7 +15,7 @@
; CHECK: .max_flat_workgroup_size: 1024
; CHECK: .name: test
; CHECK: .private_segment_fixed_size: 0
-; CHECK: .sgpr_count: 8
+; CHECK: .sgpr_count: 6
; CHECK: .symbol: test.kd
; CHECK: .vgpr_count: {{3|6}}
; WAVE64: .wavefront_size: 64
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 03330e75e4165..e10f96072e254 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -16,7 +16,7 @@
; CHECK: PrivateSegmentFixedSize: 0
; CHECK: KernargSegmentAlign: 8
; CHECK: WavefrontSize: 64
-; CHECK: NumSGPRs: 8
+; CHECK: NumSGPRs: 6
; CHECK: NumVGPRs: {{3|6}}
; CHECK: MaxFlatWorkGroupSize: 1024
define amdgpu_kernel void @test(
@@ -39,7 +39,7 @@ entry:
; CHECK: PrivateSegmentFixedSize: 0
; CHECK: KernargSegmentAlign: 8
; CHECK: WavefrontSize: 64
-; CHECK: NumSGPRs: 8
+; CHECK: NumSGPRs: 6
; CHECK: NumVGPRs: {{3|6}}
; CHECK: MaxFlatWorkGroupSize: 256
define amdgpu_kernel void @test_max_flat_workgroup_size(
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index fbb3ebcad3148..558923f5cc050 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1506,26 +1506,27 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
; SI-LABEL: dynamic_insertelement_v3i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
-; SI-NEXT: s_load_dword s6, s[4:5], 0x10
+; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xc
+; SI-NEXT: s_load_dword s12, s[4:5], 0x10
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s13
-; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2
-; SI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5]
-; SI-NEXT: v_mov_b32_e32 v0, s12
-; SI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s11
-; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
+; SI-NEXT: v_mov_b32_e32 v4, s7
+; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1
; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s10
; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s9
-; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
+; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0
; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
+; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2
+; SI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5]
+; SI-NEXT: v_mov_b32_e32 v4, s6
+; SI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5]
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -1533,26 +1534,27 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
; VI-LABEL: dynamic_insertelement_v3i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
-; VI-NEXT: s_load_dword s6, s[4:5], 0x40
+; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20
+; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x30
+; VI-NEXT: s_load_dword s12, s[4:5], 0x40
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s13
-; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2
-; VI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, s12
-; VI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s11
-; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
+; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1
; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s9
-; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
+; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0
; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2
+; VI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5]
; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index d93f05a5b439b..3601886edc46f 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1039,10 +1039,10 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
; GFX9-LABEL: s_insertelement_v2i16_dynamic:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s2, s4, 4
@@ -1057,7 +1057,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
; VI-LABEL: s_insertelement_v2i16_dynamic:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_load_dword s0, s[4:5], 0x0
@@ -1076,7 +1076,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
; CI-LABEL: s_insertelement_v2i16_dynamic:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
+; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_load_dword s0, s[4:5], 0x0
@@ -1169,10 +1169,10 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
@@ -1187,7 +1187,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
@@ -1214,7 +1214,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
+; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
index a0a78e96b920a..fa5e1b6c34540 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
@@ -1,19 +1,19 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s
; Make sure the expected regmask is generated for sub/superregisters.
-; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
+; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
define void @csr() #0 {
call void asm sideeffect "", "~{v0},~{v44},~{v45}"() #0
ret void
}
-; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
+; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
define void @subregs_for_super() #0 {
call void asm sideeffect "", "~{v0},~{v1}"() #0
ret void
}
-; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
+; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}}
define void @clobbered_reg_with_sub() #0 {
call void asm sideeffect "", "~{v[0:1]}"() #0
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 4bdbdd3872d98..be0ffa0f65ee1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -25,7 +25,8 @@ entry:
}
; FUNC-LABEL: {{^}}constant_load_v3i64:
-; GCN: s_load_dwordx8 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; GCN-DAG: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; GCN-DAG: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x{{[0-9]+}}{{$}}
; EG-DAG: VTX_READ_128
; EG-DAG: VTX_READ_128
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
index 7ef5335a6e180..27d644ce011e6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -25,10 +25,10 @@ entry:
}
; FUNC-LABEL: {{^}}global_load_v3f64:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
+; GCN-NOHSA-DAG: buffer_load_dwordx4
+; GCN-NOHSA-DAG: buffer_load_dwordx2
+; GCN-HSA-DAG: flat_load_dwordx4
+; GCN-HSA-DAG: flat_load_dwordx2
define amdgpu_kernel void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
entry:
%ld = load <3 x double>, <3 x double> addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
index 79613882051b7..868dd29dd51c2 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -32,11 +32,11 @@ entry:
}
; FUNC-LABEL: {{^}}global_load_v3i64:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA-DAG: buffer_load_dwordx4
+; GCN-NOHSA-DAG: buffer_load_dwordx2
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
+; GCN-HSA-DAG: flat_load_dwordx4
+; GCN-HSA-DAG: flat_load_dwordx2
; EG: VTX_READ_128
; EG: VTX_READ_128
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
index 2de1423e5eea5..ee2f633f57dba 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
@@ -66,38 +66,38 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg
; CHECK-NEXT: s_mov_b32 s6, s4
; CHECK-NEXT: s_mov_b32 s5, s3
; CHECK-NEXT: s_mov_b32 s4, s2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, 16, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v1
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, 8, v1
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 12, v1
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, 8, v1
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v1
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v1
; CHECK-NEXT: v_mov_b32_e32 v9, s0
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, 16, v2
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, 12, v2
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v2
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, 8, v2
+; CHECK-NEXT: v_add_i32_e32 v12, vcc, 12, v2
; CHECK-NEXT: s_mov_b32 m0, -1
; CHECK-NEXT: ds_read_b32 v3, v1
-; CHECK-NEXT: ds_read_b32 v5, v4
-; CHECK-NEXT: ds_read_b32 v4, v7
-; CHECK-NEXT: ds_read_b32 v1, v8
+; CHECK-NEXT: ds_read_b32 v4, v0
+; CHECK-NEXT: ds_read_b32 v5, v5
; CHECK-NEXT: ds_read_b32 v6, v6
-; CHECK-NEXT: ds_read_b32 v0, v0
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v2
+; CHECK-NEXT: ds_read_b32 v0, v7
+; CHECK-NEXT: ds_read_b32 v1, v8
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v2
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v2
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
; CHECK-NEXT: s_waitcnt expcnt(0)
; CHECK-NEXT: ds_read_b32 v0, v2
-; CHECK-NEXT: ds_read_b32 v2, v12
-; CHECK-NEXT: ds_read_b32 v1, v7
+; CHECK-NEXT: ds_read_b32 v1, v10
+; CHECK-NEXT: ds_read_b32 v2, v11
+; CHECK-NEXT: ds_read_b32 v3, v12
+; CHECK-NEXT: ds_read_b32 v4, v7
; CHECK-NEXT: ds_read_b32 v5, v8
-; CHECK-NEXT: ds_read_b32 v3, v11
-; CHECK-NEXT: ds_read_b32 v4, v10
; CHECK-NEXT: s_waitcnt lgkmcnt(5)
; CHECK-NEXT: exp mrt0 off, off, off, off
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index f630ab9a3039c..607ca5727eb0d 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -499,7 +499,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_sdiv24_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -529,7 +529,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_sdiv24_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
@@ -671,7 +671,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_sdiv31_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -701,7 +701,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_sdiv31_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
@@ -738,7 +738,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_sdiv23_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -768,7 +768,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_sdiv23_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
@@ -805,7 +805,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_sdiv25_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -835,7 +835,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_sdiv25_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index a513d41caeb79..74177c4394317 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -480,7 +480,7 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_srem23_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -512,7 +512,7 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_srem23_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
@@ -551,7 +551,7 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_srem24_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -583,7 +583,7 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_srem24_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
@@ -676,7 +676,7 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_srem25_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -708,7 +708,7 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_srem25_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
@@ -747,7 +747,7 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-LABEL: s_test_srem31_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -779,7 +779,7 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
; GCN-IR-LABEL: s_test_srem31_64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 3500090e8455e..ac6f11edc12ce 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1297,37 +1297,37 @@ define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readon
; GFX9-LABEL: fma_shuffle:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9]
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: fma_shuffle:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9]
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
-; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
entry:
%tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list