[llvm] 08da7ac - [AMDGPU] Fix broken sign-extended subword buffer load combine (#77470)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 10 02:50:16 PST 2024
Author: Jay Foad
Date: 2024-01-10T10:50:13Z
New Revision: 08da7ac80c165dbae0cb71257b3cdcd8a1006a76
URL: https://github.com/llvm/llvm-project/commit/08da7ac80c165dbae0cb71257b3cdcd8a1006a76
DIFF: https://github.com/llvm/llvm-project/commit/08da7ac80c165dbae0cb71257b3cdcd8a1006a76.diff
LOG: [AMDGPU] Fix broken sign-extended subword buffer load combine (#77470)
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUCombine.td
llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 0c77fe72595880..b9411e2052120d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -111,7 +111,7 @@ def smulu64 : GICombineRule<
[{ return matchCombine_s_mul_u64(*${smul}, ${matchinfo}); }]),
(apply [{ applyCombine_s_mul_u64(*${smul}, ${matchinfo}); }])>;
-def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">;
+def sign_exension_in_reg_matchdata : GIDefMatchData<"std::pair<MachineInstr *, unsigned>">;
def sign_extension_in_reg : GICombineRule<
(defs root:$sign_inreg, sign_exension_in_reg_matchdata:$matchinfo),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 21bfab52c6c4b5..bb1d6cb72e8071 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -99,10 +99,10 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
// Combine unsigned buffer load and signed extension instructions to generate
// signed buffer laod instructions.
- bool matchCombineSignExtendInReg(MachineInstr &MI,
- MachineInstr *&MatchInfo) const;
- void applyCombineSignExtendInReg(MachineInstr &MI,
- MachineInstr *&MatchInfo) const;
+ bool matchCombineSignExtendInReg(
+ MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;
+ void applyCombineSignExtendInReg(
+ MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;
// Find the s_mul_u64 instructions where the higher bits are either
// zero-extended or sign-extended.
@@ -395,34 +395,36 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
// Identify buffer_load_{u8, u16}.
bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
- MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const {
- Register Op0Reg = MI.getOperand(1).getReg();
- SubwordBufferLoad = MRI.getVRegDef(Op0Reg);
-
- if (!MRI.hasOneNonDBGUse(Op0Reg))
+ MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const {
+ Register LoadReg = MI.getOperand(1).getReg();
+ if (!MRI.hasOneNonDBGUse(LoadReg))
return false;
// Check if the first operand of the sign extension is a subword buffer load
// instruction.
- return SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE ||
- SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
+ MachineInstr *LoadMI = MRI.getVRegDef(LoadReg);
+ int64_t Width = MI.getOperand(2).getImm();
+ switch (LoadMI->getOpcode()) {
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
+ MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE};
+ return Width == 8;
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
+ MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};
+ return Width == 16;
+ }
+ return false;
}
// Combine buffer_load_{u8, u16} and the sign extension instruction to generate
// buffer_load_{i8, i16}.
void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
- MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const {
- // Modify the opcode and the destination of buffer_load_{u8, u16}:
- // Replace the opcode.
- unsigned Opc =
- SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
- ? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
- : AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
- SubwordBufferLoad->setDesc(TII.get(Opc));
- // Update the destination register of SubwordBufferLoad with the destination
- // register of the sign extension.
+ MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const {
+ auto [LoadMI, NewOpcode] = MatchData;
+ LoadMI->setDesc(TII.get(NewOpcode));
+ // Update the destination register of the load with the destination register
+ // of the sign extension.
Register SignExtendInsnDst = MI.getOperand(0).getReg();
- SubwordBufferLoad->getOperand(0).setReg(SignExtendInsnDst);
+ LoadMI->getOperand(0).setReg(SignExtendInsnDst);
// Remove the sign extension.
MI.eraseFromParent();
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
index 81c0f7557e6417..94ce8aac8a4c68 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
@@ -500,6 +500,49 @@ define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr
ret float %cast
}
+define amdgpu_ps float @struct_buffer_load_i8_sext_wrong_width(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+ ; GFX8-LABEL: name: struct_buffer_load_i8_sext_wrong_width
+ ; GFX8: bb.1 (%ir-block.0):
+ ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX8-NEXT: [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX8-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_BOTHEN]], 0, 4, implicit $exec
+ ; GFX8-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
+ ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: struct_buffer_load_i8_sext_wrong_width
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX12-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]], 0, 4, implicit $exec
+ ; GFX12-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ %trunc = trunc i8 %val to i4
+ %ext = sext i4 %trunc to i32
+ %cast = bitcast i32 %ext to float
+ ret float %cast
+}
+
define amdgpu_ps float @struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX8-LABEL: name: struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
; GFX8: bb.1 (%ir-block.0):
@@ -580,6 +623,49 @@ define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgp
ret float %cast
}
+define amdgpu_ps float @struct_buffer_load_i16_sext_wrong_width(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+ ; GFX8-LABEL: name: struct_buffer_load_i16_sext_wrong_width
+ ; GFX8: bb.1 (%ir-block.0):
+ ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX8-NEXT: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX8-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_BOTHEN]], 0, 8, implicit $exec
+ ; GFX8-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
+ ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: struct_buffer_load_i16_sext_wrong_width
+ ; GFX12: bb.1 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX12-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]], 0, 8, implicit $exec
+ ; GFX12-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
+ ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ %trunc = trunc i16 %val to i8
+ %ext = sext i8 %trunc to i32
+ %cast = bitcast i32 %ext to float
+ ret float %cast
+}
+
; Natural mapping
define amdgpu_ps half @struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX8-LABEL: name: struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
More information about the llvm-commits
mailing list