[llvm] [AMDGPU][GlobalISel] Make `selectVOP3PMadMixModsImpl` same as the SelectionDAG counterpart (PR #110168)

Thu Sep 26 13:45:30 PDT 2024

https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/110168

The current `selectVOP3PMadMixModsImpl` can produce `V_MAD_FIX_F32` instruction
that violates constant bus restriction, while its `SelectionDAG` counterpart
doesn't. The culprit is in the copy stripping while the SelectionDAG version
only has a bitcast stripping. This PR simply aligns the two version.

>From 4e5ee8a5c59fdee13570064601e70a3b7f80bacb Mon Sep 17 00:00:00 2001
From: Shilei Tian <shilei.tian at amd.com>
Date: Thu, 26 Sep 2024 16:45:11 -0400
Subject: [PATCH] [AMDGPU][GlobalISel] Make `selectVOP3PMadMixModsImpl` same as
 the SelectionDAG counterpart

The current `selectVOP3PMadMixModsImpl` can produce `V_MAD_FIX_F32` instruction
that violates constant bus restriction, while its `SelectionDAG` counterpart
doesn't. The culprit is in the copy stripping while the SelectionDAG version
only has a bitcast stripping. This PR simply aligns the two version.
---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  17 +--
 .../GlobalISel/combine-fma-add-ext-mul.ll     |  20 +++-
 .../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll     |  10 +-
 .../madmix-constant-bus-violation.mir         | 110 ++++++++++++++++++
 4 files changed, 135 insertions(+), 22 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index febf0711c7d574..c3df962d448f94 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5312,26 +5312,20 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
     // Only change Src if src modifier could be gained. In such cases new Src
     // could be sgpr but this does not violate constant bus restriction for
     // instruction that is being selected.
-    // Note: Src is not changed when there is only a simple sgpr to vgpr copy
-    // since this could violate constant bus restriction.
-    Register PeekSrc = stripCopy(Src, *MRI);
+    Src = stripBitCast(Src, *MRI);
 
     const auto CheckAbsNeg = [&]() {
       // Be careful about folding modifiers if we already have an abs. fneg is
       // applied last, so we don't want to apply an earlier fneg.
       if ((Mods & SISrcMods::ABS) == 0) {
         unsigned ModsTmp;
-        std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
+        std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
 
-        if ((ModsTmp & SISrcMods::NEG) != 0) {
+        if ((ModsTmp & SISrcMods::NEG) != 0)
           Mods ^= SISrcMods::NEG;
-          Src = PeekSrc;
-        }
 
-        if ((ModsTmp & SISrcMods::ABS) != 0) {
+        if ((ModsTmp & SISrcMods::ABS) != 0)
           Mods |= SISrcMods::ABS;
-          Src = PeekSrc;
-        }
       }
     };
 
@@ -5344,8 +5338,7 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
 
     Mods |= SISrcMods::OP_SEL_1;
 
-    if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
-      Src = PeekSrc;
+    if (isExtractHiElt(*MRI, Src, Src)) {
       Mods |= SISrcMods::OP_SEL_0;
       CheckAbsNeg();
     }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
index 4ebe1c499a1769..4d603f7487754a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
@@ -73,10 +73,14 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
 ; GFX10-FAST-DENORM-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX10-FAST-DENORM-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX10-FAST-DENORM-NEXT:    v_mov_b32_e32 v4, s10
+; GFX10-FAST-DENORM-NEXT:    s_lshr_b32 s11, s0, 16
+; GFX10-FAST-DENORM-NEXT:    s_lshr_b32 s12, s1, 16
+; GFX10-FAST-DENORM-NEXT:    s_lshr_b32 s13, s3, 16
+; GFX10-FAST-DENORM-NEXT:    s_lshr_b32 s14, s4, 16
 ; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v1, s11, s13, v1 op_sel_hi:[1,1,0]
 ; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v3, s12, s14, v3 op_sel_hi:[1,1,0]
 ; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
 ; GFX10-FAST-DENORM-NEXT:    ; return to shader part epilog
 .entry:
@@ -117,12 +121,18 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
 ; GFX10-FAST-DENORM-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX10-FAST-DENORM-NEXT:    v_mov_b32_e32 v4, s10
 ; GFX10-FAST-DENORM-NEXT:    v_mov_b32_e32 v5, s11
+; GFX10-FAST-DENORM-NEXT:    s_lshr_b32 s12, s0, 16
+; GFX10-FAST-DENORM-NEXT:    s_lshr_b32 s13, s1, 16
+; GFX10-FAST-DENORM-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX10-FAST-DENORM-NEXT:    s_lshr_b32 s14, s3, 16
+; GFX10-FAST-DENORM-NEXT:    s_lshr_b32 s15, s4, 16
+; GFX10-FAST-DENORM-NEXT:    s_lshr_b32 s16, s5, 16
 ; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v1, s12, s14, v1 op_sel_hi:[1,1,0]
 ; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v3, s13, s15, v3 op_sel_hi:[1,1,0]
 ; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v5, s2, s5, v5 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT:    v_fma_mix_f32 v5, s6, s16, v5 op_sel_hi:[1,1,0]
 ; GFX10-FAST-DENORM-NEXT:    ; return to shader part epilog
 .entry:
     %a = fmul fast <6 x half> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 89cd18ad9be70b..1a98285230b2cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -2555,9 +2555,9 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, v2, s0
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, s2, v2
+; GFX9-FLUSH-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, s3
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-FLUSH-NEXT:    ; return to shader part epilog
@@ -2571,7 +2571,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
 ; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-NEXT:    v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v0, s1, s0
 ; GFX10-NEXT:    v_div_fixup_f16 v1, v1, s2, s3
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
@@ -2588,7 +2588,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT:    v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v0, s1, s0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v1, s2, s3
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir
new file mode 100644
index 00000000000000..ba9cfb8e1d68ed
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir
@@ -0,0 +1,110 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select,machineverifier -o - %s | FileCheck -check-prefixes=GFX9 %s
+
+---
+name: s_fdiv_v2f16
+legalized: true
+regBankSelected: true
+machineFunctionInfo:
+  mode:
+    fp32-output-denormals: false
+    fp32-input-denormals: false
+body: |
+  bb.0:
+    ; GFX9-LABEL: name: s_fdiv_v2f16
+    ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+    ; GFX9-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], [[S_MOV_B32_]], implicit-def dead $scc
+    ; GFX9-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def dead $scc
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; GFX9-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+    ; GFX9-NEXT: [[V_CVT_F32_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[V_CVT_F32_F16_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_CVT_F32_F16_e64_]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY3]], 0, [[V_MUL_F32_e64_]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[V_MAD_MIX_F32_]], 0, [[V_RCP_F32_e64_]], 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MAD_MIX_F32_1:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY3]], 0, [[V_MAC_F32_e64_]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_MAD_MIX_F32_1]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+    ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_MUL_F32_e64_1]], [[COPY4]], implicit $exec
+    ; GFX9-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_AND_B32_e64_]], 0, [[V_MAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_ADD_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; GFX9-NEXT: [[V_DIV_FIXUP_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_DIV_FIXUP_F16_gfx9_e64 0, [[V_CVT_F16_F32_e64_]], 0, [[COPY5]], 0, [[COPY6]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]]
+    ; GFX9-NEXT: [[V_CVT_F32_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]]
+    ; GFX9-NEXT: [[V_CVT_F32_F16_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY8]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_RCP_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[V_CVT_F32_F16_e64_3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_CVT_F32_F16_e64_2]], 0, [[V_RCP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MAD_MIX_F32_2:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY8]], 0, [[V_MUL_F32_e64_2]], 8, [[COPY7]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MAC_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[V_MAD_MIX_F32_2]], 0, [[V_RCP_F32_e64_1]], 0, [[V_MUL_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MAD_MIX_F32_3:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY8]], 0, [[V_MAC_F32_e64_1]], 8, [[COPY7]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_MAD_MIX_F32_3]], 0, [[V_RCP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+    ; GFX9-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_MUL_F32_e64_3]], [[COPY9]], implicit $exec
+    ; GFX9-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_AND_B32_e64_1]], 0, [[V_MAC_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_CVT_F16_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_ADD_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]]
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]]
+    ; GFX9-NEXT: [[V_DIV_FIXUP_F16_gfx9_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_DIV_FIXUP_F16_gfx9_e64 0, [[V_CVT_F16_F32_e64_1]], 0, [[COPY10]], 0, [[COPY11]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, [[V_DIV_FIXUP_F16_gfx9_e64_]], 0, [[V_DIV_FIXUP_F16_gfx9_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_PACK_B32_F16_e64_]], implicit $exec
+    ; GFX9-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+    ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+    %0:sgpr(s32) = COPY $sgpr0
+    %1:sgpr(s32) = COPY $sgpr1
+    %2:sgpr(s16) = G_TRUNC %0:sgpr(s32)
+    %3:sgpr(s32) = G_CONSTANT i32 16
+    %4:sgpr(s32) = G_LSHR %0:sgpr, %3:sgpr(s32)
+    %5:sgpr(s16) = G_TRUNC %4:sgpr(s32)
+    %6:sgpr(s16) = G_TRUNC %1:sgpr(s32)
+    %7:sgpr(s32) = G_LSHR %1:sgpr, %3:sgpr(s32)
+    %8:sgpr(s16) = G_TRUNC %7:sgpr(s32)
+    %9:vgpr(s16) = COPY %2:sgpr(s16)
+    %10:vgpr(s32) = G_FPEXT %9:vgpr(s16)
+    %11:vgpr(s16) = COPY %6:sgpr(s16)
+    %12:vgpr(s32) = G_FPEXT %11:vgpr(s16)
+    %13:vgpr(s32) = G_FNEG %12:vgpr
+    %14:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %12:vgpr(s32)
+    %15:vgpr(s32) = G_FMUL %10:vgpr, %14:vgpr
+    %16:vgpr(s32) = G_FMAD %13:vgpr, %15:vgpr, %10:vgpr
+    %17:vgpr(s32) = G_FMAD %16:vgpr, %14:vgpr, %15:vgpr
+    %18:vgpr(s32) = G_FMAD %13:vgpr, %17:vgpr, %10:vgpr
+    %19:vgpr(s32) = G_FMUL %18:vgpr, %14:vgpr
+    %20:sgpr(s32) = G_CONSTANT i32 -8388608
+    %21:vgpr(s32) = COPY %20:sgpr(s32)
+    %22:vgpr(s32) = G_AND %19:vgpr, %21:vgpr
+    %23:vgpr(s32) = G_FADD %22:vgpr, %17:vgpr
+    %24:vgpr(s16) = G_FPTRUNC %23:vgpr(s32)
+    %25:vgpr(s16) = COPY %6:sgpr(s16)
+    %26:vgpr(s16) = COPY %2:sgpr(s16)
+    %27:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), %24:vgpr(s16), %25:vgpr(s16), %26:vgpr(s16)
+    %28:vgpr(s16) = COPY %5:sgpr(s16)
+    %29:vgpr(s32) = G_FPEXT %28:vgpr(s16)
+    %30:vgpr(s16) = COPY %8:sgpr(s16)
+    %31:vgpr(s32) = G_FPEXT %30:vgpr(s16)
+    %32:vgpr(s32) = G_FNEG %31:vgpr
+    %33:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %31:vgpr(s32)
+    %34:vgpr(s32) = G_FMUL %29:vgpr, %33:vgpr
+    %35:vgpr(s32) = G_FMAD %32:vgpr, %34:vgpr, %29:vgpr
+    %36:vgpr(s32) = G_FMAD %35:vgpr, %33:vgpr, %34:vgpr
+    %37:vgpr(s32) = G_FMAD %32:vgpr, %36:vgpr, %29:vgpr
+    %38:vgpr(s32) = G_FMUL %37:vgpr, %33:vgpr
+    %39:vgpr(s32) = COPY %20:sgpr(s32)
+    %40:vgpr(s32) = G_AND %38:vgpr, %39:vgpr
+    %41:vgpr(s32) = G_FADD %40:vgpr, %36:vgpr
+    %42:vgpr(s16) = G_FPTRUNC %41:vgpr(s32)
+    %43:vgpr(s16) = COPY %8:sgpr(s16)
+    %44:vgpr(s16) = COPY %5:sgpr(s16)
+    %45:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), %42:vgpr(s16), %43:vgpr(s16), %44:vgpr(s16)
+    %46:vgpr(<2 x s16>) = G_BUILD_VECTOR %27:vgpr(s16), %45:vgpr(s16)
+    %47:vgpr(s32) = G_BITCAST %46:vgpr(<2 x s16>)
+    %48:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %47:vgpr(s32)
+    $sgpr0 = COPY %48:sgpr(s32)
+    SI_RETURN_TO_EPILOG implicit $sgpr0
+...