[llvm] [AMDGPU][GlobalISel] Align `selectVOP3PMadMixModsImpl` with the `SelectionDAG` counterpart (PR #110168)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 30 11:18:09 PDT 2024
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/110168
>From 0edac2531a62be4fa8eff6de41dfd03c687d4d4c Mon Sep 17 00:00:00 2001
From: Shilei Tian <shilei.tian at amd.com>
Date: Thu, 26 Sep 2024 16:45:11 -0400
Subject: [PATCH] [AMDGPU][GlobalISel] Make `selectVOP3PMadMixModsImpl` same as
the SelectionDAG counterpart
The current `selectVOP3PMadMixModsImpl` can produce `V_MAD_FIX_F32` instruction
that violates constant bus restriction, while its `SelectionDAG` counterpart
doesn't. The culprit is in the copy stripping while the SelectionDAG version
only has a bitcast stripping. This PR simply aligns the two version.
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 17 +--
.../GlobalISel/combine-fma-add-ext-mul.ll | 20 ++-
.../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll | 10 +-
.../fmamix-constant-bus-violation.mir | 42 ++++++
.../madmix-constant-bus-violation.mir | 42 ++++++
.../madmix-constant-bus-violation.s | 141 ++++++++++++++++++
6 files changed, 250 insertions(+), 22 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.s
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index febf0711c7d574..c3df962d448f94 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5312,26 +5312,20 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
// Only change Src if src modifier could be gained. In such cases new Src
// could be sgpr but this does not violate constant bus restriction for
// instruction that is being selected.
- // Note: Src is not changed when there is only a simple sgpr to vgpr copy
- // since this could violate constant bus restriction.
- Register PeekSrc = stripCopy(Src, *MRI);
+ Src = stripBitCast(Src, *MRI);
const auto CheckAbsNeg = [&]() {
// Be careful about folding modifiers if we already have an abs. fneg is
// applied last, so we don't want to apply an earlier fneg.
if ((Mods & SISrcMods::ABS) == 0) {
unsigned ModsTmp;
- std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
+ std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
- if ((ModsTmp & SISrcMods::NEG) != 0) {
+ if ((ModsTmp & SISrcMods::NEG) != 0)
Mods ^= SISrcMods::NEG;
- Src = PeekSrc;
- }
- if ((ModsTmp & SISrcMods::ABS) != 0) {
+ if ((ModsTmp & SISrcMods::ABS) != 0)
Mods |= SISrcMods::ABS;
- Src = PeekSrc;
- }
}
};
@@ -5344,8 +5338,7 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
Mods |= SISrcMods::OP_SEL_1;
- if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
- Src = PeekSrc;
+ if (isExtractHiElt(*MRI, Src, Src)) {
Mods |= SISrcMods::OP_SEL_0;
CheckAbsNeg();
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
index 4ebe1c499a1769..4d603f7487754a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
@@ -73,10 +73,14 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s8
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s11, s0, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s1, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s3, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s4, 16
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s11, s13, v1 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s12, s14, v3 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog
.entry:
@@ -117,12 +121,18 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10
; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v5, s11
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s0, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s1, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s6, s2, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s3, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s15, s4, 16
+; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s16, s5, 16
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s12, s14, v1 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s13, s15, v3 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0]
-; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s2, s5, v5 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s6, s16, v5 op_sel_hi:[1,1,0]
; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul fast <6 x half> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 89cd18ad9be70b..1a98285230b2cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -2555,9 +2555,9 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, s2, v2
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, s3
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-FLUSH-NEXT: ; return to shader part epilog
@@ -2571,7 +2571,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX10-NEXT: v_rcp_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_f32_e32 v1, v1
; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -2588,7 +2588,7 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir
new file mode 100644
index 00000000000000..c3c873a7cf5d79
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.mir
@@ -0,0 +1,42 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -run-pass=instruction-select,machineverifier -o - %s | FileCheck -check-prefixes=GFX9 %s
+
+---
+name: foo
+legalized: true
+regBankSelected: true
+machineFunctionInfo:
+ mode:
+ fp32-output-denormals: false
+ fp32-input-denormals: false
+body: |
+ bb.0:
+ ; GFX9-LABEL: name: foo
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX9-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX9-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]]
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]]
+ ; GFX9-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIX_F32 9, [[COPY3]], 8, [[COPY3]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_FMA_MIX_F32_]], implicit $exec
+ ; GFX9-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_CONSTANT i32 16
+ %3:sgpr(s32) = G_LSHR %0:sgpr, %2:sgpr(s32)
+ %4:sgpr(s16) = G_TRUNC %3:sgpr(s32)
+ %5:sgpr(s32) = G_LSHR %1:sgpr, %2:sgpr(s32)
+ %6:sgpr(s16) = G_TRUNC %5:sgpr(s32)
+ %7:vgpr(s16) = COPY %4:sgpr(s16)
+ %8:vgpr(s32) = G_FPEXT %7:vgpr(s16)
+ %9:vgpr(s16) = COPY %6:sgpr(s16)
+ %10:vgpr(s32) = G_FPEXT %9:vgpr(s16)
+ %11:vgpr(s32) = G_FNEG %10:vgpr
+ %13:vgpr(s32) = G_FMA %11:vgpr, %10:vgpr, %8:vgpr
+ %14:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %13:vgpr(s32)
+ $sgpr0 = COPY %14:sgpr(s32)
+ SI_RETURN_TO_EPILOG implicit $sgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir
new file mode 100644
index 00000000000000..ad234402f2bc1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir
@@ -0,0 +1,42 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select,machineverifier -o - %s | FileCheck -check-prefixes=GFX9 %s
+
+---
+name: foo
+legalized: true
+regBankSelected: true
+machineFunctionInfo:
+ mode:
+ fp32-output-denormals: false
+ fp32-input-denormals: false
+body: |
+ bb.0:
+ ; GFX9-LABEL: name: foo
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX9-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX9-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_]]
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LSHR_B32_1]]
+ ; GFX9-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 9, [[COPY3]], 8, [[COPY3]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_MAD_MIX_F32_]], implicit $exec
+ ; GFX9-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_CONSTANT i32 16
+ %3:sgpr(s32) = G_LSHR %0:sgpr, %2:sgpr(s32)
+ %4:sgpr(s16) = G_TRUNC %3:sgpr(s32)
+ %5:sgpr(s32) = G_LSHR %1:sgpr, %2:sgpr(s32)
+ %6:sgpr(s16) = G_TRUNC %5:sgpr(s32)
+ %7:vgpr(s16) = COPY %4:sgpr(s16)
+ %8:vgpr(s32) = G_FPEXT %7:vgpr(s16)
+ %9:vgpr(s16) = COPY %6:sgpr(s16)
+ %10:vgpr(s32) = G_FPEXT %9:vgpr(s16)
+ %11:vgpr(s32) = G_FNEG %10:vgpr
+ %13:vgpr(s32) = G_FMAD %11:vgpr, %10:vgpr, %8:vgpr
+ %14:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %13:vgpr(s32)
+ $sgpr0 = COPY %14:sgpr(s32)
+ SI_RETURN_TO_EPILOG implicit $sgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.s b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.s
new file mode 100644
index 00000000000000..476a6faec328c9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.s
@@ -0,0 +1,141 @@
+--- |
+ ; ModuleID = '/home/shiltian/Documents/vscode/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir'
+ source_filename = "/home/shiltian/Documents/vscode/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.mir"
+ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+ target triple = "amdgcn"
+
+ define void @foo() #0 {
+ entry:
+ unreachable
+ }
+
+ attributes #0 = { "target-cpu"="gfx900" }
+
+...
+---
+name: foo
+alignment: 1
+exposesReturnsTwice: false
+legalized: true
+regBankSelected: true
+selected: true
+failedISel: false
+tracksRegLiveness: false
+hasWinCFI: false
+noPhis: true
+isSSA: true
+noVRegs: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHCatchret: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: sreg_32, preferred-register: '' }
+ - { id: 1, class: sreg_32, preferred-register: '' }
+ - { id: 2, class: sreg_32, preferred-register: '' }
+ - { id: 3, class: sreg_32, preferred-register: '' }
+ - { id: 4, class: sreg_32, preferred-register: '' }
+ - { id: 5, class: sreg_32, preferred-register: '' }
+ - { id: 6, class: sreg_32, preferred-register: '' }
+ - { id: 7, class: vgpr_32, preferred-register: '' }
+ - { id: 8, class: vgpr, preferred-register: '' }
+ - { id: 9, class: vgpr_32, preferred-register: '' }
+ - { id: 10, class: vgpr, preferred-register: '' }
+ - { id: 11, class: vgpr, preferred-register: '' }
+ - { id: 12, class: vgpr_32, preferred-register: '' }
+ - { id: 13, class: sreg_32, preferred-register: '' }
+liveins: []
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 0
+ maxKernArgAlign: 1
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: false
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ dispatchPtr: { reg: '$sgpr4_sgpr5' }
+ queuePtr: { reg: '$sgpr6_sgpr7' }
+ dispatchID: { reg: '$sgpr10_sgpr11' }
+ workGroupIDX: { reg: '$sgpr12' }
+ workGroupIDY: { reg: '$sgpr13' }
+ workGroupIDZ: { reg: '$sgpr14' }
+ LDSKernelId: { reg: '$sgpr15' }
+ implicitArgPtr: { reg: '$sgpr8_sgpr9' }
+ workItemIDX: { reg: '$vgpr31', mask: 1023 }
+ workItemIDY: { reg: '$vgpr31', mask: 1047552 }
+ workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
+ psInputAddr: 0
+ psInputEnable: 0
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: false
+ fp32-output-denormals: false
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 8
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+body: |
+ bb.0:
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = COPY $sgpr1
+ %2:sreg_32 = S_MOV_B32 16
+ %3:sreg_32 = S_LSHR_B32 %0, %2, implicit-def dead $scc
+ %5:sreg_32 = S_LSHR_B32 %1, %2, implicit-def dead $scc
+ %7:vgpr_32 = COPY %3
+ %9:vgpr_32 = COPY %5
+ %12:vgpr_32 = V_MAD_MIX_F32 9, %9, 8, %9, 8, %7, 0, 0, 0, implicit $mode, implicit $exec
+ %13:sreg_32 = V_READFIRSTLANE_B32 %12, implicit $exec
+ $sgpr0 = COPY %13
+ SI_RETURN_TO_EPILOG implicit $sgpr0
+
+...
More information about the llvm-commits
mailing list