[llvm-branch-commits] [llvm] [AMDGPU] Change SGPR layout to striped caller/callee saved (PR #127353)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sat Feb 15 15:12:48 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Shilei Tian (shiltian)
<details>
<summary>Changes</summary>
This PR updates the SGPR layout to a striped caller/callee-saved design, similar
to the VGPR layout. The stripe width is set to 8.
Fixes #<!-- -->113782.
---
Patch is 2.57 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127353.diff
60 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td (+5-1)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll (+145-145)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+90-245)
- (modified) llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll (+203-201)
- (modified) llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll (+73-140)
- (modified) llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/call-args-inreg.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+1256-1256)
- (modified) llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll (+20-14)
- (modified) llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll (+788-1549)
- (modified) llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir (+4-6)
- (modified) llvm/test/CodeGen/AMDGPU/ds_read2.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll (+36-36)
- (modified) llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir (+26-27)
- (modified) llvm/test/CodeGen/AMDGPU/function-args-inreg.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/function-resource-usage.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll (+66-2)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+80-208)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+1834-1834)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+1554-1554)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+1554-1554)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+1834-1834)
- (modified) llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir (+64-62)
- (modified) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll (+55-91)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-call.ll (+492-748)
- (modified) llvm/test/CodeGen/AMDGPU/issue48473.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll (+6-39)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll (+18-63)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll (+6-39)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll (+18-63)
- (modified) llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll (+32-32)
- (modified) llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll (+160-160)
- (modified) llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll (+68-774)
- (modified) llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll (+416-1095)
- (modified) llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll (+13-13)
- (modified) llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir (+28-58)
- (modified) llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir (+17-39)
- (modified) llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir (+9-21)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+223-223)
- (modified) llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir (+120-86)
- (modified) llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/select.f16.ll (+2-13)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll (+672-1568)
- (modified) llvm/test/CodeGen/AMDGPU/sibling-call.ll (+120-120)
- (modified) llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir (+38-17)
- (modified) llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir (+11-27)
- (modified) llvm/test/CodeGen/AMDGPU/spill-sgpr-used-for-exec-copy.mir (+3-8)
- (modified) llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll (+132-264)
- (modified) llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir (+107-93)
- (modified) llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/stack-realign.ll (+7-13)
- (modified) llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll (+189-144)
- (modified) llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll (+106-106)
- (modified) llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir (+25-51)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll (+112-240)
- (modified) llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir (+1-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 80969fce3d77f..e3861a7d06c3d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -91,7 +91,11 @@ def CSR_AMDGPU_AGPRs : CalleeSavedRegs<
>;
def CSR_AMDGPU_SGPRs : CalleeSavedRegs<
- (sequence "SGPR%u", 30, 105)
+ (add (sequence "SGPR%u", 30, 37),
+ (sequence "SGPR%u", 46, 53),
+ (sequence "SGPR%u", 62, 69),
+ (sequence "SGPR%u", 78, 85),
+ (sequence "SGPR%u", 94, 105))
>;
def CSR_AMDGPU_SI_Gfx_SGPRs : CalleeSavedRegs<
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index ab2363860af9d..905d0deacab35 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -125,35 +125,35 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
; CHECK-NEXT: v_writelane_b32 v43, s35, 3
; CHECK-NEXT: v_writelane_b32 v43, s36, 4
; CHECK-NEXT: v_writelane_b32 v43, s37, 5
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s46, 6
+; CHECK-NEXT: v_writelane_b32 v43, s47, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
-; CHECK-NEXT: v_writelane_b32 v43, s40, 8
-; CHECK-NEXT: v_writelane_b32 v43, s41, 9
-; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT: v_writelane_b32 v43, s48, 8
+; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s42, 10
+; CHECK-NEXT: v_writelane_b32 v43, s50, 10
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s43, 11
+; CHECK-NEXT: v_writelane_b32 v43, s51, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v1
-; CHECK-NEXT: v_writelane_b32 v43, s44, 12
+; CHECK-NEXT: v_writelane_b32 v43, s52, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
-; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT: v_writelane_b32 v43, s45, 13
+; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
+; CHECK-NEXT: v_writelane_b32 v43, s53, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: v_mov_b32_e32 v41, v2
-; CHECK-NEXT: s_mov_b32 s42, s15
-; CHECK-NEXT: s_mov_b32 s43, s14
-; CHECK-NEXT: s_mov_b32 s44, s13
-; CHECK-NEXT: s_mov_b32 s45, s12
+; CHECK-NEXT: s_mov_b32 s50, s15
+; CHECK-NEXT: s_mov_b32 s51, s14
+; CHECK-NEXT: s_mov_b32 s52, s13
+; CHECK-NEXT: s_mov_b32 s53, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_mov_b64 s[46:47], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41
@@ -161,15 +161,15 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[46:47]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT: s_mov_b32 s12, s45
-; CHECK-NEXT: s_mov_b32 s13, s44
-; CHECK-NEXT: s_mov_b32 s14, s43
-; CHECK-NEXT: s_mov_b32 s15, s42
+; CHECK-NEXT: s_mov_b32 s12, s53
+; CHECK-NEXT: s_mov_b32 s13, s52
+; CHECK-NEXT: s_mov_b32 s14, s51
+; CHECK-NEXT: s_mov_b32 s15, s50
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -179,14 +179,14 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT: v_readlane_b32 s45, v43, 13
-; CHECK-NEXT: v_readlane_b32 s44, v43, 12
-; CHECK-NEXT: v_readlane_b32 s43, v43, 11
-; CHECK-NEXT: v_readlane_b32 s42, v43, 10
-; CHECK-NEXT: v_readlane_b32 s41, v43, 9
-; CHECK-NEXT: v_readlane_b32 s40, v43, 8
-; CHECK-NEXT: v_readlane_b32 s39, v43, 7
-; CHECK-NEXT: v_readlane_b32 s38, v43, 6
+; CHECK-NEXT: v_readlane_b32 s53, v43, 13
+; CHECK-NEXT: v_readlane_b32 s52, v43, 12
+; CHECK-NEXT: v_readlane_b32 s51, v43, 11
+; CHECK-NEXT: v_readlane_b32 s50, v43, 10
+; CHECK-NEXT: v_readlane_b32 s49, v43, 9
+; CHECK-NEXT: v_readlane_b32 s48, v43, 8
+; CHECK-NEXT: v_readlane_b32 s47, v43, 7
+; CHECK-NEXT: v_readlane_b32 s46, v43, 6
; CHECK-NEXT: v_readlane_b32 s37, v43, 5
; CHECK-NEXT: v_readlane_b32 s36, v43, 4
; CHECK-NEXT: v_readlane_b32 s35, v43, 3
@@ -266,34 +266,34 @@ define double @test_powr_fast_f64(double %x, double %y) {
; CHECK-NEXT: v_writelane_b32 v43, s35, 3
; CHECK-NEXT: v_writelane_b32 v43, s36, 4
; CHECK-NEXT: v_writelane_b32 v43, s37, 5
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s46, 6
+; CHECK-NEXT: v_writelane_b32 v43, s47, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
-; CHECK-NEXT: v_writelane_b32 v43, s40, 8
-; CHECK-NEXT: v_writelane_b32 v43, s41, 9
-; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT: v_writelane_b32 v43, s48, 8
+; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s42, 10
-; CHECK-NEXT: v_writelane_b32 v43, s43, 11
-; CHECK-NEXT: v_writelane_b32 v43, s44, 12
-; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT: v_writelane_b32 v43, s50, 10
+; CHECK-NEXT: v_writelane_b32 v43, s51, 11
+; CHECK-NEXT: v_writelane_b32 v43, s52, 12
+; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s45, 13
+; CHECK-NEXT: v_writelane_b32 v43, s53, 13
; CHECK-NEXT: v_mov_b32_e32 v42, v31
; CHECK-NEXT: v_mov_b32_e32 v41, v3
; CHECK-NEXT: v_mov_b32_e32 v40, v2
-; CHECK-NEXT: s_mov_b32 s42, s15
-; CHECK-NEXT: s_mov_b32 s43, s14
-; CHECK-NEXT: s_mov_b32 s44, s13
-; CHECK-NEXT: s_mov_b32 s45, s12
+; CHECK-NEXT: s_mov_b32 s50, s15
+; CHECK-NEXT: s_mov_b32 s51, s14
+; CHECK-NEXT: s_mov_b32 s52, s13
+; CHECK-NEXT: s_mov_b32 s53, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_mov_b64 s[46:47], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mul_f64 v[0:1], v[40:41], v[0:1]
@@ -301,28 +301,28 @@ define double @test_powr_fast_f64(double %x, double %y) {
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[46:47]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT: s_mov_b32 s12, s45
-; CHECK-NEXT: s_mov_b32 s13, s44
-; CHECK-NEXT: s_mov_b32 s14, s43
-; CHECK-NEXT: s_mov_b32 s15, s42
+; CHECK-NEXT: s_mov_b32 s12, s53
+; CHECK-NEXT: s_mov_b32 s13, s52
+; CHECK-NEXT: s_mov_b32 s14, s51
+; CHECK-NEXT: s_mov_b32 s15, s50
; CHECK-NEXT: v_mov_b32_e32 v31, v42
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT: v_readlane_b32 s45, v43, 13
-; CHECK-NEXT: v_readlane_b32 s44, v43, 12
-; CHECK-NEXT: v_readlane_b32 s43, v43, 11
-; CHECK-NEXT: v_readlane_b32 s42, v43, 10
-; CHECK-NEXT: v_readlane_b32 s41, v43, 9
-; CHECK-NEXT: v_readlane_b32 s40, v43, 8
-; CHECK-NEXT: v_readlane_b32 s39, v43, 7
-; CHECK-NEXT: v_readlane_b32 s38, v43, 6
+; CHECK-NEXT: v_readlane_b32 s53, v43, 13
+; CHECK-NEXT: v_readlane_b32 s52, v43, 12
+; CHECK-NEXT: v_readlane_b32 s51, v43, 11
+; CHECK-NEXT: v_readlane_b32 s50, v43, 10
+; CHECK-NEXT: v_readlane_b32 s49, v43, 9
+; CHECK-NEXT: v_readlane_b32 s48, v43, 8
+; CHECK-NEXT: v_readlane_b32 s47, v43, 7
+; CHECK-NEXT: v_readlane_b32 s46, v43, 6
; CHECK-NEXT: v_readlane_b32 s37, v43, 5
; CHECK-NEXT: v_readlane_b32 s36, v43, 4
; CHECK-NEXT: v_readlane_b32 s35, v43, 3
@@ -409,35 +409,35 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
; CHECK-NEXT: v_writelane_b32 v43, s35, 3
; CHECK-NEXT: v_writelane_b32 v43, s36, 4
; CHECK-NEXT: v_writelane_b32 v43, s37, 5
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s46, 6
+; CHECK-NEXT: v_writelane_b32 v43, s47, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
-; CHECK-NEXT: v_writelane_b32 v43, s40, 8
-; CHECK-NEXT: v_writelane_b32 v43, s41, 9
-; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT: v_writelane_b32 v43, s48, 8
+; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s42, 10
+; CHECK-NEXT: v_writelane_b32 v43, s50, 10
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s43, 11
+; CHECK-NEXT: v_writelane_b32 v43, s51, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v1
-; CHECK-NEXT: v_writelane_b32 v43, s44, 12
+; CHECK-NEXT: v_writelane_b32 v43, s52, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
-; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT: v_writelane_b32 v43, s45, 13
+; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
+; CHECK-NEXT: v_writelane_b32 v43, s53, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: v_mov_b32_e32 v41, v2
-; CHECK-NEXT: s_mov_b32 s42, s15
-; CHECK-NEXT: s_mov_b32 s43, s14
-; CHECK-NEXT: s_mov_b32 s44, s13
-; CHECK-NEXT: s_mov_b32 s45, s12
+; CHECK-NEXT: s_mov_b32 s50, s15
+; CHECK-NEXT: s_mov_b32 s51, s14
+; CHECK-NEXT: s_mov_b32 s52, s13
+; CHECK-NEXT: s_mov_b32 s53, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_mov_b64 s[46:47], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41
@@ -445,15 +445,15 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[46:47]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT: s_mov_b32 s12, s45
-; CHECK-NEXT: s_mov_b32 s13, s44
-; CHECK-NEXT: s_mov_b32 s14, s43
-; CHECK-NEXT: s_mov_b32 s15, s42
+; CHECK-NEXT: s_mov_b32 s12, s53
+; CHECK-NEXT: s_mov_b32 s13, s52
+; CHECK-NEXT: s_mov_b32 s14, s51
+; CHECK-NEXT: s_mov_b32 s15, s50
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -463,14 +463,14 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT: v_readlane_b32 s45, v43, 13
-; CHECK-NEXT: v_readlane_b32 s44, v43, 12
-; CHECK-NEXT: v_readlane_b32 s43, v43, 11
-; CHECK-NEXT: v_readlane_b32 s42, v43, 10
-; CHECK-NEXT: v_readlane_b32 s41, v43, 9
-; CHECK-NEXT: v_readlane_b32 s40, v43, 8
-; CHECK-NEXT: v_readlane_b32 s39, v43, 7
-; CHECK-NEXT: v_readlane_b32 s38, v43, 6
+; CHECK-NEXT: v_readlane_b32 s53, v43, 13
+; CHECK-NEXT: v_readlane_b32 s52, v43, 12
+; CHECK-NEXT: v_readlane_b32 s51, v43, 11
+; CHECK-NEXT: v_readlane_b32 s50, v43, 10
+; CHECK-NEXT: v_readlane_b32 s49, v43, 9
+; CHECK-NEXT: v_readlane_b32 s48, v43, 8
+; CHECK-NEXT: v_readlane_b32 s47, v43, 7
+; CHECK-NEXT: v_readlane_b32 s46, v43, 6
; CHECK-NEXT: v_readlane_b32 s37, v43, 5
; CHECK-NEXT: v_readlane_b32 s36, v43, 4
; CHECK-NEXT: v_readlane_b32 s35, v43, 3
@@ -552,32 +552,32 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
; CHECK-NEXT: v_writelane_b32 v42, s35, 3
; CHECK-NEXT: v_writelane_b32 v42, s36, 4
; CHECK-NEXT: v_writelane_b32 v42, s37, 5
-; CHECK-NEXT: v_writelane_b32 v42, s38, 6
-; CHECK-NEXT: v_writelane_b32 v42, s39, 7
+; CHECK-NEXT: v_writelane_b32 v42, s46, 6
+; CHECK-NEXT: v_writelane_b32 v42, s47, 7
; CHECK-NEXT: s_addk_i32 s32, 0x400
-; CHECK-NEXT: v_writelane_b32 v42, s40, 8
-; CHECK-NEXT: v_writelane_b32 v42, s41, 9
-; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT: v_writelane_b32 v42, s48, 8
+; CHECK-NEXT: v_writelane_b32 v42, s49, 9
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v42, s42, 10
-; CHECK-NEXT: v_writelane_b32 v42, s43, 11
-; CHECK-NEXT: v_writelane_b32 v42, s44, 12
+; CHECK-NEXT: v_writelane_b32 v42, s50, 10
+; CHECK-NEXT: v_writelane_b32 v42, s51, 11
+; CHECK-NEXT: v_writelane_b32 v42, s52, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
-; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v42, s45, 13
+; CHECK-NEXT: v_writelane_b32 v42, s53, 13
; CHECK-NEXT: v_mov_b32_e32 v40, v31
-; CHECK-NEXT: s_mov_b32 s42, s15
-; CHECK-NEXT: s_mov_b32 s43, s14
-; CHECK-NEXT: s_mov_b32 s44, s13
-; CHECK-NEXT: s_mov_b32 s45, s12
+; CHECK-NEXT: s_mov_b32 s50, s15
+; CHECK-NEXT: s_mov_b32 s51, s14
+; CHECK-NEXT: s_mov_b32 s52, s13
+; CHECK-NEXT: s_mov_b32 s53, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_mov_b64 s[46:47], s[6:7]
; CHECK-NEXT: v_lshlrev_b32_e32 v41, 1, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -586,28 +586,28 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
+; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[46:47]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT: s_mov_b32 s12, s45
-; CHECK-NEXT: s_mov_b32 s13, s44
-; CHECK-NEXT: s_mov_b32 s14, s43
-; CHECK-NEXT: s_mov_b32 s15, s42
+; CHECK-NEXT: s_mov_b32 s12, s53
+; CHECK-NEXT: s_mov_b32 s13, s52
+; CHECK-NEXT: s_mov_b32 s14, s51
+; CHECK-NEXT: s_mov_b32 s15, s50
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT: v_readlane_b32 s45, v42, 13
-; CHECK-NEXT: v_readlane_b32 s44, v42, 12
-; CHECK-NEXT: v_readlane_b32 s43, v42, 11
-; CHECK-NEXT: v_readlane_b32 s42, v42, 10
-; CHECK-NEXT: v_readlane_b32 s41, v42, 9
-; CHECK-NEXT: v_readlane_b32 s40, v42, 8
-; CHECK-NEXT: v_readlane_b32 s39, v42, 7
-; CHECK-NEXT: v_readlane_b32 s38, v42, 6
+; CHECK-NEXT: v_readlane_b32 s53, v42, 13
+; CHECK-NEXT: v_readlane_b32 s52, v42, 12
+; CHECK-NEXT: v_readlane_b32 s51, v42, 11
+; CHECK-NEXT: v_readlane_b32 s50, v42, 10
+; CHECK-NEXT: v_readlane_b32 s49, v42, 9
+; CHECK-NEXT: v_readlane_b32 s48, v42, 8
+; CHECK-NEXT: v_readlane_b32 s47, v42, 7
+; CHECK-NEXT: v_readlane_b32 s46, v42, 6
; CHECK-NEXT: v_readlane_b32 s37, v42, 5
; CHECK-NEXT: v_readlane_b32 s36, v42, 4
; CHECK-NEXT: v_readlane_b32 s35, v42, 3
@@ -694,34 +694,34 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
; CHECK-NEXT: v_writelane_b32 v43, s35, 3
; CHECK-NEXT: v_writelane_b32 v43, s36, 4
; CHECK-NEXT: v_writelane_b32 v43, s37, 5
-; CHECK-NEXT: v_writelane_b32 v43, s38, 6
-; CHECK-NEXT: v_writelane_b32 v43, s39, 7
+; CHECK-NEXT: v_writelane_b32 v43, s46, 6
+; CHECK-NEXT: v_writelane_b32 v43, s47, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
-; CHECK-NEXT: v_writelane_b32 v43, s40, 8
-; CHECK-NEXT: v_writelane_b32 v43, s41, 9
-; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
+; CHECK-NEXT: v_writelane_b32 v43, s48, 8
+; CHECK-NEXT: v_writelane_b32 v43, s49, 9
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v43, s42, 10
+; CHECK-NEXT: v_writelane_b32 v43, s50, 10
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT: v_writelane_b32 v43, s43, 11
+; CHECK-NEXT: v_writelane_b32 v43, s51, 11
; CHECK-NEXT: v_mov_b32_e32 v41, v1
-; CHECK-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/127353
More information about the llvm-branch-commits
mailing list