[llvm] [AMDGPU] Set total VGPRs to 1536 for gfx12 (PR #96272)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 20 21:19:10 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Mariusz Sikora (mariusz-sikora-at-amd)
<details>
<summary>Changes</summary>
- Use Feature1_5xVGPRs
---
Patch is 126.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96272.diff
7 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll (+48-50)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll (+48-50)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+655-664)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+49-49)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+211-219)
- (modified) llvm/test/CodeGen/AMDGPU/occupancy-levels.ll (+2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index cb5ceb9959325..4b055a43b40d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1572,7 +1572,8 @@ def FeatureISAVersion12 : FeatureSet<
FeatureVGPRSingleUseHintInsts,
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
- FeatureMaxHardClauseLength32]>;
+ FeatureMaxHardClauseLength32,
+ Feature1_5xVGPRs]>;
def FeatureISAVersion12_Generic: FeatureSet<
!listconcat(FeatureISAVersion12.Features,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index 78fb23182f800..e9acbec33f2f3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -2868,74 +2868,72 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_clause 0x1b
+; GFX12-NEXT: s_clause 0x1f
+; GFX12-NEXT: scratch_load_b32 v31, off, s32
; GFX12-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX12-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX12-NEXT: scratch_load_b32 v35, off, s32 offset:16
; GFX12-NEXT: scratch_load_b32 v34, off, s32 offset:12
-; GFX12-NEXT: scratch_load_b32 v31, off, s32
-; GFX12-NEXT: scratch_load_b32 v37, off, s32 offset:120
-; GFX12-NEXT: scratch_load_b32 v39, off, s32 offset:104
-; GFX12-NEXT: scratch_load_b32 v49, off, s32 offset:24
-; GFX12-NEXT: scratch_load_b32 v48, off, s32 offset:20
-; GFX12-NEXT: scratch_load_b32 v51, off, s32 offset:32
-; GFX12-NEXT: scratch_load_b32 v50, off, s32 offset:28
-; GFX12-NEXT: scratch_load_b32 v53, off, s32 offset:40
-; GFX12-NEXT: scratch_load_b32 v52, off, s32 offset:36
-; GFX12-NEXT: scratch_load_b32 v55, off, s32 offset:48
-; GFX12-NEXT: scratch_load_b32 v54, off, s32 offset:44
-; GFX12-NEXT: scratch_load_b32 v65, off, s32 offset:56
-; GFX12-NEXT: scratch_load_b32 v64, off, s32 offset:52
-; GFX12-NEXT: scratch_load_b32 v67, off, s32 offset:64
-; GFX12-NEXT: scratch_load_b32 v66, off, s32 offset:60
-; GFX12-NEXT: scratch_load_b32 v69, off, s32 offset:72
-; GFX12-NEXT: scratch_load_b32 v68, off, s32 offset:68
-; GFX12-NEXT: scratch_load_b32 v71, off, s32 offset:80
-; GFX12-NEXT: scratch_load_b32 v70, off, s32 offset:76
-; GFX12-NEXT: scratch_load_b32 v81, off, s32 offset:88
-; GFX12-NEXT: scratch_load_b32 v80, off, s32 offset:84
-; GFX12-NEXT: scratch_load_b32 v83, off, s32 offset:96
-; GFX12-NEXT: scratch_load_b32 v82, off, s32 offset:92
-; GFX12-NEXT: scratch_load_b32 v38, off, s32 offset:100
-; GFX12-NEXT: s_wait_loadcnt 0x1a
+; GFX12-NEXT: scratch_load_b32 v37, off, s32 offset:24
+; GFX12-NEXT: scratch_load_b32 v36, off, s32 offset:20
+; GFX12-NEXT: scratch_load_b32 v39, off, s32 offset:32
+; GFX12-NEXT: scratch_load_b32 v38, off, s32 offset:28
+; GFX12-NEXT: scratch_load_b32 v49, off, s32 offset:40
+; GFX12-NEXT: scratch_load_b32 v48, off, s32 offset:36
+; GFX12-NEXT: scratch_load_b32 v51, off, s32 offset:48
+; GFX12-NEXT: scratch_load_b32 v50, off, s32 offset:44
+; GFX12-NEXT: scratch_load_b32 v53, off, s32 offset:56
+; GFX12-NEXT: scratch_load_b32 v52, off, s32 offset:52
+; GFX12-NEXT: scratch_load_b32 v55, off, s32 offset:64
+; GFX12-NEXT: scratch_load_b32 v54, off, s32 offset:60
+; GFX12-NEXT: scratch_load_b32 v65, off, s32 offset:72
+; GFX12-NEXT: scratch_load_b32 v64, off, s32 offset:68
+; GFX12-NEXT: scratch_load_b32 v67, off, s32 offset:80
+; GFX12-NEXT: scratch_load_b32 v66, off, s32 offset:76
+; GFX12-NEXT: scratch_load_b32 v69, off, s32 offset:88
+; GFX12-NEXT: scratch_load_b32 v68, off, s32 offset:84
+; GFX12-NEXT: scratch_load_b32 v71, off, s32 offset:96
+; GFX12-NEXT: scratch_load_b32 v70, off, s32 offset:92
+; GFX12-NEXT: scratch_load_b32 v81, off, s32 offset:104
+; GFX12-NEXT: scratch_load_b32 v80, off, s32 offset:100
+; GFX12-NEXT: scratch_load_b32 v83, off, s32 offset:112
+; GFX12-NEXT: scratch_load_b32 v82, off, s32 offset:108
+; GFX12-NEXT: scratch_load_b32 v85, off, s32 offset:120
+; GFX12-NEXT: scratch_load_b32 v84, off, s32 offset:116
+; GFX12-NEXT: scratch_load_b32 v87, off, s32 offset:128
+; GFX12-NEXT: scratch_load_b32 v86, off, s32 offset:124
+; GFX12-NEXT: s_wait_loadcnt 0x1e
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[32:33]
-; GFX12-NEXT: s_clause 0x2
-; GFX12-NEXT: scratch_load_b32 v33, off, s32 offset:112
-; GFX12-NEXT: scratch_load_b32 v32, off, s32 offset:108
-; GFX12-NEXT: scratch_load_b32 v36, off, s32 offset:116
-; GFX12-NEXT: s_wait_loadcnt 0x1b
+; GFX12-NEXT: s_wait_loadcnt 0x1c
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[34:35]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_load_b32 v35, off, s32 offset:128
-; GFX12-NEXT: scratch_load_b32 v34, off, s32 offset:124
+; GFX12-NEXT: s_wait_loadcnt 0x1a
+; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[36:37]
; GFX12-NEXT: s_wait_loadcnt 0x18
-; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[48:49]
+; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[38:39]
; GFX12-NEXT: s_wait_loadcnt 0x16
-; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[50:51]
+; GFX12-NEXT: v_maximum_f64 v[8:9], v[8:9], v[48:49]
; GFX12-NEXT: s_wait_loadcnt 0x14
-; GFX12-NEXT: v_maximum_f64 v[8:9], v[8:9], v[52:53]
+; GFX12-NEXT: v_maximum_f64 v[10:11], v[10:11], v[50:51]
; GFX12-NEXT: s_wait_loadcnt 0x12
-; GFX12-NEXT: v_maximum_f64 v[10:11], v[10:11], v[54:55]
+; GFX12-NEXT: v_maximum_f64 v[12:13], v[12:13], v[52:53]
; GFX12-NEXT: s_wait_loadcnt 0x10
-; GFX12-NEXT: v_maximum_f64 v[12:13], v[12:13], v[64:65]
+; GFX12-NEXT: v_maximum_f64 v[14:15], v[14:15], v[54:55]
; GFX12-NEXT: s_wait_loadcnt 0xe
-; GFX12-NEXT: v_maximum_f64 v[14:15], v[14:15], v[66:67]
+; GFX12-NEXT: v_maximum_f64 v[16:17], v[16:17], v[64:65]
; GFX12-NEXT: s_wait_loadcnt 0xc
-; GFX12-NEXT: v_maximum_f64 v[16:17], v[16:17], v[68:69]
+; GFX12-NEXT: v_maximum_f64 v[18:19], v[18:19], v[66:67]
; GFX12-NEXT: s_wait_loadcnt 0xa
-; GFX12-NEXT: v_maximum_f64 v[18:19], v[18:19], v[70:71]
+; GFX12-NEXT: v_maximum_f64 v[20:21], v[20:21], v[68:69]
; GFX12-NEXT: s_wait_loadcnt 0x8
-; GFX12-NEXT: v_maximum_f64 v[20:21], v[20:21], v[80:81]
+; GFX12-NEXT: v_maximum_f64 v[22:23], v[22:23], v[70:71]
; GFX12-NEXT: s_wait_loadcnt 0x6
-; GFX12-NEXT: v_maximum_f64 v[22:23], v[22:23], v[82:83]
-; GFX12-NEXT: s_wait_loadcnt 0x5
-; GFX12-NEXT: v_maximum_f64 v[24:25], v[24:25], v[38:39]
-; GFX12-NEXT: s_wait_loadcnt 0x3
-; GFX12-NEXT: v_maximum_f64 v[26:27], v[26:27], v[32:33]
+; GFX12-NEXT: v_maximum_f64 v[24:25], v[24:25], v[80:81]
+; GFX12-NEXT: s_wait_loadcnt 0x4
+; GFX12-NEXT: v_maximum_f64 v[26:27], v[26:27], v[82:83]
; GFX12-NEXT: s_wait_loadcnt 0x2
-; GFX12-NEXT: v_maximum_f64 v[28:29], v[28:29], v[36:37]
+; GFX12-NEXT: v_maximum_f64 v[28:29], v[28:29], v[84:85]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_maximum_f64 v[30:31], v[30:31], v[34:35]
+; GFX12-NEXT: v_maximum_f64 v[30:31], v[30:31], v[86:87]
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x double> @llvm.maximum.v16f64(<16 x double> %src0, <16 x double> %src1)
ret <16 x double> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index 37fe2e958e62b..d8462ec220244 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -2868,74 +2868,72 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_clause 0x1b
+; GFX12-NEXT: s_clause 0x1f
+; GFX12-NEXT: scratch_load_b32 v31, off, s32
; GFX12-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX12-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX12-NEXT: scratch_load_b32 v35, off, s32 offset:16
; GFX12-NEXT: scratch_load_b32 v34, off, s32 offset:12
-; GFX12-NEXT: scratch_load_b32 v31, off, s32
-; GFX12-NEXT: scratch_load_b32 v37, off, s32 offset:120
-; GFX12-NEXT: scratch_load_b32 v39, off, s32 offset:104
-; GFX12-NEXT: scratch_load_b32 v49, off, s32 offset:24
-; GFX12-NEXT: scratch_load_b32 v48, off, s32 offset:20
-; GFX12-NEXT: scratch_load_b32 v51, off, s32 offset:32
-; GFX12-NEXT: scratch_load_b32 v50, off, s32 offset:28
-; GFX12-NEXT: scratch_load_b32 v53, off, s32 offset:40
-; GFX12-NEXT: scratch_load_b32 v52, off, s32 offset:36
-; GFX12-NEXT: scratch_load_b32 v55, off, s32 offset:48
-; GFX12-NEXT: scratch_load_b32 v54, off, s32 offset:44
-; GFX12-NEXT: scratch_load_b32 v65, off, s32 offset:56
-; GFX12-NEXT: scratch_load_b32 v64, off, s32 offset:52
-; GFX12-NEXT: scratch_load_b32 v67, off, s32 offset:64
-; GFX12-NEXT: scratch_load_b32 v66, off, s32 offset:60
-; GFX12-NEXT: scratch_load_b32 v69, off, s32 offset:72
-; GFX12-NEXT: scratch_load_b32 v68, off, s32 offset:68
-; GFX12-NEXT: scratch_load_b32 v71, off, s32 offset:80
-; GFX12-NEXT: scratch_load_b32 v70, off, s32 offset:76
-; GFX12-NEXT: scratch_load_b32 v81, off, s32 offset:88
-; GFX12-NEXT: scratch_load_b32 v80, off, s32 offset:84
-; GFX12-NEXT: scratch_load_b32 v83, off, s32 offset:96
-; GFX12-NEXT: scratch_load_b32 v82, off, s32 offset:92
-; GFX12-NEXT: scratch_load_b32 v38, off, s32 offset:100
-; GFX12-NEXT: s_wait_loadcnt 0x1a
+; GFX12-NEXT: scratch_load_b32 v37, off, s32 offset:24
+; GFX12-NEXT: scratch_load_b32 v36, off, s32 offset:20
+; GFX12-NEXT: scratch_load_b32 v39, off, s32 offset:32
+; GFX12-NEXT: scratch_load_b32 v38, off, s32 offset:28
+; GFX12-NEXT: scratch_load_b32 v49, off, s32 offset:40
+; GFX12-NEXT: scratch_load_b32 v48, off, s32 offset:36
+; GFX12-NEXT: scratch_load_b32 v51, off, s32 offset:48
+; GFX12-NEXT: scratch_load_b32 v50, off, s32 offset:44
+; GFX12-NEXT: scratch_load_b32 v53, off, s32 offset:56
+; GFX12-NEXT: scratch_load_b32 v52, off, s32 offset:52
+; GFX12-NEXT: scratch_load_b32 v55, off, s32 offset:64
+; GFX12-NEXT: scratch_load_b32 v54, off, s32 offset:60
+; GFX12-NEXT: scratch_load_b32 v65, off, s32 offset:72
+; GFX12-NEXT: scratch_load_b32 v64, off, s32 offset:68
+; GFX12-NEXT: scratch_load_b32 v67, off, s32 offset:80
+; GFX12-NEXT: scratch_load_b32 v66, off, s32 offset:76
+; GFX12-NEXT: scratch_load_b32 v69, off, s32 offset:88
+; GFX12-NEXT: scratch_load_b32 v68, off, s32 offset:84
+; GFX12-NEXT: scratch_load_b32 v71, off, s32 offset:96
+; GFX12-NEXT: scratch_load_b32 v70, off, s32 offset:92
+; GFX12-NEXT: scratch_load_b32 v81, off, s32 offset:104
+; GFX12-NEXT: scratch_load_b32 v80, off, s32 offset:100
+; GFX12-NEXT: scratch_load_b32 v83, off, s32 offset:112
+; GFX12-NEXT: scratch_load_b32 v82, off, s32 offset:108
+; GFX12-NEXT: scratch_load_b32 v85, off, s32 offset:120
+; GFX12-NEXT: scratch_load_b32 v84, off, s32 offset:116
+; GFX12-NEXT: scratch_load_b32 v87, off, s32 offset:128
+; GFX12-NEXT: scratch_load_b32 v86, off, s32 offset:124
+; GFX12-NEXT: s_wait_loadcnt 0x1e
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[32:33]
-; GFX12-NEXT: s_clause 0x2
-; GFX12-NEXT: scratch_load_b32 v33, off, s32 offset:112
-; GFX12-NEXT: scratch_load_b32 v32, off, s32 offset:108
-; GFX12-NEXT: scratch_load_b32 v36, off, s32 offset:116
-; GFX12-NEXT: s_wait_loadcnt 0x1b
+; GFX12-NEXT: s_wait_loadcnt 0x1c
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[34:35]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_load_b32 v35, off, s32 offset:128
-; GFX12-NEXT: scratch_load_b32 v34, off, s32 offset:124
+; GFX12-NEXT: s_wait_loadcnt 0x1a
+; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[36:37]
; GFX12-NEXT: s_wait_loadcnt 0x18
-; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[48:49]
+; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[38:39]
; GFX12-NEXT: s_wait_loadcnt 0x16
-; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[50:51]
+; GFX12-NEXT: v_minimum_f64 v[8:9], v[8:9], v[48:49]
; GFX12-NEXT: s_wait_loadcnt 0x14
-; GFX12-NEXT: v_minimum_f64 v[8:9], v[8:9], v[52:53]
+; GFX12-NEXT: v_minimum_f64 v[10:11], v[10:11], v[50:51]
; GFX12-NEXT: s_wait_loadcnt 0x12
-; GFX12-NEXT: v_minimum_f64 v[10:11], v[10:11], v[54:55]
+; GFX12-NEXT: v_minimum_f64 v[12:13], v[12:13], v[52:53]
; GFX12-NEXT: s_wait_loadcnt 0x10
-; GFX12-NEXT: v_minimum_f64 v[12:13], v[12:13], v[64:65]
+; GFX12-NEXT: v_minimum_f64 v[14:15], v[14:15], v[54:55]
; GFX12-NEXT: s_wait_loadcnt 0xe
-; GFX12-NEXT: v_minimum_f64 v[14:15], v[14:15], v[66:67]
+; GFX12-NEXT: v_minimum_f64 v[16:17], v[16:17], v[64:65]
; GFX12-NEXT: s_wait_loadcnt 0xc
-; GFX12-NEXT: v_minimum_f64 v[16:17], v[16:17], v[68:69]
+; GFX12-NEXT: v_minimum_f64 v[18:19], v[18:19], v[66:67]
; GFX12-NEXT: s_wait_loadcnt 0xa
-; GFX12-NEXT: v_minimum_f64 v[18:19], v[18:19], v[70:71]
+; GFX12-NEXT: v_minimum_f64 v[20:21], v[20:21], v[68:69]
; GFX12-NEXT: s_wait_loadcnt 0x8
-; GFX12-NEXT: v_minimum_f64 v[20:21], v[20:21], v[80:81]
+; GFX12-NEXT: v_minimum_f64 v[22:23], v[22:23], v[70:71]
; GFX12-NEXT: s_wait_loadcnt 0x6
-; GFX12-NEXT: v_minimum_f64 v[22:23], v[22:23], v[82:83]
-; GFX12-NEXT: s_wait_loadcnt 0x5
-; GFX12-NEXT: v_minimum_f64 v[24:25], v[24:25], v[38:39]
-; GFX12-NEXT: s_wait_loadcnt 0x3
-; GFX12-NEXT: v_minimum_f64 v[26:27], v[26:27], v[32:33]
+; GFX12-NEXT: v_minimum_f64 v[24:25], v[24:25], v[80:81]
+; GFX12-NEXT: s_wait_loadcnt 0x4
+; GFX12-NEXT: v_minimum_f64 v[26:27], v[26:27], v[82:83]
; GFX12-NEXT: s_wait_loadcnt 0x2
-; GFX12-NEXT: v_minimum_f64 v[28:29], v[28:29], v[36:37]
+; GFX12-NEXT: v_minimum_f64 v[28:29], v[28:29], v[84:85]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_minimum_f64 v[30:31], v[30:31], v[34:35]
+; GFX12-NEXT: v_minimum_f64 v[30:31], v[30:31], v[86:87]
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x double> @llvm.minimum.v16f64(<16 x double> %src0, <16 x double> %src1)
ret <16 x double> %op
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 502cd14284e15..04fba9ef6d86d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -3448,163 +3448,160 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v2, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v1, 13, s2
; GFX12-NEXT: s_lshr_b32 s4, s3, 24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: v_lshrrev_b16 v10, 13, s3
-; GFX12-NEXT: v_lshrrev_b16 v3, 9, s2
-; GFX12-NEXT: v_and_b32_e32 v45, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 1, s4
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 11, s3
-; GFX12-NEXT: v_lshrrev_b16 v14, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 5, s4
+; GFX12-NEXT: v_lshrrev_b16 v2, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 13, s3
+; GFX12-NEXT: v_and_b32_e32 v44, 1, v1
+; GFX12-NEXT: v_lshrrev_b16 v1, 1, s4
; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: s_and_b32 s6, s3, 1
-; GFX12-NEXT: s_bfe_u32 s14, s3, 0x10012
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v55, s14 :: v_dual_and_b32 v36, 1, v10
-; GFX12-NEXT: v_and_b32_e32 v10, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 3, s4
-; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v11, 9, s3
-; GFX12-NEXT: v_lshrrev_b16 v13, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v15, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 3, s3
-; GFX12-NEXT: v_and_b32_e32 v43, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5
-; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10014
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v49, s19 :: v_dual_and_b32 v42, 1, v3
-; GFX12-NEXT: v_lshrrev_b16 v3, 5, s5
-; GFX12-NEXT: s_bfe_u32 s13, s3, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v29, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3
-; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v25, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3
-; GFX12-NEXT: v_dual_mov_b32 v56, s13 :: v_dual_and_b32 v27, 1, v12
-; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3
-; GFX12-NEXT: v_and_b32_e32 v12, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 1, s5
+; GFX12-NEXT: v_dual_mov_b32 v64, 0 :: v_dual_and_b32 v41, 1, v2
+; GFX12-NEXT: v_lshrrev_b16 v4, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v6, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v7, 3, s2
+; GFX12-NEXT: v_lshrrev_b16 v10, 9, s3
+; GFX12-NEXT: v_lshrrev_b16 v11, 11, s3
+; GFX12-NEXT: v_lshrrev_b16 v12, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v13, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v14, 1, s3
+; GFX12-NEXT: v_lshrrev_b16 v17, 5, s4
+; GFX12-NEXT: v_lshrrev_b16 v2, 5, s5
; GFX12-NEXT: s_and_b32 s7, s2, 1
-; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10011
+; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10010
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v52, s18 :: v_dual_and_b32 v35, 1, v9
+; GFX12-NEXT: v_and_b32_e32 v9, 1, v1
+; GFX12-NEXT: v_lshrrev_b16 v1, 3, s4
+; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10017
+; GFX12-NEXT: v_dual_mov_b32 v51, s19 :: v_dual_and_b32 v42, 1, v3
+; GFX12-NEXT: v_lshrrev_b16 v3, 3, s5
+; GFX12-NEXT: v_lshrrev_b16 v15, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v28, 12, s3
+; GFX12-NEXT: v_lshrrev_b16 v29, 14, s3
+; GFX12-NEXT: v_lshrrev_b16 v30, 15, s3
+; GFX12-NEXT: v_lshrrev_b16 v25, 10, s3
+; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3
+; GFX12-NEXT: v_lshrrev_b16 v21, 6, s3
+; GFX12-NEXT: v_and_b32_e32 v27, 1, v12
+; GFX12-NEXT: s_and_b32 s6, s3, 1
+; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10012
+; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10011
+; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10017
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v59, s12 :: v_dual_and_b32 v22, 1, v13
+; GFX12-NEXT: v_dual_mov_b32 v62, s9 :: v_dual_and_b32 v13, 1, v17
+; GFX12-NEXT: v_lshrrev_b16 v17, 6, s5
+; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10016
+; GFX12-NEXT: v_dual_mov_b32 v58, s13 :: v_dual_and_b32 v23, 1, v14
+; GFX12-NEXT: s_bfe_u32 s14, s2, 0x10015
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v57, s14 :: v_dual_and_b32 v26, 1, v11
+; GFX12-NEXT: v_and_b32_e32 v11, 1, v1
+; GFX12-NEXT: v_lshrrev_b16 v1, 1, s5
+; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10013
+; GFX12-NEXT: v_dual_mov_b32 v55, s15 :: v_dual_and_b32 v34, 1, v7
+; GFX12-NEXT: v_lshrrev_b16 v7, 7, s5
+; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10012
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v54, s16 :: v_dual_and_b32 v31, 1, v10
+; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10011
+; GFX12-NEXT: v_dual_mov_b32 v53, s17 :: v_dual_and_b32 v38, 1, v5
+; GFX12-NEXT: s_bfe_u32 s20, s3, 0x10016
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v54, s15 :: v_dual_and_b32 v35, 1, v8
-; GFX12-NEXT: v_lshrrev_b16 v8, 7, s5
-; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10010
-; GFX12-NEXT: v_dual_mov_b32 v53, s16 :: v_dual_and_b32 v40, 1, v7
-; GFX12-NEXT: v_lshrrev_b16 v7, 2, s5
-; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10017
-; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10016
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-N...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/96272
More information about the llvm-commits
mailing list