[llvm] AMDGPU/GlobalISel: Insert m0 initialization before sextload/zextload (PR #111720)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 9 10:23:00 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
Fixes missing m0 initialize for pre-gfx9 targets with local extending
loads.
---
Patch is 24.98 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/111720.diff
7 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir (+14-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir (+14-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll (+31-31)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll (+28-28)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll (+53-54)
- (modified) llvm/test/CodeGen/AMDGPU/med3-knownbits.ll (+10-22)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 34a89a907e6487..5be0a049cc5827 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3532,6 +3532,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return true;
return selectImpl(I, *CoverageInfo);
case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_ZEXTLOAD:
+ case TargetOpcode::G_SEXTLOAD:
case TargetOpcode::G_STORE:
case TargetOpcode::G_ATOMIC_CMPXCHG:
case TargetOpcode::G_ATOMICRMW_XCHG:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir
index 6bac125c0309ba..37958480d28a5f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir
@@ -19,14 +19,18 @@ body: |
; GFX6: liveins: $vgpr0
; GFX6-NEXT: {{ $}}
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX6-NEXT: $m0 = S_MOV_B32 -1
; GFX6-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_I8_]]
+ ;
; GFX7-LABEL: name: sextload_local_s32_from_s8_align1
; GFX7: liveins: $vgpr0
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX7-NEXT: $m0 = S_MOV_B32 -1
; GFX7-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_I8_]]
+ ;
; GFX9-LABEL: name: sextload_local_s32_from_s8_align1
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
@@ -53,14 +57,18 @@ body: |
; GFX6: liveins: $vgpr0
; GFX6-NEXT: {{ $}}
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX6-NEXT: $m0 = S_MOV_B32 -1
; GFX6-NEXT: [[DS_READ_I16_:%[0-9]+]]:vgpr_32 = DS_READ_I16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_I16_]]
+ ;
; GFX7-LABEL: name: sextload_local_s32_from_s16_align2
; GFX7: liveins: $vgpr0
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX7-NEXT: $m0 = S_MOV_B32 -1
; GFX7-NEXT: [[DS_READ_I16_:%[0-9]+]]:vgpr_32 = DS_READ_I16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_I16_]]
+ ;
; GFX9-LABEL: name: sextload_local_s32_from_s16_align2
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
@@ -105,15 +113,19 @@ body: |
; GFX6-NEXT: {{ $}}
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
- ; GFX6-NEXT: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
- ; GFX6-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 %2, 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
+ ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+ ; GFX6-NEXT: $m0 = S_MOV_B32 -1
+ ; GFX6-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_I8_]]
+ ;
; GFX7-LABEL: name: sextload_local_s32_from_s8_align1_offset4095
; GFX7: liveins: $vgpr0
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX7-NEXT: $m0 = S_MOV_B32 -1
; GFX7-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[COPY]], 4095, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_I8_]]
+ ;
; GFX9-LABEL: name: sextload_local_s32_from_s8_align1_offset4095
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir
index 63e5d061f8c372..29671c13e173f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir
@@ -19,14 +19,18 @@ body: |
; GFX6: liveins: $vgpr0
; GFX6-NEXT: {{ $}}
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX6-NEXT: $m0 = S_MOV_B32 -1
; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
+ ;
; GFX7-LABEL: name: zextload_local_s32_from_s8_align1
; GFX7: liveins: $vgpr0
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX7-NEXT: $m0 = S_MOV_B32 -1
; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
+ ;
; GFX9-LABEL: name: zextload_local_s32_from_s8_align1
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
@@ -53,14 +57,18 @@ body: |
; GFX6: liveins: $vgpr0
; GFX6-NEXT: {{ $}}
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX6-NEXT: $m0 = S_MOV_B32 -1
; GFX6-NEXT: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U16_]]
+ ;
; GFX7-LABEL: name: zextload_local_s32_from_s16_align2
; GFX7: liveins: $vgpr0
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX7-NEXT: $m0 = S_MOV_B32 -1
; GFX7-NEXT: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U16_]]
+ ;
; GFX9-LABEL: name: zextload_local_s32_from_s16_align2
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
@@ -105,15 +113,19 @@ body: |
; GFX6-NEXT: {{ $}}
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
- ; GFX6-NEXT: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
- ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
+ ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+ ; GFX6-NEXT: $m0 = S_MOV_B32 -1
+ ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
+ ;
; GFX7-LABEL: name: zextload_local_s32_from_s8_align1_offset4095
; GFX7: liveins: $vgpr0
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX7-NEXT: $m0 = S_MOV_B32 -1
; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 4095, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
+ ;
; GFX9-LABEL: name: zextload_local_s32_from_s8_align1_offset4095
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
index fef672570352c3..21f1af1feb4a06 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
@@ -90,54 +90,53 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
; GFX7-LABEL: load_lds_v4i32_align1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT: ds_read_u8 v2, v0
-; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: ds_read_u8 v2, v0 offset:3
+; GFX7-NEXT: ds_read_u8 v1, v0
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT: ds_read_u8 v9, v0 offset:8
-; GFX7-NEXT: ds_read_u8 v10, v0 offset:9
-; GFX7-NEXT: ds_read_u8 v11, v0 offset:10
-; GFX7-NEXT: s_waitcnt lgkmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT: s_waitcnt lgkmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(5)
+; GFX7-NEXT: s_waitcnt lgkmcnt(2)
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
-; GFX7-NEXT: s_waitcnt lgkmcnt(3)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: ds_read_u8 v3, v0 offset:11
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:12
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:13
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:14
-; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:8
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:9
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:10
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:11
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:12
+; GFX7-NEXT: ds_read_u8 v8, v0 offset:13
+; GFX7-NEXT: ds_read_u8 v9, v0 offset:14
+; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v11
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v9
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v9
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v7
; GFX7-NEXT: v_or_b32_e32 v0, v0, v5
; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
; GFX7-NEXT: v_mov_b32_e32 v0, v4
@@ -270,8 +269,8 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
; GFX7-LABEL: load_lds_v4i32_align2:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: ds_read_u16 v1, v0
; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_read_u16 v1, v0
; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
; GFX7-NEXT: ds_read_u16 v4, v0 offset:6
@@ -281,11 +280,12 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
; GFX7-NEXT: ds_read_u16 v8, v0 offset:14
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT: s_waitcnt lgkmcnt(2)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v8
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
index 225f2165977b3c..67a089b5cd17dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
@@ -81,42 +81,42 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
; GFX7-LABEL: load_lds_v3i32_align1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT: ds_read_u8 v2, v0
-; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: ds_read_u8 v2, v0 offset:3
-; GFX7-NEXT: ds_read_u8 v4, v0 offset:4
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:5
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:6
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:7
-; GFX7-NEXT: ds_read_u8 v8, v0 offset:8
-; GFX7-NEXT: ds_read_u8 v9, v0 offset:9
-; GFX7-NEXT: ds_read_u8 v10, v0 offset:10
-; GFX7-NEXT: s_waitcnt lgkmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT: ds_read_u8 v1, v0
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
+; GFX7-NEXT: s_waitcnt lgkmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: ds_read_u8 v0, v0 offset:11
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
; GFX7-NEXT: v_or_b32_e32 v3, v2, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(6)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v5
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT: s_waitcnt lgkmcnt(2)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT: s_waitcnt lgkmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:8
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:9
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:10
+; GFX7-NEXT: ds_read_u8 v0, v0 offset:11
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8
; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v6
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v10
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
; GFX7-NEXT: v_or_b32_e32 v0, v0, v4
; GFX7-NEXT: v_or_b32_e32 v2, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, v3
@@ -223,8 +223,8 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
; GFX7-LABEL: load_lds_v3i32_align2:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: ds_read_u16 v1, v0
; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_read_u16 v1, v0
; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
; GFX7-NEXT: ds_read_u16 v4, v0 offset:6
@@ -235,9 +235,9 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index 06bf71f5e122cc..c595c939e8d139 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -19,54 +19,53 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
; GFX7-LABEL: load_lds_v4i32_align1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT: ds_read_u8 v2, v0
-; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(1)
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: ds_read_u8 v2, v0 offset:3
+; GFX7-NEXT: ds_read_u8 v1, v0
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT: ds_read_u8 v9, v0 offset:8
-; GFX7-NEXT: ds_read_u8 v10, v0 offset:9
-; GFX7-NEXT: ds_read_u8 v11, v0 offset:10
-; GFX7-NEXT: s_waitcnt lgkmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT: s_waitcnt lgkmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT: s_waitcnt lgkmcnt(5)
+; GFX7-NEXT: s_waitcnt lgkmcnt(2)
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
-; GFX7-NEXT: s_waitcnt lgkmcnt(3)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: ds_read_u8 v3, v0 offset:11
-; GFX7-NEXT: ds_read_u8 v5, v0 offset:12
-; GFX7-NEXT: ds_read_u8 v6, v0 offset:13
-; GFX7-NEXT: ds_read_u8 v7, v0 offset:14
-; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: ds_read_u8 v2, v0 offset:8
+; GFX7-NEXT: ds_read_u8 v3, v0 offset:9
+; GFX7-NEXT: ds_read_u8 v5, v0 offset:10
+; GFX7-NEXT: ds_read_u8 v6, v0 offset:11
+; GFX7-NEXT: ds_read_u8 v7, v0 offset:12
+; GFX7-NEXT: ds_read_u8 v8, v0 offset:13
+; GFX7-NEXT: ds_read_u8 v9, v0 offset:14
+; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v11
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v9
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v9
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v7
; GFX7-NEXT: v_or_b32_e32 v0, v0, v5
; GFX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/111720
More information about the llvm-commits
mailing list