[llvm] AMDGPU/GlobalISel: Insert m0 initialization before sextload/zextload (PR #111720)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 9 10:21:53 PDT 2024


https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/111720

Fixes missing m0 initialize for pre-gfx9 targets with local extending
loads.

>From 8a4ebaa3b264cfdbb7e93d98a06a71c2e440dafb Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 9 Oct 2024 21:00:28 +0400
Subject: [PATCH] AMDGPU/GlobalISel: Insert m0 initialization before
 sextload/zextload

Fixes missing m0 initialize for pre-gfx9 targets with local extending
loads.
---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   2 +
 .../GlobalISel/inst-select-sextload-local.mir |  16 ++-
 .../GlobalISel/inst-select-zextload-local.mir |  16 ++-
 .../AMDGPU/GlobalISel/load-local.128.ll       |  62 +++++-----
 .../AMDGPU/GlobalISel/load-local.96.ll        |  56 ++++-----
 .../AMDGPU/GlobalISel/load-unaligned.ll       | 107 +++++++++---------
 llvm/test/CodeGen/AMDGPU/med3-knownbits.ll    |  32 ++----
 7 files changed, 152 insertions(+), 139 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 34a89a907e6487..5be0a049cc5827 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3532,6 +3532,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
       return true;
     return selectImpl(I, *CoverageInfo);
   case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_ZEXTLOAD:
+  case TargetOpcode::G_SEXTLOAD:
   case TargetOpcode::G_STORE:
   case TargetOpcode::G_ATOMIC_CMPXCHG:
   case TargetOpcode::G_ATOMICRMW_XCHG:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir
index 6bac125c0309ba..37958480d28a5f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir
@@ -19,14 +19,18 @@ body: |
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_I8_]]
+    ;
     ; GFX7-LABEL: name: sextload_local_s32_from_s8_align1
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_I8_]]
+    ;
     ; GFX9-LABEL: name: sextload_local_s32_from_s8_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -53,14 +57,18 @@ body: |
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_I16_:%[0-9]+]]:vgpr_32 = DS_READ_I16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_I16_]]
+    ;
     ; GFX7-LABEL: name: sextload_local_s32_from_s16_align2
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_I16_:%[0-9]+]]:vgpr_32 = DS_READ_I16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_I16_]]
+    ;
     ; GFX9-LABEL: name: sextload_local_s32_from_s16_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -105,15 +113,19 @@ body: |
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX6-NEXT: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 %2, 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
+    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6-NEXT: $m0 = S_MOV_B32 -1
+    ; GFX6-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_I8_]]
+    ;
     ; GFX7-LABEL: name: sextload_local_s32_from_s8_align1_offset4095
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[COPY]], 4095, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_I8_]]
+    ;
     ; GFX9-LABEL: name: sextload_local_s32_from_s8_align1_offset4095
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir
index 63e5d061f8c372..29671c13e173f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir
@@ -19,14 +19,18 @@ body: |
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
+    ;
     ; GFX7-LABEL: name: zextload_local_s32_from_s8_align1
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
+    ;
     ; GFX9-LABEL: name: zextload_local_s32_from_s8_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -53,14 +57,18 @@ body: |
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U16_]]
+    ;
     ; GFX7-LABEL: name: zextload_local_s32_from_s16_align2
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U16_]]
+    ;
     ; GFX9-LABEL: name: zextload_local_s32_from_s16_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -105,15 +113,19 @@ body: |
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX6-NEXT: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
+    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6-NEXT: $m0 = S_MOV_B32 -1
+    ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
+    ;
     ; GFX7-LABEL: name: zextload_local_s32_from_s8_align1_offset4095
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 4095, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
+    ;
     ; GFX9-LABEL: name: zextload_local_s32_from_s8_align1_offset4095
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
index fef672570352c3..21f1af1feb4a06 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
@@ -90,54 +90,53 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-LABEL: load_lds_v4i32_align1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v2, v0
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    ds_read_u8 v2, v0 offset:3
+; GFX7-NEXT:    ds_read_u8 v1, v0
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT:    ds_read_u8 v4, v0 offset:3
 ; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
 ; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT:    ds_read_u8 v9, v0 offset:8
-; GFX7-NEXT:    ds_read_u8 v10, v0 offset:9
-; GFX7-NEXT:    ds_read_u8 v11, v0 offset:10
-; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
-; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:11
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:12
-; GFX7-NEXT:    ds_read_u8 v6, v0 offset:13
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:14
-; GFX7-NEXT:    ds_read_u8 v0, v0 offset:15
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:8
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:9
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:10
+; GFX7-NEXT:    ds_read_u8 v6, v0 offset:11
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:12
+; GFX7-NEXT:    ds_read_u8 v8, v0 offset:13
+; GFX7-NEXT:    ds_read_u8 v9, v0 offset:14
+; GFX7-NEXT:    ds_read_u8 v0, v0 offset:15
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v9
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
@@ -270,8 +269,8 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX7-LABEL: load_lds_v4i32_align2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    ds_read_u16 v1, v0
 ; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_u16 v1, v0
 ; GFX7-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX7-NEXT:    ds_read_u16 v3, v0 offset:4
 ; GFX7-NEXT:    ds_read_u16 v4, v0 offset:6
@@ -281,11 +280,12 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    ds_read_u16 v8, v0 offset:14
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
index 225f2165977b3c..67a089b5cd17dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
@@ -81,42 +81,42 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-LABEL: load_lds_v3i32_align1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v2, v0
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    ds_read_u8 v2, v0 offset:3
-; GFX7-NEXT:    ds_read_u8 v4, v0 offset:4
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:5
-; GFX7-NEXT:    ds_read_u8 v6, v0 offset:6
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:7
-; GFX7-NEXT:    ds_read_u8 v8, v0 offset:8
-; GFX7-NEXT:    ds_read_u8 v9, v0 offset:9
-; GFX7-NEXT:    ds_read_u8 v10, v0 offset:10
-; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    ds_read_u8 v1, v0
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    ds_read_u8 v0, v0 offset:11
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:8
+; GFX7-NEXT:    ds_read_u8 v6, v0 offset:9
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:10
+; GFX7-NEXT:    ds_read_u8 v0, v0 offset:11
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
@@ -223,8 +223,8 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
 ; GFX7-LABEL: load_lds_v3i32_align2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    ds_read_u16 v1, v0
 ; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_u16 v1, v0
 ; GFX7-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX7-NEXT:    ds_read_u16 v3, v0 offset:4
 ; GFX7-NEXT:    ds_read_u16 v4, v0 offset:6
@@ -235,9 +235,9 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index 06bf71f5e122cc..c595c939e8d139 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -19,54 +19,53 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-LABEL: load_lds_v4i32_align1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v2, v0
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    ds_read_u8 v2, v0 offset:3
+; GFX7-NEXT:    ds_read_u8 v1, v0
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT:    ds_read_u8 v4, v0 offset:3
 ; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
 ; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT:    ds_read_u8 v9, v0 offset:8
-; GFX7-NEXT:    ds_read_u8 v10, v0 offset:9
-; GFX7-NEXT:    ds_read_u8 v11, v0 offset:10
-; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
-; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:11
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:12
-; GFX7-NEXT:    ds_read_u8 v6, v0 offset:13
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:14
-; GFX7-NEXT:    ds_read_u8 v0, v0 offset:15
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:8
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:9
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:10
+; GFX7-NEXT:    ds_read_u8 v6, v0 offset:11
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:12
+; GFX7-NEXT:    ds_read_u8 v8, v0 offset:13
+; GFX7-NEXT:    ds_read_u8 v9, v0 offset:14
+; GFX7-NEXT:    ds_read_u8 v0, v0 offset:15
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v9
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
@@ -102,42 +101,42 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-LABEL: load_lds_v3i32_align1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
-; GFX7-NEXT:    ds_read_u8 v2, v0
-; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    ds_read_u8 v2, v0 offset:3
-; GFX7-NEXT:    ds_read_u8 v4, v0 offset:4
-; GFX7-NEXT:    ds_read_u8 v5, v0 offset:5
-; GFX7-NEXT:    ds_read_u8 v6, v0 offset:6
-; GFX7-NEXT:    ds_read_u8 v7, v0 offset:7
-; GFX7-NEXT:    ds_read_u8 v8, v0 offset:8
-; GFX7-NEXT:    ds_read_u8 v9, v0 offset:9
-; GFX7-NEXT:    ds_read_u8 v10, v0 offset:10
-; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    ds_read_u8 v1, v0
+; GFX7-NEXT:    ds_read_u8 v2, v0 offset:1
+; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
+; GFX7-NEXT:    ds_read_u8 v4, v0 offset:3
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
+; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
+; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
+; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    ds_read_u8 v0, v0 offset:11
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v2, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    ds_read_u8 v5, v0 offset:8
+; GFX7-NEXT:    ds_read_u8 v6, v0 offset:9
+; GFX7-NEXT:    ds_read_u8 v7, v0 offset:10
+; GFX7-NEXT:    ds_read_u8 v0, v0 offset:11
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll b/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll
index fe0ab81bffe310..01e1e56aea8882 100644
--- a/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll
@@ -11,26 +11,16 @@ declare i32 @llvm.umax.i32(i32, i32)
 ; 0 sign bit only after umed3 is formed. The DS instruction offset can
 ; only be folded on SI with a positive base address.
 define i32 @v_known_bits_umed3(i8 %a) {
-; SI-SDAG-LABEL: v_known_bits_umed3:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x80
-; SI-SDAG-NEXT:    v_med3_u32 v0, v0, 32, v1
-; SI-SDAG-NEXT:    s_mov_b32 m0, -1
-; SI-SDAG-NEXT:    ds_read_u8 v0, v0 offset:128
-; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_known_bits_umed3:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x80
-; SI-GISEL-NEXT:    v_med3_u32 v0, v0, 32, v1
-; SI-GISEL-NEXT:    ds_read_u8 v0, v0 offset:128
-; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_known_bits_umed3:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0x80
+; SI-NEXT:    v_med3_u32 v0, v0, 32, v1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_u8 v0, v0 offset:128
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_setpc_b64 s[30:31]
   %ext.a = zext i8 %a to i32
   %max.a = call i32 @llvm.umax.i32(i32 %ext.a, i32 32)
   %umed3 = call i32 @llvm.umin.i32(i32 %max.a, i32 128)
@@ -120,5 +110,3 @@ define i32 @v_known_signbits_smed3(i16 %a, i16 %b) {
   %mul = sdiv i32 %smed3.a, %smed3.b
   ret i32 %mul
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SI: {{.*}}



More information about the llvm-commits mailing list