[llvm] AMDGPU/GlobalISel: Regbanklegalize for G_CONCAT_VECTORS (PR #171471)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 9 08:42:25 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Petar Avramovic (petar-avramovic)
<details>
<summary>Changes</summary>
RegBankLegalize using trivial mapping helper, assigns same reg bank
to all operands, vgpr or sgpr.
Uncovers multiple codegen and regbank combiner regressions related to
looking through sgpr to vgpr copies.
Skip regbankselect-concat-vector.mir since agprs are not yet supported.
---
Patch is 90.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/171471.diff
10 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll (+86-55)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll (+49-25)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll (+49-25)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll (+43-61)
- (modified) llvm/test/CodeGen/AMDGPU/freeze.ll (+80-41)
- (modified) llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll (+70-56)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll (+70-56)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll (+70-56)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index 839120da89711..2b782684c348a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -443,7 +443,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
// Opcodes that support pretty much all combinations of reg banks and LLTs
// (except S1). There is no point in writing rules for them.
if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
- Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) {
+ Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_CONCAT_VECTORS ||
+ Opc == AMDGPU::G_BITCAST) {
RBLHelper.applyMappingTrivial(*MI);
continue;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
index b67080bd4798d..b754bf0071da8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
@@ -1,6 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+; FIXME: codegen regression, related to:
+; - looking through s16 sgpr to vgpr copy
+; on G_BUILD_VECTOR with G_IMPLICIT_DEF input
define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
; GFX8-LABEL: add_v3i16:
@@ -40,25 +44,30 @@ define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v6, v[0:1], off
-; GFX9-NEXT: global_load_ushort v7, v[2:3], off
-; GFX9-NEXT: global_load_ushort v8, v[2:3], off offset:4
-; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:4
+; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4
+; GFX9-NEXT: global_load_ushort v8, v[2:3], off
+; GFX9-NEXT: global_load_ushort v9, v[2:3], off offset:4
; GFX9-NEXT: global_load_ushort v10, v[0:1], off offset:2
; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v7
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v8
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_pk_add_u16 v2, v9, v8
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v9
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshl_or_b32 v0, v10, 16, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1
-; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
+; GFX9-NEXT: v_lshl_or_b32 v2, v11, 16, v2
+; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1
+; GFX9-NEXT: v_lshl_or_b32 v3, s4, 16, v3
+; GFX9-NEXT: v_pk_add_u16 v0, v0, v2
+; GFX9-NEXT: v_pk_add_u16 v1, v1, v3
; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2
-; GFX9-NEXT: global_store_short v[4:5], v2, off offset:4
+; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%a = load <3 x i16>, ptr addrspace(1) %ptra, align 4
@@ -206,10 +215,10 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v6, v[0:1], off
; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4
-; GFX9-NEXT: global_load_ushort v8, v[2:3], off
-; GFX9-NEXT: global_load_ushort v9, v[2:3], off offset:4
-; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:8
-; GFX9-NEXT: global_load_ushort v11, v[0:1], off offset:8
+; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8
+; GFX9-NEXT: global_load_ushort v9, v[2:3], off
+; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:4
+; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:8
; GFX9-NEXT: global_load_ushort v12, v[0:1], off offset:2
; GFX9-NEXT: global_load_ushort v13, v[0:1], off offset:6
; GFX9-NEXT: global_load_ushort v14, v[2:3], off offset:2
@@ -222,23 +231,28 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v8
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v9
+; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v10
; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_pk_add_u16 v6, v11, v10
+; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v11
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshl_or_b32 v0, v12, 16, v0
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshl_or_b32 v1, v13, 16, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshl_or_b32 v2, v14, 16, v2
+; GFX9-NEXT: v_lshl_or_b32 v3, v14, 16, v3
+; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshl_or_b32 v3, v15, 16, v3
-; GFX9-NEXT: v_pk_add_u16 v0, v0, v2
-; GFX9-NEXT: v_pk_add_u16 v1, v1, v3
+; GFX9-NEXT: v_lshl_or_b32 v6, v15, 16, v6
+; GFX9-NEXT: v_lshl_or_b32 v7, s4, 16, v7
+; GFX9-NEXT: v_pk_add_u16 v0, v0, v3
+; GFX9-NEXT: v_pk_add_u16 v1, v1, v6
+; GFX9-NEXT: v_pk_add_u16 v2, v2, v7
; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2
; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:6
-; GFX9-NEXT: global_store_short v[4:5], v6, off offset:8
+; GFX9-NEXT: global_store_short v[4:5], v2, off offset:8
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%a = load <5 x i16>, ptr addrspace(1) %ptra, align 4
@@ -421,11 +435,11 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9-NEXT: global_load_ushort v6, v[0:1], off
; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4
; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8
-; GFX9-NEXT: global_load_ushort v9, v[2:3], off
-; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:4
-; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:8
-; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:12
-; GFX9-NEXT: global_load_ushort v13, v[0:1], off offset:12
+; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:12
+; GFX9-NEXT: global_load_ushort v10, v[2:3], off
+; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:4
+; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:8
+; GFX9-NEXT: global_load_ushort v13, v[2:3], off offset:12
; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:2
; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:6
; GFX9-NEXT: global_load_ushort v16, v[0:1], off offset:10
@@ -444,8 +458,10 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v10
; GFX9-NEXT: s_waitcnt vmcnt(8)
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v11
+; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v12
; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: v_pk_add_u16 v8, v13, v12
+; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v13
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0
; GFX9-NEXT: s_waitcnt vmcnt(4)
@@ -453,21 +469,24 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3
+; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6
+; GFX9-NEXT: v_lshl_or_b32 v3, s4, 16, v3
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshl_or_b32 v6, v18, 16, v6
+; GFX9-NEXT: v_lshl_or_b32 v7, v18, 16, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshl_or_b32 v7, v19, 16, v7
-; GFX9-NEXT: v_pk_add_u16 v0, v0, v3
-; GFX9-NEXT: v_pk_add_u16 v1, v1, v6
-; GFX9-NEXT: v_pk_add_u16 v2, v2, v7
+; GFX9-NEXT: v_lshl_or_b32 v8, v19, 16, v8
+; GFX9-NEXT: v_lshl_or_b32 v9, s4, 16, v9
+; GFX9-NEXT: v_pk_add_u16 v0, v0, v6
+; GFX9-NEXT: v_pk_add_u16 v1, v1, v7
+; GFX9-NEXT: v_pk_add_u16 v2, v2, v8
+; GFX9-NEXT: v_pk_add_u16 v3, v3, v9
; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2
; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:6
; GFX9-NEXT: global_store_short v[4:5], v2, off offset:8
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v2, off offset:10
-; GFX9-NEXT: global_store_short v[4:5], v8, off offset:12
+; GFX9-NEXT: global_store_short v[4:5], v3, off offset:12
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%a = load <7 x i16>, ptr addrspace(1) %ptra, align 4
@@ -542,17 +561,22 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9-LABEL: add_v9i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
-; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16
; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16
+; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_pk_add_u16 v0, v10, v6
-; GFX9-NEXT: v_pk_add_u16 v1, v11, v7
-; GFX9-NEXT: v_pk_add_u16 v2, v12, v8
-; GFX9-NEXT: v_pk_add_u16 v3, v13, v9
+; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v6, v14, v15
+; GFX9-NEXT: v_pk_add_u16 v0, v6, v10
+; GFX9-NEXT: v_pk_add_u16 v1, v7, v11
+; GFX9-NEXT: v_pk_add_u16 v2, v8, v12
+; GFX9-NEXT: v_pk_add_u16 v3, v9, v13
+; GFX9-NEXT: v_lshl_or_b32 v6, s4, 16, v14
+; GFX9-NEXT: v_lshl_or_b32 v7, s4, 16, v15
+; GFX9-NEXT: v_pk_add_u16 v6, v6, v7
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: global_store_short v[4:5], v6, off offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -716,33 +740,40 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
; GFX9-LABEL: add_v11i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16
-; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off
-; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:20
-; GFX9-NEXT: global_load_ushort v17, v[0:1], off offset:20
+; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:20
+; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
+; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:16
+; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:20
+; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:18
; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:18
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_pk_add_u16 v0, v6, v10
-; GFX9-NEXT: v_pk_add_u16 v1, v7, v11
-; GFX9-NEXT: v_pk_add_u16 v2, v8, v12
-; GFX9-NEXT: v_pk_add_u16 v3, v9, v13
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_pk_add_u16 v0, v10, v6
+; GFX9-NEXT: v_pk_add_u16 v1, v11, v7
+; GFX9-NEXT: v_pk_add_u16 v2, v12, v8
+; GFX9-NEXT: v_pk_add_u16 v3, v13, v9
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshl_or_b32 v7, v18, 16, v14
+; GFX9-NEXT: v_lshl_or_b32 v6, v18, 16, v14
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshl_or_b32 v8, v19, 16, v15
+; GFX9-NEXT: v_lshl_or_b32 v8, v19, 16, v16
+; GFX9-NEXT: v_lshl_or_b32 v7, s4, 16, v15
+; GFX9-NEXT: v_lshl_or_b32 v9, s4, 16, v17
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
-; GFX9-NEXT: v_pk_add_u16 v6, v17, v16
-; GFX9-NEXT: v_pk_add_u16 v0, v7, v8
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_pk_add_u16 v0, v6, v8
+; GFX9-NEXT: v_pk_add_u16 v1, v7, v9
; GFX9-NEXT: global_store_short v[4:5], v0, off offset:16
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:18
-; GFX9-NEXT: global_store_short v[4:5], v6, off offset:20
+; GFX9-NEXT: global_store_short v[4:5], v1, off offset:20
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%a = load <11 x i16>, ptr addrspace(1) %ptra, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index aa38c63dc9dcd..22b63a7de5f89 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+
+; FIXME: regbankcombiner regression, related to:
+; - looking through copy and splitting G_CONSTANT i64 to two i32 constants
+; - s_xor_b32 instead of s_not_b32, missing s16 pattern
define amdgpu_ps i32 @s_andn2_i32(i32 inreg %src0, i32 inreg %src1) {
; GCN-LABEL: s_andn2_i32:
@@ -241,15 +245,19 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
; GCN-LABEL: v_andn2_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_bfi_b32 v0, v2, 0, v0
-; GCN-NEXT: v_bfi_b32 v1, v3, 0, v1
+; GCN-NEXT: v_xor_b32_e32 v2, -1, v2
+; GCN-NEXT: v_xor_b32_e32 v3, -1, v3
+; GCN-NEXT: v_and_b32_e32 v0, v0, v2
+; GCN-NEXT: v_and_b32_e32 v1, v1, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_andn2_i64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_bfi_b32 v0, v2, 0, v0
-; GFX10PLUS-NEXT: v_bfi_b32 v1, v3, 0, v1
+; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2
+; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -1, v3
+; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor i64 %src1, -1
%and = and i64 %src0, %not.src1
@@ -259,14 +267,18 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) {
; GCN-LABEL: v_andn2_i64_sv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_bfi_b32 v0, v0, 0, s2
-; GCN-NEXT: v_bfi_b32 v1, v1, 0, s3
+; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
+; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
+; GCN-NEXT: v_and_b32_e32 v0, s2, v0
+; GCN-NEXT: v_and_b32_e32 v1, s3, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_andn2_i64_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, 0, s2
-; GFX10PLUS-NEXT: v_bfi_b32 v1, v1, 0, s3
+; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0
+; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1
+; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0
+; GFX10PLUS-NEXT: v_and_b32_e32 v1, s3, v1
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i64 %src1, -1
%and = and i64 %src0, %not.src1
@@ -278,16 +290,28 @@ define amdgpu_ps <2 x float> @v_andn2_i64_vs(i64 %src0, i64 inreg %src1) {
; GCN-LABEL: v_andn2_i64_vs:
; GCN: ; %bb.0:
; GCN-NEXT: s_not_b64 s[0:1], s[2:3]
-; GCN-NEXT: v_and_b32_e32 v0, s0, v0
-; GCN-NEXT: v_and_b32_e32 v1, s1, v1
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_and_b32_e32 v0, v0, v2
+; GCN-NEXT: v_and_b32_e32 v1, v1, v3
; GCN-NEXT: ; return to shader part epilog
;
-; GFX10PLUS-LABEL: v_andn2_i64_vs:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_not_b64 s[0:1], s[2:3]
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0
-; GFX10PLUS-NEXT: v_and_b32_e32 v1, s1, v1
-; GFX10PLUS-NEXT: ; return to shader part epilog
+; GFX10-LABEL: v_andn2_i64_vs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_not_b64 s[0:1], s[2:3]
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_and_b32_e32 v1, v1, v3
+; GFX10-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: v_andn2_i64_vs:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_not_b64 s[0:1], s[2:3]
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_and_b32_e32 v1, v1, v3
+; GFX11-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX11-NEXT: ; return to shader part epilog
%not.src1 = xor i64 %src1, -1
%and = and i64 %src0, %not.src1
%cast = bitcast i64 %and to <2 x float>
@@ -377,20 +401,20 @@ define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
; GCN-LABEL: s_andn2_i16_multi_use:
; GCN: ; %bb.0:
-; GCN-NEXT: s_not_b32 s1, s3
+; GCN-NEXT: s_xor_b32 s1, s3, -1
; GCN-NEXT: s_andn2_b32 s0, s2, s3
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_andn2_i16_multi_use:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_andn2_b32 s0, s2, s3
-; GFX10-NEXT: s_not_b32 s1, s3
+; GFX10-NEXT: s_xor_b32 s1, s3, -1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_andn2_i16_multi_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_not1_b32 s0, s2, s3
-; GFX11-NEXT: s_not_b32 s1, s3
+; GFX11-NEXT: s_xor_b32 s1, s3, -1
; GFX11-NEXT: ; return to shader part epilog
%not.src1 = xor i16 %src1, -1
%and = and i16 %src0, %not.src1
@@ -468,14 +492,14 @@ define amdgpu_ps float @v_andn2_i16_sv(i16 inreg %src0, i16 %src1) {
define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
; GCN-LABEL: v_andn2_i16_vs:
; GCN: ; %bb.0:
-; GCN-NEXT: s_not_b32 s0, s2
+; GCN-NEXT: s_xor_b32 s0, s2, -1
; GCN-NEXT: v_and_b32_e32 v0, s0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_andn2_i16_vs:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_not_b32 s0, s2
+; GFX10PLUS-NEXT: s_xor_b32 s0, s2, -1
; GFX10PLUS-NEXT: v_an...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/171471
More information about the llvm-commits
mailing list