[llvm] bab1a17 - [AMDGPU] Add bfi immediate pattern
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 28 02:17:16 PDT 2020
Author: Jay Foad
Date: 2020-09-28T10:16:51+01:00
New Revision: bab1a17ad7761ae61e5841c2fb905de59cb8c2da
URL: https://github.com/llvm/llvm-project/commit/bab1a17ad7761ae61e5841c2fb905de59cb8c2da
DIFF: https://github.com/llvm/llvm-project/commit/bab1a17ad7761ae61e5841c2fb905de59cb8c2da.diff
LOG: [AMDGPU] Add bfi immediate pattern
Differential Revision: https://reviews.llvm.org/D88246
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 77a9ebe8465a..817fa0bf3ac7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1552,6 +1552,17 @@ def : UMad24Pat<V_MAD_U32_U24, 1>;
// BFI patterns
+def BFIImm32 : PatFrag<
+ (ops node:$x, node:$y, node:$z),
+ (i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))),
+ [{
+ auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1));
+ auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1));
+ return X && NotX &&
+ ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue();
+ }]
+>;
+
// Definition from ISA doc:
// (y & x) | (z & ~x)
def : AMDGPUPat <
@@ -1559,6 +1570,12 @@ def : AMDGPUPat <
(V_BFI_B32 $x, $y, $z)
>;
+// (y & C) | (z & ~C)
+def : AMDGPUPat <
+ (BFIImm32 i32:$x, i32:$y, i32:$z),
+ (V_BFI_B32 $x, $y, $z)
+>;
+
// 64-bit version
def : AMDGPUPat <
(DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 5925163077fd..e3db96e64c5d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1285,11 +1285,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: s_mov_b32 s0, 0xffff
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; VI-NEXT: v_or_b32_e32 v0, s0, v0
+; VI-NEXT: v_bfi_b32 v0, s0, v4, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -1305,11 +1305,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: s_and_b32 s0, s4, 0xffff
+; CI-NEXT: s_mov_b32 s0, 0xffff
+; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT: v_or_b32_e32 v0, s0, v0
+; CI-NEXT: v_bfi_b32 v0, s0, v4, v0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -1415,11 +1415,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: s_mov_b32 s0, 0xffff
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; VI-NEXT: v_or_b32_e32 v1, s0, v1
+; VI-NEXT: v_bfi_b32 v1, s0, v4, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -1435,11 +1435,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: s_and_b32 s0, s4, 0xffff
+; CI-NEXT: s_mov_b32 s0, 0xffff
+; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT: v_or_b32_e32 v1, s0, v1
+; CI-NEXT: v_bfi_b32 v1, s0, v4, v1
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -1545,11 +1545,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: s_mov_b32 s0, 0xffff
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; VI-NEXT: v_or_b32_e32 v1, s0, v1
+; VI-NEXT: v_bfi_b32 v1, s0, v4, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -1565,11 +1565,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: s_and_b32 s0, s4, 0xffff
+; CI-NEXT: s_mov_b32 s0, 0xffff
+; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT: v_or_b32_e32 v1, s0, v1
+; CI-NEXT: v_bfi_b32 v1, s0, v4, v1
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 3a9fe209a0ca..8cc7ddb63332 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -905,10 +905,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2
+; SI-NEXT: s_mov_b32 s4, 0xffff0000
+; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
@@ -978,10 +977,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out,
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, -7, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, -7, v2
+; SI-NEXT: s_mov_b32 s4, 0xffff0000
+; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
@@ -1052,10 +1050,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2
+; SI-NEXT: s_mov_b32 s4, 0xffff0000
+; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xff850000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
@@ -1127,10 +1124,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out,
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, -7, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, -7, v2
+; SI-NEXT: s_mov_b32 s4, 0xffff
+; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -1406,10 +1402,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2
+; SI-NEXT: s_mov_b32 s4, 0xffff0000
+; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
@@ -1547,10 +1542,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2
+; SI-NEXT: s_mov_b32 s4, 0xffff
+; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -1619,10 +1613,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2
+; SI-NEXT: s_mov_b32 s4, 0xffff0000
+; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
@@ -1760,10 +1753,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %o
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2
+; SI-NEXT: s_mov_b32 s4, 0xffff
+; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -1831,10 +1823,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)*
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffc400, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0xffffc400, v2
+; SI-NEXT: s_mov_b32 s4, 0xffff0000
+; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xc4000000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
@@ -1906,10 +1897,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4400, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4400, v2
+; SI-NEXT: s_mov_b32 s4, 0xffff0000
+; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44000000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
@@ -1981,10 +1971,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)*
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4000, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4000, v2
+; SI-NEXT: s_mov_b32 s4, 0xffff0000
+; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 2.0, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
@@ -2056,10 +2045,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffc000, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0xffffc000, v2
+; SI-NEXT: s_mov_b32 s4, 0xffff0000
+; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, -2.0, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
More information about the llvm-commits
mailing list