[llvm] d71924f - [AMDGPU] Improve v2i32/v2f32 insertelt patterns
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 5 16:14:00 PDT 2021
Author: Stanislav Mekhanoshin
Date: 2021-08-05T16:13:39-07:00
New Revision: d71924fbfef2efd343388d189309bfb1e38e8021
URL: https://github.com/llvm/llvm-project/commit/d71924fbfef2efd343388d189309bfb1e38e8021
DIFF: https://github.com/llvm/llvm-project/commit/d71924fbfef2efd343388d189309bfb1e38e8021.diff
LOG: [AMDGPU] Improve v2i32/v2f32 insertelt patterns
Using REG_SEQUENCE produces better code than INSERT_SUBREG,
we can omit one move instruction in many cases.
Fixes: SWDEV-298028
Differential Revision: https://reviews.llvm.org/D107602
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index fbf4634bfc94c..7455c0b2fb799 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1017,22 +1017,33 @@ def : GCNPat <
/********** Extraction, Insertion, Building and Casting **********/
/********** ============================================ **********/
-foreach Index = 0-2 in {
- def Extract_Element_v2i32_#Index : Extract_Element <
- i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
+// Special case for 2 element vectors. REQ_SEQUENCE produces better code
+// than an INSERT_SUBREG.
+multiclass Insert_Element_V2<RegisterClass RC, ValueType elem_type, ValueType vec_type> {
+ def : GCNPat <
+ (insertelt vec_type:$vec, elem_type:$elem, 0),
+ (REG_SEQUENCE RC, $elem, sub0, (elem_type (EXTRACT_SUBREG $vec, sub1)), sub1)
+ >;
+
+ def : GCNPat <
+ (insertelt vec_type:$vec, elem_type:$elem, 1),
+ (REG_SEQUENCE RC, (elem_type (EXTRACT_SUBREG $vec, sub0)), sub0, $elem, sub1)
>;
- def Insert_Element_v2i32_#Index : Insert_Element <
+}
+
+foreach Index = 0-1 in {
+ def Extract_Element_v2i32_#Index : Extract_Element <
i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v2f32_#Index : Extract_Element <
f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
>;
- def Insert_Element_v2f32_#Index : Insert_Element <
- f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
- >;
}
+defm : Insert_Element_V2 <SReg_64, i32, v2i32>;
+defm : Insert_Element_V2 <SReg_64, f32, v2f32>;
+
foreach Index = 0-2 in {
def Extract_Element_v3i32_#Index : Extract_Element <
i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 9c8733bacfe95..aaa014eb50c1d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -11,11 +11,10 @@ define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out,
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, 0x40a00000
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -24,11 +23,10 @@ define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, 0x40a00000
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
@@ -42,12 +40,11 @@ define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out,
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s5, 0x40a00000
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -55,12 +52,11 @@ define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s5, 0x40a00000
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1
@@ -73,11 +69,10 @@ define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_movk_i32 s4, 0x3e7
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -86,11 +81,10 @@ define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_movk_i32 s4, 0x3e7
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v0, 0x3e7
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
@@ -104,12 +98,11 @@ define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_movk_i32 s5, 0x3e7
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v1, 0x3e7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -117,12 +110,11 @@ define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_movk_i32 s5, 0x3e7
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: v_mov_b32_e32 v1, 0x3e7
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <2 x i32> %a, i32 999, i32 1
More information about the llvm-commits
mailing list