[llvm] [AMDGPU][True16][CodeGen] Update and/or/xor codegen pattern for i16 (PR #121835)

Mon Jan 6 13:00:07 PST 2025

https://github.com/broxigarchen created https://github.com/llvm/llvm-project/pull/121835

In true16 flow, remove and/or/xor patterns which match i16 to 32bit instructions

>From 452e86b066b1e2b233be418bdaa57f920d1430d1 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 6 Jan 2025 14:05:03 -0500
Subject: [PATCH] Update and/or/xor codegen pattern for i16

---
 llvm/lib/Target/AMDGPU/VOP2Instructions.td | 24 ++++++++++++++++++----
 llvm/test/CodeGen/AMDGPU/uaddsat.ll        |  8 +++-----
 llvm/test/CodeGen/AMDGPU/usubsat.ll        |  5 ++---
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index ca4a0fa706c301..6bbf19179b7f6c 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1261,23 +1261,39 @@ class ZExt_i16_i1_Pat <SDNode ext> : GCNPat <
                      $src)
 >;
 
-foreach vt = [i16, v2i16] in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
-  (and vt:$src0, vt:$src1),
+  (and i16:$src0, i16:$src1),
   (V_AND_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
 >;
 
 def : GCNPat <
-  (or vt:$src0, vt:$src1),
+  (or i16:$src0, i16:$src1),
   (V_OR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
 >;
 
 def : GCNPat <
-  (xor vt:$src0, vt:$src1),
+  (xor i16:$src0, i16:$src1),
   (V_XOR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
 >;
 }
 
+def : GCNPat <
+  (and v2i16:$src0, v2i16:$src1),
+  (V_AND_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
+>;
+
+def : GCNPat <
+  (or v2i16:$src0, v2i16:$src1),
+  (V_OR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
+>;
+
+def : GCNPat <
+  (xor v2i16:$src0, v2i16:$src1),
+  (V_XOR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
+>;
+
 let Predicates = [Has16BitInsts, isGFX8GFX9] in {
 
 // Undo sub x, c -> add x, -c canonicalization since c is more likely
diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index 2775de29368fbb..572793e1c5d711 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -42,12 +42,10 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX11-TRUE16-LABEL: v_uaddsat_i8:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_min_u16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 775602ab80cde0..75866e33da23a8 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -39,9 +39,8 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX11-TRUE16-LABEL: v_usubsat_i8:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v0.h clamp
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;