[llvm] [NVPTX] Improve lowering of v2i16 logical ops. (PR #67365)

Artem Belevich via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 25 13:08:33 PDT 2023


https://github.com/Artem-B created https://github.com/llvm/llvm-project/pull/67365

Bitwise logical ops can always be done as b32, regardless of availability of
other v2i16 ops, that would need a new GPU.

Includes the missing lowering for 2-argument register operation variants and additional tests for `and`.

>From bb46cfdc5b48663f1bc692760340762ebc7e783c Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 21 Sep 2023 16:23:25 -0700
Subject: [PATCH 1/2] [NVPTX] Improve lowering of v2i16 logical ops.

Bitwise logical ops can always be done as b32, regardless of availability of
other v2i16 ops, that would need a new GPU.
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |  7 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       | 11 ++++
 llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 64 +++++++++++++++++++
 3 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 22a72ae24b440a5..b24aae4792ce6a6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -642,10 +642,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
 
   // Other arithmetic and logic ops are unsupported.
-  setOperationAction({ISD::AND, ISD::OR, ISD::XOR, ISD::SDIV, ISD::UDIV,
-                      ISD::SRA, ISD::SRL, ISD::MULHS, ISD::MULHU,
-                      ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP,
-                      ISD::UINT_TO_FP},
+  setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
+                      ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
+                      ISD::SINT_TO_FP, ISD::UINT_TO_FP},
                      MVT::v2i16, Expand);
 
   setOperationAction(ISD::ADDC, MVT::i32, Legal);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index ad10d7938ef12e4..8f88b06372b245d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1486,6 +1486,17 @@ defm OR  : BITWISE<"or", or>;
 defm AND : BITWISE<"and", and>;
 defm XOR : BITWISE<"xor", xor>;
 
+// Lower logical ops as bitwise ops on b32.
+// By this point the constants get legalized into a bitcast from i32, so that's
+// what we need to match here.
+def: Pat<(or Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
+         (ORb32ri Int32Regs:$a, imm:$b)>;
+def: Pat<(xor Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
+         (XORb32ri Int32Regs:$a, imm:$b)>;
+def: Pat<(and Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
+         (ANDb32ri Int32Regs:$a, imm:$b)>;
+
+
 def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
                       "not.pred \t$dst, $src;",
                       [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index 59d1d84b191c31b..d14514cb53d3a93 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -235,6 +235,70 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 {
   ret <2 x i16> %r
 }
 
+;; Logical ops are available on all GPUs as regular 32-bit logical ops
+; COMMON-LABEL: test_or(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_or_param_0];
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_or_param_1];
+; COMMON-NEXT: or.b32          [[R:%r[0-9]+]], [[A]], [[B]];
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 {
+  %r = or <2 x i16> %a, %b
+  ret <2 x i16> %r
+}
+
+; Check that we can lower or with immediate arguments.
+; COMMON-LABEL: test_or_imm_0(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_or_imm_0_param_0];
+; COMMON-NEXT: or.b32          [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 {
+  %r = or <2 x i16> <i16 1, i16 2>, %a
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_or_imm_1(
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_or_imm_1_param_0];
+; COMMON-NEXT: or.b32          [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 {
+  %r = or <2 x i16> %a, <i16 1, i16 2>
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_xor(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_xor_param_0];
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_xor_param_1];
+; COMMON-NEXT: xor.b32         [[R:%r[0-9]+]], [[A]], [[B]];
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 {
+  %r = xor <2 x i16> %a, %b
+  ret <2 x i16> %r
+}
+
+; Check that we can lower xor with immediate arguments.
+; COMMON-LABEL: test_xor_imm_0(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_xor_imm_0_param_0];
+; COMMON-NEXT: xor.b32         [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 {
+  %r = xor <2 x i16> <i16 1, i16 2>, %a
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_xor_imm_1(
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_xor_imm_1_param_0];
+; COMMON-NEXT: xor.b32         [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 {
+  %r = xor <2 x i16> %a, <i16 1, i16 2>
+  ret <2 x i16> %r
+}
 
 ; COMMON-LABEL: .func test_ldst_v2i16(
 ; COMMON-DAG:    ld.param.u64    [[A:%rd[0-9]+]], [test_ldst_v2i16_param_0];

>From d2bac0ce2fd54b2d376a5899af1c598ff28f4dff Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Mon, 25 Sep 2023 13:00:21 -0700
Subject: [PATCH 2/2] Added lowering of 2-register variants operations

Previously we only matched i32 arguments, and were lucky that that's what we
were often getting when an argument was loaded from memory. For computed
arguments, we do need to match v2i16.
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       | 13 ++-
 llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 81 +++++++++++++++++++
 2 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 8f88b06372b245d..28c4cadb303ad4f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1486,9 +1486,16 @@ defm OR  : BITWISE<"or", or>;
 defm AND : BITWISE<"and", and>;
 defm XOR : BITWISE<"xor", xor>;
 
-// Lower logical ops as bitwise ops on b32.
-// By this point the constants get legalized into a bitcast from i32, so that's
-// what we need to match here.
+// Lower logical v2i16 ops as bitwise ops on b32.
+def: Pat<(or (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
+         (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
+def: Pat<(xor (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
+         (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
+def: Pat<(and (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
+         (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
+
+// The constants get legalized into a bitcast from i32, so that's what we need
+// to match here.
 def: Pat<(or Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
          (ORb32ri Int32Regs:$a, imm:$b)>;
 def: Pat<(xor Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index d14514cb53d3a93..5a22bbcf7416c17 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -247,6 +247,23 @@ define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 {
   ret <2 x i16> %r
 }
 
+; Ops that operate on computed arguments go though a different lowering path.
+; compared to the ones that operate on loaded data. So we test them separately.
+; COMMON-LABEL: test_or_computed(
+; COMMON:        ld.param.u16    [[A:%rs[0-9+]]], [test_or_computed_param_0];
+; COMMON-DAG:    mov.u16         [[C0:%rs[0-9]+]], 0;
+; COMMON-DAG:    mov.b32         [[R1:%r[0-9]+]], {[[A]], [[C0]]};
+; COMMON-DAG:    mov.u16         [[C5:%rs[0-9]+]], 5;
+; COMMON-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[A]], [[C5]]};
+; COMMON:        or.b32          [[R:%r[0-9]+]], [[R2]], [[R1]];
+; COMMON-NEXT:   st.param.b32    [func_retval0+0], [[R]];
+define <2 x i16> @test_or_computed(i16 %a) {
+  %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
+  %r = or <2 x i16> %ins.1, %ins.0
+  ret <2 x i16> %r
+}
+
 ; Check that we can lower or with immediate arguments.
 ; COMMON-LABEL: test_or_imm_0(
 ; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_or_imm_0_param_0];
@@ -279,6 +296,21 @@ define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 {
   ret <2 x i16> %r
 }
 
+; COMMON-LABEL: test_xor_computed(
+; COMMON:        ld.param.u16    [[A:%rs[0-9+]]], [test_xor_computed_param_0];
+; COMMON-DAG:    mov.u16         [[C0:%rs[0-9]+]], 0;
+; COMMON-DAG:    mov.b32         [[R1:%r[0-9]+]], {[[A]], [[C0]]};
+; COMMON-DAG:    mov.u16         [[C5:%rs[0-9]+]], 5;
+; COMMON-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[A]], [[C5]]};
+; COMMON:        xor.b32         [[R:%r[0-9]+]], [[R2]], [[R1]];
+; COMMON-NEXT:   st.param.b32    [func_retval0+0], [[R]];
+define <2 x i16> @test_xor_computed(i16 %a) {
+  %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
+  %r = xor <2 x i16> %ins.1, %ins.0
+  ret <2 x i16> %r
+}
+
 ; Check that we can lower xor with immediate arguments.
 ; COMMON-LABEL: test_xor_imm_0(
 ; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_xor_imm_0_param_0];
@@ -300,6 +332,55 @@ define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 {
   ret <2 x i16> %r
 }
 
+; COMMON-LABEL: test_and(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_and_param_0];
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_and_param_1];
+; COMMON-NEXT: and.b32          [[R:%r[0-9]+]], [[A]], [[B]];
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 {
+  %r = and <2 x i16> %a, %b
+  ret <2 x i16> %r
+}
+
+; Ops that operate on computed arguments go though a different lowering path.
+; compared to the ones that operate on loaded data. So we test them separately.
+; COMMON-LABEL: test_and_computed(
+; COMMON:        ld.param.u16    [[A:%rs[0-9+]]], [test_and_computed_param_0];
+; COMMON-DAG:    mov.u16         [[C0:%rs[0-9]+]], 0;
+; COMMON-DAG:    mov.b32         [[R1:%r[0-9]+]], {[[A]], [[C0]]};
+; COMMON-DAG:    mov.u16         [[C5:%rs[0-9]+]], 5;
+; COMMON-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[A]], [[C5]]};
+; COMMON:        and.b32          [[R:%r[0-9]+]], [[R2]], [[R1]];
+; COMMON-NEXT:   st.param.b32    [func_retval0+0], [[R]];
+define <2 x i16> @test_and_computed(i16 %a) {
+  %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
+  %r = and <2 x i16> %ins.1, %ins.0
+  ret <2 x i16> %r
+}
+
+; Check that we can lower and with immediate arguments.
+; COMMON-LABEL: test_and_imm_0(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_and_imm_0_param_0];
+; COMMON-NEXT: and.b32          [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 {
+  %r = and <2 x i16> <i16 1, i16 2>, %a
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_and_imm_1(
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_and_imm_1_param_0];
+; COMMON-NEXT: and.b32          [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_and_imm_1(<2 x i16> %a) #0 {
+  %r = and <2 x i16> %a, <i16 1, i16 2>
+  ret <2 x i16> %r
+}
+
 ; COMMON-LABEL: .func test_ldst_v2i16(
 ; COMMON-DAG:    ld.param.u64    [[A:%rd[0-9]+]], [test_ldst_v2i16_param_0];
 ; COMMON-DAG:    ld.param.u64    [[B:%rd[0-9]+]], [test_ldst_v2i16_param_1];



More information about the llvm-commits mailing list