[llvm] [NVPTX] Prefer prmt.b32 over bfi.b32 (PR #110766)

via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 1 16:49:40 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-nvptx

Author: Justin Fargnoli (justinfargnoli)

<details>
<summary>Changes</summary>

In [[NVPTX] Improve lowering of v4i8](https://github.com/llvm/llvm-project/commit/cbafb6f2f5c99474164dcc725820cbbeb2e02e14) @<!-- -->Artem-B add the ability to lower ISD::BUILD_VECTOR with bfi PTX instructions. @<!-- -->Artem-B did this because: ([source](https://github.com/llvm/llvm-project/pull/67866#discussion_r1343066911))

Under the hood byte extraction/insertion ends up as BFI/BFE instructions, so we may as well do that in PTX, too. https://godbolt.org/z/Tb3zWbj9b

However, the example that @<!-- -->Artem-B linked was targeting sm_52. On modern architectures, ptxas uses prmt.b32. [Example](https://godbolt.org/z/Ye4W1n84o).
Thus, remove all uses of NVPTXISD::BFI in favor of NVPTXISD::PRMT.

---

Patch is 41.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110766.diff


3 Files Affected:

- (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+17-14) 
- (modified) llvm/test/CodeGen/NVPTX/i8x4-instructions.ll (+191-191) 
- (modified) llvm/test/CodeGen/NVPTX/sext-setcc.ll (+6-6) 


``````````diff
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8812136733fb24..b6fc8eff56d6ea 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2328,20 +2328,23 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
     // to optimize calculation of constant parts.
     if (VT == MVT::v4i8) {
-      SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
-      SDValue E01 = DAG.getNode(
-          NVPTXISD::BFI, DL, MVT::i32,
-          DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
-          DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
-      SDValue E012 =
-          DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
-                      DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
-                      E01, DAG.getConstant(16, DL, MVT::i32), C8);
-      SDValue E0123 =
-          DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
-                      DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
-                      E012, DAG.getConstant(24, DL, MVT::i32), C8);
-      return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
+      SDValue PRMT__10 = DAG.getNode(
+          NVPTXISD::PRMT, DL, MVT::v4i8,
+          {DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32),
+           DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
+           DAG.getConstant(0x3340, DL, MVT::i32),
+           DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+      SDValue PRMT_210 = DAG.getNode(
+          NVPTXISD::PRMT, DL, MVT::v4i8,
+          {PRMT__10, DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
+           DAG.getConstant(0x3410, DL, MVT::i32),
+           DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+      SDValue PRMT3210 = DAG.getNode(
+          NVPTXISD::PRMT, DL, MVT::v4i8,
+          {PRMT_210, DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
+           DAG.getConstant(0x4210, DL, MVT::i32),
+           DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+      return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210);
     }
     return Op;
   }
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 96a4359d0ec43e..fdc25cf95d06af 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -101,38 +101,38 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_add(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_add_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 13120;
 ; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    add.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r13, %r9, %r12, 13328;
 ; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
 ; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r17, %r13, %r16, 16912;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %r = add <4 x i8> %a, %b
@@ -143,29 +143,29 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
 ; CHECK-LABEL: test_add_imm_0(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
-; CHECK-NEXT:    add.s16 %rs4, %rs3, 2;
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT:    bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 13120;
 ; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r6, %r8, 13328;
 ; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 4;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT:    bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r9, %r11, 16912;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r12;
 ; CHECK-NEXT:    ret;
   %r = add <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
@@ -176,29 +176,29 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
 ; CHECK-LABEL: test_add_imm_1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
-; CHECK-NEXT:    add.s16 %rs4, %rs3, 2;
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT:    bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 13120;
 ; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r6, %r8, 13328;
 ; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 4;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT:    bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r9, %r11, 16912;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r12;
 ; CHECK-NEXT:    ret;
   %r = add <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
@@ -209,38 +209,38 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_sub(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_sub_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_sub_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    sub.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    sub.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 13120;
 ; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    sub.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r13, %r9, %r12, 13328;
 ; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
 ; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    sub.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r17, %r13, %r16, 16912;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %r = sub <4 x i8> %a, %b
@@ -251,7 +251,7 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smax(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<27>;
+; CHECK-NEXT:    .reg .b32 %r<26>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
@@ -262,27 +262,27 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
 ; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 0, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 8, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p4, %r10, %r9;
 ; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
 ; CHECK-NEXT:    bfe.u32 %r12, %r1, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r13, %r1, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r2, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT:    bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r2, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
-; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 13120;
 ; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r22, %r19, %r21, 13328;
 ; CHECK-NEXT:    bfe.u32 %r23, %r2, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r24, %r11, %r23, %p1;
-; CHECK-NEXT:    bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r25, %r22, %r24, 16912;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sgt <4 x i8> %a, %b
@@ -294,7 +294,7 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_umax(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
@@ -305,19 +305,19 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
 ; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 0, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 8, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
 ; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 13120;
 ; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r15, %r13, %r14, 13328;
 ; CHECK-NEXT:    selp.b32 %r16, %r4, %r3, %p1;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r17, %r15, %r16, 16912;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %cmp = icmp ugt <4 x i8> %a, %b
@@ -329,7 +329,7 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<27>;
+; CHECK-NEXT:    .reg .b32 %r<26>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
@@ -340,27 +340,27 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
 ; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
 ; CHECK-NEXT:    setp.le.s32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 0, 8;
 ; CHECK-NEXT:    setp.le.s32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 8, 8;
 ; CHECK-NEXT:    setp.le.s32 %p4, %r10, %r9;
 ; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
 ; CHECK-NEXT:    bfe.u32 %r12, %r1, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r13, %r1, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r2, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT:    bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r2, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
-; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 13120;
 ; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r22, %r19, %r21, 13328;
 ; CHECK-NEXT:    bfe.u32 %r23, %r2, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r24, %r11, %r23, %p1;
-; CHECK-NEXT:    bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r25, %r22, %r24, 16912;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sle <4 x i8> %a, %b
@@ -372,7 +372,7 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_umin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
@@ -383,19 +383,19 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
 ; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
 ; CHECK-NEXT:    setp.ls.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 0, 8;
 ; CHECK-NEXT:    setp.ls.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 8, 8;
 ; CHECK-NEXT:    setp.ls.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
 ; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 13120;
 ; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r15, %r13, %r14, 13328;
 ; CHECK-NEXT:    selp.b32 %r16, %r4, %r3, %p1;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r17, %r15, %r16, 16912;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %cmp = icmp ule <4 x i8> %a, %b
@@ -407,7 +407,7 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-LABEL: test_eq(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<24>;
+; CHECK-NEXT:    .reg .b32 %r<23>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_eq_param_2];
@@ -419,23 +419,23 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
 ; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.u32 %r14, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r3, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
-; CHECK-NEXT:    bfi.b32 %r16, %r15, %r13, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 13120;
 ; CHECK-NEXT:    bfe.u32 %r17, %r3, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r16, %r18, 13328;
 ; CHECK-NEXT:    bfe.u32 %r20, %r3, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r5, %r20, %p1;
-; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r22, %r19, %r21, 16912;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r22;
 ; CHECK-NEXT:    ret;
   %cmp = icmp eq <4 x i8> %a, %b
@@ -447,7 +447,7 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-LABEL: test_ne(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<24>;
+; CHECK-NEXT:    .reg .b32 %r<23>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_ne_param_2];
@@ -459,23 +459,23 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
 ; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.u32 %r14, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r3, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
-; CHECK-NEXT:    bfi.b32 %r16, %r15, %r13, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 13120;
 ; CHECK-NEXT:    bfe.u32 %r17, %r3, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r16, %r18, 13328;
 ; CHECK-NEXT:    bfe.u32 %...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/110766


More information about the llvm-commits mailing list