[llvm] [NVPTX] Prefer prmt.b32 over bfi.b32 (PR #110766)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 1 16:49:40 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-nvptx
Author: Justin Fargnoli (justinfargnoli)
<details>
<summary>Changes</summary>
In [[NVPTX] Improve lowering of v4i8](https://github.com/llvm/llvm-project/commit/cbafb6f2f5c99474164dcc725820cbbeb2e02e14) @<!-- -->Artem-B add the ability to lower ISD::BUILD_VECTOR with bfi PTX instructions. @<!-- -->Artem-B did this because: ([source](https://github.com/llvm/llvm-project/pull/67866#discussion_r1343066911))
Under the hood byte extraction/insertion ends up as BFI/BFE instructions, so we may as well do that in PTX, too. https://godbolt.org/z/Tb3zWbj9b
However, the example that @<!-- -->Artem-B linked was targeting sm_52. On modern architectures, ptxas uses prmt.b32. [Example](https://godbolt.org/z/Ye4W1n84o).
Thus, remove all uses of NVPTXISD::BFI in favor of NVPTXISD::PRMT.
---
Patch is 41.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110766.diff
3 Files Affected:
- (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+17-14)
- (modified) llvm/test/CodeGen/NVPTX/i8x4-instructions.ll (+191-191)
- (modified) llvm/test/CodeGen/NVPTX/sext-setcc.ll (+6-6)
``````````diff
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8812136733fb24..b6fc8eff56d6ea 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2328,20 +2328,23 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
// to optimize calculation of constant parts.
if (VT == MVT::v4i8) {
- SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
- SDValue E01 = DAG.getNode(
- NVPTXISD::BFI, DL, MVT::i32,
- DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
- DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
- SDValue E012 =
- DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
- DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
- E01, DAG.getConstant(16, DL, MVT::i32), C8);
- SDValue E0123 =
- DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
- DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
- E012, DAG.getConstant(24, DL, MVT::i32), C8);
- return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
+ SDValue PRMT__10 = DAG.getNode(
+ NVPTXISD::PRMT, DL, MVT::v4i8,
+ {DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32),
+ DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
+ DAG.getConstant(0x3340, DL, MVT::i32),
+ DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+ SDValue PRMT_210 = DAG.getNode(
+ NVPTXISD::PRMT, DL, MVT::v4i8,
+ {PRMT__10, DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
+ DAG.getConstant(0x3410, DL, MVT::i32),
+ DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+ SDValue PRMT3210 = DAG.getNode(
+ NVPTXISD::PRMT, DL, MVT::v4i8,
+ {PRMT_210, DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
+ DAG.getConstant(0x4210, DL, MVT::i32),
+ DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+ return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210);
}
return Op;
}
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 96a4359d0ec43e..fdc25cf95d06af 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -101,38 +101,38 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_add(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<13>;
-; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-NEXT: .reg .b32 %r<18>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1];
; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r3, %r2, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r6, %r2, 0, 8;
; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 0, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 13120;
; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7;
; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8;
+; CHECK-NEXT: prmt.b32 %r13, %r9, %r12, 13328;
; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs10, %r14;
; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r15;
; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10;
; CHECK-NEXT: cvt.u32.u16 %r16, %rs12;
-; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT: prmt.b32 %r17, %r13, %r16, 16912;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
; CHECK-NEXT: ret;
%r = add <4 x i8> %a, %b
@@ -143,29 +143,29 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
; CHECK-LABEL: test_add_imm_0(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<9>;
-; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-NEXT: .reg .b32 %r<13>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r2, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 2;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
-; CHECK-NEXT: add.s16 %rs4, %rs3, 2;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120;
; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 3;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r6, %r8, 13328;
; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT: add.s16 %rs8, %rs7, 4;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r9, %r11, 16912;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12;
; CHECK-NEXT: ret;
%r = add <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
@@ -176,29 +176,29 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
; CHECK-LABEL: test_add_imm_1(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<9>;
-; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-NEXT: .reg .b32 %r<13>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r2, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 2;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
-; CHECK-NEXT: add.s16 %rs4, %rs3, 2;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120;
; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 3;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r6, %r8, 13328;
; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT: add.s16 %rs8, %rs7, 4;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r9, %r11, 16912;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12;
; CHECK-NEXT: ret;
%r = add <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
@@ -209,38 +209,38 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_sub(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<13>;
-; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-NEXT: .reg .b32 %r<18>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1];
; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r3, %r2, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r6, %r2, 0, 8;
; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 0, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 13120;
; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7;
; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8;
+; CHECK-NEXT: prmt.b32 %r13, %r9, %r12, 13328;
; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs10, %r14;
; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r15;
; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10;
; CHECK-NEXT: cvt.u32.u16 %r16, %rs12;
-; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT: prmt.b32 %r17, %r13, %r16, 16912;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
; CHECK-NEXT: ret;
%r = sub <4 x i8> %a, %b
@@ -251,7 +251,7 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_smax(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<27>;
+; CHECK-NEXT: .reg .b32 %r<26>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1];
@@ -262,27 +262,27 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8;
; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8;
; CHECK-NEXT: setp.gt.s32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8;
-; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT: bfe.s32 %r7, %r2, 0, 8;
+; CHECK-NEXT: bfe.s32 %r8, %r1, 0, 8;
; CHECK-NEXT: setp.gt.s32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8;
-; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT: bfe.s32 %r9, %r2, 8, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r1, 8, 8;
; CHECK-NEXT: setp.gt.s32 %p4, %r10, %r9;
; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8;
; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8;
-; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8;
-; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r13, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r15, %r2, 8, 8;
; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r17, %r2, 0, 8;
; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3;
-; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 13120;
; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8;
; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8;
+; CHECK-NEXT: prmt.b32 %r22, %r19, %r21, 13328;
; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8;
; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1;
-; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT: prmt.b32 %r25, %r22, %r24, 16912;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25;
; CHECK-NEXT: ret;
%cmp = icmp sgt <4 x i8> %a, %b
@@ -294,7 +294,7 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_umax(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-NEXT: .reg .b32 %r<18>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1];
@@ -305,19 +305,19 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8;
; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r8, %r1, 0, 8;
; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r9, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r1, 8, 8;
; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9;
; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4;
; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 13120;
; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
+; CHECK-NEXT: prmt.b32 %r15, %r13, %r14, 13328;
; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1;
-; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT: prmt.b32 %r17, %r15, %r16, 16912;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
; CHECK-NEXT: ret;
%cmp = icmp ugt <4 x i8> %a, %b
@@ -329,7 +329,7 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_smin(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<27>;
+; CHECK-NEXT: .reg .b32 %r<26>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1];
@@ -340,27 +340,27 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8;
; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8;
; CHECK-NEXT: setp.le.s32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8;
-; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT: bfe.s32 %r7, %r2, 0, 8;
+; CHECK-NEXT: bfe.s32 %r8, %r1, 0, 8;
; CHECK-NEXT: setp.le.s32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8;
-; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT: bfe.s32 %r9, %r2, 8, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r1, 8, 8;
; CHECK-NEXT: setp.le.s32 %p4, %r10, %r9;
; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8;
; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8;
-; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8;
-; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r13, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r15, %r2, 8, 8;
; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r17, %r2, 0, 8;
; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3;
-; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 13120;
; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8;
; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8;
+; CHECK-NEXT: prmt.b32 %r22, %r19, %r21, 13328;
; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8;
; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1;
-; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT: prmt.b32 %r25, %r22, %r24, 16912;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25;
; CHECK-NEXT: ret;
%cmp = icmp sle <4 x i8> %a, %b
@@ -372,7 +372,7 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_umin(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-NEXT: .reg .b32 %r<18>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1];
@@ -383,19 +383,19 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8;
; CHECK-NEXT: setp.ls.u32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r8, %r1, 0, 8;
; CHECK-NEXT: setp.ls.u32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r9, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r1, 8, 8;
; CHECK-NEXT: setp.ls.u32 %p4, %r10, %r9;
; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4;
; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 13120;
; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
+; CHECK-NEXT: prmt.b32 %r15, %r13, %r14, 13328;
; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1;
-; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT: prmt.b32 %r17, %r15, %r16, 16912;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
; CHECK-NEXT: ret;
%cmp = icmp ule <4 x i8> %a, %b
@@ -407,7 +407,7 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
; CHECK-LABEL: test_eq(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<24>;
+; CHECK-NEXT: .reg .b32 %r<23>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r3, [test_eq_param_2];
@@ -419,23 +419,23 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8;
; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
; CHECK-NEXT: setp.eq.u32 %p2, %r7, %r6;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
; CHECK-NEXT: setp.eq.u32 %p3, %r9, %r8;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8;
; CHECK-NEXT: setp.eq.u32 %p4, %r11, %r10;
-; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8;
+; CHECK-NEXT: bfe.u32 %r12, %r3, 8, 8;
; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT: bfe.u32 %r14, %r3, 8, 8;
+; CHECK-NEXT: bfe.u32 %r14, %r3, 0, 8;
; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3;
-; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 13120;
; CHECK-NEXT: bfe.u32 %r17, %r3, 16, 8;
; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r16, %r18, 13328;
; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8;
; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1;
-; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8;
+; CHECK-NEXT: prmt.b32 %r22, %r19, %r21, 16912;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22;
; CHECK-NEXT: ret;
%cmp = icmp eq <4 x i8> %a, %b
@@ -447,7 +447,7 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
; CHECK-LABEL: test_ne(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<24>;
+; CHECK-NEXT: .reg .b32 %r<23>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r3, [test_ne_param_2];
@@ -459,23 +459,23 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8;
; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
; CHECK-NEXT: setp.ne.u32 %p2, %r7, %r6;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
; CHECK-NEXT: setp.ne.u32 %p3, %r9, %r8;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8;
; CHECK-NEXT: setp.ne.u32 %p4, %r11, %r10;
-; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8;
+; CHECK-NEXT: bfe.u32 %r12, %r3, 8, 8;
; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT: bfe.u32 %r14, %r3, 8, 8;
+; CHECK-NEXT: bfe.u32 %r14, %r3, 0, 8;
; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3;
-; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 13120;
; CHECK-NEXT: bfe.u32 %r17, %r3, 16, 8;
; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r16, %r18, 13328;
; CHECK-NEXT: bfe.u32 %...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/110766
More information about the llvm-commits
mailing list