[llvm] Reland "[NVPTX] Prefer prmt.b32 over bfi.b32" (PR #114326)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 30 16:53:24 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-nvptx
Author: Justin Fargnoli (justinfargnoli)
<details>
<summary>Changes</summary>
Fix [failure](https://github.com/llvm/llvm-project/pull/110766#discussion_r1796832635) identified by @<!-- -->akuegel.
---
In [[NVPTX] Improve lowering of v4i8](https://github.com/llvm/llvm-project/commit/cbafb6f2f5c99474164dcc725820cbbeb2e02e14) @<!-- -->Artem-B add the ability to lower ISD::BUILD_VECTOR with bfi PTX instructions. @<!-- -->Artem-B did this because: (https://github.com/llvm/llvm-project/pull/67866#discussion_r1343066911)
Under the hood byte extraction/insertion ends up as BFI/BFE instructions, so we may as well do that in PTX, too. https://godbolt.org/z/Tb3zWbj9b
However, the example that @<!-- -->Artem-B linked was targeting sm_52. On modern architectures, ptxas uses prmt.b32. [Example](https://godbolt.org/z/Ye4W1n84o).
Thus, remove uses of NVPTXISD::BFI in favor of NVPTXISD::PRMT.
---
Patch is 48.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114326.diff
3 Files Affected:
- (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+21-20)
- (modified) llvm/test/CodeGen/NVPTX/i8x4-instructions.ll (+309-305)
- (modified) llvm/test/CodeGen/NVPTX/sext-setcc.ll (+9-9)
``````````diff
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 01abf9591e342f..4b4ad7c8e2c4c4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2318,32 +2318,33 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
EVT VT = Op->getValueType(0);
if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
return Op;
-
SDLoc DL(Op);
if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
isa<ConstantFPSDNode>(Operand);
})) {
+ if (VT != MVT::v4i8)
+ return Op;
// Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
// to optimize calculation of constant parts.
- if (VT == MVT::v4i8) {
- SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
- SDValue E01 = DAG.getNode(
- NVPTXISD::BFI, DL, MVT::i32,
- DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
- DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
- SDValue E012 =
- DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
- DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
- E01, DAG.getConstant(16, DL, MVT::i32), C8);
- SDValue E0123 =
- DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
- DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
- E012, DAG.getConstant(24, DL, MVT::i32), C8);
- return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
- }
- return Op;
+ auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
+ uint64_t SelectionValue) -> SDValue {
+ SDValue L = Left;
+ SDValue R = Right;
+ if (Cast) {
+ L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
+ R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
+ }
+ return DAG.getNode(
+ NVPTXISD::PRMT, DL, MVT::v4i8, {L, R,
+ DAG.getConstant(SelectionValue, DL, MVT::i32),
+ DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+ };
+ auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
+ auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
+ auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
+ return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210);
}
// Get value or the Nth operand as an APInt(32). Undef values treated as 0.
@@ -2374,8 +2375,8 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
} else {
llvm_unreachable("Unsupported type");
}
- SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
- return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
+ SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
+ return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
}
SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 5b5662a1eea766..a16a5b435962df 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -101,38 +101,38 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_add(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<13>;
-; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-NEXT: .reg .b32 %r<18>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1];
; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 13120;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8;
+; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7;
; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs11, %r15;
+; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs10, %r13;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r14;
; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT: cvt.u32.u16 %r16, %rs12;
-; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs12;
+; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120;
+; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520;
; CHECK-NEXT: st.param.b32 [func_retval0], %r17;
; CHECK-NEXT: ret;
%r = add <4 x i8> %a, %b
@@ -143,29 +143,29 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
; CHECK-LABEL: test_add_imm_0(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<9>;
-; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-NEXT: .reg .b32 %r<13>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 4;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
-; CHECK-NEXT: add.s16 %rs4, %rs3, 2;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 3;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT: add.s16 %rs6, %rs5, 3;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 2;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT: add.s16 %rs8, %rs7, 4;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520;
; CHECK-NEXT: st.param.b32 [func_retval0], %r12;
; CHECK-NEXT: ret;
%r = add <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
@@ -176,29 +176,29 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
; CHECK-LABEL: test_add_imm_1(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<9>;
-; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-NEXT: .reg .b32 %r<13>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 4;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
-; CHECK-NEXT: add.s16 %rs4, %rs3, 2;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 3;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT: add.s16 %rs6, %rs5, 3;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 2;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT: add.s16 %rs8, %rs7, 4;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520;
; CHECK-NEXT: st.param.b32 [func_retval0], %r12;
; CHECK-NEXT: ret;
%r = add <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
@@ -209,38 +209,38 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_sub(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<13>;
-; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-NEXT: .reg .b32 %r<18>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1];
; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 13120;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8;
+; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7;
; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs11, %r15;
+; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs10, %r13;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r14;
; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT: cvt.u32.u16 %r16, %rs12;
-; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs12;
+; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120;
+; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520;
; CHECK-NEXT: st.param.b32 [func_retval0], %r17;
; CHECK-NEXT: ret;
%r = sub <4 x i8> %a, %b
@@ -251,38 +251,38 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_smax(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<27>;
+; CHECK-NEXT: .reg .b32 %r<26>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1];
; CHECK-NEXT: ld.param.u32 %r1, [test_smax_param_0];
-; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8;
-; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
; CHECK-NEXT: setp.gt.s32 %p1, %r4, %r3;
-; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8;
-; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8;
+; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8;
; CHECK-NEXT: setp.gt.s32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8;
-; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8;
+; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8;
; CHECK-NEXT: setp.gt.s32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8;
-; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8;
; CHECK-NEXT: setp.gt.s32 %p4, %r10, %r9;
-; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8;
-; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8;
-; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8;
-; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8;
; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8;
; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3;
-; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8;
-; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 13120;
+; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8;
; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8;
-; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8;
-; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1;
-; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8;
+; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 13120;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 21520;
; CHECK-NEXT: st.param.b32 [func_retval0], %r25;
; CHECK-NEXT: ret;
%cmp = icmp sgt <4 x i8> %a, %b
@@ -294,30 +294,30 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_umax(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-NEXT: .reg .b32 %r<18>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1];
; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
-; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8;
; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8;
+; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8;
; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9;
; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4;
; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 13120;
; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1;
-; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1;
+; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120;
+; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520;
; CHECK-NEXT: st.param.b32 [func_retval0], %r17;
; CHECK-NEXT: ret;
%cmp = icmp ugt <4 x i8> %a, %b
@@ -329,38 +329,38 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_smin(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<27>;
+; CHECK-NEXT: .reg .b32 %r<26>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1];
; CHECK-NEXT: ld.param.u32 %r1, [test_smin_param_0];
-; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8;
-; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
; CHECK-NEXT: setp.le.s32 %p1, %r4, %r3;
-; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8;
-; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8;
+; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8;
; CHECK-NEXT: setp.le.s32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8;
-; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8;
+; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8;
; CHECK-NEXT: setp.le.s32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8;
-; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8;
; CHECK-NEXT: setp.le.s32 %p4, %r10, %r9;
-; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8;
-; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8;
-; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8;
-; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8;
; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8;
; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3;
-; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8;
-; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 13120;
+; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8;
; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8;
-; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8;
-; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1;
-; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8;
+; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 13120;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 21520;
; CHECK-NEXT: st.param.b32 [func_retval0], %r25;
; CHECK-NEXT: ret;
%cmp = icmp sle <4 x i8> %a, %b
@@ -372,30 +372,30 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_umin(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-NEXT: .reg .b32 %r<18>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1];
; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
; CHECK-NEXT: setp.ls.u32 %p1, %r4, %r3;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
-; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8;
+; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8;
; CHECK-NEXT: setp.ls.u32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8;
+; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8;
; CHECK-NEXT: setp.ls.u32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8;
+; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
; CHECK-NEXT: setp.ls.u32 %p4, %r10, %r9;
; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4;
; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 13120;
; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1;
-; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1;
+; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120;
+; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520;
; CHECK-NEXT: st.pa...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/114326
More information about the llvm-commits
mailing list