[llvm] [NVPTX] Use PRMT more widely, and improve folding around this instruction (PR #148261)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 11 10:15:11 PDT 2025
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/148261
>From 8550a5cc3fee054a6c70f2fb502cf284aa29d895 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Fri, 11 Jul 2025 16:37:21 +0000
Subject: [PATCH] [NVPTX] Use PRMT more widely, and improve folding around this
intruction
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 85 ++--
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 6 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 89 ++--
.../test/CodeGen/NVPTX/LoadStoreVectorizer.ll | 126 ++---
llvm/test/CodeGen/NVPTX/extractelement.ll | 64 +--
llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 402 ++++++++--------
llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll | 32 +-
llvm/test/CodeGen/NVPTX/ldg-invariant.ll | 24 +-
llvm/test/CodeGen/NVPTX/load-store-vectors.ll | 448 +++++++++---------
llvm/test/CodeGen/NVPTX/sext-setcc.ll | 28 +-
10 files changed, 651 insertions(+), 653 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb0aeb493ed48..7a5588e314656 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -57,6 +57,7 @@
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/NVPTXAddrSpace.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
@@ -1070,7 +1071,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::StoreV8)
MAKE_CASE(NVPTXISD::FSHL_CLAMP)
MAKE_CASE(NVPTXISD::FSHR_CLAMP)
- MAKE_CASE(NVPTXISD::BFE)
MAKE_CASE(NVPTXISD::BFI)
MAKE_CASE(NVPTXISD::PRMT)
MAKE_CASE(NVPTXISD::FCOPYSIGN)
@@ -2145,14 +2145,14 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
EVT VectorVT = Vector.getValueType();
if (VectorVT == MVT::v4i8) {
- SDValue BFE =
- DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
- {Vector,
- DAG.getNode(ISD::MUL, DL, MVT::i32,
- DAG.getZExtOrTrunc(Index, DL, MVT::i32),
- DAG.getConstant(8, DL, MVT::i32)),
- DAG.getConstant(8, DL, MVT::i32)});
- return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
+ SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
+ DAG.getZExtOrTrunc(Index, DL, MVT::i32),
+ DAG.getConstant(0x7770, DL, MVT::i32));
+ SDValue PRMT = DAG.getNode(
+ NVPTXISD::PRMT, DL, MVT::i32,
+ {DAG.getBitcast(MVT::i32, Vector), DAG.getConstant(0, DL, MVT::i32),
+ Selector, DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+ return DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
}
// Constant index will be matched by tablegen.
@@ -5206,31 +5206,6 @@ static SDValue PerformANDCombine(SDNode *N,
SDValue AExt;
- // Convert BFE-> truncate i16 -> and 255
- // To just BFE-> truncate i16, as the value already has all the bits in the
- // right places.
- if (Val.getOpcode() == ISD::TRUNCATE) {
- SDValue BFE = Val.getOperand(0);
- if (BFE.getOpcode() != NVPTXISD::BFE)
- return SDValue();
-
- ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
- if (!BFEBits)
- return SDValue();
- uint64_t BFEBitsVal = BFEBits->getZExtValue();
-
- ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
- if (!MaskCnst) {
- // Not an AND with a constant
- return SDValue();
- }
- uint64_t MaskVal = MaskCnst->getZExtValue();
-
- if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
- return SDValue();
- // If we get here, the AND is unnecessary. Just replace it with the trunc
- DCI.CombineTo(N, Val, false);
- }
// Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
if (Val.getOpcode() == ISD::ANY_EXTEND) {
AExt = Val;
@@ -6334,3 +6309,45 @@ MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
return getDataSection();
}
+
+static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known,
+ const SelectionDAG &DAG, unsigned Depth) {
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+ ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ unsigned Mode = Op.getConstantOperandVal(3);
+
+ if (Mode != NVPTX::PTXPrmtMode::NONE || !Selector)
+ return;
+
+ KnownBits AKnown = DAG.computeKnownBits(A, Depth);
+ KnownBits BKnown = DAG.computeKnownBits(B, Depth);
+
+ // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
+ KnownBits BitField = BKnown.concat(AKnown);
+
+ APInt SelectorVal = Selector->getAPIntValue();
+ for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) {
+ APInt Sel = SelectorVal.extractBits(4, I * 4);
+ unsigned Idx = Sel.getLoBits(3).getZExtValue();
+ unsigned Sign = Sel.getHiBits(1).getZExtValue();
+ KnownBits Byte = BitField.extractBits(8, Idx * 8);
+ if (Sign)
+ Byte = KnownBits::ashr(Byte, 8);
+ Known.insertBits(Byte, I * 8);
+ }
+}
+
+void NVPTXTargetLowering::computeKnownBitsForTargetNode(
+ const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+ const SelectionDAG &DAG, unsigned Depth) const {
+ Known.resetAll();
+
+ switch (Op.getOpcode()) {
+ case NVPTXISD::PRMT:
+ computeKnownBitsForPRMT(Op, Known, DAG, Depth);
+ break;
+ default:
+ break;
+ }
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 2477e1fb61595..bc3548c0272bb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -50,7 +50,6 @@ enum NodeType : unsigned {
MUL_WIDE_UNSIGNED,
SETP_F16X2,
SETP_BF16X2,
- BFE,
BFI,
PRMT,
@@ -272,6 +271,11 @@ class NVPTXTargetLowering : public TargetLowering {
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const override;
+ void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
private:
const NVPTXSubtarget &STI; // cache the subtarget here
mutable unsigned GlobalUniqueCallSite;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index dcdebb81e3c86..6913b68453574 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1359,11 +1359,6 @@ def BREV64 :
// restriction in PTX?
//
// dest and src may be int32 or int64, but start and end are always int32.
-def SDTBFE :
- SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
- SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
-def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>;
-
def SDTBFI :
SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
@@ -1374,22 +1369,13 @@ def SDTPRMT :
SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
-multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
+multiclass BFE<string Instr, RegisterClass RC> {
def rrr
- : BasicNVPTXInst<(outs RC:$d),
- (ins RC:$a, B32:$b, B32:$c),
- Instr,
- [(set T:$d, (bfe T:$a, i32:$b, i32:$c))]>;
+ : BasicNVPTXInst<(outs RC:$d), (ins RC:$a, B32:$b, B32:$c), Instr>;
def rri
- : BasicNVPTXInst<(outs RC:$d),
- (ins RC:$a, B32:$b, i32imm:$c),
- Instr,
- [(set T:$d, (bfe T:$a, i32:$b, imm:$c))]>;
+ : BasicNVPTXInst<(outs RC:$d), (ins RC:$a, B32:$b, i32imm:$c), Instr>;
def rii
- : BasicNVPTXInst<(outs RC:$d),
- (ins RC:$a, i32imm:$b, i32imm:$c),
- Instr,
- [(set T:$d, (bfe T:$a, imm:$b, imm:$c))]>;
+ : BasicNVPTXInst<(outs RC:$d), (ins RC:$a, i32imm:$b, i32imm:$c), Instr>;
}
multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
@@ -1434,10 +1420,10 @@ let hasSideEffects = false in {
// the same patterns, so the first one wins. Having unsigned byte extraction
// has the benefit of always having zero in unused bits, which makes some
// optimizations easier (e.g. no need to mask them).
- defm BFE_U32 : BFE<"bfe.u32", i32, B32>;
- defm BFE_S32 : BFE<"bfe.s32", i32, B32>;
- defm BFE_U64 : BFE<"bfe.u64", i64, B64>;
- defm BFE_S64 : BFE<"bfe.s64", i64, B64>;
+ defm BFE_U32 : BFE<"bfe.u32", B32>;
+ defm BFE_S32 : BFE<"bfe.s32", B32>;
+ defm BFE_U64 : BFE<"bfe.u64", B64>;
+ defm BFE_S64 : BFE<"bfe.s64", B64>;
defm BFI_B32 : BFI<"bfi.b32", i32, B32, i32imm>;
defm BFI_B64 : BFI<"bfi.b64", i64, B64, i64imm>;
@@ -1474,19 +1460,26 @@ def : Pat<(fshr i32:$hi, i32:$lo, (shl i32:$amt, (i32 3))),
(PRMT_B32rrr $lo, $hi, $amt, PrmtF4E)>;
+def byte_extract_prmt : ImmLeaf<i32, [{
+ return (Imm == 0x7770) || (Imm == 0x7771) || (Imm == 0x7772) || (Imm == 0x7773);
+}]>;
+
+def to_sign_extend_selector : SDNodeXForm<imm, [{
+ const APInt &V = N->getAPIntValue();
+ const APInt B = V.trunc(4);
+ const APInt BSext = B | 8;
+ const APInt R = BSext.concat(BSext).concat(BSext).concat(B).zext(32);
+ return CurDAG->getTargetConstant(R, SDLoc(N), MVT::i32);
+}]>;
+
+
// byte extraction + signed/unsigned extension to i32.
-def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)),
- (BFE_S32rri $s, $o, 8)>;
-def : Pat<(i32 (sext_inreg (bfe i32:$s, imm:$o, 8), i8)),
- (BFE_S32rii $s, imm:$o, 8)>;
-def : Pat<(i32 (and (bfe i32:$s, i32:$o, 8), 255)),
- (BFE_U32rri $s, $o, 8)>;
-def : Pat<(i32 (and (bfe i32:$s, imm:$o, 8), 255)),
- (BFE_U32rii $s, imm:$o, 8)>;
+def : Pat<(i32 (sext_inreg (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtNONE), i8)),
+ (PRMT_B32rii $s, 0, (to_sign_extend_selector $sel), PrmtNONE)>;
// byte extraction + signed extension to i16
-def : Pat<(i16 (sext_inreg (trunc (bfe i32:$s, imm:$o, 8)), i8)),
- (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
+def : Pat<(i16 (sext_inreg (trunc (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtNONE)), i8)),
+ (CVT_u16_u32 (PRMT_B32rii $s, 0, (to_sign_extend_selector $sel), PrmtNONE), CvtNONE)>;
// Byte extraction via shift/trunc/sext
@@ -1699,25 +1692,33 @@ def cond_not_signed : PatLeaf<(cond), [{
// comparisons of i8 extracted with BFE as i32
// It's faster to do comparison directly on i32 extracted by BFE,
// instead of the long conversion and sign extending.
-def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (bfe B32:$a, B32:$oa, 8))), i8)),
- (i16 (sext_inreg (i16 (trunc (bfe B32:$b, B32:$ob, 8))), i8)),
+def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), i8)),
+ (i16 (sext_inreg (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), i8)),
cond_signed:$cc),
- (SETP_i32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), (cond2cc $cc))>;
+ (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+ (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
+ (cond2cc $cc))>;
-def: Pat<(setcc (i16 (sext_inreg (trunc (bfe B32:$a, imm:$oa, 8)), i8)),
- (i16 (sext_inreg (trunc (bfe B32:$b, imm:$ob, 8)), i8)),
+def: Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)),
+ (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)),
cond_signed:$cc),
- (SETP_i32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), (cond2cc $cc))>;
+ (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+ (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
+ (cond2cc $cc))>;
-def: Pat<(setcc (i16 (and (trunc (bfe B32:$a, B32:$oa, 8)), 255)),
- (i16 (and (trunc (bfe B32:$b, B32:$ob, 8)), 255)),
+def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
+ (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
cond_signed:$cc),
- (SETP_i32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), (cond2cc $cc))>;
+ (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+ (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
+ (cond2cc $cc))>;
-def: Pat<(setcc (i16 (and (trunc (bfe B32:$a, imm:$oa, 8)), 255)),
- (i16 (and (trunc (bfe B32:$b, imm:$ob, 8)), 255)),
+def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
+ (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
cond_not_signed:$cc),
- (SETP_i32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), (cond2cc $cc))>;
+ (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+ (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
+ (cond2cc $cc))>;
def SDTDeclareArrayParam :
SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index 1207c429524ca..23832a9cb5c58 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -178,38 +178,38 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.b64 %rd1, [combine_v16i8_param_0];
; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r7, %r4, 0, 0x7771U;
+; ENABLED-NEXT: prmt.b32 %r8, %r4, 0, 0x7770U;
+; ENABLED-NEXT: prmt.b32 %r9, %r3, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r10, %r3, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r11, %r3, 0, 0x7771U;
+; ENABLED-NEXT: prmt.b32 %r12, %r3, 0, 0x7770U;
+; ENABLED-NEXT: prmt.b32 %r13, %r2, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r14, %r2, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r15, %r2, 0, 0x7771U;
+; ENABLED-NEXT: prmt.b32 %r16, %r2, 0, 0x7770U;
+; ENABLED-NEXT: prmt.b32 %r17, %r1, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r18, %r1, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U;
+; ENABLED-NEXT: prmt.b32 %r20, %r1, 0, 0x7770U;
; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_param_1];
-; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r8, %r1, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r9, %r2, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r10, %r2, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r11, %r2, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r12, %r2, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r13, %r3, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r14, %r3, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r15, %r3, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r16, %r3, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r17, %r4, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r18, %r4, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r19, %r4, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r20, %r4, 24, 8;
-; ENABLED-NEXT: add.s32 %r21, %r5, %r6;
-; ENABLED-NEXT: add.s32 %r22, %r21, %r7;
-; ENABLED-NEXT: add.s32 %r23, %r22, %r8;
-; ENABLED-NEXT: add.s32 %r24, %r23, %r9;
-; ENABLED-NEXT: add.s32 %r25, %r24, %r10;
-; ENABLED-NEXT: add.s32 %r26, %r25, %r11;
-; ENABLED-NEXT: add.s32 %r27, %r26, %r12;
-; ENABLED-NEXT: add.s32 %r28, %r27, %r13;
-; ENABLED-NEXT: add.s32 %r29, %r28, %r14;
-; ENABLED-NEXT: add.s32 %r30, %r29, %r15;
-; ENABLED-NEXT: add.s32 %r31, %r30, %r16;
-; ENABLED-NEXT: add.s32 %r32, %r31, %r17;
-; ENABLED-NEXT: add.s32 %r33, %r32, %r18;
-; ENABLED-NEXT: add.s32 %r34, %r33, %r19;
-; ENABLED-NEXT: add.s32 %r35, %r34, %r20;
+; ENABLED-NEXT: add.s32 %r21, %r20, %r19;
+; ENABLED-NEXT: add.s32 %r22, %r21, %r18;
+; ENABLED-NEXT: add.s32 %r23, %r22, %r17;
+; ENABLED-NEXT: add.s32 %r24, %r23, %r16;
+; ENABLED-NEXT: add.s32 %r25, %r24, %r15;
+; ENABLED-NEXT: add.s32 %r26, %r25, %r14;
+; ENABLED-NEXT: add.s32 %r27, %r26, %r13;
+; ENABLED-NEXT: add.s32 %r28, %r27, %r12;
+; ENABLED-NEXT: add.s32 %r29, %r28, %r11;
+; ENABLED-NEXT: add.s32 %r30, %r29, %r10;
+; ENABLED-NEXT: add.s32 %r31, %r30, %r9;
+; ENABLED-NEXT: add.s32 %r32, %r31, %r8;
+; ENABLED-NEXT: add.s32 %r33, %r32, %r7;
+; ENABLED-NEXT: add.s32 %r34, %r33, %r6;
+; ENABLED-NEXT: add.s32 %r35, %r34, %r5;
; ENABLED-NEXT: st.b32 [%rd2], %r35;
; ENABLED-NEXT: ret;
;
@@ -329,39 +329,39 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.b64 %rd1, [combine_v16i8_unaligned_param_0];
; ENABLED-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
+; ENABLED-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r4, %r2, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
+; ENABLED-NEXT: prmt.b32 %r6, %r2, 0, 0x7770U;
+; ENABLED-NEXT: prmt.b32 %r7, %r1, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r9, %r1, 0, 0x7771U;
+; ENABLED-NEXT: prmt.b32 %r10, %r1, 0, 0x7770U;
; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_unaligned_param_1];
-; ENABLED-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1+8];
-; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r8, %r1, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r9, %r2, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r10, %r2, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r11, %r2, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r12, %r2, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r13, %r3, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r14, %r3, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r15, %r3, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r16, %r3, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r17, %r4, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r18, %r4, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r19, %r4, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r20, %r4, 24, 8;
-; ENABLED-NEXT: add.s32 %r21, %r5, %r6;
-; ENABLED-NEXT: add.s32 %r22, %r21, %r7;
-; ENABLED-NEXT: add.s32 %r23, %r22, %r8;
-; ENABLED-NEXT: add.s32 %r24, %r23, %r9;
-; ENABLED-NEXT: add.s32 %r25, %r24, %r10;
-; ENABLED-NEXT: add.s32 %r26, %r25, %r11;
-; ENABLED-NEXT: add.s32 %r27, %r26, %r12;
-; ENABLED-NEXT: add.s32 %r28, %r27, %r13;
-; ENABLED-NEXT: add.s32 %r29, %r28, %r14;
-; ENABLED-NEXT: add.s32 %r30, %r29, %r15;
-; ENABLED-NEXT: add.s32 %r31, %r30, %r16;
-; ENABLED-NEXT: add.s32 %r32, %r31, %r17;
-; ENABLED-NEXT: add.s32 %r33, %r32, %r18;
-; ENABLED-NEXT: add.s32 %r34, %r33, %r19;
-; ENABLED-NEXT: add.s32 %r35, %r34, %r20;
+; ENABLED-NEXT: ld.v2.b32 {%r11, %r12}, [%rd1+8];
+; ENABLED-NEXT: prmt.b32 %r13, %r12, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r14, %r12, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r15, %r12, 0, 0x7771U;
+; ENABLED-NEXT: prmt.b32 %r16, %r12, 0, 0x7770U;
+; ENABLED-NEXT: prmt.b32 %r17, %r11, 0, 0x7773U;
+; ENABLED-NEXT: prmt.b32 %r18, %r11, 0, 0x7772U;
+; ENABLED-NEXT: prmt.b32 %r19, %r11, 0, 0x7771U;
+; ENABLED-NEXT: prmt.b32 %r20, %r11, 0, 0x7770U;
+; ENABLED-NEXT: add.s32 %r21, %r10, %r9;
+; ENABLED-NEXT: add.s32 %r22, %r21, %r8;
+; ENABLED-NEXT: add.s32 %r23, %r22, %r7;
+; ENABLED-NEXT: add.s32 %r24, %r23, %r6;
+; ENABLED-NEXT: add.s32 %r25, %r24, %r5;
+; ENABLED-NEXT: add.s32 %r26, %r25, %r4;
+; ENABLED-NEXT: add.s32 %r27, %r26, %r3;
+; ENABLED-NEXT: add.s32 %r28, %r27, %r20;
+; ENABLED-NEXT: add.s32 %r29, %r28, %r19;
+; ENABLED-NEXT: add.s32 %r30, %r29, %r18;
+; ENABLED-NEXT: add.s32 %r31, %r30, %r17;
+; ENABLED-NEXT: add.s32 %r32, %r31, %r16;
+; ENABLED-NEXT: add.s32 %r33, %r32, %r15;
+; ENABLED-NEXT: add.s32 %r34, %r33, %r14;
+; ENABLED-NEXT: add.s32 %r35, %r34, %r13;
; ENABLED-NEXT: st.b32 [%rd2], %r35;
; ENABLED-NEXT: ret;
;
diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index e04732ebad66b..80980efbab05b 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -60,14 +60,14 @@ define i16 @test_v4i8(i32 %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_param_0];
-; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs1, %r2;
-; CHECK-NEXT: bfe.s32 %r3, %r1, 8, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs2, %r3;
-; CHECK-NEXT: bfe.s32 %r4, %r1, 16, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs3, %r4;
-; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs4, %r5;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x8880U;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r3;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r5;
; CHECK-NEXT: add.s16 %rs5, %rs1, %rs2;
; CHECK-NEXT: add.s16 %rs6, %rs3, %rs4;
; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6;
@@ -96,10 +96,10 @@ define i32 @test_v4i8_s32(i32 %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_s32_param_0];
-; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8;
-; CHECK-NEXT: bfe.s32 %r3, %r1, 8, 8;
-; CHECK-NEXT: bfe.s32 %r4, %r1, 16, 8;
-; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x8880U;
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U;
+; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U;
; CHECK-NEXT: add.s32 %r6, %r2, %r3;
; CHECK-NEXT: add.s32 %r7, %r4, %r5;
; CHECK-NEXT: add.s32 %r8, %r6, %r7;
@@ -127,10 +127,10 @@ define i32 @test_v4i8_u32(i32 %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_u32_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
-; CHECK-NEXT: bfe.u32 %r3, %r1, 8, 8;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
-; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7770U;
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7773U;
; CHECK-NEXT: add.s32 %r6, %r2, %r3;
; CHECK-NEXT: add.s32 %r7, %r4, %r5;
; CHECK-NEXT: add.s32 %r8, %r6, %r7;
@@ -161,22 +161,22 @@ define i16 @test_v8i8(i64 %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v8i8_param_0];
-; CHECK-NEXT: bfe.s32 %r3, %r1, 0, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs1, %r3;
-; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs2, %r4;
-; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs3, %r5;
-; CHECK-NEXT: bfe.s32 %r6, %r1, 24, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs4, %r6;
-; CHECK-NEXT: bfe.s32 %r7, %r2, 0, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs5, %r7;
-; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs6, %r8;
-; CHECK-NEXT: bfe.s32 %r9, %r2, 16, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs7, %r9;
-; CHECK-NEXT: bfe.s32 %r10, %r2, 24, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs8, %r10;
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x8880U;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x9991U;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
+; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xaaa2U;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0xbbb3U;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
+; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x8880U;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x9991U;
+; CHECK-NEXT: cvt.u16.u32 %rs6, %r8;
+; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0xaaa2U;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0xbbb3U;
+; CHECK-NEXT: cvt.u16.u32 %rs8, %r10;
; CHECK-NEXT: add.s16 %rs9, %rs1, %rs2;
; CHECK-NEXT: add.s16 %rs10, %rs3, %rs4;
; CHECK-NEXT: add.s16 %rs11, %rs5, %rs6;
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 328da60a1f783..6a76cd15c3219 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -29,7 +29,7 @@ define i8 @test_extract_0(<4 x i8> %a) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_extract_0_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7770U;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%e = extractelement <4 x i8> %a, i32 0
@@ -43,7 +43,7 @@ define i8 @test_extract_1(<4 x i8> %a) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_extract_1_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7771U;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%e = extractelement <4 x i8> %a, i32 1
@@ -57,7 +57,7 @@ define i8 @test_extract_2(<4 x i8> %a) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_extract_2_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7772U;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%e = extractelement <4 x i8> %a, i32 2
@@ -71,7 +71,7 @@ define i8 @test_extract_3(<4 x i8> %a) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_extract_3_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%e = extractelement <4 x i8> %a, i32 3
@@ -88,8 +88,8 @@ define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 {
; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_extract_i_param_0];
; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT: shl.b32 %r3, %r2, 3;
-; CHECK-NEXT: bfe.u32 %r4, %r1, %r3, 8;
+; CHECK-NEXT: or.b32 %r3, %r2, 30576;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, %r3;
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-NEXT: ret;
%e = extractelement <4 x i8> %a, i64 %idx
@@ -105,28 +105,28 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r2, [test_add_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_add_param_0];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7;
; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r13, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r14;
; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs12;
@@ -146,20 +146,20 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_add_imm_0_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 4;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 3;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 2;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
@@ -179,20 +179,20 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_add_imm_1_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 4;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 3;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 2;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
@@ -213,28 +213,28 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r2, [test_sub_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_sub_param_0];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7;
; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r13, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r14;
; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs12;
@@ -250,39 +250,31 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_smax(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<26>;
+; CHECK-NEXT: .reg .b32 %r<18>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r2, [test_smax_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_smax_param_0];
-; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
-; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
; CHECK-NEXT: setp.gt.s32 %p1, %r4, %r3;
-; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8;
-; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
; CHECK-NEXT: setp.gt.s32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8;
-; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
; CHECK-NEXT: setp.gt.s32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8;
-; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
+; CHECK-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
; CHECK-NEXT: setp.gt.s32 %p4, %r10, %r9;
-; CHECK-NEXT: bfe.u32 %r11, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r12, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r13, %r2, 16, 8;
-; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8;
-; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8;
-; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p4;
-; CHECK-NEXT: bfe.u32 %r17, %r1, 16, 8;
-; CHECK-NEXT: selp.b32 %r18, %r17, %r13, %p3;
-; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r20, %r1, 8, 8;
-; CHECK-NEXT: selp.b32 %r21, %r20, %r12, %p2;
-; CHECK-NEXT: bfe.u32 %r22, %r1, 0, 8;
-; CHECK-NEXT: selp.b32 %r23, %r22, %r11, %p1;
-; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
-; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r25;
+; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4;
+; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
+; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2;
+; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1;
+; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r17;
; CHECK-NEXT: ret;
%cmp = icmp sgt <4 x i8> %a, %b
%r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -298,17 +290,17 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r2, [test_umax_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_umax_param_0];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
; CHECK-NEXT: setp.gt.u32 %p1, %r4, %r3;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
; CHECK-NEXT: setp.gt.u32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8;
-; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
; CHECK-NEXT: setp.gt.u32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
+; CHECK-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
; CHECK-NEXT: setp.gt.u32 %p4, %r10, %r9;
; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4;
; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3;
@@ -328,39 +320,31 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-LABEL: test_smin(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<26>;
+; CHECK-NEXT: .reg .b32 %r<18>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r2, [test_smin_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_smin_param_0];
-; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
-; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
; CHECK-NEXT: setp.le.s32 %p1, %r4, %r3;
-; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8;
-; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
; CHECK-NEXT: setp.le.s32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8;
-; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
; CHECK-NEXT: setp.le.s32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8;
-; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
+; CHECK-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
; CHECK-NEXT: setp.le.s32 %p4, %r10, %r9;
-; CHECK-NEXT: bfe.u32 %r11, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r12, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r13, %r2, 16, 8;
-; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8;
-; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8;
-; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p4;
-; CHECK-NEXT: bfe.u32 %r17, %r1, 16, 8;
-; CHECK-NEXT: selp.b32 %r18, %r17, %r13, %p3;
-; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r20, %r1, 8, 8;
-; CHECK-NEXT: selp.b32 %r21, %r20, %r12, %p2;
-; CHECK-NEXT: bfe.u32 %r22, %r1, 0, 8;
-; CHECK-NEXT: selp.b32 %r23, %r22, %r11, %p1;
-; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U;
-; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r25;
+; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4;
+; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U;
+; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2;
+; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1;
+; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r17;
; CHECK-NEXT: ret;
%cmp = icmp sle <4 x i8> %a, %b
%r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -376,17 +360,17 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r2, [test_umin_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_umin_param_0];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
; CHECK-NEXT: setp.le.u32 %p1, %r4, %r3;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
; CHECK-NEXT: setp.le.u32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8;
-; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
; CHECK-NEXT: setp.le.u32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
+; CHECK-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
; CHECK-NEXT: setp.le.u32 %p4, %r10, %r9;
; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4;
; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3;
@@ -412,26 +396,26 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
; CHECK-NEXT: ld.param.b32 %r3, [test_eq_param_2];
; CHECK-NEXT: ld.param.b32 %r2, [test_eq_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_eq_param_0];
-; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x7770U;
+; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7770U;
; CHECK-NEXT: setp.eq.b32 %p1, %r5, %r4;
-; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT: setp.eq.b32 %p2, %r7, %r6;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 16, 8;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7772U;
; CHECK-NEXT: setp.eq.b32 %p3, %r9, %r8;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 24, 8;
-; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7773U;
+; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x7773U;
; CHECK-NEXT: setp.eq.b32 %p4, %r11, %r10;
-; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r3, 0, 0x7773U;
; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT: bfe.u32 %r14, %r3, 16, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r3, 0, 0x7772U;
; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3;
; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r17, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r17, %r3, 0, 0x7771U;
; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT: bfe.u32 %r19, %r3, 0, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r3, 0, 0x7770U;
; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1;
; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 0x3340U;
; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 0x5410U;
@@ -452,26 +436,26 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
; CHECK-NEXT: ld.param.b32 %r3, [test_ne_param_2];
; CHECK-NEXT: ld.param.b32 %r2, [test_ne_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_ne_param_0];
-; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x7770U;
+; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7770U;
; CHECK-NEXT: setp.ne.b32 %p1, %r5, %r4;
-; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT: setp.ne.b32 %p2, %r7, %r6;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 16, 8;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7772U;
; CHECK-NEXT: setp.ne.b32 %p3, %r9, %r8;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 24, 8;
-; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7773U;
+; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x7773U;
; CHECK-NEXT: setp.ne.b32 %p4, %r11, %r10;
-; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r3, 0, 0x7773U;
; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT: bfe.u32 %r14, %r3, 16, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r3, 0, 0x7772U;
; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3;
; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r17, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r17, %r3, 0, 0x7771U;
; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT: bfe.u32 %r19, %r3, 0, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r3, 0, 0x7770U;
; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1;
; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 0x3340U;
; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 0x5410U;
@@ -491,28 +475,28 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r2, [test_mul_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_mul_param_0];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
; CHECK-NEXT: mul.lo.s16 %rs3, %rs2, %rs1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: mul.lo.s16 %rs6, %rs5, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
; CHECK-NEXT: mul.lo.s16 %rs9, %rs8, %rs7;
; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r13, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r14;
; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs12;
@@ -750,7 +734,7 @@ define void @test_ldst_v3i8(ptr %a, ptr %b) {
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v3i8_param_0];
; CHECK-NEXT: ld.b32 %r1, [%rd1];
; CHECK-NEXT: st.b16 [%rd2], %r1;
-; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7772U;
; CHECK-NEXT: st.b8 [%rd2+2], %r2;
; CHECK-NEXT: ret;
%t1 = load <3 x i8>, ptr %a
@@ -918,30 +902,30 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
; CHECK-NEXT: ld.param.b32 %r3, [test_select_cc_param_2];
; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_param_0];
-; CHECK-NEXT: bfe.u32 %r5, %r4, 0, 8;
-; CHECK-NEXT: bfe.u32 %r6, %r3, 0, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7770U;
+; CHECK-NEXT: prmt.b32 %r6, %r3, 0, 0x7770U;
; CHECK-NEXT: setp.ne.b32 %p1, %r6, %r5;
-; CHECK-NEXT: bfe.u32 %r7, %r4, 8, 8;
-; CHECK-NEXT: bfe.u32 %r8, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r4, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r8, %r3, 0, 0x7771U;
; CHECK-NEXT: setp.ne.b32 %p2, %r8, %r7;
-; CHECK-NEXT: bfe.u32 %r9, %r4, 16, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r3, 16, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r10, %r3, 0, 0x7772U;
; CHECK-NEXT: setp.ne.b32 %p3, %r10, %r9;
-; CHECK-NEXT: bfe.u32 %r11, %r4, 24, 8;
-; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT: prmt.b32 %r11, %r4, 0, 0x7773U;
+; CHECK-NEXT: prmt.b32 %r12, %r3, 0, 0x7773U;
; CHECK-NEXT: setp.ne.b32 %p4, %r12, %r11;
-; CHECK-NEXT: bfe.u32 %r13, %r2, 24, 8;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r13, %r2, 0, 0x7773U;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT: selp.b32 %r15, %r14, %r13, %p4;
-; CHECK-NEXT: bfe.u32 %r16, %r2, 16, 8;
-; CHECK-NEXT: bfe.u32 %r17, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r2, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U;
; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p3;
; CHECK-NEXT: prmt.b32 %r19, %r18, %r15, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r21, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r20, %r2, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r21, %r1, 0, 0x7771U;
; CHECK-NEXT: selp.b32 %r22, %r21, %r20, %p2;
-; CHECK-NEXT: bfe.u32 %r23, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r24, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r23, %r2, 0, 0x7770U;
+; CHECK-NEXT: prmt.b32 %r24, %r1, 0, 0x7770U;
; CHECK-NEXT: selp.b32 %r25, %r24, %r23, %p1;
; CHECK-NEXT: prmt.b32 %r26, %r25, %r22, 0x3340U;
; CHECK-NEXT: prmt.b32 %r27, %r26, %r19, 0x5410U;
@@ -963,17 +947,17 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
; CHECK-NEXT: ld.param.b32 %r10, [test_select_cc_i32_i8_param_3];
; CHECK-NEXT: ld.param.b32 %r9, [test_select_cc_i32_i8_param_2];
-; CHECK-NEXT: bfe.u32 %r11, %r10, 0, 8;
-; CHECK-NEXT: bfe.u32 %r12, %r9, 0, 8;
+; CHECK-NEXT: prmt.b32 %r11, %r10, 0, 0x7770U;
+; CHECK-NEXT: prmt.b32 %r12, %r9, 0, 0x7770U;
; CHECK-NEXT: setp.ne.b32 %p1, %r12, %r11;
-; CHECK-NEXT: bfe.u32 %r13, %r10, 8, 8;
-; CHECK-NEXT: bfe.u32 %r14, %r9, 8, 8;
+; CHECK-NEXT: prmt.b32 %r13, %r10, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r14, %r9, 0, 0x7771U;
; CHECK-NEXT: setp.ne.b32 %p2, %r14, %r13;
-; CHECK-NEXT: bfe.u32 %r15, %r10, 16, 8;
-; CHECK-NEXT: bfe.u32 %r16, %r9, 16, 8;
+; CHECK-NEXT: prmt.b32 %r15, %r10, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r16, %r9, 0, 0x7772U;
; CHECK-NEXT: setp.ne.b32 %p3, %r16, %r15;
-; CHECK-NEXT: bfe.u32 %r17, %r10, 24, 8;
-; CHECK-NEXT: bfe.u32 %r18, %r9, 24, 8;
+; CHECK-NEXT: prmt.b32 %r17, %r10, 0, 0x7773U;
+; CHECK-NEXT: prmt.b32 %r18, %r9, 0, 0x7773U;
; CHECK-NEXT: setp.ne.b32 %p4, %r18, %r17;
; CHECK-NEXT: selp.b32 %r19, %r4, %r8, %p4;
; CHECK-NEXT: selp.b32 %r20, %r3, %r7, %p3;
@@ -1002,18 +986,18 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
; CHECK-NEXT: setp.ne.b32 %p2, %r4, %r8;
; CHECK-NEXT: setp.ne.b32 %p3, %r5, %r9;
; CHECK-NEXT: setp.ne.b32 %p4, %r6, %r10;
-; CHECK-NEXT: bfe.u32 %r11, %r2, 24, 8;
-; CHECK-NEXT: bfe.u32 %r12, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r11, %r2, 0, 0x7773U;
+; CHECK-NEXT: prmt.b32 %r12, %r1, 0, 0x7773U;
; CHECK-NEXT: selp.b32 %r13, %r12, %r11, %p4;
-; CHECK-NEXT: bfe.u32 %r14, %r2, 16, 8;
-; CHECK-NEXT: bfe.u32 %r15, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r2, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r15, %r1, 0, 0x7772U;
; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p3;
; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r18, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r18, %r2, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT: selp.b32 %r20, %r19, %r18, %p2;
-; CHECK-NEXT: bfe.u32 %r21, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r22, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r2, 0, 0x7770U;
+; CHECK-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U;
; CHECK-NEXT: selp.b32 %r23, %r22, %r21, %p1;
; CHECK-NEXT: prmt.b32 %r24, %r23, %r20, 0x3340U;
; CHECK-NEXT: prmt.b32 %r25, %r24, %r17, 0x5410U;
@@ -1071,10 +1055,10 @@ define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_zext_2xi32_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
-; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
-; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7770U;
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r5, %r4, %r3, %r2};
; CHECK-NEXT: ret;
%r = zext <4 x i8> %a to <4 x i32>
@@ -1085,24 +1069,20 @@ define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
; CHECK-LABEL: test_zext_2xi64(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<6>;
-; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_zext_2xi64_param_0];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u64.u32 %rd1, %r2;
-; CHECK-NEXT: and.b64 %rd2, %rd1, 255;
-; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8;
-; CHECK-NEXT: cvt.u64.u32 %rd3, %r3;
-; CHECK-NEXT: and.b64 %rd4, %rd3, 255;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
-; CHECK-NEXT: cvt.u64.u32 %rd5, %r4;
-; CHECK-NEXT: and.b64 %rd6, %rd5, 255;
-; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8;
-; CHECK-NEXT: cvt.u64.u32 %rd7, %r5;
-; CHECK-NEXT: and.b64 %rd8, %rd7, 255;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
-; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd4, %rd2};
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x7772U;
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r3;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7771U;
+; CHECK-NEXT: cvt.u64.u32 %rd3, %r4;
+; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7770U;
+; CHECK-NEXT: cvt.u64.u32 %rd4, %r5;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3};
+; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd2, %rd1};
; CHECK-NEXT: ret;
%r = zext <4 x i8> %a to <4 x i64>
ret <4 x i64> %r
@@ -1308,29 +1288,29 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
; CHECK-NEXT: ld.param.b64 %rd1, [test_srem_v4i8_param_0];
; CHECK-NEXT: ld.b32 %r1, [%rd1];
; CHECK-NEXT: ld.b32 %r2, [%rd2];
-; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs1, %r3;
-; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs2, %r4;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0xbbb3U;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xbbb3U;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
; CHECK-NEXT: rem.s16 %rs3, %rs2, %rs1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs4, %r6;
-; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs5, %r7;
+; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0xaaa2U;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0xaaa2U;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: rem.s16 %rs6, %rs5, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT: bfe.s32 %r10, %r2, 8, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs7, %r10;
-; CHECK-NEXT: bfe.s32 %r11, %r1, 8, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs8, %r11;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x9991U;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x9991U;
+; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
; CHECK-NEXT: rem.s16 %rs9, %rs8, %rs7;
; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT: bfe.s32 %r13, %r2, 0, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs10, %r13;
-; CHECK-NEXT: bfe.s32 %r14, %r1, 0, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs11, %r14;
+; CHECK-NEXT: prmt.b32 %r13, %r2, 0, 0x8880U;
+; CHECK-NEXT: cvt.u16.u32 %rs10, %r13;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x8880U;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r14;
; CHECK-NEXT: rem.s16 %rs12, %rs11, %rs10;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs12;
; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U;
@@ -1374,16 +1354,16 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
; CHECK-NEXT: or.b16 %rs9, %rs8, %rs6;
; CHECK-NEXT: cvt.u32.u16 %r2, %rs9;
; CHECK-NEXT: ld.s8 %rs10, [%rd2+2];
-; CHECK-NEXT: bfe.s32 %r3, %r2, 8, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs11, %r3;
-; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs12, %r4;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x9991U;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r3;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x9991U;
+; CHECK-NEXT: cvt.u16.u32 %rs12, %r4;
; CHECK-NEXT: rem.s16 %rs13, %rs12, %rs11;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs13;
-; CHECK-NEXT: bfe.s32 %r6, %r2, 0, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs14, %r6;
-; CHECK-NEXT: bfe.s32 %r7, %r1, 0, 8;
-; CHECK-NEXT: cvt.s8.s32 %rs15, %r7;
+; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x8880U;
+; CHECK-NEXT: cvt.u16.u32 %rs14, %r6;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x8880U;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r7;
; CHECK-NEXT: rem.s16 %rs16, %rs15, %rs14;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs16;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U;
@@ -1419,17 +1399,17 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
; CHECK-NEXT: ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
; CHECK-NEXT: ld.b32 %r1, [%rd1];
; CHECK-NEXT: ld.b32 %r2, [%rd2];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U;
; CHECK-NEXT: setp.gt.u32 %p1, %r4, %r3;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8;
-; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
+; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
; CHECK-NEXT: setp.gt.u32 %p2, %r6, %r5;
-; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8;
-; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U;
+; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
; CHECK-NEXT: setp.gt.u32 %p3, %r8, %r7;
-; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U;
+; CHECK-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U;
; CHECK-NEXT: setp.gt.u32 %p4, %r10, %r9;
; CHECK-NEXT: selp.b32 %r11, -1, 0, %p4;
; CHECK-NEXT: selp.b32 %r12, -1, 0, %p3;
diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
index d5078f5f19af6..6e42e0006af3c 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
@@ -17,22 +17,22 @@ define i8 @ld_global_v32i8(ptr addrspace(1) %ptr) {
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v32i8_param_0];
; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
-; SM90-NEXT: bfe.u32 %r5, %r4, 0, 8;
+; SM90-NEXT: prmt.b32 %r5, %r4, 0, 0x7770U;
; SM90-NEXT: cvt.u16.u32 %rs1, %r5;
-; SM90-NEXT: bfe.u32 %r6, %r3, 0, 8;
+; SM90-NEXT: prmt.b32 %r6, %r3, 0, 0x7770U;
; SM90-NEXT: cvt.u16.u32 %rs2, %r6;
-; SM90-NEXT: bfe.u32 %r7, %r2, 0, 8;
+; SM90-NEXT: prmt.b32 %r7, %r2, 0, 0x7770U;
; SM90-NEXT: cvt.u16.u32 %rs3, %r7;
-; SM90-NEXT: bfe.u32 %r8, %r1, 0, 8;
+; SM90-NEXT: prmt.b32 %r8, %r1, 0, 0x7770U;
; SM90-NEXT: cvt.u16.u32 %rs4, %r8;
; SM90-NEXT: ld.global.nc.v4.b32 {%r9, %r10, %r11, %r12}, [%rd1];
-; SM90-NEXT: bfe.u32 %r13, %r12, 0, 8;
+; SM90-NEXT: prmt.b32 %r13, %r12, 0, 0x7770U;
; SM90-NEXT: cvt.u16.u32 %rs5, %r13;
-; SM90-NEXT: bfe.u32 %r14, %r11, 0, 8;
+; SM90-NEXT: prmt.b32 %r14, %r11, 0, 0x7770U;
; SM90-NEXT: cvt.u16.u32 %rs6, %r14;
-; SM90-NEXT: bfe.u32 %r15, %r10, 0, 8;
+; SM90-NEXT: prmt.b32 %r15, %r10, 0, 0x7770U;
; SM90-NEXT: cvt.u16.u32 %rs7, %r15;
-; SM90-NEXT: bfe.u32 %r16, %r9, 0, 8;
+; SM90-NEXT: prmt.b32 %r16, %r9, 0, 0x7770U;
; SM90-NEXT: cvt.u16.u32 %rs8, %r16;
; SM90-NEXT: add.s16 %rs9, %rs8, %rs7;
; SM90-NEXT: add.s16 %rs10, %rs6, %rs5;
@@ -54,21 +54,21 @@ define i8 @ld_global_v32i8(ptr addrspace(1) %ptr) {
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v32i8_param_0];
; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
-; SM100-NEXT: bfe.u32 %r9, %r8, 0, 8;
+; SM100-NEXT: prmt.b32 %r9, %r8, 0, 0x7770U;
; SM100-NEXT: cvt.u16.u32 %rs1, %r9;
-; SM100-NEXT: bfe.u32 %r10, %r7, 0, 8;
+; SM100-NEXT: prmt.b32 %r10, %r7, 0, 0x7770U;
; SM100-NEXT: cvt.u16.u32 %rs2, %r10;
-; SM100-NEXT: bfe.u32 %r11, %r6, 0, 8;
+; SM100-NEXT: prmt.b32 %r11, %r6, 0, 0x7770U;
; SM100-NEXT: cvt.u16.u32 %rs3, %r11;
-; SM100-NEXT: bfe.u32 %r12, %r5, 0, 8;
+; SM100-NEXT: prmt.b32 %r12, %r5, 0, 0x7770U;
; SM100-NEXT: cvt.u16.u32 %rs4, %r12;
-; SM100-NEXT: bfe.u32 %r13, %r4, 0, 8;
+; SM100-NEXT: prmt.b32 %r13, %r4, 0, 0x7770U;
; SM100-NEXT: cvt.u16.u32 %rs5, %r13;
-; SM100-NEXT: bfe.u32 %r14, %r3, 0, 8;
+; SM100-NEXT: prmt.b32 %r14, %r3, 0, 0x7770U;
; SM100-NEXT: cvt.u16.u32 %rs6, %r14;
-; SM100-NEXT: bfe.u32 %r15, %r2, 0, 8;
+; SM100-NEXT: prmt.b32 %r15, %r2, 0, 0x7770U;
; SM100-NEXT: cvt.u16.u32 %rs7, %r15;
-; SM100-NEXT: bfe.u32 %r16, %r1, 0, 8;
+; SM100-NEXT: prmt.b32 %r16, %r1, 0, 0x7770U;
; SM100-NEXT: cvt.u16.u32 %rs8, %r16;
; SM100-NEXT: add.s16 %rs9, %rs8, %rs7;
; SM100-NEXT: add.s16 %rs10, %rs6, %rs5;
diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
index 3bd46000661ce..0c19490eb0904 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
@@ -134,13 +134,13 @@ define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v8i8_param_0];
; CHECK-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
-; CHECK-NEXT: bfe.u32 %r5, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
-; CHECK-NEXT: bfe.u32 %r6, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
; CHECK-NEXT: add.s16 %rs5, %rs4, %rs3;
; CHECK-NEXT: add.s16 %rs6, %rs2, %rs1;
@@ -169,21 +169,21 @@ define i8 @ld_global_v16i8(ptr addrspace(1) %ptr) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v16i8_param_0];
; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r5, %r4, 16, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
-; CHECK-NEXT: bfe.u32 %r6, %r4, 0, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs2, %r6;
-; CHECK-NEXT: bfe.u32 %r7, %r3, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r3, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
-; CHECK-NEXT: bfe.u32 %r8, %r3, 0, 8;
+; CHECK-NEXT: prmt.b32 %r8, %r3, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs4, %r8;
-; CHECK-NEXT: bfe.u32 %r9, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r9;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs6, %r10;
-; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r11;
-; CHECK-NEXT: bfe.u32 %r12, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs8, %r12;
; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7;
; CHECK-NEXT: add.s16 %rs10, %rs6, %rs5;
diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
index 3c90323da01d7..7e013390a39db 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
@@ -56,20 +56,20 @@ define void @generic_4xi8(ptr %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xi8_param_0];
; CHECK-NEXT: ld.b32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
@@ -93,39 +93,39 @@ define void @generic_8xi8(ptr %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xi8_param_0];
; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
@@ -149,77 +149,77 @@ define void @generic_16xi8(ptr %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_16xi8_param_0];
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
@@ -473,20 +473,20 @@ define void @generic_volatile_4xi8(ptr %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_4xi8_param_0];
; CHECK-NEXT: ld.volatile.b32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
@@ -510,39 +510,39 @@ define void @generic_volatile_8xi8(ptr %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi8_param_0];
; CHECK-NEXT: ld.volatile.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
@@ -566,77 +566,77 @@ define void @generic_volatile_16xi8(ptr %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_16xi8_param_0];
; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
@@ -873,20 +873,20 @@ define void @global_4xi8(ptr addrspace(1) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_4xi8_param_0];
; CHECK-NEXT: ld.global.b32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
@@ -910,39 +910,39 @@ define void @global_8xi8(ptr addrspace(1) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_8xi8_param_0];
; CHECK-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
@@ -966,77 +966,77 @@ define void @global_16xi8(ptr addrspace(1) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_16xi8_param_0];
; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
@@ -1271,20 +1271,20 @@ define void @global_volatile_4xi8(ptr addrspace(1) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_4xi8_param_0];
; CHECK-NEXT: ld.volatile.global.b32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
@@ -1308,39 +1308,39 @@ define void @global_volatile_8xi8(ptr addrspace(1) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_8xi8_param_0];
; CHECK-NEXT: ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
@@ -1364,77 +1364,77 @@ define void @global_volatile_16xi8(ptr addrspace(1) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_16xi8_param_0];
; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
@@ -1671,20 +1671,20 @@ define void @shared_4xi8(ptr addrspace(3) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_4xi8_param_0];
; CHECK-NEXT: ld.shared.b32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
@@ -1708,39 +1708,39 @@ define void @shared_8xi8(ptr addrspace(3) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xi8_param_0];
; CHECK-NEXT: ld.shared.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
@@ -1764,77 +1764,77 @@ define void @shared_16xi8(ptr addrspace(3) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_16xi8_param_0];
; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
@@ -2069,20 +2069,20 @@ define void @shared_volatile_4xi8(ptr addrspace(3) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_4xi8_param_0];
; CHECK-NEXT: ld.volatile.shared.b32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
@@ -2106,39 +2106,39 @@ define void @shared_volatile_8xi8(ptr addrspace(3) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi8_param_0];
; CHECK-NEXT: ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
@@ -2162,77 +2162,77 @@ define void @shared_volatile_16xi8(ptr addrspace(3) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_16xi8_param_0];
; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
@@ -2469,20 +2469,20 @@ define void @local_4xi8(ptr addrspace(5) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_4xi8_param_0];
; CHECK-NEXT: ld.local.b32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
@@ -2506,39 +2506,39 @@ define void @local_8xi8(ptr addrspace(5) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_8xi8_param_0];
; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
@@ -2562,77 +2562,77 @@ define void @local_16xi8(ptr addrspace(5) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_16xi8_param_0];
; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
@@ -2867,20 +2867,20 @@ define void @local_volatile_4xi8(ptr addrspace(5) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xi8_param_0];
; CHECK-NEXT: ld.local.b32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
@@ -2904,39 +2904,39 @@ define void @local_volatile_8xi8(ptr addrspace(5) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xi8_param_0];
; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
@@ -2960,77 +2960,77 @@ define void @local_volatile_16xi8(ptr addrspace(5) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_16xi8_param_0];
; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll
index f6e6196345fcb..9a67bdfeb067b 100644
--- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll
+++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll
@@ -29,29 +29,25 @@ define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) {
; CHECK-LABEL: sext_setcc_v4i1_to_v4i8(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<13>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0];
; CHECK-NEXT: ld.b32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7770U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT: and.b16 %rs2, %rs1, 255;
-; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 0;
-; CHECK-NEXT: bfe.u32 %r3, %r1, 8, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs3, %r3;
-; CHECK-NEXT: and.b16 %rs4, %rs3, 255;
-; CHECK-NEXT: setp.eq.b16 %p2, %rs4, 0;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs5, %r4;
-; CHECK-NEXT: and.b16 %rs6, %rs5, 255;
-; CHECK-NEXT: setp.eq.b16 %p3, %rs6, 0;
-; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r5;
-; CHECK-NEXT: and.b16 %rs8, %rs7, 255;
-; CHECK-NEXT: setp.eq.b16 %p4, %rs8, 0;
+; CHECK-NEXT: setp.eq.b16 %p1, %rs1, 0;
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x7771U;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r3;
+; CHECK-NEXT: setp.eq.b16 %p2, %rs2, 0;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: setp.eq.b16 %p3, %rs3, 0;
+; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7773U;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r5;
+; CHECK-NEXT: setp.eq.b16 %p4, %rs4, 0;
; CHECK-NEXT: selp.b32 %r6, -1, 0, %p4;
; CHECK-NEXT: selp.b32 %r7, -1, 0, %p3;
; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 0x3340U;
More information about the llvm-commits
mailing list