[llvm] 86203b6 - [NVPTX] Use PRMT more widely, and improve folding around this instruction (#148261)

via llvm-commits llvm-commits at lists.llvm.org
Sun Jul 13 15:06:56 PDT 2025


Author: Alex MacLean
Date: 2025-07-13T15:06:53-07:00
New Revision: 86203b6b33e49cc1a8ce6d7d69e7df4970d8f7bd

URL: https://github.com/llvm/llvm-project/commit/86203b6b33e49cc1a8ce6d7d69e7df4970d8f7bd
DIFF: https://github.com/llvm/llvm-project/commit/86203b6b33e49cc1a8ce6d7d69e7df4970d8f7bd.diff

LOG: [NVPTX] Use PRMT more widely, and improve folding around this instruction (#148261)

Replace uses of BFE with PRMT when lowering v4i8 vectors. This will
generally lead to equivalent or better SASS and reduces the number of
target specific operations we need to represent.
(https://cuda.godbolt.org/z/M75W6f8xd) Also implement KnownBits tracking
for PRMT allowing elimination of redundant AND instructions when
lowering various i8 operations.

Added: 
    

Modified: 
    llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
    llvm/lib/Target/NVPTX/NVPTXISelLowering.h
    llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
    llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
    llvm/test/CodeGen/NVPTX/extractelement.ll
    llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
    llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
    llvm/test/CodeGen/NVPTX/ldg-invariant.ll
    llvm/test/CodeGen/NVPTX/load-store-vectors.ll
    llvm/test/CodeGen/NVPTX/sext-setcc.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3d010e04824c5..14f05250ad6b8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -57,6 +57,7 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/NVPTXAddrSpace.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -1087,7 +1088,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(NVPTXISD::StoreV8)
     MAKE_CASE(NVPTXISD::FSHL_CLAMP)
     MAKE_CASE(NVPTXISD::FSHR_CLAMP)
-    MAKE_CASE(NVPTXISD::BFE)
     MAKE_CASE(NVPTXISD::BFI)
     MAKE_CASE(NVPTXISD::PRMT)
     MAKE_CASE(NVPTXISD::FCOPYSIGN)
@@ -2173,14 +2173,14 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   EVT VectorVT = Vector.getValueType();
 
   if (VectorVT == MVT::v4i8) {
-    SDValue BFE =
-        DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
-                    {Vector,
-                     DAG.getNode(ISD::MUL, DL, MVT::i32,
-                                 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
-                                 DAG.getConstant(8, DL, MVT::i32)),
-                     DAG.getConstant(8, DL, MVT::i32)});
-    return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
+    SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
+                                   DAG.getZExtOrTrunc(Index, DL, MVT::i32),
+                                   DAG.getConstant(0x7770, DL, MVT::i32));
+    SDValue PRMT = DAG.getNode(
+        NVPTXISD::PRMT, DL, MVT::i32,
+        {DAG.getBitcast(MVT::i32, Vector), DAG.getConstant(0, DL, MVT::i32),
+         Selector, DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+    return DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
   }
 
   // Constant index will be matched by tablegen.
@@ -5271,31 +5271,6 @@ static SDValue PerformANDCombine(SDNode *N,
 
   SDValue AExt;
 
-  // Convert BFE-> truncate i16 -> and 255
-  // To just BFE-> truncate i16, as the value already has all the bits in the
-  // right places.
-  if (Val.getOpcode() == ISD::TRUNCATE) {
-    SDValue BFE = Val.getOperand(0);
-    if (BFE.getOpcode() != NVPTXISD::BFE)
-      return SDValue();
-
-    ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
-    if (!BFEBits)
-      return SDValue();
-    uint64_t BFEBitsVal = BFEBits->getZExtValue();
-
-    ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
-    if (!MaskCnst) {
-      // Not an AND with a constant
-      return SDValue();
-    }
-    uint64_t MaskVal = MaskCnst->getZExtValue();
-
-    if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
-      return SDValue();
-    // If we get here, the AND is unnecessary.  Just replace it with the trunc
-    DCI.CombineTo(N, Val, false);
-  }
   // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
   if (Val.getOpcode() == ISD::ANY_EXTEND) {
     AExt = Val;
@@ -6402,3 +6377,45 @@ MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   return getDataSection();
 }
+
+static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known,
+                                    const SelectionDAG &DAG, unsigned Depth) {
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+  ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+  unsigned Mode = Op.getConstantOperandVal(3);
+
+  if (Mode != NVPTX::PTXPrmtMode::NONE || !Selector)
+    return;
+
+  KnownBits AKnown = DAG.computeKnownBits(A, Depth);
+  KnownBits BKnown = DAG.computeKnownBits(B, Depth);
+
+  // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
+  KnownBits BitField = BKnown.concat(AKnown);
+
+  APInt SelectorVal = Selector->getAPIntValue();
+  for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) {
+    APInt Sel = SelectorVal.extractBits(4, I * 4);
+    unsigned Idx = Sel.getLoBits(3).getZExtValue();
+    unsigned Sign = Sel.getHiBits(1).getZExtValue();
+    KnownBits Byte = BitField.extractBits(8, Idx * 8);
+    if (Sign)
+      Byte = KnownBits::ashr(Byte, 8);
+    Known.insertBits(Byte, I * 8);
+  }
+}
+
+void NVPTXTargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, unsigned Depth) const {
+  Known.resetAll();
+
+  switch (Op.getOpcode()) {
+  case NVPTXISD::PRMT:
+    computeKnownBitsForPRMT(Op, Known, DAG, Depth);
+    break;
+  default:
+    break;
+  }
+}
\ No newline at end of file

diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 2477e1fb61595..bc3548c0272bb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -50,7 +50,6 @@ enum NodeType : unsigned {
   MUL_WIDE_UNSIGNED,
   SETP_F16X2,
   SETP_BF16X2,
-  BFE,
   BFI,
   PRMT,
 
@@ -272,6 +271,11 @@ class NVPTXTargetLowering : public TargetLowering {
   unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
                                      EVT ToVT) const override;
 
+  void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+                                     const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
 private:
   const NVPTXSubtarget &STI; // cache the subtarget here
   mutable unsigned GlobalUniqueCallSite;

diff  --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index db6b411509e93..ecae03e77aa83 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1372,11 +1372,6 @@ def BREV64 :
 // restriction in PTX?
 //
 // dest and src may be int32 or int64, but start and end are always int32.
-def SDTBFE :
-  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
-                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
-def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>;
-
 def SDTBFI :
   SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, 
                        SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
@@ -1387,22 +1382,13 @@ def SDTPRMT :
                        SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
 def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
 
-multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
+multiclass BFE<string Instr, RegisterClass RC> {
   def rrr
-    : BasicNVPTXInst<(outs RC:$d),
-                (ins RC:$a, B32:$b, B32:$c),
-                Instr,
-                [(set T:$d, (bfe T:$a, i32:$b, i32:$c))]>;
+    : BasicNVPTXInst<(outs RC:$d), (ins RC:$a, B32:$b, B32:$c), Instr>;
   def rri
-    : BasicNVPTXInst<(outs RC:$d),
-                (ins RC:$a, B32:$b, i32imm:$c),
-                Instr,
-                [(set T:$d, (bfe T:$a, i32:$b, imm:$c))]>;
+    : BasicNVPTXInst<(outs RC:$d), (ins RC:$a, B32:$b, i32imm:$c), Instr>;
   def rii
-    : BasicNVPTXInst<(outs RC:$d),
-                (ins RC:$a, i32imm:$b, i32imm:$c),
-                Instr,
-                [(set T:$d, (bfe T:$a, imm:$b, imm:$c))]>;
+    : BasicNVPTXInst<(outs RC:$d), (ins RC:$a, i32imm:$b, i32imm:$c), Instr>;
 }
 
 multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
@@ -1447,10 +1433,10 @@ let hasSideEffects = false in {
   // the same patterns, so the first one wins. Having unsigned byte extraction
   // has the benefit of always having zero in unused bits, which makes some
   // optimizations easier (e.g. no need to mask them).
-  defm BFE_U32 : BFE<"bfe.u32", i32, B32>;
-  defm BFE_S32 : BFE<"bfe.s32", i32, B32>;
-  defm BFE_U64 : BFE<"bfe.u64", i64, B64>;
-  defm BFE_S64 : BFE<"bfe.s64", i64, B64>;
+  defm BFE_U32 : BFE<"bfe.u32", B32>;
+  defm BFE_S32 : BFE<"bfe.s32", B32>;
+  defm BFE_U64 : BFE<"bfe.u64", B64>;
+  defm BFE_S64 : BFE<"bfe.s64", B64>;
 
   defm BFI_B32 : BFI<"bfi.b32", i32, B32, i32imm>;
   defm BFI_B64 : BFI<"bfi.b64", i64, B64, i64imm>;
@@ -1487,19 +1473,26 @@ def : Pat<(fshr i32:$hi, i32:$lo, (shl i32:$amt, (i32 3))),
           (PRMT_B32rrr $lo, $hi, $amt, PrmtF4E)>;
 
 
+def byte_extract_prmt : ImmLeaf<i32, [{
+  return (Imm == 0x7770) || (Imm == 0x7771) || (Imm == 0x7772) || (Imm == 0x7773);
+}]>;
+
+def to_sign_extend_selector : SDNodeXForm<imm, [{
+  const APInt &V = N->getAPIntValue();
+  const APInt B = V.trunc(4);
+  const APInt BSext = B | 8;
+  const APInt R = BSext.concat(BSext).concat(BSext).concat(B).zext(32);
+  return CurDAG->getTargetConstant(R, SDLoc(N), MVT::i32);
+}]>;
+
+
 // byte extraction + signed/unsigned extension to i32.
-def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)),
-          (BFE_S32rri $s, $o, 8)>;
-def : Pat<(i32 (sext_inreg (bfe i32:$s, imm:$o, 8), i8)),
-          (BFE_S32rii $s, imm:$o, 8)>;
-def : Pat<(i32 (and (bfe i32:$s, i32:$o, 8), 255)),
-          (BFE_U32rri $s, $o, 8)>;
-def : Pat<(i32 (and (bfe i32:$s, imm:$o, 8), 255)),
-          (BFE_U32rii $s, imm:$o, 8)>;
+def : Pat<(i32 (sext_inreg (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtNONE), i8)),
+          (PRMT_B32rii $s, 0, (to_sign_extend_selector $sel), PrmtNONE)>;
 
 // byte extraction + signed extension to i16
-def : Pat<(i16 (sext_inreg (trunc (bfe i32:$s, imm:$o, 8)), i8)),
-          (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
+def : Pat<(i16 (sext_inreg (trunc (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtNONE)), i8)),
+          (CVT_u16_u32 (PRMT_B32rii $s, 0, (to_sign_extend_selector $sel), PrmtNONE), CvtNONE)>;
 
 
 // Byte extraction via shift/trunc/sext
@@ -1709,28 +1702,36 @@ def cond_not_signed : PatLeaf<(cond), [{
   return !isSignedIntSetCC(N->get());
 }]>;
 
-// comparisons of i8 extracted with BFE as i32
-// It's faster to do comparison directly on i32 extracted by BFE,
+// comparisons of i8 extracted with PRMT as i32
+// It's faster to do comparison directly on i32 extracted by PRMT,
 // instead of the long conversion and sign extending.
-def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (bfe B32:$a, B32:$oa, 8))), i8)),
-                (i16 (sext_inreg (i16 (trunc (bfe B32:$b, B32:$ob, 8))), i8)),
+def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), i8)),
+                (i16 (sext_inreg (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), i8)),
                 cond_signed:$cc),
-         (SETP_i32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), (cond2cc $cc))>;
+         (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+                     (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), 
+                     (cond2cc $cc))>;
 
-def: Pat<(setcc (i16 (sext_inreg (trunc (bfe B32:$a, imm:$oa, 8)), i8)),
-                (i16 (sext_inreg (trunc (bfe B32:$b, imm:$ob, 8)), i8)),
+def: Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)),
+                (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)),
                 cond_signed:$cc),
-         (SETP_i32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), (cond2cc $cc))>;
+         (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+                     (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), 
+                     (cond2cc $cc))>;
 
-def: Pat<(setcc (i16 (and (trunc (bfe B32:$a, B32:$oa, 8)), 255)),
-                (i16 (and (trunc (bfe B32:$b, B32:$ob, 8)), 255)),
+def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
+                (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
                 cond_signed:$cc),
-         (SETP_i32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), (cond2cc $cc))>;
+         (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+                     (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE),
+                     (cond2cc $cc))>;
 
-def: Pat<(setcc (i16 (and (trunc (bfe B32:$a, imm:$oa, 8)), 255)),
-                (i16 (and (trunc (bfe B32:$b, imm:$ob, 8)), 255)),
+def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))),
+                (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))),
                 cond_not_signed:$cc),
-         (SETP_i32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), (cond2cc $cc))>;
+         (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE),
+                     (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), 
+                     (cond2cc $cc))>;
 
 def SDTDeclareArrayParam :
   SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;

diff  --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index 1207c429524ca..23832a9cb5c58 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -178,38 +178,38 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
 ; ENABLED-NEXT:  // %bb.0:
 ; ENABLED-NEXT:    ld.param.b64 %rd1, [combine_v16i8_param_0];
 ; ENABLED-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
+; ENABLED-NEXT:    prmt.b32 %r6, %r4, 0, 0x7772U;
+; ENABLED-NEXT:    prmt.b32 %r7, %r4, 0, 0x7771U;
+; ENABLED-NEXT:    prmt.b32 %r8, %r4, 0, 0x7770U;
+; ENABLED-NEXT:    prmt.b32 %r9, %r3, 0, 0x7773U;
+; ENABLED-NEXT:    prmt.b32 %r10, %r3, 0, 0x7772U;
+; ENABLED-NEXT:    prmt.b32 %r11, %r3, 0, 0x7771U;
+; ENABLED-NEXT:    prmt.b32 %r12, %r3, 0, 0x7770U;
+; ENABLED-NEXT:    prmt.b32 %r13, %r2, 0, 0x7773U;
+; ENABLED-NEXT:    prmt.b32 %r14, %r2, 0, 0x7772U;
+; ENABLED-NEXT:    prmt.b32 %r15, %r2, 0, 0x7771U;
+; ENABLED-NEXT:    prmt.b32 %r16, %r2, 0, 0x7770U;
+; ENABLED-NEXT:    prmt.b32 %r17, %r1, 0, 0x7773U;
+; ENABLED-NEXT:    prmt.b32 %r18, %r1, 0, 0x7772U;
+; ENABLED-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
+; ENABLED-NEXT:    prmt.b32 %r20, %r1, 0, 0x7770U;
 ; ENABLED-NEXT:    ld.param.b64 %rd2, [combine_v16i8_param_1];
-; ENABLED-NEXT:    bfe.u32 %r5, %r1, 0, 8;
-; ENABLED-NEXT:    bfe.u32 %r6, %r1, 8, 8;
-; ENABLED-NEXT:    bfe.u32 %r7, %r1, 16, 8;
-; ENABLED-NEXT:    bfe.u32 %r8, %r1, 24, 8;
-; ENABLED-NEXT:    bfe.u32 %r9, %r2, 0, 8;
-; ENABLED-NEXT:    bfe.u32 %r10, %r2, 8, 8;
-; ENABLED-NEXT:    bfe.u32 %r11, %r2, 16, 8;
-; ENABLED-NEXT:    bfe.u32 %r12, %r2, 24, 8;
-; ENABLED-NEXT:    bfe.u32 %r13, %r3, 0, 8;
-; ENABLED-NEXT:    bfe.u32 %r14, %r3, 8, 8;
-; ENABLED-NEXT:    bfe.u32 %r15, %r3, 16, 8;
-; ENABLED-NEXT:    bfe.u32 %r16, %r3, 24, 8;
-; ENABLED-NEXT:    bfe.u32 %r17, %r4, 0, 8;
-; ENABLED-NEXT:    bfe.u32 %r18, %r4, 8, 8;
-; ENABLED-NEXT:    bfe.u32 %r19, %r4, 16, 8;
-; ENABLED-NEXT:    bfe.u32 %r20, %r4, 24, 8;
-; ENABLED-NEXT:    add.s32 %r21, %r5, %r6;
-; ENABLED-NEXT:    add.s32 %r22, %r21, %r7;
-; ENABLED-NEXT:    add.s32 %r23, %r22, %r8;
-; ENABLED-NEXT:    add.s32 %r24, %r23, %r9;
-; ENABLED-NEXT:    add.s32 %r25, %r24, %r10;
-; ENABLED-NEXT:    add.s32 %r26, %r25, %r11;
-; ENABLED-NEXT:    add.s32 %r27, %r26, %r12;
-; ENABLED-NEXT:    add.s32 %r28, %r27, %r13;
-; ENABLED-NEXT:    add.s32 %r29, %r28, %r14;
-; ENABLED-NEXT:    add.s32 %r30, %r29, %r15;
-; ENABLED-NEXT:    add.s32 %r31, %r30, %r16;
-; ENABLED-NEXT:    add.s32 %r32, %r31, %r17;
-; ENABLED-NEXT:    add.s32 %r33, %r32, %r18;
-; ENABLED-NEXT:    add.s32 %r34, %r33, %r19;
-; ENABLED-NEXT:    add.s32 %r35, %r34, %r20;
+; ENABLED-NEXT:    add.s32 %r21, %r20, %r19;
+; ENABLED-NEXT:    add.s32 %r22, %r21, %r18;
+; ENABLED-NEXT:    add.s32 %r23, %r22, %r17;
+; ENABLED-NEXT:    add.s32 %r24, %r23, %r16;
+; ENABLED-NEXT:    add.s32 %r25, %r24, %r15;
+; ENABLED-NEXT:    add.s32 %r26, %r25, %r14;
+; ENABLED-NEXT:    add.s32 %r27, %r26, %r13;
+; ENABLED-NEXT:    add.s32 %r28, %r27, %r12;
+; ENABLED-NEXT:    add.s32 %r29, %r28, %r11;
+; ENABLED-NEXT:    add.s32 %r30, %r29, %r10;
+; ENABLED-NEXT:    add.s32 %r31, %r30, %r9;
+; ENABLED-NEXT:    add.s32 %r32, %r31, %r8;
+; ENABLED-NEXT:    add.s32 %r33, %r32, %r7;
+; ENABLED-NEXT:    add.s32 %r34, %r33, %r6;
+; ENABLED-NEXT:    add.s32 %r35, %r34, %r5;
 ; ENABLED-NEXT:    st.b32 [%rd2], %r35;
 ; ENABLED-NEXT:    ret;
 ;
@@ -329,39 +329,39 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
 ; ENABLED-NEXT:  // %bb.0:
 ; ENABLED-NEXT:    ld.param.b64 %rd1, [combine_v16i8_unaligned_param_0];
 ; ENABLED-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
+; ENABLED-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; ENABLED-NEXT:    prmt.b32 %r4, %r2, 0, 0x7772U;
+; ENABLED-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; ENABLED-NEXT:    prmt.b32 %r6, %r2, 0, 0x7770U;
+; ENABLED-NEXT:    prmt.b32 %r7, %r1, 0, 0x7773U;
+; ENABLED-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; ENABLED-NEXT:    prmt.b32 %r9, %r1, 0, 0x7771U;
+; ENABLED-NEXT:    prmt.b32 %r10, %r1, 0, 0x7770U;
 ; ENABLED-NEXT:    ld.param.b64 %rd2, [combine_v16i8_unaligned_param_1];
-; ENABLED-NEXT:    ld.v2.b32 {%r3, %r4}, [%rd1+8];
-; ENABLED-NEXT:    bfe.u32 %r5, %r1, 0, 8;
-; ENABLED-NEXT:    bfe.u32 %r6, %r1, 8, 8;
-; ENABLED-NEXT:    bfe.u32 %r7, %r1, 16, 8;
-; ENABLED-NEXT:    bfe.u32 %r8, %r1, 24, 8;
-; ENABLED-NEXT:    bfe.u32 %r9, %r2, 0, 8;
-; ENABLED-NEXT:    bfe.u32 %r10, %r2, 8, 8;
-; ENABLED-NEXT:    bfe.u32 %r11, %r2, 16, 8;
-; ENABLED-NEXT:    bfe.u32 %r12, %r2, 24, 8;
-; ENABLED-NEXT:    bfe.u32 %r13, %r3, 0, 8;
-; ENABLED-NEXT:    bfe.u32 %r14, %r3, 8, 8;
-; ENABLED-NEXT:    bfe.u32 %r15, %r3, 16, 8;
-; ENABLED-NEXT:    bfe.u32 %r16, %r3, 24, 8;
-; ENABLED-NEXT:    bfe.u32 %r17, %r4, 0, 8;
-; ENABLED-NEXT:    bfe.u32 %r18, %r4, 8, 8;
-; ENABLED-NEXT:    bfe.u32 %r19, %r4, 16, 8;
-; ENABLED-NEXT:    bfe.u32 %r20, %r4, 24, 8;
-; ENABLED-NEXT:    add.s32 %r21, %r5, %r6;
-; ENABLED-NEXT:    add.s32 %r22, %r21, %r7;
-; ENABLED-NEXT:    add.s32 %r23, %r22, %r8;
-; ENABLED-NEXT:    add.s32 %r24, %r23, %r9;
-; ENABLED-NEXT:    add.s32 %r25, %r24, %r10;
-; ENABLED-NEXT:    add.s32 %r26, %r25, %r11;
-; ENABLED-NEXT:    add.s32 %r27, %r26, %r12;
-; ENABLED-NEXT:    add.s32 %r28, %r27, %r13;
-; ENABLED-NEXT:    add.s32 %r29, %r28, %r14;
-; ENABLED-NEXT:    add.s32 %r30, %r29, %r15;
-; ENABLED-NEXT:    add.s32 %r31, %r30, %r16;
-; ENABLED-NEXT:    add.s32 %r32, %r31, %r17;
-; ENABLED-NEXT:    add.s32 %r33, %r32, %r18;
-; ENABLED-NEXT:    add.s32 %r34, %r33, %r19;
-; ENABLED-NEXT:    add.s32 %r35, %r34, %r20;
+; ENABLED-NEXT:    ld.v2.b32 {%r11, %r12}, [%rd1+8];
+; ENABLED-NEXT:    prmt.b32 %r13, %r12, 0, 0x7773U;
+; ENABLED-NEXT:    prmt.b32 %r14, %r12, 0, 0x7772U;
+; ENABLED-NEXT:    prmt.b32 %r15, %r12, 0, 0x7771U;
+; ENABLED-NEXT:    prmt.b32 %r16, %r12, 0, 0x7770U;
+; ENABLED-NEXT:    prmt.b32 %r17, %r11, 0, 0x7773U;
+; ENABLED-NEXT:    prmt.b32 %r18, %r11, 0, 0x7772U;
+; ENABLED-NEXT:    prmt.b32 %r19, %r11, 0, 0x7771U;
+; ENABLED-NEXT:    prmt.b32 %r20, %r11, 0, 0x7770U;
+; ENABLED-NEXT:    add.s32 %r21, %r10, %r9;
+; ENABLED-NEXT:    add.s32 %r22, %r21, %r8;
+; ENABLED-NEXT:    add.s32 %r23, %r22, %r7;
+; ENABLED-NEXT:    add.s32 %r24, %r23, %r6;
+; ENABLED-NEXT:    add.s32 %r25, %r24, %r5;
+; ENABLED-NEXT:    add.s32 %r26, %r25, %r4;
+; ENABLED-NEXT:    add.s32 %r27, %r26, %r3;
+; ENABLED-NEXT:    add.s32 %r28, %r27, %r20;
+; ENABLED-NEXT:    add.s32 %r29, %r28, %r19;
+; ENABLED-NEXT:    add.s32 %r30, %r29, %r18;
+; ENABLED-NEXT:    add.s32 %r31, %r30, %r17;
+; ENABLED-NEXT:    add.s32 %r32, %r31, %r16;
+; ENABLED-NEXT:    add.s32 %r33, %r32, %r15;
+; ENABLED-NEXT:    add.s32 %r34, %r33, %r14;
+; ENABLED-NEXT:    add.s32 %r35, %r34, %r13;
 ; ENABLED-NEXT:    st.b32 [%rd2], %r35;
 ; ENABLED-NEXT:    ret;
 ;

diff  --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index e04732ebad66b..80980efbab05b 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -60,14 +60,14 @@ define i16  @test_v4i8(i32 %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_v4i8_param_0];
-; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs1, %r2;
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 8, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs2, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 16, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 24, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs4, %r5;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x8880U;
+; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT:    prmt.b32 %r3, %r1, 0, 0x9991U;
+; CHECK-NEXT:    cvt.u16.u32 %rs2, %r3;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0xaaa2U;
+; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0xbbb3U;
+; CHECK-NEXT:    cvt.u16.u32 %rs4, %r5;
 ; CHECK-NEXT:    add.s16 %rs5, %rs1, %rs2;
 ; CHECK-NEXT:    add.s16 %rs6, %rs3, %rs4;
 ; CHECK-NEXT:    add.s16 %rs7, %rs5, %rs6;
@@ -96,10 +96,10 @@ define i32  @test_v4i8_s32(i32 %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_v4i8_s32_param_0];
-; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x8880U;
+; CHECK-NEXT:    prmt.b32 %r3, %r1, 0, 0x9991U;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0xaaa2U;
+; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0xbbb3U;
 ; CHECK-NEXT:    add.s32 %r6, %r2, %r3;
 ; CHECK-NEXT:    add.s32 %r7, %r4, %r5;
 ; CHECK-NEXT:    add.s32 %r8, %r6, %r7;
@@ -127,10 +127,10 @@ define i32  @test_v4i8_u32(i32 %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_v4i8_u32_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r3, %r1, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r3, %r1, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    add.s32 %r6, %r2, %r3;
 ; CHECK-NEXT:    add.s32 %r7, %r4, %r5;
 ; CHECK-NEXT:    add.s32 %r8, %r6, %r7;
@@ -161,22 +161,22 @@ define i16  @test_v8i8(i64 %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_v8i8_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs2, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs3, %r5;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 24, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r2, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs5, %r7;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs6, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 16, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs7, %r9;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 24, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs8, %r10;
+; CHECK-NEXT:    prmt.b32 %r3, %r1, 0, 0x8880U;
+; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x9991U;
+; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
+; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0xaaa2U;
+; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0xbbb3U;
+; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
+; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x8880U;
+; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x9991U;
+; CHECK-NEXT:    cvt.u16.u32 %rs6, %r8;
+; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0xaaa2U;
+; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0xbbb3U;
+; CHECK-NEXT:    cvt.u16.u32 %rs8, %r10;
 ; CHECK-NEXT:    add.s16 %rs9, %rs1, %rs2;
 ; CHECK-NEXT:    add.s16 %rs10, %rs3, %rs4;
 ; CHECK-NEXT:    add.s16 %rs11, %rs5, %rs6;

diff  --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 1fc42d6cc02c0..410c0019c7222 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -29,7 +29,7 @@ define i8 @test_extract_0(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %e = extractelement <4 x i8> %a, i32 0
@@ -43,7 +43,7 @@ define i8 @test_extract_1(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %e = extractelement <4 x i8> %a, i32 1
@@ -57,7 +57,7 @@ define i8 @test_extract_2(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_2_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %e = extractelement <4 x i8> %a, i32 2
@@ -71,7 +71,7 @@ define i8 @test_extract_3(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_3_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %e = extractelement <4 x i8> %a, i32 3
@@ -88,8 +88,8 @@ define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 {
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
 ; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT:    shl.b32 %r3, %r2, 3;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, %r3, 8;
+; CHECK-NEXT:    or.b32 %r3, %r2, 30576;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
   %e = extractelement <4 x i8> %a, i64 %idx
@@ -105,28 +105,28 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_add_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_add_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    add.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
@@ -146,20 +146,20 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_add_imm_0_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 4;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
@@ -179,20 +179,20 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_add_imm_1_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 4;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
@@ -213,28 +213,28 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_sub_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_sub_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    sub.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    sub.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    sub.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
 ; CHECK-NEXT:    sub.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
@@ -250,39 +250,31 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smax(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<26>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    setp.gt.s32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.s32 %r5, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    setp.gt.s32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    setp.gt.s32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    setp.gt.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r13, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p4;
-; CHECK-NEXT:    bfe.u32 %r17, %r1, 16, 8;
-; CHECK-NEXT:    selp.b32 %r18, %r17, %r13, %p3;
-; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r20, %r1, 8, 8;
-; CHECK-NEXT:    selp.b32 %r21, %r20, %r12, %p2;
-; CHECK-NEXT:    bfe.u32 %r22, %r1, 0, 8;
-; CHECK-NEXT:    selp.b32 %r23, %r22, %r11, %p1;
-; CHECK-NEXT:    prmt.b32 %r24, %r23, %r21, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r25, %r24, %r19, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r25;
+; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sgt <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -298,17 +290,17 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    setp.gt.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r6, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    setp.gt.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    setp.gt.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    setp.gt.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
 ; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
@@ -328,39 +320,31 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<26>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    setp.le.s32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.s32 %r5, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    setp.le.s32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    setp.le.s32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    setp.le.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r13, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p4;
-; CHECK-NEXT:    bfe.u32 %r17, %r1, 16, 8;
-; CHECK-NEXT:    selp.b32 %r18, %r17, %r13, %p3;
-; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r20, %r1, 8, 8;
-; CHECK-NEXT:    selp.b32 %r21, %r20, %r12, %p2;
-; CHECK-NEXT:    bfe.u32 %r22, %r1, 0, 8;
-; CHECK-NEXT:    selp.b32 %r23, %r22, %r11, %p1;
-; CHECK-NEXT:    prmt.b32 %r24, %r23, %r21, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r25, %r24, %r19, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r25;
+; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sle <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -376,17 +360,17 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    setp.le.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r6, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    setp.le.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    setp.le.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    setp.le.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
 ; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
@@ -412,26 +396,26 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-NEXT:    ld.param.b32 %r3, [test_eq_param_2];
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_eq_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_eq_param_0];
-; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r2, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    setp.eq.b32 %p1, %r5, %r4;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    setp.eq.b32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    setp.eq.b32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    setp.eq.b32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.u32 %r14, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r3, 0, 0x7772U;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
 ; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r17, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r17, %r3, 0, 0x7771U;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT:    bfe.u32 %r19, %r3, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r3, 0, 0x7770U;
 ; CHECK-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
 ; CHECK-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
@@ -452,26 +436,26 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-NEXT:    ld.param.b32 %r3, [test_ne_param_2];
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_ne_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_ne_param_0];
-; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r2, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    setp.ne.b32 %p1, %r5, %r4;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    setp.ne.b32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    setp.ne.b32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    setp.ne.b32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.u32 %r14, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r3, 0, 0x7772U;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
 ; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r17, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r17, %r3, 0, 0x7771U;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT:    bfe.u32 %r19, %r3, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r3, 0, 0x7770U;
 ; CHECK-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
 ; CHECK-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
@@ -491,28 +475,28 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_mul_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_mul_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
 ; CHECK-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
@@ -750,7 +734,7 @@ define void @test_ldst_v3i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v3i8_param_0];
 ; CHECK-NEXT:    ld.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.b16 [%rd2], %r1;
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    st.b8 [%rd2+2], %r2;
 ; CHECK-NEXT:    ret;
   %t1 = load <3 x i8>, ptr %a
@@ -918,30 +902,30 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
 ; CHECK-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r6, %r3, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r6, %r3, 0, 0x7770U;
 ; CHECK-NEXT:    setp.ne.b32 %p1, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r4, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r8, %r3, 0, 0x7771U;
 ; CHECK-NEXT:    setp.ne.b32 %p2, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r4, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r4, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r10, %r3, 0, 0x7772U;
 ; CHECK-NEXT:    setp.ne.b32 %p3, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r4, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r11, %r4, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
 ; CHECK-NEXT:    setp.ne.b32 %p4, %r12, %r11;
-; CHECK-NEXT:    bfe.u32 %r13, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
-; CHECK-NEXT:    bfe.u32 %r16, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r17, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r2, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r17, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
 ; CHECK-NEXT:    prmt.b32 %r19, %r18, %r15, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r21, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r20, %r2, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; CHECK-NEXT:    bfe.u32 %r23, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r24, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r23, %r2, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r24, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    selp.b32 %r25, %r24, %r23, %p1;
 ; CHECK-NEXT:    prmt.b32 %r26, %r25, %r22, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r27, %r26, %r19, 0x5410U;
@@ -963,17 +947,17 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
 ; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
 ; CHECK-NEXT:    ld.param.b32 %r10, [test_select_cc_i32_i8_param_3];
 ; CHECK-NEXT:    ld.param.b32 %r9, [test_select_cc_i32_i8_param_2];
-; CHECK-NEXT:    bfe.u32 %r11, %r10, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r9, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r11, %r10, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r12, %r9, 0, 0x7770U;
 ; CHECK-NEXT:    setp.ne.b32 %p1, %r12, %r11;
-; CHECK-NEXT:    bfe.u32 %r13, %r10, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r9, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r13, %r10, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r14, %r9, 0, 0x7771U;
 ; CHECK-NEXT:    setp.ne.b32 %p2, %r14, %r13;
-; CHECK-NEXT:    bfe.u32 %r15, %r10, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r16, %r9, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r15, %r10, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r16, %r9, 0, 0x7772U;
 ; CHECK-NEXT:    setp.ne.b32 %p3, %r16, %r15;
-; CHECK-NEXT:    bfe.u32 %r17, %r10, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r18, %r9, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r17, %r10, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r18, %r9, 0, 0x7773U;
 ; CHECK-NEXT:    setp.ne.b32 %p4, %r18, %r17;
 ; CHECK-NEXT:    selp.b32 %r19, %r4, %r8, %p4;
 ; CHECK-NEXT:    selp.b32 %r20, %r3, %r7, %p3;
@@ -1002,18 +986,18 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
 ; CHECK-NEXT:    setp.ne.b32 %p2, %r4, %r8;
 ; CHECK-NEXT:    setp.ne.b32 %p3, %r5, %r9;
 ; CHECK-NEXT:    setp.ne.b32 %p4, %r6, %r10;
-; CHECK-NEXT:    bfe.u32 %r11, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r11, %r2, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r12, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
-; CHECK-NEXT:    bfe.u32 %r14, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r2, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r15, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
 ; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r18, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r18, %r2, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
-; CHECK-NEXT:    bfe.u32 %r21, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r22, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r2, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r22, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    selp.b32 %r23, %r22, %r21, %p1;
 ; CHECK-NEXT:    prmt.b32 %r24, %r23, %r20, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r17, 0x5410U;
@@ -1071,10 +1055,10 @@ define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_zext_2xi32_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r3, %r1, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r5, %r4, %r3, %r2};
 ; CHECK-NEXT:    ret;
   %r = zext <4 x i8> %a to <4 x i32>
@@ -1085,24 +1069,20 @@ define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
 ; CHECK-LABEL: test_zext_2xi64(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
-; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_zext_2xi64_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u64.u32 %rd1, %r2;
-; CHECK-NEXT:    and.b64 %rd2, %rd1, 255;
-; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
-; CHECK-NEXT:    cvt.u64.u32 %rd3, %r3;
-; CHECK-NEXT:    and.b64 %rd4, %rd3, 255;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
-; CHECK-NEXT:    cvt.u64.u32 %rd5, %r4;
-; CHECK-NEXT:    and.b64 %rd6, %rd5, 255;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
-; CHECK-NEXT:    cvt.u64.u32 %rd7, %r5;
-; CHECK-NEXT:    and.b64 %rd8, %rd7, 255;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd8, %rd6};
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd4, %rd2};
+; CHECK-NEXT:    prmt.b32 %r3, %r1, 0, 0x7772U;
+; CHECK-NEXT:    cvt.u64.u32 %rd2, %r3;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7771U;
+; CHECK-NEXT:    cvt.u64.u32 %rd3, %r4;
+; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
+; CHECK-NEXT:    cvt.u64.u32 %rd4, %r5;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd4, %rd3};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd2, %rd1};
 ; CHECK-NEXT:    ret;
   %r = zext <4 x i8> %a to <4 x i64>
   ret <4 x i64> %r
@@ -1304,29 +1284,29 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_srem_v4i8_param_0];
 ; CHECK-NEXT:    ld.b32 %r1, [%rd1];
 ; CHECK-NEXT:    ld.b32 %r2, [%rd2];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs2, %r4;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0xbbb3U;
+; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0xbbb3U;
+; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    rem.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs5, %r7;
+; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0xaaa2U;
+; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0xaaa2U;
+; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    rem.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 8, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs7, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 8, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs8, %r11;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x9991U;
+; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x9991U;
+; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    rem.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfe.s32 %r13, %r2, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs10, %r13;
-; CHECK-NEXT:    bfe.s32 %r14, %r1, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs11, %r14;
+; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x8880U;
+; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x8880U;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
 ; CHECK-NEXT:    rem.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
@@ -1370,16 +1350,16 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    or.b16 %rs9, %rs8, %rs6;
 ; CHECK-NEXT:    cvt.u32.u16 %r2, %rs9;
 ; CHECK-NEXT:    ld.s8 %rs10, [%rd2+2];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 8, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs11, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs12, %r4;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x9991U;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r3;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x9991U;
+; CHECK-NEXT:    cvt.u16.u32 %rs12, %r4;
 ; CHECK-NEXT:    rem.s16 %rs13, %rs12, %rs11;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs13;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs14, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs15, %r7;
+; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x8880U;
+; CHECK-NEXT:    cvt.u16.u32 %rs14, %r6;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x8880U;
+; CHECK-NEXT:    cvt.u16.u32 %rs15, %r7;
 ; CHECK-NEXT:    rem.s16 %rs16, %rs15, %rs14;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs16;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
@@ -1415,17 +1395,17 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
 ; CHECK-NEXT:    ld.b32 %r1, [%rd1];
 ; CHECK-NEXT:    ld.b32 %r2, [%rd2];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    setp.gt.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r6, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    setp.gt.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    setp.gt.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    setp.gt.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.b32 %r11, -1, 0, %p4;
 ; CHECK-NEXT:    selp.b32 %r12, -1, 0, %p3;

diff  --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
index d5078f5f19af6..6e42e0006af3c 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
@@ -17,22 +17,22 @@ define i8 @ld_global_v32i8(ptr addrspace(1) %ptr) {
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b64 %rd1, [ld_global_v32i8_param_0];
 ; SM90-NEXT:    ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
-; SM90-NEXT:    bfe.u32 %r5, %r4, 0, 8;
+; SM90-NEXT:    prmt.b32 %r5, %r4, 0, 0x7770U;
 ; SM90-NEXT:    cvt.u16.u32 %rs1, %r5;
-; SM90-NEXT:    bfe.u32 %r6, %r3, 0, 8;
+; SM90-NEXT:    prmt.b32 %r6, %r3, 0, 0x7770U;
 ; SM90-NEXT:    cvt.u16.u32 %rs2, %r6;
-; SM90-NEXT:    bfe.u32 %r7, %r2, 0, 8;
+; SM90-NEXT:    prmt.b32 %r7, %r2, 0, 0x7770U;
 ; SM90-NEXT:    cvt.u16.u32 %rs3, %r7;
-; SM90-NEXT:    bfe.u32 %r8, %r1, 0, 8;
+; SM90-NEXT:    prmt.b32 %r8, %r1, 0, 0x7770U;
 ; SM90-NEXT:    cvt.u16.u32 %rs4, %r8;
 ; SM90-NEXT:    ld.global.nc.v4.b32 {%r9, %r10, %r11, %r12}, [%rd1];
-; SM90-NEXT:    bfe.u32 %r13, %r12, 0, 8;
+; SM90-NEXT:    prmt.b32 %r13, %r12, 0, 0x7770U;
 ; SM90-NEXT:    cvt.u16.u32 %rs5, %r13;
-; SM90-NEXT:    bfe.u32 %r14, %r11, 0, 8;
+; SM90-NEXT:    prmt.b32 %r14, %r11, 0, 0x7770U;
 ; SM90-NEXT:    cvt.u16.u32 %rs6, %r14;
-; SM90-NEXT:    bfe.u32 %r15, %r10, 0, 8;
+; SM90-NEXT:    prmt.b32 %r15, %r10, 0, 0x7770U;
 ; SM90-NEXT:    cvt.u16.u32 %rs7, %r15;
-; SM90-NEXT:    bfe.u32 %r16, %r9, 0, 8;
+; SM90-NEXT:    prmt.b32 %r16, %r9, 0, 0x7770U;
 ; SM90-NEXT:    cvt.u16.u32 %rs8, %r16;
 ; SM90-NEXT:    add.s16 %rs9, %rs8, %rs7;
 ; SM90-NEXT:    add.s16 %rs10, %rs6, %rs5;
@@ -54,21 +54,21 @@ define i8 @ld_global_v32i8(ptr addrspace(1) %ptr) {
 ; SM100-NEXT:  // %bb.0:
 ; SM100-NEXT:    ld.param.b64 %rd1, [ld_global_v32i8_param_0];
 ; SM100-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
-; SM100-NEXT:    bfe.u32 %r9, %r8, 0, 8;
+; SM100-NEXT:    prmt.b32 %r9, %r8, 0, 0x7770U;
 ; SM100-NEXT:    cvt.u16.u32 %rs1, %r9;
-; SM100-NEXT:    bfe.u32 %r10, %r7, 0, 8;
+; SM100-NEXT:    prmt.b32 %r10, %r7, 0, 0x7770U;
 ; SM100-NEXT:    cvt.u16.u32 %rs2, %r10;
-; SM100-NEXT:    bfe.u32 %r11, %r6, 0, 8;
+; SM100-NEXT:    prmt.b32 %r11, %r6, 0, 0x7770U;
 ; SM100-NEXT:    cvt.u16.u32 %rs3, %r11;
-; SM100-NEXT:    bfe.u32 %r12, %r5, 0, 8;
+; SM100-NEXT:    prmt.b32 %r12, %r5, 0, 0x7770U;
 ; SM100-NEXT:    cvt.u16.u32 %rs4, %r12;
-; SM100-NEXT:    bfe.u32 %r13, %r4, 0, 8;
+; SM100-NEXT:    prmt.b32 %r13, %r4, 0, 0x7770U;
 ; SM100-NEXT:    cvt.u16.u32 %rs5, %r13;
-; SM100-NEXT:    bfe.u32 %r14, %r3, 0, 8;
+; SM100-NEXT:    prmt.b32 %r14, %r3, 0, 0x7770U;
 ; SM100-NEXT:    cvt.u16.u32 %rs6, %r14;
-; SM100-NEXT:    bfe.u32 %r15, %r2, 0, 8;
+; SM100-NEXT:    prmt.b32 %r15, %r2, 0, 0x7770U;
 ; SM100-NEXT:    cvt.u16.u32 %rs7, %r15;
-; SM100-NEXT:    bfe.u32 %r16, %r1, 0, 8;
+; SM100-NEXT:    prmt.b32 %r16, %r1, 0, 0x7770U;
 ; SM100-NEXT:    cvt.u16.u32 %rs8, %r16;
 ; SM100-NEXT:    add.s16 %rs9, %rs8, %rs7;
 ; SM100-NEXT:    add.s16 %rs10, %rs6, %rs5;

diff  --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
index 3bd46000661ce..0c19490eb0904 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
@@ -134,13 +134,13 @@ define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [ld_global_v8i8_param_0];
 ; CHECK-NEXT:    ld.global.nc.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
-; CHECK-NEXT:    bfe.u32 %r6, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
 ; CHECK-NEXT:    add.s16 %rs5, %rs4, %rs3;
 ; CHECK-NEXT:    add.s16 %rs6, %rs2, %rs1;
@@ -169,21 +169,21 @@ define i8 @ld_global_v16i8(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [ld_global_v16i8_param_0];
 ; CHECK-NEXT:    ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
-; CHECK-NEXT:    bfe.u32 %r6, %r4, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r6, %r4, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r6;
-; CHECK-NEXT:    bfe.u32 %r7, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r3, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
-; CHECK-NEXT:    bfe.u32 %r8, %r3, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r8, %r3, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r8;
-; CHECK-NEXT:    bfe.u32 %r9, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r9;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs6, %r10;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r11;
-; CHECK-NEXT:    bfe.u32 %r12, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r12;
 ; CHECK-NEXT:    add.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    add.s16 %rs10, %rs6, %rs5;

diff  --git a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
index 3c90323da01d7..7e013390a39db 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
@@ -56,20 +56,20 @@ define void @generic_4xi8(ptr %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xi8_param_0];
 ; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
@@ -93,39 +93,39 @@ define void @generic_8xi8(ptr %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_8xi8_param_0];
 ; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
@@ -149,77 +149,77 @@ define void @generic_16xi8(ptr %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_16xi8_param_0];
 ; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
 ; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
 ; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
 ; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
 ; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
 ; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
 ; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
 ; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
 ; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
 ; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
 ; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
 ; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
@@ -473,20 +473,20 @@ define void @generic_volatile_4xi8(ptr %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
@@ -510,39 +510,39 @@ define void @generic_volatile_8xi8(ptr %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_8xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
@@ -566,77 +566,77 @@ define void @generic_volatile_16xi8(ptr %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_16xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
 ; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
 ; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
 ; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
 ; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
 ; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
 ; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
 ; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
 ; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
 ; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
 ; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
 ; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
@@ -873,20 +873,20 @@ define void @global_4xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_4xi8_param_0];
 ; CHECK-NEXT:    ld.global.b32 %r1, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
@@ -910,39 +910,39 @@ define void @global_8xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_8xi8_param_0];
 ; CHECK-NEXT:    ld.global.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
@@ -966,77 +966,77 @@ define void @global_16xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_16xi8_param_0];
 ; CHECK-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
 ; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
 ; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
 ; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
 ; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
 ; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
 ; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
 ; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
 ; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
 ; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
 ; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
 ; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
@@ -1271,20 +1271,20 @@ define void @global_volatile_4xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_4xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
@@ -1308,39 +1308,39 @@ define void @global_volatile_8xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_8xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
@@ -1364,77 +1364,77 @@ define void @global_volatile_16xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_16xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
 ; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
 ; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
 ; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
 ; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
 ; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
 ; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
 ; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
 ; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
 ; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
 ; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
 ; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
@@ -1671,20 +1671,20 @@ define void @shared_4xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xi8_param_0];
 ; CHECK-NEXT:    ld.shared.b32 %r1, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
@@ -1708,39 +1708,39 @@ define void @shared_8xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_8xi8_param_0];
 ; CHECK-NEXT:    ld.shared.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
@@ -1764,77 +1764,77 @@ define void @shared_16xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_16xi8_param_0];
 ; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
 ; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
 ; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
 ; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
 ; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
 ; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
 ; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
 ; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
 ; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
 ; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
 ; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
 ; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
@@ -2069,20 +2069,20 @@ define void @shared_volatile_4xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
@@ -2106,39 +2106,39 @@ define void @shared_volatile_8xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_8xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
@@ -2162,77 +2162,77 @@ define void @shared_volatile_16xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_16xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
 ; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
 ; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
 ; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
 ; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
 ; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
 ; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
 ; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
 ; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
 ; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
 ; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
 ; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
@@ -2469,20 +2469,20 @@ define void @local_4xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xi8_param_0];
 ; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
@@ -2506,39 +2506,39 @@ define void @local_8xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_8xi8_param_0];
 ; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
@@ -2562,77 +2562,77 @@ define void @local_16xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_16xi8_param_0];
 ; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
 ; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
 ; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
 ; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
 ; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
 ; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
 ; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
 ; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
 ; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
 ; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
 ; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
 ; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
@@ -2867,20 +2867,20 @@ define void @local_volatile_4xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xi8_param_0];
 ; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
@@ -2904,39 +2904,39 @@ define void @local_volatile_8xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_8xi8_param_0];
 ; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
@@ -2960,77 +2960,77 @@ define void @local_volatile_16xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_16xi8_param_0];
 ; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
-; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
 ; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
-; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
 ; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
-; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
 ; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
-; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
 ; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
-; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
 ; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
 ; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
 ; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
-; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
 ; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
 ; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
-; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
 ; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
-; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
 ; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
 ; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
 ; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
 ; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;

diff  --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll
index f6e6196345fcb..9a67bdfeb067b 100644
--- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll
+++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll
@@ -29,29 +29,25 @@ define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) {
 ; CHECK-LABEL: sext_setcc_v4i1_to_v4i8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b16 %rs<9>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0];
 ; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7770U;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT:    and.b16 %rs2, %rs1, 255;
-; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 0;
-; CHECK-NEXT:    bfe.u32 %r3, %r1, 8, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs3, %r3;
-; CHECK-NEXT:    and.b16 %rs4, %rs3, 255;
-; CHECK-NEXT:    setp.eq.b16 %p2, %rs4, 0;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs5, %r4;
-; CHECK-NEXT:    and.b16 %rs6, %rs5, 255;
-; CHECK-NEXT:    setp.eq.b16 %p3, %rs6, 0;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r5;
-; CHECK-NEXT:    and.b16 %rs8, %rs7, 255;
-; CHECK-NEXT:    setp.eq.b16 %p4, %rs8, 0;
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs1, 0;
+; CHECK-NEXT:    prmt.b32 %r3, %r1, 0, 0x7771U;
+; CHECK-NEXT:    cvt.u16.u32 %rs2, %r3;
+; CHECK-NEXT:    setp.eq.b16 %p2, %rs2, 0;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
+; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT:    setp.eq.b16 %p3, %rs3, 0;
+; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7773U;
+; CHECK-NEXT:    cvt.u16.u32 %rs4, %r5;
+; CHECK-NEXT:    setp.eq.b16 %p4, %rs4, 0;
 ; CHECK-NEXT:    selp.b32 %r6, -1, 0, %p4;
 ; CHECK-NEXT:    selp.b32 %r7, -1, 0, %p3;
 ; CHECK-NEXT:    prmt.b32 %r8, %r7, %r6, 0x3340U;


        


More information about the llvm-commits mailing list