[llvm] [NVPTX] Fix the error in a pattern match in v4i8 comparisons. (PR #81308)

Artem Belevich via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 12 12:39:36 PST 2024


https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/81308

>From ef68de9ca6aad1f159122e085892b392e3e84e36 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 8 Feb 2024 17:21:28 -0800
Subject: [PATCH 1/2] [NVPTX] Fix the error in a pattern match.

The replacement should've had BFE() as the arguments for the comparison, not the source register.

While at that, tighten the patterns a bit, and expand them no cover variants with immediate arguments.
Also change the default lowering of bfe() to use unsigned variant, so the value of the upper bits is predictable.
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td      |  90 +++--
 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 366 ++++++++++---------
 2 files changed, 258 insertions(+), 198 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 365afc6bd8c617..2f2dbea8169e41 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1886,10 +1886,12 @@ multiclass PRMT<ValueType T, RegisterClass RC> {
 }
 
 let hasSideEffects = false in {
-  defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
+  // order is somewhat importent here. signed/unsigned variants match
+  // the same patterns, so the first one wins.
   defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>;
-  defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
+  defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
   defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>;
+  defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
 
   defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>;
   defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>;
@@ -2259,27 +2261,69 @@ def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
           (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
 
 // comparisons of i8 extracted with BFE as i32
-def: Pat<(setgt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
-         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGT)>;
-def: Pat<(setge (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
-         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGE)>;
-def: Pat<(setlt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
-         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLT)>;
-def: Pat<(setle (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
-         (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLE)>;
-
-def: Pat<(setugt (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHI)>;
-def: Pat<(setuge (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHS)>;
-def: Pat<(setult (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLO)>;
-def: Pat<(setule (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLS)>;
-def: Pat<(seteq (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpEQ)>;
-def: Pat<(setne (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
-         (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpNE)>;
+// It's faster to do comparison directly on i32 extracted by BFE,
+// instead of the long conversion and sign extending.
+def: Pat<(setgt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
+                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
+         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGT)>;
+def: Pat<(setgt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
+                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
+         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGT)>;
+def: Pat<(setge (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
+                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
+         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGE)>;
+def: Pat<(setge (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
+                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
+         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGE)>;
+def: Pat<(setlt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
+                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
+         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLT)>;
+def: Pat<(setlt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
+                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
+         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLT)>;
+def: Pat<(setle (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
+                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
+         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLE)>;
+def: Pat<(setle (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
+                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
+         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLE)>;
+
+def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHI)>;
+def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHI)>;
+def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHS)>;
+def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHS)>;
+def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLO)>;
+def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLO)>;
+def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLS)>;
+def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLS)>;
+def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpEQ)>;
+def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpEQ)>;
+def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpNE)>;
+def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
+                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
+         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpNE)>;
 
 // i1 compare -> i32
 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 1ec68b4a271bac..6895699a1dfea1 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -106,29 +106,29 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_add_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    add.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
@@ -147,21 +147,21 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
-; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 4;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
@@ -180,21 +180,21 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
-; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
 ; CHECK-NEXT:    add.s16 %rs4, %rs3, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; CHECK-NEXT:    bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
 ; CHECK-NEXT:    add.s16 %rs8, %rs7, 4;
 ; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
@@ -214,29 +214,29 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_sub_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_sub_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    sub.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    sub.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    sub.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    sub.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
@@ -251,31 +251,39 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smax(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<27>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    setp.gt.s32 %p1, %r3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    setp.gt.s32 %p2, %r5, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    setp.gt.s32 %p3, %r7, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    setp.gt.s32 %p4, %r9, %r10;
-; CHECK-NEXT:    selp.b32 %r11, %r9, %r10, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r7, %r8, %p3;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
-; CHECK-NEXT:    selp.b32 %r14, %r5, %r6, %p2;
-; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r3, %r4, %p1;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    setp.gt.s32 %p1, %r4, %r3;
+; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    setp.gt.s32 %p2, %r6, %r5;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    setp.gt.s32 %p3, %r8, %r7;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    setp.gt.s32 %p4, %r10, %r9;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
+; CHECK-NEXT:    bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
+; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
+; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r2, 24, 8;
+; CHECK-NEXT:    selp.b32 %r24, %r11, %r23, %p1;
+; CHECK-NEXT:    bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sgt <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -291,24 +299,24 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    setp.hi.u32 %p1, %r3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    setp.hi.u32 %p2, %r5, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    setp.hi.u32 %p3, %r7, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    setp.hi.u32 %p4, %r9, %r10;
-; CHECK-NEXT:    selp.b32 %r11, %r9, %r10, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r7, %r8, %p3;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
+; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
-; CHECK-NEXT:    selp.b32 %r14, %r5, %r6, %p2;
+; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
 ; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r3, %r4, %p1;
+; CHECK-NEXT:    selp.b32 %r16, %r4, %r3, %p1;
 ; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
@@ -321,31 +329,39 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<27>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    setp.le.s32 %p1, %r3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    setp.le.s32 %p2, %r5, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    setp.le.s32 %p3, %r7, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    setp.le.s32 %p4, %r9, %r10;
-; CHECK-NEXT:    selp.b32 %r11, %r9, %r10, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r7, %r8, %p3;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
-; CHECK-NEXT:    selp.b32 %r14, %r5, %r6, %p2;
-; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r3, %r4, %p1;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    setp.le.s32 %p1, %r4, %r3;
+; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    setp.le.s32 %p2, %r6, %r5;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    setp.le.s32 %p3, %r8, %r7;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    setp.le.s32 %p4, %r10, %r9;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
+; CHECK-NEXT:    bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
+; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
+; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r2, 24, 8;
+; CHECK-NEXT:    selp.b32 %r24, %r11, %r23, %p1;
+; CHECK-NEXT:    bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sle <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -361,24 +377,24 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    setp.ls.u32 %p1, %r3, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    setp.ls.u32 %p2, %r5, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    setp.ls.u32 %p3, %r7, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    setp.ls.u32 %p4, %r9, %r10;
-; CHECK-NEXT:    selp.b32 %r11, %r9, %r10, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r7, %r8, %p3;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    setp.ls.u32 %p1, %r4, %r3;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    setp.ls.u32 %p2, %r6, %r5;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    setp.ls.u32 %p3, %r8, %r7;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    setp.ls.u32 %p4, %r10, %r9;
+; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
-; CHECK-NEXT:    selp.b32 %r14, %r5, %r6, %p2;
+; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
 ; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r3, %r4, %p1;
+; CHECK-NEXT:    selp.b32 %r16, %r4, %r3, %p1;
 ; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
@@ -397,27 +413,27 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_eq_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_eq_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_eq_param_0];
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p1, %r5, %r4;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 8, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.s32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.s32 %r14, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r3, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
 ; CHECK-NEXT:    bfi.b32 %r16, %r15, %r13, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r17, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r3, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
 ; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r20, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r3, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r5, %r20, %p1;
 ; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r22;
@@ -437,27 +453,27 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_ne_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_ne_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_ne_param_0];
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r5, %r4;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.s32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.s32 %r14, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r3, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
 ; CHECK-NEXT:    bfi.b32 %r16, %r15, %r13, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r17, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r3, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
 ; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r20, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r3, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r5, %r20, %p1;
 ; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r22;
@@ -476,29 +492,29 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_mul_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_mul_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
 ; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
 ; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
@@ -732,7 +748,7 @@ define void @test_ldst_v3i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v3i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
 ; CHECK-NEXT:    st.u16 [%rd2], %r1;
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
 ; CHECK-NEXT:    st.u8 [%rd2+2], %r3;
 ; CHECK-NEXT:    ret;
   %t1 = load <3 x i8>, ptr %a
@@ -920,31 +936,31 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_select_cc_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_select_cc_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_select_cc_param_0];
-; CHECK-NEXT:    bfe.s32 %r5, %r4, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r4, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r3, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r4, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r4, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r3, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r10, %r9;
-; CHECK-NEXT:    bfe.s32 %r11, %r4, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r4, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r12, %r11;
-; CHECK-NEXT:    bfe.s32 %r13, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
-; CHECK-NEXT:    bfe.s32 %r16, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r17, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r16, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r1, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
 ; CHECK-NEXT:    bfi.b32 %r19, %r18, %r15, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r20, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r21, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r21, %r1, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
 ; CHECK-NEXT:    bfi.b32 %r23, %r22, %r19, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r24, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r25, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r24, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r25, %r1, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r26, %r25, %r24, %p1;
 ; CHECK-NEXT:    bfi.b32 %r27, %r26, %r23, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r27;
@@ -965,17 +981,17 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
 ; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
 ; CHECK-NEXT:    ld.param.u32 %r10, [test_select_cc_i32_i8_param_3];
 ; CHECK-NEXT:    ld.param.u32 %r9, [test_select_cc_i32_i8_param_2];
-; CHECK-NEXT:    bfe.s32 %r11, %r10, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r12, %r9, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r10, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r9, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r12, %r11;
-; CHECK-NEXT:    bfe.s32 %r13, %r10, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r9, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r10, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r9, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r14, %r13;
-; CHECK-NEXT:    bfe.s32 %r15, %r10, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r16, %r9, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r10, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r16, %r9, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r16, %r15;
-; CHECK-NEXT:    bfe.s32 %r17, %r10, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r18, %r9, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r10, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r18, %r9, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r18, %r17;
 ; CHECK-NEXT:    selp.b32 %r19, %r4, %r8, %p4;
 ; CHECK-NEXT:    selp.b32 %r20, %r3, %r7, %p3;
@@ -1004,19 +1020,19 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
 ; CHECK-NEXT:    setp.ne.s32 %p2, %r5, %r9;
 ; CHECK-NEXT:    setp.ne.s32 %p3, %r4, %r8;
 ; CHECK-NEXT:    setp.ne.s32 %p4, %r3, %r7;
-; CHECK-NEXT:    bfe.s32 %r11, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r12, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
 ; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r18, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r19, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r18, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r19, %r1, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
 ; CHECK-NEXT:    bfi.b32 %r21, %r20, %r17, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r22, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r23, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r22, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r1, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r24, %r23, %r22, %p1;
 ; CHECK-NEXT:    bfi.b32 %r25, %r24, %r21, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
@@ -1091,16 +1107,16 @@ define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_zext_2xi64_param_0];
-; CHECK-NEXT:    bfe.s32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd1, %r2;
 ; CHECK-NEXT:    and.b64 %rd2, %rd1, 255;
-; CHECK-NEXT:    bfe.s32 %r3, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd3, %r3;
 ; CHECK-NEXT:    and.b64 %rd4, %rd3, 255;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd5, %r4;
 ; CHECK-NEXT:    and.b64 %rd6, %rd5, 255;
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd7, %r5;
 ; CHECK-NEXT:    and.b64 %rd8, %rd7, 255;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd8, %rd6};
@@ -1424,17 +1440,17 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
 ; CHECK-NEXT:    ld.u32 %r2, [%rd2];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.s32 %r11, -1, 0, %p4;
 ; CHECK-NEXT:    selp.s32 %r12, -1, 0, %p3;

>From 3305a0514656e8a0c2f6a3486b0205303504393b Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Mon, 12 Feb 2024 12:29:56 -0800
Subject: [PATCH 2/2] Imporoved a comment.

---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 2f2dbea8169e41..4322eaef9f467e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1886,8 +1886,10 @@ multiclass PRMT<ValueType T, RegisterClass RC> {
 }
 
 let hasSideEffects = false in {
-  // order is somewhat importent here. signed/unsigned variants match
-  // the same patterns, so the first one wins.
+  // order is somewhat important here. signed/unsigned variants match
+  // the same patterns, so the first one wins. Having unsigned byte extraction
+  // has the benefit of always having zero in unused bits, which makes some
+  // optimizations easier (e.g. no need to mask them).
   defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>;
   defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
   defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>;



More information about the llvm-commits mailing list