[llvm] [NVPTX] Improve lowering of v2i16 logical ops. (PR #67365)

Artem Belevich via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 25 14:19:03 PDT 2023


https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/67365

>From bb46cfdc5b48663f1bc692760340762ebc7e783c Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 21 Sep 2023 16:23:25 -0700
Subject: [PATCH 1/3] [NVPTX] Improve lowering of v2i16 logical ops.

Bitwise logical ops can always be done as b32, regardless of availability of
other v2i16 ops, that would need a new GPU.
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |  7 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       | 11 ++++
 llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 64 +++++++++++++++++++
 3 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 22a72ae24b440a5..b24aae4792ce6a6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -642,10 +642,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
 
   // Other arithmetic and logic ops are unsupported.
-  setOperationAction({ISD::AND, ISD::OR, ISD::XOR, ISD::SDIV, ISD::UDIV,
-                      ISD::SRA, ISD::SRL, ISD::MULHS, ISD::MULHU,
-                      ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP,
-                      ISD::UINT_TO_FP},
+  setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
+                      ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
+                      ISD::SINT_TO_FP, ISD::UINT_TO_FP},
                      MVT::v2i16, Expand);
 
   setOperationAction(ISD::ADDC, MVT::i32, Legal);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index ad10d7938ef12e4..8f88b06372b245d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1486,6 +1486,17 @@ defm OR  : BITWISE<"or", or>;
 defm AND : BITWISE<"and", and>;
 defm XOR : BITWISE<"xor", xor>;
 
+// Lower logical ops as bitwise ops on b32.
+// By this point the constants get legalized into a bitcast from i32, so that's
+// what we need to match here.
+def: Pat<(or Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
+         (ORb32ri Int32Regs:$a, imm:$b)>;
+def: Pat<(xor Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
+         (XORb32ri Int32Regs:$a, imm:$b)>;
+def: Pat<(and Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
+         (ANDb32ri Int32Regs:$a, imm:$b)>;
+
+
 def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
                       "not.pred \t$dst, $src;",
                       [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index 59d1d84b191c31b..d14514cb53d3a93 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -235,6 +235,70 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 {
   ret <2 x i16> %r
 }
 
+;; Logical ops are available on all GPUs as regular 32-bit logical ops
+; COMMON-LABEL: test_or(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_or_param_0];
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_or_param_1];
+; COMMON-NEXT: or.b32          [[R:%r[0-9]+]], [[A]], [[B]];
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 {
+  %r = or <2 x i16> %a, %b
+  ret <2 x i16> %r
+}
+
+; Check that we can lower or with immediate arguments.
+; COMMON-LABEL: test_or_imm_0(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_or_imm_0_param_0];
+; COMMON-NEXT: or.b32          [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 {
+  %r = or <2 x i16> <i16 1, i16 2>, %a
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_or_imm_1(
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_or_imm_1_param_0];
+; COMMON-NEXT: or.b32          [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 {
+  %r = or <2 x i16> %a, <i16 1, i16 2>
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_xor(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_xor_param_0];
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_xor_param_1];
+; COMMON-NEXT: xor.b32         [[R:%r[0-9]+]], [[A]], [[B]];
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 {
+  %r = xor <2 x i16> %a, %b
+  ret <2 x i16> %r
+}
+
+; Check that we can lower xor with immediate arguments.
+; COMMON-LABEL: test_xor_imm_0(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_xor_imm_0_param_0];
+; COMMON-NEXT: xor.b32         [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 {
+  %r = xor <2 x i16> <i16 1, i16 2>, %a
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_xor_imm_1(
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_xor_imm_1_param_0];
+; COMMON-NEXT: xor.b32         [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 {
+  %r = xor <2 x i16> %a, <i16 1, i16 2>
+  ret <2 x i16> %r
+}
 
 ; COMMON-LABEL: .func test_ldst_v2i16(
 ; COMMON-DAG:    ld.param.u64    [[A:%rd[0-9]+]], [test_ldst_v2i16_param_0];

>From d2bac0ce2fd54b2d376a5899af1c598ff28f4dff Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Mon, 25 Sep 2023 13:00:21 -0700
Subject: [PATCH 2/3] Added lowering of 2-register variants operations

Previously we only matched i32 arguments, and were lucky that that's what we
were often getting when an argument was loaded from memory. For computed
arguments, we do need to match v2i16.
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       | 13 ++-
 llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 81 +++++++++++++++++++
 2 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 8f88b06372b245d..28c4cadb303ad4f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1486,9 +1486,16 @@ defm OR  : BITWISE<"or", or>;
 defm AND : BITWISE<"and", and>;
 defm XOR : BITWISE<"xor", xor>;
 
-// Lower logical ops as bitwise ops on b32.
-// By this point the constants get legalized into a bitcast from i32, so that's
-// what we need to match here.
+// Lower logical v2i16 ops as bitwise ops on b32.
+def: Pat<(or (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
+         (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
+def: Pat<(xor (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
+         (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
+def: Pat<(and (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
+         (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
+
+// The constants get legalized into a bitcast from i32, so that's what we need
+// to match here.
 def: Pat<(or Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
          (ORb32ri Int32Regs:$a, imm:$b)>;
 def: Pat<(xor Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index d14514cb53d3a93..5a22bbcf7416c17 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -247,6 +247,23 @@ define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 {
   ret <2 x i16> %r
 }
 
+; Ops that operate on computed arguments go though a different lowering path.
+; compared to the ones that operate on loaded data. So we test them separately.
+; COMMON-LABEL: test_or_computed(
+; COMMON:        ld.param.u16    [[A:%rs[0-9+]]], [test_or_computed_param_0];
+; COMMON-DAG:    mov.u16         [[C0:%rs[0-9]+]], 0;
+; COMMON-DAG:    mov.b32         [[R1:%r[0-9]+]], {[[A]], [[C0]]};
+; COMMON-DAG:    mov.u16         [[C5:%rs[0-9]+]], 5;
+; COMMON-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[A]], [[C5]]};
+; COMMON:        or.b32          [[R:%r[0-9]+]], [[R2]], [[R1]];
+; COMMON-NEXT:   st.param.b32    [func_retval0+0], [[R]];
+define <2 x i16> @test_or_computed(i16 %a) {
+  %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
+  %r = or <2 x i16> %ins.1, %ins.0
+  ret <2 x i16> %r
+}
+
 ; Check that we can lower or with immediate arguments.
 ; COMMON-LABEL: test_or_imm_0(
 ; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_or_imm_0_param_0];
@@ -279,6 +296,21 @@ define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 {
   ret <2 x i16> %r
 }
 
+; COMMON-LABEL: test_xor_computed(
+; COMMON:        ld.param.u16    [[A:%rs[0-9+]]], [test_xor_computed_param_0];
+; COMMON-DAG:    mov.u16         [[C0:%rs[0-9]+]], 0;
+; COMMON-DAG:    mov.b32         [[R1:%r[0-9]+]], {[[A]], [[C0]]};
+; COMMON-DAG:    mov.u16         [[C5:%rs[0-9]+]], 5;
+; COMMON-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[A]], [[C5]]};
+; COMMON:        xor.b32         [[R:%r[0-9]+]], [[R2]], [[R1]];
+; COMMON-NEXT:   st.param.b32    [func_retval0+0], [[R]];
+define <2 x i16> @test_xor_computed(i16 %a) {
+  %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
+  %r = xor <2 x i16> %ins.1, %ins.0
+  ret <2 x i16> %r
+}
+
 ; Check that we can lower xor with immediate arguments.
 ; COMMON-LABEL: test_xor_imm_0(
 ; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_xor_imm_0_param_0];
@@ -300,6 +332,55 @@ define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 {
   ret <2 x i16> %r
 }
 
+; COMMON-LABEL: test_and(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_and_param_0];
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_and_param_1];
+; COMMON-NEXT: and.b32          [[R:%r[0-9]+]], [[A]], [[B]];
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 {
+  %r = and <2 x i16> %a, %b
+  ret <2 x i16> %r
+}
+
+; Ops that operate on computed arguments go though a different lowering path.
+; compared to the ones that operate on loaded data. So we test them separately.
+; COMMON-LABEL: test_and_computed(
+; COMMON:        ld.param.u16    [[A:%rs[0-9+]]], [test_and_computed_param_0];
+; COMMON-DAG:    mov.u16         [[C0:%rs[0-9]+]], 0;
+; COMMON-DAG:    mov.b32         [[R1:%r[0-9]+]], {[[A]], [[C0]]};
+; COMMON-DAG:    mov.u16         [[C5:%rs[0-9]+]], 5;
+; COMMON-DAG:    mov.b32         [[R2:%r[0-9]+]], {[[A]], [[C5]]};
+; COMMON:        and.b32          [[R:%r[0-9]+]], [[R2]], [[R1]];
+; COMMON-NEXT:   st.param.b32    [func_retval0+0], [[R]];
+define <2 x i16> @test_and_computed(i16 %a) {
+  %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
+  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
+  %r = and <2 x i16> %ins.1, %ins.0
+  ret <2 x i16> %r
+}
+
+; Check that we can lower and with immediate arguments.
+; COMMON-LABEL: test_and_imm_0(
+; COMMON-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_and_imm_0_param_0];
+; COMMON-NEXT: and.b32          [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 {
+  %r = and <2 x i16> <i16 1, i16 2>, %a
+  ret <2 x i16> %r
+}
+
+; COMMON-LABEL: test_and_imm_1(
+; COMMON-DAG:  ld.param.u32    [[B:%r[0-9]+]], [test_and_imm_1_param_0];
+; COMMON-NEXT: and.b32          [[R:%r[0-9]+]], [[A]], 131073;
+; COMMON-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; COMMON-NEXT: ret;
+define <2 x i16> @test_and_imm_1(<2 x i16> %a) #0 {
+  %r = and <2 x i16> %a, <i16 1, i16 2>
+  ret <2 x i16> %r
+}
+
 ; COMMON-LABEL: .func test_ldst_v2i16(
 ; COMMON-DAG:    ld.param.u64    [[A:%rd[0-9]+]], [test_ldst_v2i16_param_0];
 ; COMMON-DAG:    ld.param.u64    [[B:%rd[0-9]+]], [test_ldst_v2i16_param_1];

>From a9518b285b99b9e964ce2f421ecf0dd3551b63b7 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Mon, 25 Sep 2023 14:18:24 -0700
Subject: [PATCH 3/3] Updated another test affected by the patch.

---
 ...unfold-masked-merge-vector-variablemask.ll | 912 ++++++++----------
 1 file changed, 405 insertions(+), 507 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index 9d60e1992c890c1..16579de882ed4b6 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -63,47 +63,43 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-NEXT:    .local .align 2 .b8 __local_depot2[4];
 ; CHECK-NEXT:    .reg .b64 %SP;
 ; CHECK-NEXT:    .reg .b64 %SPL;
-; CHECK-NEXT:    .reg .b16 %rs<36>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b16 %rs<20>;
+; CHECK-NEXT:    .reg .b32 %r<21>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    mov.u64 %SPL, __local_depot2;
 ; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v4i8_param_0];
+; CHECK-NEXT:    mov.b32 %r1, {%rs3, %rs4};
+; CHECK-NEXT:    mov.b32 %r2, {%rs1, %rs2};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v4i8_param_2];
-; CHECK-NEXT:    and.b16 %rs9, %rs1, %rs5;
-; CHECK-NEXT:    and.b16 %rs10, %rs2, %rs6;
-; CHECK-NEXT:    and.b16 %rs11, %rs3, %rs7;
-; CHECK-NEXT:    and.b16 %rs12, %rs4, %rs8;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [out_v4i8_param_1];
-; CHECK-NEXT:    xor.b16 %rs17, %rs8, 255;
-; CHECK-NEXT:    xor.b16 %rs18, %rs7, 255;
-; CHECK-NEXT:    xor.b16 %rs19, %rs6, 255;
-; CHECK-NEXT:    xor.b16 %rs20, %rs5, 255;
-; CHECK-NEXT:    and.b16 %rs21, %rs13, %rs20;
-; CHECK-NEXT:    and.b16 %rs22, %rs14, %rs19;
-; CHECK-NEXT:    and.b16 %rs23, %rs15, %rs18;
-; CHECK-NEXT:    and.b16 %rs24, %rs16, %rs17;
-; CHECK-NEXT:    or.b16 %rs25, %rs12, %rs24;
-; CHECK-NEXT:    or.b16 %rs26, %rs11, %rs23;
-; CHECK-NEXT:    mov.b32 %r1, {%rs26, %rs25};
-; CHECK-NEXT:    mov.b32 {%rs27, %rs28}, %r1;
-; CHECK-NEXT:    st.v2.u8 [%SP+0], {%rs27, %rs28};
-; CHECK-NEXT:    or.b16 %rs29, %rs10, %rs22;
-; CHECK-NEXT:    or.b16 %rs30, %rs9, %rs21;
-; CHECK-NEXT:    mov.b32 %r2, {%rs30, %rs29};
-; CHECK-NEXT:    mov.b32 {%rs31, %rs32}, %r2;
-; CHECK-NEXT:    st.v2.u8 [%SP+2], {%rs31, %rs32};
-; CHECK-NEXT:    ld.u16 %r3, [%SP+0];
-; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-NEXT:    ld.u16 %r5, [%SP+2];
-; CHECK-NEXT:    or.b32 %r6, %r5, %r4;
-; CHECK-NEXT:    shr.u32 %r7, %r6, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs33, %r7;
-; CHECK-NEXT:    cvt.u16.u32 %rs34, %r3;
-; CHECK-NEXT:    bfe.s32 %r8, %r3, 8, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs35, %r8;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs31, %rs33, %rs34, %rs35};
+; CHECK-NEXT:    mov.b32 %r3, {%rs5, %rs6};
+; CHECK-NEXT:    and.b32 %r4, %r2, %r3;
+; CHECK-NEXT:    mov.b32 %r5, {%rs7, %rs8};
+; CHECK-NEXT:    and.b32 %r6, %r1, %r5;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v4i8_param_1];
+; CHECK-NEXT:    mov.b32 %r7, {%rs11, %rs12};
+; CHECK-NEXT:    mov.b32 %r8, {%rs9, %rs10};
+; CHECK-NEXT:    xor.b32 %r9, %r5, 16711935;
+; CHECK-NEXT:    xor.b32 %r10, %r3, 16711935;
+; CHECK-NEXT:    and.b32 %r11, %r8, %r10;
+; CHECK-NEXT:    and.b32 %r12, %r7, %r9;
+; CHECK-NEXT:    or.b32 %r13, %r6, %r12;
+; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r13;
+; CHECK-NEXT:    st.v2.u8 [%SP+0], {%rs13, %rs14};
+; CHECK-NEXT:    or.b32 %r14, %r4, %r11;
+; CHECK-NEXT:    mov.b32 {%rs15, %rs16}, %r14;
+; CHECK-NEXT:    st.v2.u8 [%SP+2], {%rs15, %rs16};
+; CHECK-NEXT:    ld.u16 %r15, [%SP+0];
+; CHECK-NEXT:    shl.b32 %r16, %r15, 16;
+; CHECK-NEXT:    ld.u16 %r17, [%SP+2];
+; CHECK-NEXT:    or.b32 %r18, %r17, %r16;
+; CHECK-NEXT:    shr.u32 %r19, %r18, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs17, %r19;
+; CHECK-NEXT:    cvt.u16.u32 %rs18, %r15;
+; CHECK-NEXT:    bfe.s32 %r20, %r15, 8, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs19, %r20;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs15, %rs17, %rs18, %rs19};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i8> %x, %mask
   %notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1>
@@ -118,44 +114,45 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 ; CHECK-NEXT:    .local .align 2 .b8 __local_depot3[4];
 ; CHECK-NEXT:    .reg .b64 %SP;
 ; CHECK-NEXT:    .reg .b64 %SPL;
-; CHECK-NEXT:    .reg .b16 %rs<33>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b16 %rs<22>;
+; CHECK-NEXT:    .reg .b32 %r<22>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    mov.u64 %SPL, __local_depot3;
 ; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v4i8_undef_param_0];
+; CHECK-NEXT:    mov.b32 %r1, {%rs3, %rs4};
+; CHECK-NEXT:    mov.b32 %r2, {%rs1, %rs2};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v4i8_undef_param_2];
-; CHECK-NEXT:    and.b16 %rs9, %rs1, %rs5;
-; CHECK-NEXT:    and.b16 %rs10, %rs2, %rs6;
-; CHECK-NEXT:    and.b16 %rs11, %rs3, %rs7;
-; CHECK-NEXT:    and.b16 %rs12, %rs4, %rs8;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [out_v4i8_undef_param_1];
-; CHECK-NEXT:    xor.b16 %rs17, %rs8, 255;
-; CHECK-NEXT:    xor.b16 %rs18, %rs6, 255;
-; CHECK-NEXT:    xor.b16 %rs19, %rs5, 255;
-; CHECK-NEXT:    and.b16 %rs20, %rs13, %rs19;
-; CHECK-NEXT:    and.b16 %rs21, %rs14, %rs18;
-; CHECK-NEXT:    and.b16 %rs22, %rs16, %rs17;
-; CHECK-NEXT:    or.b16 %rs23, %rs12, %rs22;
-; CHECK-NEXT:    mov.b32 %r1, {%rs11, %rs23};
-; CHECK-NEXT:    mov.b32 {%rs24, %rs25}, %r1;
-; CHECK-NEXT:    st.v2.u8 [%SP+0], {%rs24, %rs25};
-; CHECK-NEXT:    or.b16 %rs26, %rs10, %rs21;
-; CHECK-NEXT:    or.b16 %rs27, %rs9, %rs20;
-; CHECK-NEXT:    mov.b32 %r2, {%rs27, %rs26};
-; CHECK-NEXT:    mov.b32 {%rs28, %rs29}, %r2;
-; CHECK-NEXT:    st.v2.u8 [%SP+2], {%rs28, %rs29};
-; CHECK-NEXT:    ld.u16 %r3, [%SP+0];
-; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-NEXT:    ld.u16 %r5, [%SP+2];
-; CHECK-NEXT:    or.b32 %r6, %r5, %r4;
-; CHECK-NEXT:    shr.u32 %r7, %r6, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs30, %r7;
-; CHECK-NEXT:    cvt.u16.u32 %rs31, %r3;
-; CHECK-NEXT:    bfe.s32 %r8, %r3, 8, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs32, %r8;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs28, %rs30, %rs31, %rs32};
+; CHECK-NEXT:    mov.b32 %r3, {%rs5, %rs6};
+; CHECK-NEXT:    and.b32 %r4, %r2, %r3;
+; CHECK-NEXT:    mov.b32 %r5, {%rs7, %rs8};
+; CHECK-NEXT:    and.b32 %r6, %r1, %r5;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v4i8_undef_param_1];
+; CHECK-NEXT:    mov.b32 %r7, {%rs11, %rs12};
+; CHECK-NEXT:    mov.b32 %r8, {%rs9, %rs10};
+; CHECK-NEXT:    mov.u16 %rs13, 255;
+; CHECK-NEXT:    mov.b32 %r9, {%rs14, %rs13};
+; CHECK-NEXT:    xor.b32 %r10, %r5, %r9;
+; CHECK-NEXT:    xor.b32 %r11, %r3, 16711935;
+; CHECK-NEXT:    and.b32 %r12, %r8, %r11;
+; CHECK-NEXT:    and.b32 %r13, %r7, %r10;
+; CHECK-NEXT:    or.b32 %r14, %r6, %r13;
+; CHECK-NEXT:    mov.b32 {%rs15, %rs16}, %r14;
+; CHECK-NEXT:    st.v2.u8 [%SP+0], {%rs15, %rs16};
+; CHECK-NEXT:    or.b32 %r15, %r4, %r12;
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    st.v2.u8 [%SP+2], {%rs17, %rs18};
+; CHECK-NEXT:    ld.u16 %r16, [%SP+0];
+; CHECK-NEXT:    shl.b32 %r17, %r16, 16;
+; CHECK-NEXT:    ld.u16 %r18, [%SP+2];
+; CHECK-NEXT:    or.b32 %r19, %r18, %r17;
+; CHECK-NEXT:    shr.u32 %r20, %r19, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs19, %r20;
+; CHECK-NEXT:    cvt.u16.u32 %rs20, %r16;
+; CHECK-NEXT:    bfe.s32 %r21, %r16, 8, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs21, %r21;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs17, %rs19, %rs20, %rs21};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i8> %x, %mask
   %notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 undef, i8 -1>
@@ -167,25 +164,17 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v2i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [out_v2i16_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r3, [out_v2i16_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r4, [out_v2i16_param_0];
 ; CHECK-NEXT:    and.b32 %r5, %r4, %r1;
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT:    not.b16 %rs3, %rs2;
-; CHECK-NEXT:    not.b16 %rs4, %rs1;
-; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
-; CHECK-NEXT:    and.b16 %rs7, %rs5, %rs4;
-; CHECK-NEXT:    and.b16 %rs8, %rs6, %rs3;
-; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r5;
-; CHECK-NEXT:    or.b16 %rs11, %rs10, %rs8;
-; CHECK-NEXT:    or.b16 %rs12, %rs9, %rs7;
-; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r7;
+; CHECK-NEXT:    xor.b32 %r7, %r1, -1;
+; CHECK-NEXT:    and.b32 %r8, %r3, %r7;
+; CHECK-NEXT:    or.b32 %r9, %r5, %r8;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r9;
 ; CHECK-NEXT:    ret;
   %mx = and <2 x i16> %x, %mask
   %notmask = xor <2 x i16> %mask, <i16 -1, i16 -1>
@@ -226,89 +215,81 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-NEXT:    .local .align 2 .b8 __local_depot6[8];
 ; CHECK-NEXT:    .reg .b64 %SP;
 ; CHECK-NEXT:    .reg .b64 %SPL;
-; CHECK-NEXT:    .reg .b16 %rs<72>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b16 %rs<40>;
+; CHECK-NEXT:    .reg .b32 %r<38>;
 ; CHECK-NEXT:    .reg .b64 %rd<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    mov.u64 %SPL, __local_depot6;
 ; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v8i8_param_0];
+; CHECK-NEXT:    mov.b32 %r1, {%rs3, %rs4};
+; CHECK-NEXT:    mov.b32 %r2, {%rs1, %rs2};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v8i8_param_0+4];
+; CHECK-NEXT:    mov.b32 %r3, {%rs7, %rs8};
+; CHECK-NEXT:    mov.b32 %r4, {%rs5, %rs6};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v8i8_param_2+4];
-; CHECK-NEXT:    and.b16 %rs13, %rs5, %rs9;
-; CHECK-NEXT:    and.b16 %rs14, %rs6, %rs10;
-; CHECK-NEXT:    and.b16 %rs15, %rs7, %rs11;
-; CHECK-NEXT:    and.b16 %rs16, %rs8, %rs12;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [out_v8i8_param_2];
-; CHECK-NEXT:    and.b16 %rs21, %rs1, %rs17;
-; CHECK-NEXT:    and.b16 %rs22, %rs2, %rs18;
-; CHECK-NEXT:    and.b16 %rs23, %rs3, %rs19;
-; CHECK-NEXT:    and.b16 %rs24, %rs4, %rs20;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [out_v8i8_param_1];
-; CHECK-NEXT:    ld.param.v4.u8 {%rs29, %rs30, %rs31, %rs32}, [out_v8i8_param_1+4];
-; CHECK-NEXT:    xor.b16 %rs33, %rs20, 255;
-; CHECK-NEXT:    xor.b16 %rs34, %rs19, 255;
-; CHECK-NEXT:    xor.b16 %rs35, %rs18, 255;
-; CHECK-NEXT:    xor.b16 %rs36, %rs17, 255;
-; CHECK-NEXT:    xor.b16 %rs37, %rs12, 255;
-; CHECK-NEXT:    xor.b16 %rs38, %rs11, 255;
-; CHECK-NEXT:    xor.b16 %rs39, %rs10, 255;
-; CHECK-NEXT:    xor.b16 %rs40, %rs9, 255;
-; CHECK-NEXT:    and.b16 %rs41, %rs29, %rs40;
-; CHECK-NEXT:    and.b16 %rs42, %rs30, %rs39;
-; CHECK-NEXT:    and.b16 %rs43, %rs31, %rs38;
-; CHECK-NEXT:    and.b16 %rs44, %rs32, %rs37;
-; CHECK-NEXT:    and.b16 %rs45, %rs25, %rs36;
-; CHECK-NEXT:    and.b16 %rs46, %rs26, %rs35;
-; CHECK-NEXT:    and.b16 %rs47, %rs27, %rs34;
-; CHECK-NEXT:    and.b16 %rs48, %rs28, %rs33;
-; CHECK-NEXT:    or.b16 %rs49, %rs24, %rs48;
-; CHECK-NEXT:    or.b16 %rs50, %rs23, %rs47;
-; CHECK-NEXT:    mov.b32 %r1, {%rs50, %rs49};
-; CHECK-NEXT:    mov.b32 {%rs51, %rs52}, %r1;
-; CHECK-NEXT:    st.v2.u8 [%SP+0], {%rs51, %rs52};
-; CHECK-NEXT:    or.b16 %rs53, %rs22, %rs46;
-; CHECK-NEXT:    or.b16 %rs54, %rs21, %rs45;
-; CHECK-NEXT:    mov.b32 %r2, {%rs54, %rs53};
-; CHECK-NEXT:    mov.b32 {%rs55, %rs56}, %r2;
-; CHECK-NEXT:    st.v2.u8 [%SP+2], {%rs55, %rs56};
-; CHECK-NEXT:    or.b16 %rs57, %rs16, %rs44;
-; CHECK-NEXT:    or.b16 %rs58, %rs15, %rs43;
-; CHECK-NEXT:    mov.b32 %r3, {%rs58, %rs57};
-; CHECK-NEXT:    mov.b32 {%rs59, %rs60}, %r3;
-; CHECK-NEXT:    st.v2.u8 [%SP+4], {%rs59, %rs60};
-; CHECK-NEXT:    or.b16 %rs61, %rs14, %rs42;
-; CHECK-NEXT:    or.b16 %rs62, %rs13, %rs41;
-; CHECK-NEXT:    mov.b32 %r4, {%rs62, %rs61};
-; CHECK-NEXT:    mov.b32 {%rs63, %rs64}, %r4;
-; CHECK-NEXT:    st.v2.u8 [%SP+6], {%rs63, %rs64};
-; CHECK-NEXT:    ld.u16 %r5, [%SP+0];
-; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-NEXT:    ld.u16 %r7, [%SP+2];
-; CHECK-NEXT:    or.b32 %r8, %r7, %r6;
-; CHECK-NEXT:    cvt.u64.u32 %rd1, %r8;
-; CHECK-NEXT:    ld.u16 %r9, [%SP+4];
-; CHECK-NEXT:    shl.b32 %r10, %r9, 16;
-; CHECK-NEXT:    ld.u16 %r11, [%SP+6];
-; CHECK-NEXT:    or.b32 %r12, %r11, %r10;
-; CHECK-NEXT:    cvt.u64.u32 %rd2, %r12;
+; CHECK-NEXT:    mov.b32 %r5, {%rs9, %rs10};
+; CHECK-NEXT:    and.b32 %r6, %r4, %r5;
+; CHECK-NEXT:    mov.b32 %r7, {%rs11, %rs12};
+; CHECK-NEXT:    and.b32 %r8, %r3, %r7;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [out_v8i8_param_2];
+; CHECK-NEXT:    mov.b32 %r9, {%rs13, %rs14};
+; CHECK-NEXT:    and.b32 %r10, %r2, %r9;
+; CHECK-NEXT:    mov.b32 %r11, {%rs15, %rs16};
+; CHECK-NEXT:    and.b32 %r12, %r1, %r11;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [out_v8i8_param_1];
+; CHECK-NEXT:    mov.b32 %r13, {%rs19, %rs20};
+; CHECK-NEXT:    mov.b32 %r14, {%rs17, %rs18};
+; CHECK-NEXT:    ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [out_v8i8_param_1+4];
+; CHECK-NEXT:    mov.b32 %r15, {%rs23, %rs24};
+; CHECK-NEXT:    mov.b32 %r16, {%rs21, %rs22};
+; CHECK-NEXT:    xor.b32 %r17, %r11, 16711935;
+; CHECK-NEXT:    xor.b32 %r18, %r9, 16711935;
+; CHECK-NEXT:    xor.b32 %r19, %r7, 16711935;
+; CHECK-NEXT:    xor.b32 %r20, %r5, 16711935;
+; CHECK-NEXT:    and.b32 %r21, %r16, %r20;
+; CHECK-NEXT:    and.b32 %r22, %r15, %r19;
+; CHECK-NEXT:    and.b32 %r23, %r14, %r18;
+; CHECK-NEXT:    and.b32 %r24, %r13, %r17;
+; CHECK-NEXT:    or.b32 %r25, %r12, %r24;
+; CHECK-NEXT:    mov.b32 {%rs25, %rs26}, %r25;
+; CHECK-NEXT:    st.v2.u8 [%SP+0], {%rs25, %rs26};
+; CHECK-NEXT:    or.b32 %r26, %r10, %r23;
+; CHECK-NEXT:    mov.b32 {%rs27, %rs28}, %r26;
+; CHECK-NEXT:    st.v2.u8 [%SP+2], {%rs27, %rs28};
+; CHECK-NEXT:    or.b32 %r27, %r8, %r22;
+; CHECK-NEXT:    mov.b32 {%rs29, %rs30}, %r27;
+; CHECK-NEXT:    st.v2.u8 [%SP+4], {%rs29, %rs30};
+; CHECK-NEXT:    or.b32 %r28, %r6, %r21;
+; CHECK-NEXT:    mov.b32 {%rs31, %rs32}, %r28;
+; CHECK-NEXT:    st.v2.u8 [%SP+6], {%rs31, %rs32};
+; CHECK-NEXT:    ld.u16 %r29, [%SP+0];
+; CHECK-NEXT:    shl.b32 %r30, %r29, 16;
+; CHECK-NEXT:    ld.u16 %r31, [%SP+2];
+; CHECK-NEXT:    or.b32 %r32, %r31, %r30;
+; CHECK-NEXT:    cvt.u64.u32 %rd1, %r32;
+; CHECK-NEXT:    ld.u16 %r33, [%SP+4];
+; CHECK-NEXT:    shl.b32 %r34, %r33, 16;
+; CHECK-NEXT:    ld.u16 %r35, [%SP+6];
+; CHECK-NEXT:    or.b32 %r36, %r35, %r34;
+; CHECK-NEXT:    cvt.u64.u32 %rd2, %r36;
 ; CHECK-NEXT:    shl.b64 %rd3, %rd2, 32;
 ; CHECK-NEXT:    or.b64 %rd4, %rd1, %rd3;
-; CHECK-NEXT:    shr.u32 %r13, %r12, 8;
+; CHECK-NEXT:    shr.u32 %r37, %r36, 8;
 ; CHECK-NEXT:    shr.u64 %rd5, %rd4, 24;
-; CHECK-NEXT:    cvt.u16.u64 %rs65, %rd5;
+; CHECK-NEXT:    cvt.u16.u64 %rs33, %rd5;
 ; CHECK-NEXT:    shr.u64 %rd6, %rd1, 16;
-; CHECK-NEXT:    cvt.u16.u64 %rs66, %rd6;
+; CHECK-NEXT:    cvt.u16.u64 %rs34, %rd6;
 ; CHECK-NEXT:    shr.u64 %rd7, %rd1, 8;
-; CHECK-NEXT:    cvt.u16.u64 %rs67, %rd7;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs55, %rs67, %rs66, %rs65};
-; CHECK-NEXT:    cvt.u16.u32 %rs68, %r13;
+; CHECK-NEXT:    cvt.u16.u64 %rs35, %rd7;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs27, %rs35, %rs34, %rs33};
+; CHECK-NEXT:    cvt.u16.u32 %rs36, %r37;
 ; CHECK-NEXT:    bfe.s64 %rd8, %rd2, 24, 8;
-; CHECK-NEXT:    cvt.u16.u64 %rs69, %rd8;
-; CHECK-NEXT:    cvt.u16.u32 %rs70, %r9;
-; CHECK-NEXT:    cvt.u16.u32 %rs71, %r11;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs71, %rs68, %rs70, %rs69};
+; CHECK-NEXT:    cvt.u16.u64 %rs37, %rd8;
+; CHECK-NEXT:    cvt.u16.u32 %rs38, %r33;
+; CHECK-NEXT:    cvt.u16.u32 %rs39, %r35;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs39, %rs36, %rs38, %rs37};
 ; CHECK-NEXT:    ret;
   %mx = and <8 x i8> %x, %mask
   %notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -320,8 +301,7 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v4i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<25>;
-; CHECK-NEXT:    .reg .b32 %r<17>;
+; CHECK-NEXT:    .reg .b32 %r<21>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [out_v4i16_param_1];
@@ -329,27 +309,13 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-NEXT:    ld.param.v2.u32 {%r9, %r10}, [out_v4i16_param_0];
 ; CHECK-NEXT:    and.b32 %r11, %r9, %r5;
 ; CHECK-NEXT:    and.b32 %r13, %r10, %r6;
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r6;
-; CHECK-NEXT:    not.b16 %rs3, %rs2;
-; CHECK-NEXT:    not.b16 %rs4, %rs1;
-; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r5;
-; CHECK-NEXT:    not.b16 %rs7, %rs6;
-; CHECK-NEXT:    not.b16 %rs8, %rs5;
-; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r1;
-; CHECK-NEXT:    and.b16 %rs11, %rs9, %rs8;
-; CHECK-NEXT:    and.b16 %rs12, %rs10, %rs7;
-; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r2;
-; CHECK-NEXT:    and.b16 %rs15, %rs13, %rs4;
-; CHECK-NEXT:    and.b16 %rs16, %rs14, %rs3;
-; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r13;
-; CHECK-NEXT:    or.b16 %rs19, %rs18, %rs16;
-; CHECK-NEXT:    or.b16 %rs20, %rs17, %rs15;
-; CHECK-NEXT:    mov.b32 %r15, {%rs20, %rs19};
-; CHECK-NEXT:    mov.b32 {%rs21, %rs22}, %r11;
-; CHECK-NEXT:    or.b16 %rs23, %rs22, %rs12;
-; CHECK-NEXT:    or.b16 %rs24, %rs21, %rs11;
-; CHECK-NEXT:    mov.b32 %r16, {%rs24, %rs23};
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0+0], {%r16, %r15};
+; CHECK-NEXT:    xor.b32 %r15, %r6, -1;
+; CHECK-NEXT:    xor.b32 %r16, %r5, -1;
+; CHECK-NEXT:    and.b32 %r17, %r1, %r16;
+; CHECK-NEXT:    and.b32 %r18, %r2, %r15;
+; CHECK-NEXT:    or.b32 %r19, %r13, %r18;
+; CHECK-NEXT:    or.b32 %r20, %r11, %r17;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0+0], {%r20, %r19};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i16> %x, %mask
   %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1>
@@ -361,33 +327,24 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v4i16_undef(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<20>;
-; CHECK-NEXT:    .reg .b32 %r<17>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<22>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [out_v4i16_undef_param_1];
 ; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [out_v4i16_undef_param_2];
 ; CHECK-NEXT:    ld.param.v2.u32 {%r9, %r10}, [out_v4i16_undef_param_0];
-; CHECK-NEXT:    and.b32 %r11, %r10, %r6;
-; CHECK-NEXT:    and.b32 %r13, %r9, %r5;
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r5;
-; CHECK-NEXT:    not.b16 %rs3, %rs2;
-; CHECK-NEXT:    not.b16 %rs4, %rs1;
-; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs5}, %r6; }
-; CHECK-NEXT:    not.b16 %rs6, %rs5;
-; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r2; }
-; CHECK-NEXT:    and.b16 %rs8, %rs7, %rs6;
-; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r1;
-; CHECK-NEXT:    and.b16 %rs11, %rs9, %rs4;
-; CHECK-NEXT:    and.b16 %rs12, %rs10, %rs3;
-; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r13;
-; CHECK-NEXT:    or.b16 %rs15, %rs14, %rs12;
-; CHECK-NEXT:    or.b16 %rs16, %rs13, %rs11;
-; CHECK-NEXT:    mov.b32 %r15, {%rs16, %rs15};
-; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r11;
-; CHECK-NEXT:    or.b16 %rs19, %rs18, %rs8;
-; CHECK-NEXT:    mov.b32 %r16, {%rs17, %rs19};
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0+0], {%r15, %r16};
+; CHECK-NEXT:    and.b32 %r11, %r9, %r5;
+; CHECK-NEXT:    and.b32 %r13, %r10, %r6;
+; CHECK-NEXT:    mov.u16 %rs1, -1;
+; CHECK-NEXT:    mov.b32 %r15, {%rs2, %rs1};
+; CHECK-NEXT:    xor.b32 %r16, %r6, %r15;
+; CHECK-NEXT:    xor.b32 %r17, %r5, -1;
+; CHECK-NEXT:    and.b32 %r18, %r1, %r17;
+; CHECK-NEXT:    and.b32 %r19, %r2, %r16;
+; CHECK-NEXT:    or.b32 %r20, %r13, %r19;
+; CHECK-NEXT:    or.b32 %r21, %r11, %r18;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0+0], {%r21, %r20};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i16> %x, %mask
   %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
@@ -451,106 +408,90 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin
 define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v16i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<129>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b16 %rs<65>;
+; CHECK-NEXT:    .reg .b32 %r<57>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v16i8_param_0+12];
+; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT:    mov.b32 %r2, {%rs3, %rs4};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v16i8_param_0+8];
+; CHECK-NEXT:    mov.b32 %r3, {%rs5, %rs6};
+; CHECK-NEXT:    mov.b32 %r4, {%rs7, %rs8};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v16i8_param_0+4];
+; CHECK-NEXT:    mov.b32 %r5, {%rs9, %rs10};
+; CHECK-NEXT:    mov.b32 %r6, {%rs11, %rs12};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [out_v16i8_param_0];
+; CHECK-NEXT:    mov.b32 %r7, {%rs13, %rs14};
+; CHECK-NEXT:    mov.b32 %r8, {%rs15, %rs16};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [out_v16i8_param_2];
-; CHECK-NEXT:    and.b16 %rs21, %rs15, %rs19;
-; CHECK-NEXT:    and.b16 %rs22, %rs16, %rs20;
-; CHECK-NEXT:    and.b16 %rs23, %rs13, %rs17;
-; CHECK-NEXT:    and.b16 %rs24, %rs14, %rs18;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [out_v16i8_param_2+4];
-; CHECK-NEXT:    and.b16 %rs29, %rs11, %rs27;
-; CHECK-NEXT:    and.b16 %rs30, %rs12, %rs28;
-; CHECK-NEXT:    and.b16 %rs31, %rs9, %rs25;
-; CHECK-NEXT:    and.b16 %rs32, %rs10, %rs26;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs33, %rs34, %rs35, %rs36}, [out_v16i8_param_2+8];
-; CHECK-NEXT:    and.b16 %rs37, %rs7, %rs35;
-; CHECK-NEXT:    and.b16 %rs38, %rs8, %rs36;
-; CHECK-NEXT:    and.b16 %rs39, %rs5, %rs33;
-; CHECK-NEXT:    and.b16 %rs40, %rs6, %rs34;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs41, %rs42, %rs43, %rs44}, [out_v16i8_param_2+12];
-; CHECK-NEXT:    and.b16 %rs45, %rs3, %rs43;
-; CHECK-NEXT:    and.b16 %rs46, %rs4, %rs44;
-; CHECK-NEXT:    and.b16 %rs47, %rs1, %rs41;
-; CHECK-NEXT:    and.b16 %rs48, %rs2, %rs42;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs49, %rs50, %rs51, %rs52}, [out_v16i8_param_1+12];
-; CHECK-NEXT:    ld.param.v4.u8 {%rs53, %rs54, %rs55, %rs56}, [out_v16i8_param_1+8];
-; CHECK-NEXT:    ld.param.v4.u8 {%rs57, %rs58, %rs59, %rs60}, [out_v16i8_param_1+4];
-; CHECK-NEXT:    ld.param.v4.u8 {%rs61, %rs62, %rs63, %rs64}, [out_v16i8_param_1];
-; CHECK-NEXT:    xor.b16 %rs65, %rs42, 255;
-; CHECK-NEXT:    xor.b16 %rs66, %rs41, 255;
-; CHECK-NEXT:    xor.b16 %rs67, %rs44, 255;
-; CHECK-NEXT:    xor.b16 %rs68, %rs43, 255;
-; CHECK-NEXT:    xor.b16 %rs69, %rs34, 255;
-; CHECK-NEXT:    xor.b16 %rs70, %rs33, 255;
-; CHECK-NEXT:    xor.b16 %rs71, %rs36, 255;
-; CHECK-NEXT:    xor.b16 %rs72, %rs35, 255;
-; CHECK-NEXT:    xor.b16 %rs73, %rs26, 255;
-; CHECK-NEXT:    xor.b16 %rs74, %rs25, 255;
-; CHECK-NEXT:    xor.b16 %rs75, %rs28, 255;
-; CHECK-NEXT:    xor.b16 %rs76, %rs27, 255;
-; CHECK-NEXT:    xor.b16 %rs77, %rs18, 255;
-; CHECK-NEXT:    xor.b16 %rs78, %rs17, 255;
-; CHECK-NEXT:    xor.b16 %rs79, %rs20, 255;
-; CHECK-NEXT:    xor.b16 %rs80, %rs19, 255;
-; CHECK-NEXT:    and.b16 %rs81, %rs63, %rs80;
-; CHECK-NEXT:    and.b16 %rs82, %rs64, %rs79;
-; CHECK-NEXT:    and.b16 %rs83, %rs61, %rs78;
-; CHECK-NEXT:    and.b16 %rs84, %rs62, %rs77;
-; CHECK-NEXT:    and.b16 %rs85, %rs59, %rs76;
-; CHECK-NEXT:    and.b16 %rs86, %rs60, %rs75;
-; CHECK-NEXT:    and.b16 %rs87, %rs57, %rs74;
-; CHECK-NEXT:    and.b16 %rs88, %rs58, %rs73;
-; CHECK-NEXT:    and.b16 %rs89, %rs55, %rs72;
-; CHECK-NEXT:    and.b16 %rs90, %rs56, %rs71;
-; CHECK-NEXT:    and.b16 %rs91, %rs53, %rs70;
-; CHECK-NEXT:    and.b16 %rs92, %rs54, %rs69;
-; CHECK-NEXT:    and.b16 %rs93, %rs51, %rs68;
-; CHECK-NEXT:    and.b16 %rs94, %rs52, %rs67;
-; CHECK-NEXT:    and.b16 %rs95, %rs49, %rs66;
-; CHECK-NEXT:    and.b16 %rs96, %rs50, %rs65;
-; CHECK-NEXT:    or.b16 %rs97, %rs48, %rs96;
-; CHECK-NEXT:    or.b16 %rs98, %rs47, %rs95;
-; CHECK-NEXT:    mov.b32 %r1, {%rs98, %rs97};
-; CHECK-NEXT:    or.b16 %rs99, %rs46, %rs94;
-; CHECK-NEXT:    or.b16 %rs100, %rs45, %rs93;
-; CHECK-NEXT:    mov.b32 %r2, {%rs100, %rs99};
-; CHECK-NEXT:    or.b16 %rs101, %rs40, %rs92;
-; CHECK-NEXT:    or.b16 %rs102, %rs39, %rs91;
-; CHECK-NEXT:    mov.b32 %r3, {%rs102, %rs101};
-; CHECK-NEXT:    or.b16 %rs103, %rs38, %rs90;
-; CHECK-NEXT:    or.b16 %rs104, %rs37, %rs89;
-; CHECK-NEXT:    mov.b32 %r4, {%rs104, %rs103};
-; CHECK-NEXT:    or.b16 %rs105, %rs32, %rs88;
-; CHECK-NEXT:    or.b16 %rs106, %rs31, %rs87;
-; CHECK-NEXT:    mov.b32 %r5, {%rs106, %rs105};
-; CHECK-NEXT:    or.b16 %rs107, %rs30, %rs86;
-; CHECK-NEXT:    or.b16 %rs108, %rs29, %rs85;
-; CHECK-NEXT:    mov.b32 %r6, {%rs108, %rs107};
-; CHECK-NEXT:    or.b16 %rs109, %rs24, %rs84;
-; CHECK-NEXT:    or.b16 %rs110, %rs23, %rs83;
-; CHECK-NEXT:    mov.b32 %r7, {%rs110, %rs109};
-; CHECK-NEXT:    or.b16 %rs111, %rs22, %rs82;
-; CHECK-NEXT:    or.b16 %rs112, %rs21, %rs81;
-; CHECK-NEXT:    mov.b32 %r8, {%rs112, %rs111};
-; CHECK-NEXT:    mov.b32 {%rs113, %rs114}, %r8;
-; CHECK-NEXT:    mov.b32 {%rs115, %rs116}, %r7;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs115, %rs116, %rs113, %rs114};
-; CHECK-NEXT:    mov.b32 {%rs117, %rs118}, %r6;
-; CHECK-NEXT:    mov.b32 {%rs119, %rs120}, %r5;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs119, %rs120, %rs117, %rs118};
-; CHECK-NEXT:    mov.b32 {%rs121, %rs122}, %r4;
-; CHECK-NEXT:    mov.b32 {%rs123, %rs124}, %r3;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs123, %rs124, %rs121, %rs122};
-; CHECK-NEXT:    mov.b32 {%rs125, %rs126}, %r2;
-; CHECK-NEXT:    mov.b32 {%rs127, %rs128}, %r1;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+12], {%rs127, %rs128, %rs125, %rs126};
+; CHECK-NEXT:    mov.b32 %r9, {%rs19, %rs20};
+; CHECK-NEXT:    and.b32 %r10, %r8, %r9;
+; CHECK-NEXT:    mov.b32 %r11, {%rs17, %rs18};
+; CHECK-NEXT:    and.b32 %r12, %r7, %r11;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [out_v16i8_param_2+4];
+; CHECK-NEXT:    mov.b32 %r13, {%rs23, %rs24};
+; CHECK-NEXT:    and.b32 %r14, %r6, %r13;
+; CHECK-NEXT:    mov.b32 %r15, {%rs21, %rs22};
+; CHECK-NEXT:    and.b32 %r16, %r5, %r15;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [out_v16i8_param_2+8];
+; CHECK-NEXT:    mov.b32 %r17, {%rs27, %rs28};
+; CHECK-NEXT:    and.b32 %r18, %r4, %r17;
+; CHECK-NEXT:    mov.b32 %r19, {%rs25, %rs26};
+; CHECK-NEXT:    and.b32 %r20, %r3, %r19;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs29, %rs30, %rs31, %rs32}, [out_v16i8_param_2+12];
+; CHECK-NEXT:    mov.b32 %r21, {%rs31, %rs32};
+; CHECK-NEXT:    and.b32 %r22, %r2, %r21;
+; CHECK-NEXT:    mov.b32 %r23, {%rs29, %rs30};
+; CHECK-NEXT:    and.b32 %r24, %r1, %r23;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs33, %rs34, %rs35, %rs36}, [out_v16i8_param_1+12];
+; CHECK-NEXT:    mov.b32 %r25, {%rs33, %rs34};
+; CHECK-NEXT:    mov.b32 %r26, {%rs35, %rs36};
+; CHECK-NEXT:    ld.param.v4.u8 {%rs37, %rs38, %rs39, %rs40}, [out_v16i8_param_1+8];
+; CHECK-NEXT:    mov.b32 %r27, {%rs37, %rs38};
+; CHECK-NEXT:    mov.b32 %r28, {%rs39, %rs40};
+; CHECK-NEXT:    ld.param.v4.u8 {%rs41, %rs42, %rs43, %rs44}, [out_v16i8_param_1+4];
+; CHECK-NEXT:    mov.b32 %r29, {%rs41, %rs42};
+; CHECK-NEXT:    mov.b32 %r30, {%rs43, %rs44};
+; CHECK-NEXT:    ld.param.v4.u8 {%rs45, %rs46, %rs47, %rs48}, [out_v16i8_param_1];
+; CHECK-NEXT:    mov.b32 %r31, {%rs45, %rs46};
+; CHECK-NEXT:    mov.b32 %r32, {%rs47, %rs48};
+; CHECK-NEXT:    xor.b32 %r33, %r23, 16711935;
+; CHECK-NEXT:    xor.b32 %r34, %r21, 16711935;
+; CHECK-NEXT:    xor.b32 %r35, %r19, 16711935;
+; CHECK-NEXT:    xor.b32 %r36, %r17, 16711935;
+; CHECK-NEXT:    xor.b32 %r37, %r15, 16711935;
+; CHECK-NEXT:    xor.b32 %r38, %r13, 16711935;
+; CHECK-NEXT:    xor.b32 %r39, %r11, 16711935;
+; CHECK-NEXT:    xor.b32 %r40, %r9, 16711935;
+; CHECK-NEXT:    and.b32 %r41, %r32, %r40;
+; CHECK-NEXT:    and.b32 %r42, %r31, %r39;
+; CHECK-NEXT:    and.b32 %r43, %r30, %r38;
+; CHECK-NEXT:    and.b32 %r44, %r29, %r37;
+; CHECK-NEXT:    and.b32 %r45, %r28, %r36;
+; CHECK-NEXT:    and.b32 %r46, %r27, %r35;
+; CHECK-NEXT:    and.b32 %r47, %r26, %r34;
+; CHECK-NEXT:    and.b32 %r48, %r25, %r33;
+; CHECK-NEXT:    or.b32 %r49, %r24, %r48;
+; CHECK-NEXT:    or.b32 %r50, %r22, %r47;
+; CHECK-NEXT:    or.b32 %r51, %r20, %r46;
+; CHECK-NEXT:    or.b32 %r52, %r18, %r45;
+; CHECK-NEXT:    or.b32 %r53, %r16, %r44;
+; CHECK-NEXT:    or.b32 %r54, %r14, %r43;
+; CHECK-NEXT:    or.b32 %r55, %r12, %r42;
+; CHECK-NEXT:    or.b32 %r56, %r10, %r41;
+; CHECK-NEXT:    mov.b32 {%rs49, %rs50}, %r56;
+; CHECK-NEXT:    mov.b32 {%rs51, %rs52}, %r55;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs51, %rs52, %rs49, %rs50};
+; CHECK-NEXT:    mov.b32 {%rs53, %rs54}, %r54;
+; CHECK-NEXT:    mov.b32 {%rs55, %rs56}, %r53;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs55, %rs56, %rs53, %rs54};
+; CHECK-NEXT:    mov.b32 {%rs57, %rs58}, %r52;
+; CHECK-NEXT:    mov.b32 {%rs59, %rs60}, %r51;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs59, %rs60, %rs57, %rs58};
+; CHECK-NEXT:    mov.b32 {%rs61, %rs62}, %r50;
+; CHECK-NEXT:    mov.b32 {%rs63, %rs64}, %r49;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+12], {%rs63, %rs64, %rs61, %rs62};
 ; CHECK-NEXT:    ret;
   %mx = and <16 x i8> %x, %mask
   %notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -562,8 +503,7 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
 define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v8i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<49>;
-; CHECK-NEXT:    .reg .b32 %r<33>;
+; CHECK-NEXT:    .reg .b32 %r<41>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v8i16_param_1];
@@ -573,47 +513,19 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-NEXT:    and.b32 %r23, %r18, %r10;
 ; CHECK-NEXT:    and.b32 %r25, %r19, %r11;
 ; CHECK-NEXT:    and.b32 %r27, %r20, %r12;
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r12;
-; CHECK-NEXT:    not.b16 %rs3, %rs2;
-; CHECK-NEXT:    not.b16 %rs4, %rs1;
-; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r11;
-; CHECK-NEXT:    not.b16 %rs7, %rs6;
-; CHECK-NEXT:    not.b16 %rs8, %rs5;
-; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r10;
-; CHECK-NEXT:    not.b16 %rs11, %rs10;
-; CHECK-NEXT:    not.b16 %rs12, %rs9;
-; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r9;
-; CHECK-NEXT:    not.b16 %rs15, %rs14;
-; CHECK-NEXT:    not.b16 %rs16, %rs13;
-; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r1;
-; CHECK-NEXT:    and.b16 %rs19, %rs17, %rs16;
-; CHECK-NEXT:    and.b16 %rs20, %rs18, %rs15;
-; CHECK-NEXT:    mov.b32 {%rs21, %rs22}, %r2;
-; CHECK-NEXT:    and.b16 %rs23, %rs21, %rs12;
-; CHECK-NEXT:    and.b16 %rs24, %rs22, %rs11;
-; CHECK-NEXT:    mov.b32 {%rs25, %rs26}, %r3;
-; CHECK-NEXT:    and.b16 %rs27, %rs25, %rs8;
-; CHECK-NEXT:    and.b16 %rs28, %rs26, %rs7;
-; CHECK-NEXT:    mov.b32 {%rs29, %rs30}, %r4;
-; CHECK-NEXT:    and.b16 %rs31, %rs29, %rs4;
-; CHECK-NEXT:    and.b16 %rs32, %rs30, %rs3;
-; CHECK-NEXT:    mov.b32 {%rs33, %rs34}, %r27;
-; CHECK-NEXT:    or.b16 %rs35, %rs34, %rs32;
-; CHECK-NEXT:    or.b16 %rs36, %rs33, %rs31;
-; CHECK-NEXT:    mov.b32 %r29, {%rs36, %rs35};
-; CHECK-NEXT:    mov.b32 {%rs37, %rs38}, %r25;
-; CHECK-NEXT:    or.b16 %rs39, %rs38, %rs28;
-; CHECK-NEXT:    or.b16 %rs40, %rs37, %rs27;
-; CHECK-NEXT:    mov.b32 %r30, {%rs40, %rs39};
-; CHECK-NEXT:    mov.b32 {%rs41, %rs42}, %r23;
-; CHECK-NEXT:    or.b16 %rs43, %rs42, %rs24;
-; CHECK-NEXT:    or.b16 %rs44, %rs41, %rs23;
-; CHECK-NEXT:    mov.b32 %r31, {%rs44, %rs43};
-; CHECK-NEXT:    mov.b32 {%rs45, %rs46}, %r21;
-; CHECK-NEXT:    or.b16 %rs47, %rs46, %rs20;
-; CHECK-NEXT:    or.b16 %rs48, %rs45, %rs19;
-; CHECK-NEXT:    mov.b32 %r32, {%rs48, %rs47};
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0+0], {%r32, %r31, %r30, %r29};
+; CHECK-NEXT:    xor.b32 %r29, %r12, -1;
+; CHECK-NEXT:    xor.b32 %r30, %r11, -1;
+; CHECK-NEXT:    xor.b32 %r31, %r10, -1;
+; CHECK-NEXT:    xor.b32 %r32, %r9, -1;
+; CHECK-NEXT:    and.b32 %r33, %r1, %r32;
+; CHECK-NEXT:    and.b32 %r34, %r2, %r31;
+; CHECK-NEXT:    and.b32 %r35, %r3, %r30;
+; CHECK-NEXT:    and.b32 %r36, %r4, %r29;
+; CHECK-NEXT:    or.b32 %r37, %r27, %r36;
+; CHECK-NEXT:    or.b32 %r38, %r25, %r35;
+; CHECK-NEXT:    or.b32 %r39, %r23, %r34;
+; CHECK-NEXT:    or.b32 %r40, %r21, %r33;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0+0], {%r40, %r39, %r38, %r37};
 ; CHECK-NEXT:    ret;
   %mx = and <8 x i16> %x, %mask
   %notmask = xor <8 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -775,43 +687,41 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-NEXT:    .local .align 2 .b8 __local_depot18[4];
 ; CHECK-NEXT:    .reg .b64 %SP;
 ; CHECK-NEXT:    .reg .b64 %SPL;
-; CHECK-NEXT:    .reg .b16 %rs<32>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b16 %rs<20>;
+; CHECK-NEXT:    .reg .b32 %r<19>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    mov.u64 %SPL, __local_depot18;
 ; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [in_v4i8_param_0];
+; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT:    mov.b32 %r2, {%rs3, %rs4};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [in_v4i8_param_1];
-; CHECK-NEXT:    xor.b16 %rs9, %rs4, %rs8;
-; CHECK-NEXT:    xor.b16 %rs10, %rs3, %rs7;
-; CHECK-NEXT:    xor.b16 %rs11, %rs2, %rs6;
-; CHECK-NEXT:    xor.b16 %rs12, %rs1, %rs5;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [in_v4i8_param_2];
-; CHECK-NEXT:    and.b16 %rs17, %rs12, %rs13;
-; CHECK-NEXT:    and.b16 %rs18, %rs11, %rs14;
-; CHECK-NEXT:    and.b16 %rs19, %rs10, %rs15;
-; CHECK-NEXT:    and.b16 %rs20, %rs9, %rs16;
-; CHECK-NEXT:    xor.b16 %rs21, %rs20, %rs8;
-; CHECK-NEXT:    xor.b16 %rs22, %rs19, %rs7;
-; CHECK-NEXT:    mov.b32 %r1, {%rs22, %rs21};
-; CHECK-NEXT:    mov.b32 {%rs23, %rs24}, %r1;
-; CHECK-NEXT:    st.v2.u8 [%SP+0], {%rs23, %rs24};
-; CHECK-NEXT:    xor.b16 %rs25, %rs18, %rs6;
-; CHECK-NEXT:    xor.b16 %rs26, %rs17, %rs5;
-; CHECK-NEXT:    mov.b32 %r2, {%rs26, %rs25};
-; CHECK-NEXT:    mov.b32 {%rs27, %rs28}, %r2;
-; CHECK-NEXT:    st.v2.u8 [%SP+2], {%rs27, %rs28};
-; CHECK-NEXT:    ld.u16 %r3, [%SP+0];
-; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
-; CHECK-NEXT:    ld.u16 %r5, [%SP+2];
-; CHECK-NEXT:    or.b32 %r6, %r5, %r4;
-; CHECK-NEXT:    shr.u32 %r7, %r6, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs29, %r7;
-; CHECK-NEXT:    cvt.u16.u32 %rs30, %r3;
-; CHECK-NEXT:    bfe.s32 %r8, %r3, 8, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs31, %r8;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs27, %rs29, %rs30, %rs31};
+; CHECK-NEXT:    mov.b32 %r3, {%rs7, %rs8};
+; CHECK-NEXT:    xor.b32 %r4, %r2, %r3;
+; CHECK-NEXT:    mov.b32 %r5, {%rs5, %rs6};
+; CHECK-NEXT:    xor.b32 %r6, %r1, %r5;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [in_v4i8_param_2];
+; CHECK-NEXT:    mov.b32 %r7, {%rs9, %rs10};
+; CHECK-NEXT:    and.b32 %r8, %r6, %r7;
+; CHECK-NEXT:    mov.b32 %r9, {%rs11, %rs12};
+; CHECK-NEXT:    and.b32 %r10, %r4, %r9;
+; CHECK-NEXT:    xor.b32 %r11, %r10, %r3;
+; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r11;
+; CHECK-NEXT:    st.v2.u8 [%SP+0], {%rs13, %rs14};
+; CHECK-NEXT:    xor.b32 %r12, %r8, %r5;
+; CHECK-NEXT:    mov.b32 {%rs15, %rs16}, %r12;
+; CHECK-NEXT:    st.v2.u8 [%SP+2], {%rs15, %rs16};
+; CHECK-NEXT:    ld.u16 %r13, [%SP+0];
+; CHECK-NEXT:    shl.b32 %r14, %r13, 16;
+; CHECK-NEXT:    ld.u16 %r15, [%SP+2];
+; CHECK-NEXT:    or.b32 %r16, %r15, %r14;
+; CHECK-NEXT:    shr.u32 %r17, %r16, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs17, %r17;
+; CHECK-NEXT:    cvt.u16.u32 %rs18, %r13;
+; CHECK-NEXT:    bfe.s32 %r18, %r13, 8, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs19, %r18;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs15, %rs17, %rs18, %rs19};
 ; CHECK-NEXT:    ret;
   %n0 = xor <4 x i8> %x, %y
   %n1 = and <4 x i8> %n0, %mask
@@ -869,81 +779,77 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-NEXT:    .local .align 2 .b8 __local_depot21[8];
 ; CHECK-NEXT:    .reg .b64 %SP;
 ; CHECK-NEXT:    .reg .b64 %SPL;
-; CHECK-NEXT:    .reg .b16 %rs<64>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b16 %rs<40>;
+; CHECK-NEXT:    .reg .b32 %r<34>;
 ; CHECK-NEXT:    .reg .b64 %rd<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    mov.u64 %SPL, __local_depot21;
 ; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [in_v8i8_param_0+4];
+; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT:    mov.b32 %r2, {%rs3, %rs4};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [in_v8i8_param_0];
+; CHECK-NEXT:    mov.b32 %r3, {%rs5, %rs6};
+; CHECK-NEXT:    mov.b32 %r4, {%rs7, %rs8};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [in_v8i8_param_1];
-; CHECK-NEXT:    xor.b16 %rs13, %rs8, %rs12;
-; CHECK-NEXT:    xor.b16 %rs14, %rs7, %rs11;
-; CHECK-NEXT:    xor.b16 %rs15, %rs6, %rs10;
-; CHECK-NEXT:    xor.b16 %rs16, %rs5, %rs9;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [in_v8i8_param_1+4];
-; CHECK-NEXT:    xor.b16 %rs21, %rs4, %rs20;
-; CHECK-NEXT:    xor.b16 %rs22, %rs3, %rs19;
-; CHECK-NEXT:    xor.b16 %rs23, %rs2, %rs18;
-; CHECK-NEXT:    xor.b16 %rs24, %rs1, %rs17;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [in_v8i8_param_2+4];
-; CHECK-NEXT:    and.b16 %rs29, %rs24, %rs25;
-; CHECK-NEXT:    and.b16 %rs30, %rs23, %rs26;
-; CHECK-NEXT:    and.b16 %rs31, %rs22, %rs27;
-; CHECK-NEXT:    and.b16 %rs32, %rs21, %rs28;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs33, %rs34, %rs35, %rs36}, [in_v8i8_param_2];
-; CHECK-NEXT:    and.b16 %rs37, %rs16, %rs33;
-; CHECK-NEXT:    and.b16 %rs38, %rs15, %rs34;
-; CHECK-NEXT:    and.b16 %rs39, %rs14, %rs35;
-; CHECK-NEXT:    and.b16 %rs40, %rs13, %rs36;
-; CHECK-NEXT:    xor.b16 %rs41, %rs40, %rs12;
-; CHECK-NEXT:    xor.b16 %rs42, %rs39, %rs11;
-; CHECK-NEXT:    mov.b32 %r1, {%rs42, %rs41};
-; CHECK-NEXT:    mov.b32 {%rs43, %rs44}, %r1;
-; CHECK-NEXT:    st.v2.u8 [%SP+0], {%rs43, %rs44};
-; CHECK-NEXT:    xor.b16 %rs45, %rs38, %rs10;
-; CHECK-NEXT:    xor.b16 %rs46, %rs37, %rs9;
-; CHECK-NEXT:    mov.b32 %r2, {%rs46, %rs45};
-; CHECK-NEXT:    mov.b32 {%rs47, %rs48}, %r2;
-; CHECK-NEXT:    st.v2.u8 [%SP+2], {%rs47, %rs48};
-; CHECK-NEXT:    xor.b16 %rs49, %rs32, %rs20;
-; CHECK-NEXT:    xor.b16 %rs50, %rs31, %rs19;
-; CHECK-NEXT:    mov.b32 %r3, {%rs50, %rs49};
-; CHECK-NEXT:    mov.b32 {%rs51, %rs52}, %r3;
-; CHECK-NEXT:    st.v2.u8 [%SP+4], {%rs51, %rs52};
-; CHECK-NEXT:    xor.b16 %rs53, %rs30, %rs18;
-; CHECK-NEXT:    xor.b16 %rs54, %rs29, %rs17;
-; CHECK-NEXT:    mov.b32 %r4, {%rs54, %rs53};
-; CHECK-NEXT:    mov.b32 {%rs55, %rs56}, %r4;
-; CHECK-NEXT:    st.v2.u8 [%SP+6], {%rs55, %rs56};
-; CHECK-NEXT:    ld.u16 %r5, [%SP+0];
-; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
-; CHECK-NEXT:    ld.u16 %r7, [%SP+2];
-; CHECK-NEXT:    or.b32 %r8, %r7, %r6;
-; CHECK-NEXT:    cvt.u64.u32 %rd1, %r8;
-; CHECK-NEXT:    ld.u16 %r9, [%SP+4];
-; CHECK-NEXT:    shl.b32 %r10, %r9, 16;
-; CHECK-NEXT:    ld.u16 %r11, [%SP+6];
-; CHECK-NEXT:    or.b32 %r12, %r11, %r10;
-; CHECK-NEXT:    cvt.u64.u32 %rd2, %r12;
+; CHECK-NEXT:    mov.b32 %r5, {%rs11, %rs12};
+; CHECK-NEXT:    xor.b32 %r6, %r4, %r5;
+; CHECK-NEXT:    mov.b32 %r7, {%rs9, %rs10};
+; CHECK-NEXT:    xor.b32 %r8, %r3, %r7;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [in_v8i8_param_1+4];
+; CHECK-NEXT:    mov.b32 %r9, {%rs15, %rs16};
+; CHECK-NEXT:    xor.b32 %r10, %r2, %r9;
+; CHECK-NEXT:    mov.b32 %r11, {%rs13, %rs14};
+; CHECK-NEXT:    xor.b32 %r12, %r1, %r11;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [in_v8i8_param_2+4];
+; CHECK-NEXT:    mov.b32 %r13, {%rs17, %rs18};
+; CHECK-NEXT:    and.b32 %r14, %r12, %r13;
+; CHECK-NEXT:    mov.b32 %r15, {%rs19, %rs20};
+; CHECK-NEXT:    and.b32 %r16, %r10, %r15;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [in_v8i8_param_2];
+; CHECK-NEXT:    mov.b32 %r17, {%rs21, %rs22};
+; CHECK-NEXT:    and.b32 %r18, %r8, %r17;
+; CHECK-NEXT:    mov.b32 %r19, {%rs23, %rs24};
+; CHECK-NEXT:    and.b32 %r20, %r6, %r19;
+; CHECK-NEXT:    xor.b32 %r21, %r20, %r5;
+; CHECK-NEXT:    mov.b32 {%rs25, %rs26}, %r21;
+; CHECK-NEXT:    st.v2.u8 [%SP+0], {%rs25, %rs26};
+; CHECK-NEXT:    xor.b32 %r22, %r18, %r7;
+; CHECK-NEXT:    mov.b32 {%rs27, %rs28}, %r22;
+; CHECK-NEXT:    st.v2.u8 [%SP+2], {%rs27, %rs28};
+; CHECK-NEXT:    xor.b32 %r23, %r16, %r9;
+; CHECK-NEXT:    mov.b32 {%rs29, %rs30}, %r23;
+; CHECK-NEXT:    st.v2.u8 [%SP+4], {%rs29, %rs30};
+; CHECK-NEXT:    xor.b32 %r24, %r14, %r11;
+; CHECK-NEXT:    mov.b32 {%rs31, %rs32}, %r24;
+; CHECK-NEXT:    st.v2.u8 [%SP+6], {%rs31, %rs32};
+; CHECK-NEXT:    ld.u16 %r25, [%SP+0];
+; CHECK-NEXT:    shl.b32 %r26, %r25, 16;
+; CHECK-NEXT:    ld.u16 %r27, [%SP+2];
+; CHECK-NEXT:    or.b32 %r28, %r27, %r26;
+; CHECK-NEXT:    cvt.u64.u32 %rd1, %r28;
+; CHECK-NEXT:    ld.u16 %r29, [%SP+4];
+; CHECK-NEXT:    shl.b32 %r30, %r29, 16;
+; CHECK-NEXT:    ld.u16 %r31, [%SP+6];
+; CHECK-NEXT:    or.b32 %r32, %r31, %r30;
+; CHECK-NEXT:    cvt.u64.u32 %rd2, %r32;
 ; CHECK-NEXT:    shl.b64 %rd3, %rd2, 32;
 ; CHECK-NEXT:    or.b64 %rd4, %rd1, %rd3;
-; CHECK-NEXT:    shr.u32 %r13, %r12, 8;
+; CHECK-NEXT:    shr.u32 %r33, %r32, 8;
 ; CHECK-NEXT:    shr.u64 %rd5, %rd4, 24;
-; CHECK-NEXT:    cvt.u16.u64 %rs57, %rd5;
+; CHECK-NEXT:    cvt.u16.u64 %rs33, %rd5;
 ; CHECK-NEXT:    shr.u64 %rd6, %rd1, 16;
-; CHECK-NEXT:    cvt.u16.u64 %rs58, %rd6;
+; CHECK-NEXT:    cvt.u16.u64 %rs34, %rd6;
 ; CHECK-NEXT:    shr.u64 %rd7, %rd1, 8;
-; CHECK-NEXT:    cvt.u16.u64 %rs59, %rd7;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs47, %rs59, %rs58, %rs57};
-; CHECK-NEXT:    cvt.u16.u32 %rs60, %r13;
+; CHECK-NEXT:    cvt.u16.u64 %rs35, %rd7;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs27, %rs35, %rs34, %rs33};
+; CHECK-NEXT:    cvt.u16.u32 %rs36, %r33;
 ; CHECK-NEXT:    bfe.s64 %rd8, %rd2, 24, 8;
-; CHECK-NEXT:    cvt.u16.u64 %rs61, %rd8;
-; CHECK-NEXT:    cvt.u16.u32 %rs62, %r9;
-; CHECK-NEXT:    cvt.u16.u32 %rs63, %r11;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs63, %rs60, %rs62, %rs61};
+; CHECK-NEXT:    cvt.u16.u64 %rs37, %rd8;
+; CHECK-NEXT:    cvt.u16.u32 %rs38, %r29;
+; CHECK-NEXT:    cvt.u16.u32 %rs39, %r31;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs39, %rs36, %rs38, %rs37};
 ; CHECK-NEXT:    ret;
   %n0 = xor <8 x i8> %x, %y
   %n1 = and <8 x i8> %n0, %mask
@@ -959,14 +865,14 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [in_v4i16_param_0];
 ; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [in_v4i16_param_1];
-; CHECK-NEXT:    xor.b32 %r5, %r2, %r4;
-; CHECK-NEXT:    xor.b32 %r6, %r1, %r3;
-; CHECK-NEXT:    ld.param.v2.u32 {%r7, %r8}, [in_v4i16_param_2];
-; CHECK-NEXT:    and.b32 %r9, %r6, %r7;
-; CHECK-NEXT:    and.b32 %r10, %r5, %r8;
-; CHECK-NEXT:    xor.b32 %r11, %r10, %r4;
-; CHECK-NEXT:    xor.b32 %r13, %r9, %r3;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0+0], {%r13, %r11};
+; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [in_v4i16_param_2];
+; CHECK-NEXT:    xor.b32 %r7, %r2, %r4;
+; CHECK-NEXT:    and.b32 %r8, %r7, %r6;
+; CHECK-NEXT:    xor.b32 %r9, %r8, %r4;
+; CHECK-NEXT:    xor.b32 %r11, %r1, %r3;
+; CHECK-NEXT:    and.b32 %r12, %r11, %r5;
+; CHECK-NEXT:    xor.b32 %r13, %r12, %r3;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0+0], {%r13, %r9};
 ; CHECK-NEXT:    ret;
   %n0 = xor <4 x i16> %x, %y
   %n1 = and <4 x i16> %n0, %mask
@@ -1024,90 +930,82 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind
 define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v16i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<113>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b16 %rs<65>;
+; CHECK-NEXT:    .reg .b32 %r<49>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [in_v16i8_param_0];
+; CHECK-NEXT:    mov.b32 %r1, {%rs3, %rs4};
+; CHECK-NEXT:    mov.b32 %r2, {%rs1, %rs2};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [in_v16i8_param_0+4];
+; CHECK-NEXT:    mov.b32 %r3, {%rs7, %rs8};
+; CHECK-NEXT:    mov.b32 %r4, {%rs5, %rs6};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [in_v16i8_param_0+8];
+; CHECK-NEXT:    mov.b32 %r5, {%rs11, %rs12};
+; CHECK-NEXT:    mov.b32 %r6, {%rs9, %rs10};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [in_v16i8_param_0+12];
+; CHECK-NEXT:    mov.b32 %r7, {%rs15, %rs16};
+; CHECK-NEXT:    mov.b32 %r8, {%rs13, %rs14};
 ; CHECK-NEXT:    ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [in_v16i8_param_1+12];
-; CHECK-NEXT:    xor.b16 %rs21, %rs14, %rs18;
-; CHECK-NEXT:    xor.b16 %rs22, %rs13, %rs17;
-; CHECK-NEXT:    xor.b16 %rs23, %rs16, %rs20;
-; CHECK-NEXT:    xor.b16 %rs24, %rs15, %rs19;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [in_v16i8_param_1+8];
-; CHECK-NEXT:    xor.b16 %rs29, %rs10, %rs26;
-; CHECK-NEXT:    xor.b16 %rs30, %rs9, %rs25;
-; CHECK-NEXT:    xor.b16 %rs31, %rs12, %rs28;
-; CHECK-NEXT:    xor.b16 %rs32, %rs11, %rs27;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs33, %rs34, %rs35, %rs36}, [in_v16i8_param_1+4];
-; CHECK-NEXT:    xor.b16 %rs37, %rs6, %rs34;
-; CHECK-NEXT:    xor.b16 %rs38, %rs5, %rs33;
-; CHECK-NEXT:    xor.b16 %rs39, %rs8, %rs36;
-; CHECK-NEXT:    xor.b16 %rs40, %rs7, %rs35;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs41, %rs42, %rs43, %rs44}, [in_v16i8_param_1];
-; CHECK-NEXT:    xor.b16 %rs45, %rs2, %rs42;
-; CHECK-NEXT:    xor.b16 %rs46, %rs1, %rs41;
-; CHECK-NEXT:    xor.b16 %rs47, %rs4, %rs44;
-; CHECK-NEXT:    xor.b16 %rs48, %rs3, %rs43;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs49, %rs50, %rs51, %rs52}, [in_v16i8_param_2];
-; CHECK-NEXT:    and.b16 %rs53, %rs48, %rs51;
-; CHECK-NEXT:    and.b16 %rs54, %rs47, %rs52;
-; CHECK-NEXT:    and.b16 %rs55, %rs46, %rs49;
-; CHECK-NEXT:    and.b16 %rs56, %rs45, %rs50;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs57, %rs58, %rs59, %rs60}, [in_v16i8_param_2+4];
-; CHECK-NEXT:    and.b16 %rs61, %rs40, %rs59;
-; CHECK-NEXT:    and.b16 %rs62, %rs39, %rs60;
-; CHECK-NEXT:    and.b16 %rs63, %rs38, %rs57;
-; CHECK-NEXT:    and.b16 %rs64, %rs37, %rs58;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs65, %rs66, %rs67, %rs68}, [in_v16i8_param_2+8];
-; CHECK-NEXT:    and.b16 %rs69, %rs32, %rs67;
-; CHECK-NEXT:    and.b16 %rs70, %rs31, %rs68;
-; CHECK-NEXT:    and.b16 %rs71, %rs30, %rs65;
-; CHECK-NEXT:    and.b16 %rs72, %rs29, %rs66;
-; CHECK-NEXT:    ld.param.v4.u8 {%rs73, %rs74, %rs75, %rs76}, [in_v16i8_param_2+12];
-; CHECK-NEXT:    and.b16 %rs77, %rs24, %rs75;
-; CHECK-NEXT:    and.b16 %rs78, %rs23, %rs76;
-; CHECK-NEXT:    and.b16 %rs79, %rs22, %rs73;
-; CHECK-NEXT:    and.b16 %rs80, %rs21, %rs74;
-; CHECK-NEXT:    xor.b16 %rs81, %rs80, %rs18;
-; CHECK-NEXT:    xor.b16 %rs82, %rs79, %rs17;
-; CHECK-NEXT:    mov.b32 %r1, {%rs82, %rs81};
-; CHECK-NEXT:    xor.b16 %rs83, %rs78, %rs20;
-; CHECK-NEXT:    xor.b16 %rs84, %rs77, %rs19;
-; CHECK-NEXT:    mov.b32 %r2, {%rs84, %rs83};
-; CHECK-NEXT:    xor.b16 %rs85, %rs72, %rs26;
-; CHECK-NEXT:    xor.b16 %rs86, %rs71, %rs25;
-; CHECK-NEXT:    mov.b32 %r3, {%rs86, %rs85};
-; CHECK-NEXT:    xor.b16 %rs87, %rs70, %rs28;
-; CHECK-NEXT:    xor.b16 %rs88, %rs69, %rs27;
-; CHECK-NEXT:    mov.b32 %r4, {%rs88, %rs87};
-; CHECK-NEXT:    xor.b16 %rs89, %rs64, %rs34;
-; CHECK-NEXT:    xor.b16 %rs90, %rs63, %rs33;
-; CHECK-NEXT:    mov.b32 %r5, {%rs90, %rs89};
-; CHECK-NEXT:    xor.b16 %rs91, %rs62, %rs36;
-; CHECK-NEXT:    xor.b16 %rs92, %rs61, %rs35;
-; CHECK-NEXT:    mov.b32 %r6, {%rs92, %rs91};
-; CHECK-NEXT:    xor.b16 %rs93, %rs56, %rs42;
-; CHECK-NEXT:    xor.b16 %rs94, %rs55, %rs41;
-; CHECK-NEXT:    mov.b32 %r7, {%rs94, %rs93};
-; CHECK-NEXT:    xor.b16 %rs95, %rs54, %rs44;
-; CHECK-NEXT:    xor.b16 %rs96, %rs53, %rs43;
-; CHECK-NEXT:    mov.b32 %r8, {%rs96, %rs95};
-; CHECK-NEXT:    mov.b32 {%rs97, %rs98}, %r8;
-; CHECK-NEXT:    mov.b32 {%rs99, %rs100}, %r7;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs99, %rs100, %rs97, %rs98};
-; CHECK-NEXT:    mov.b32 {%rs101, %rs102}, %r6;
-; CHECK-NEXT:    mov.b32 {%rs103, %rs104}, %r5;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs103, %rs104, %rs101, %rs102};
-; CHECK-NEXT:    mov.b32 {%rs105, %rs106}, %r4;
-; CHECK-NEXT:    mov.b32 {%rs107, %rs108}, %r3;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs107, %rs108, %rs105, %rs106};
-; CHECK-NEXT:    mov.b32 {%rs109, %rs110}, %r2;
-; CHECK-NEXT:    mov.b32 {%rs111, %rs112}, %r1;
-; CHECK-NEXT:    st.param.v4.b8 [func_retval0+12], {%rs111, %rs112, %rs109, %rs110};
+; CHECK-NEXT:    mov.b32 %r9, {%rs17, %rs18};
+; CHECK-NEXT:    xor.b32 %r10, %r8, %r9;
+; CHECK-NEXT:    mov.b32 %r11, {%rs19, %rs20};
+; CHECK-NEXT:    xor.b32 %r12, %r7, %r11;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [in_v16i8_param_1+8];
+; CHECK-NEXT:    mov.b32 %r13, {%rs21, %rs22};
+; CHECK-NEXT:    xor.b32 %r14, %r6, %r13;
+; CHECK-NEXT:    mov.b32 %r15, {%rs23, %rs24};
+; CHECK-NEXT:    xor.b32 %r16, %r5, %r15;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [in_v16i8_param_1+4];
+; CHECK-NEXT:    mov.b32 %r17, {%rs25, %rs26};
+; CHECK-NEXT:    xor.b32 %r18, %r4, %r17;
+; CHECK-NEXT:    mov.b32 %r19, {%rs27, %rs28};
+; CHECK-NEXT:    xor.b32 %r20, %r3, %r19;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs29, %rs30, %rs31, %rs32}, [in_v16i8_param_1];
+; CHECK-NEXT:    mov.b32 %r21, {%rs29, %rs30};
+; CHECK-NEXT:    xor.b32 %r22, %r2, %r21;
+; CHECK-NEXT:    mov.b32 %r23, {%rs31, %rs32};
+; CHECK-NEXT:    xor.b32 %r24, %r1, %r23;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs33, %rs34, %rs35, %rs36}, [in_v16i8_param_2];
+; CHECK-NEXT:    mov.b32 %r25, {%rs35, %rs36};
+; CHECK-NEXT:    and.b32 %r26, %r24, %r25;
+; CHECK-NEXT:    mov.b32 %r27, {%rs33, %rs34};
+; CHECK-NEXT:    and.b32 %r28, %r22, %r27;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs37, %rs38, %rs39, %rs40}, [in_v16i8_param_2+4];
+; CHECK-NEXT:    mov.b32 %r29, {%rs39, %rs40};
+; CHECK-NEXT:    and.b32 %r30, %r20, %r29;
+; CHECK-NEXT:    mov.b32 %r31, {%rs37, %rs38};
+; CHECK-NEXT:    and.b32 %r32, %r18, %r31;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs41, %rs42, %rs43, %rs44}, [in_v16i8_param_2+8];
+; CHECK-NEXT:    mov.b32 %r33, {%rs43, %rs44};
+; CHECK-NEXT:    and.b32 %r34, %r16, %r33;
+; CHECK-NEXT:    mov.b32 %r35, {%rs41, %rs42};
+; CHECK-NEXT:    and.b32 %r36, %r14, %r35;
+; CHECK-NEXT:    ld.param.v4.u8 {%rs45, %rs46, %rs47, %rs48}, [in_v16i8_param_2+12];
+; CHECK-NEXT:    mov.b32 %r37, {%rs47, %rs48};
+; CHECK-NEXT:    and.b32 %r38, %r12, %r37;
+; CHECK-NEXT:    mov.b32 %r39, {%rs45, %rs46};
+; CHECK-NEXT:    and.b32 %r40, %r10, %r39;
+; CHECK-NEXT:    xor.b32 %r41, %r40, %r9;
+; CHECK-NEXT:    xor.b32 %r42, %r38, %r11;
+; CHECK-NEXT:    xor.b32 %r43, %r36, %r13;
+; CHECK-NEXT:    xor.b32 %r44, %r34, %r15;
+; CHECK-NEXT:    xor.b32 %r45, %r32, %r17;
+; CHECK-NEXT:    xor.b32 %r46, %r30, %r19;
+; CHECK-NEXT:    xor.b32 %r47, %r28, %r21;
+; CHECK-NEXT:    xor.b32 %r48, %r26, %r23;
+; CHECK-NEXT:    mov.b32 {%rs49, %rs50}, %r48;
+; CHECK-NEXT:    mov.b32 {%rs51, %rs52}, %r47;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+0], {%rs51, %rs52, %rs49, %rs50};
+; CHECK-NEXT:    mov.b32 {%rs53, %rs54}, %r46;
+; CHECK-NEXT:    mov.b32 {%rs55, %rs56}, %r45;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs55, %rs56, %rs53, %rs54};
+; CHECK-NEXT:    mov.b32 {%rs57, %rs58}, %r44;
+; CHECK-NEXT:    mov.b32 {%rs59, %rs60}, %r43;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs59, %rs60, %rs57, %rs58};
+; CHECK-NEXT:    mov.b32 {%rs61, %rs62}, %r42;
+; CHECK-NEXT:    mov.b32 {%rs63, %rs64}, %r41;
+; CHECK-NEXT:    st.param.v4.b8 [func_retval0+12], {%rs63, %rs64, %rs61, %rs62};
 ; CHECK-NEXT:    ret;
   %n0 = xor <16 x i8> %x, %y
   %n1 = and <16 x i8> %n0, %mask



More information about the llvm-commits mailing list