[llvm] 60ba539 - [DAG] PromoteIntRes_ADDSUBSHLSAT - use promoted ISD::USUBSAT directly
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 13 04:35:19 PST 2021
Author: Simon Pilgrim
Date: 2021-02-13T12:35:10Z
New Revision: 60ba5397dfbf28ffe6ec670f0cb29cf892591106
URL: https://github.com/llvm/llvm-project/commit/60ba5397dfbf28ffe6ec670f0cb29cf892591106
DIFF: https://github.com/llvm/llvm-project/commit/60ba5397dfbf28ffe6ec670f0cb29cf892591106.diff
LOG: [DAG] PromoteIntRes_ADDSUBSHLSAT - use promoted ISD::USUBSAT directly
As discussed on D96413, as long as the promoted bits of the args are zero we can use the basic ISD::USUBSAT pattern directly, without the shifting like we do for other ops.
I think something similar should be possible for ISD::UADDSAT as well, which I'll look at later.
Also, create a ISD::USUBSAT node directly - this will be expanded back by the legalizer later on if necessary.
Differential Revision: https://reviews.llvm.org/D96622
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
llvm/test/CodeGen/AArch64/usub_sat_vec.ll
llvm/test/CodeGen/AMDGPU/usubsat.ll
llvm/test/CodeGen/ARM/usub_sat_plus.ll
llvm/test/CodeGen/X86/usub_sat_plus.ll
llvm/test/CodeGen/X86/usub_sat_vec.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 92d98a84b8ab..c3250e8f43b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -772,6 +772,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
EVT PromotedType = Op1Promoted.getValueType();
unsigned NewBits = PromotedType.getScalarSizeInBits();
+ // USUBSAT can always be promoted as long as we have zero-extended the args.
+ if (Opcode == ISD::USUBSAT)
+ return DAG.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted,
+ Op2Promoted);
+
// Shift cannot use a min/max expansion, we can't detect overflow if all of
// the bits have been shifted out.
if (IsShift || TLI.isOperationLegalOrCustom(Opcode, PromotedType)) {
@@ -783,7 +788,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
ShiftOp = ISD::SRA;
break;
case ISD::UADDSAT:
- case ISD::USUBSAT:
case ISD::USHLSAT:
ShiftOp = ISD::SRL;
break;
@@ -806,12 +810,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
}
- if (Opcode == ISD::USUBSAT) {
- SDValue Max =
- DAG.getNode(ISD::UMAX, dl, PromotedType, Op1Promoted, Op2Promoted);
- return DAG.getNode(ISD::SUB, dl, PromotedType, Max, Op2Promoted);
- }
-
if (Opcode == ISD::UADDSAT) {
APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits);
SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 08114f49bdeb..a361314126a1 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -129,10 +129,7 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
; CHECK-NEXT: mov v1.h[2], w9
; CHECK-NEXT: mov v0.h[3], w10
; CHECK-NEXT: mov v1.h[3], w11
-; CHECK-NEXT: shl v1.4h, v1.4h, #8
-; CHECK-NEXT: shl v0.4h, v0.4h, #8
; CHECK-NEXT: uqsub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ushr v0.4h, v0.4h, #8
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: str s0, [x2]
; CHECK-NEXT: ret
@@ -154,10 +151,7 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
; CHECK-NEXT: fmov s1, w9
; CHECK-NEXT: mov v0.s[1], w10
; CHECK-NEXT: mov v1.s[1], w11
-; CHECK-NEXT: shl v1.2s, v1.2s, #24
-; CHECK-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NEXT: uqsub v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ushr v0.2s, v0.2s, #24
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: strb w9, [x2]
@@ -196,10 +190,7 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
; CHECK-NEXT: fmov s1, w9
; CHECK-NEXT: mov v0.s[1], w10
; CHECK-NEXT: mov v1.s[1], w11
-; CHECK-NEXT: shl v1.2s, v1.2s, #16
-; CHECK-NEXT: shl v0.2s, v0.2s, #16
; CHECK-NEXT: uqsub v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ushr v0.2s, v0.2s, #16
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: strh w9, [x2]
@@ -272,12 +263,9 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; CHECK-LABEL: v16i4:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v2.16b, #15
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: shl v1.16b, v1.16b, #4
-; CHECK-NEXT: shl v0.16b, v0.16b, #4
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: uqsub v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ushr v0.16b, v0.16b, #4
; CHECK-NEXT: ret
%z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
ret <16 x i4> %z
@@ -287,12 +275,9 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
; CHECK-LABEL: v16i1:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v2.16b, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: shl v1.16b, v1.16b, #7
-; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: uqsub v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ushr v0.16b, v0.16b, #7
; CHECK-NEXT: ret
%z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
ret <16 x i1> %z
diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 8216c4769858..7b3e91093855 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -17,19 +17,13 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
; GFX8-LABEL: v_usubsat_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
-; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
+; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_usubsat_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
-; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
diff --git a/llvm/test/CodeGen/ARM/usub_sat_plus.ll b/llvm/test/CodeGen/ARM/usub_sat_plus.ll
index bd83f3c704e3..6c6eff4bdf73 100644
--- a/llvm/test/CodeGen/ARM/usub_sat_plus.ll
+++ b/llvm/test/CodeGen/ARM/usub_sat_plus.ll
@@ -104,34 +104,31 @@ define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y, i16 zeroext %z) nounw
; CHECK-T1-LABEL: func16:
; CHECK-T1: @ %bb.0:
; CHECK-T1-NEXT: muls r1, r2, r1
-; CHECK-T1-NEXT: uxth r2, r1
-; CHECK-T1-NEXT: cmp r0, r2
-; CHECK-T1-NEXT: bhi .LBB2_2
+; CHECK-T1-NEXT: uxth r1, r1
+; CHECK-T1-NEXT: subs r0, r0, r1
+; CHECK-T1-NEXT: bhs .LBB2_2
; CHECK-T1-NEXT: @ %bb.1:
-; CHECK-T1-NEXT: mov r0, r2
+; CHECK-T1-NEXT: movs r0, #0
; CHECK-T1-NEXT: .LBB2_2:
-; CHECK-T1-NEXT: subs r0, r0, r1
; CHECK-T1-NEXT: uxth r0, r0
; CHECK-T1-NEXT: bx lr
;
; CHECK-T2-LABEL: func16:
; CHECK-T2: @ %bb.0:
-; CHECK-T2-NEXT: mul r3, r1, r2
-; CHECK-T2-NEXT: uxth r3, r3
-; CHECK-T2-NEXT: cmp r0, r3
-; CHECK-T2-NEXT: it hi
-; CHECK-T2-NEXT: movhi r3, r0
-; CHECK-T2-NEXT: mls r0, r1, r2, r3
+; CHECK-T2-NEXT: muls r1, r2, r1
+; CHECK-T2-NEXT: uxth r1, r1
+; CHECK-T2-NEXT: subs r0, r0, r1
+; CHECK-T2-NEXT: it lo
+; CHECK-T2-NEXT: movlo r0, #0
; CHECK-T2-NEXT: uxth r0, r0
; CHECK-T2-NEXT: bx lr
;
; CHECK-ARM-LABEL: func16:
; CHECK-ARM: @ %bb.0:
-; CHECK-ARM-NEXT: mul r3, r1, r2
-; CHECK-ARM-NEXT: uxth r3, r3
-; CHECK-ARM-NEXT: cmp r0, r3
-; CHECK-ARM-NEXT: movhi r3, r0
-; CHECK-ARM-NEXT: mls r0, r1, r2, r3
+; CHECK-ARM-NEXT: mul r1, r1, r2
+; CHECK-ARM-NEXT: uxth r1, r1
+; CHECK-ARM-NEXT: subs r0, r0, r1
+; CHECK-ARM-NEXT: movlo r0, #0
; CHECK-ARM-NEXT: uxth r0, r0
; CHECK-ARM-NEXT: bx lr
%a = mul i16 %y, %z
@@ -143,34 +140,31 @@ define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y, i8 zeroext %z) nounwind {
; CHECK-T1-LABEL: func8:
; CHECK-T1: @ %bb.0:
; CHECK-T1-NEXT: muls r1, r2, r1
-; CHECK-T1-NEXT: uxtb r2, r1
-; CHECK-T1-NEXT: cmp r0, r2
-; CHECK-T1-NEXT: bhi .LBB3_2
+; CHECK-T1-NEXT: uxtb r1, r1
+; CHECK-T1-NEXT: subs r0, r0, r1
+; CHECK-T1-NEXT: bhs .LBB3_2
; CHECK-T1-NEXT: @ %bb.1:
-; CHECK-T1-NEXT: mov r0, r2
+; CHECK-T1-NEXT: movs r0, #0
; CHECK-T1-NEXT: .LBB3_2:
-; CHECK-T1-NEXT: subs r0, r0, r1
; CHECK-T1-NEXT: uxtb r0, r0
; CHECK-T1-NEXT: bx lr
;
; CHECK-T2-LABEL: func8:
; CHECK-T2: @ %bb.0:
-; CHECK-T2-NEXT: mul r3, r1, r2
-; CHECK-T2-NEXT: uxtb r3, r3
-; CHECK-T2-NEXT: cmp r0, r3
-; CHECK-T2-NEXT: it hi
-; CHECK-T2-NEXT: movhi r3, r0
-; CHECK-T2-NEXT: mls r0, r1, r2, r3
+; CHECK-T2-NEXT: muls r1, r2, r1
+; CHECK-T2-NEXT: uxtb r1, r1
+; CHECK-T2-NEXT: subs r0, r0, r1
+; CHECK-T2-NEXT: it lo
+; CHECK-T2-NEXT: movlo r0, #0
; CHECK-T2-NEXT: uxtb r0, r0
; CHECK-T2-NEXT: bx lr
;
; CHECK-ARM-LABEL: func8:
; CHECK-ARM: @ %bb.0:
-; CHECK-ARM-NEXT: smulbb r3, r1, r2
-; CHECK-ARM-NEXT: uxtb r3, r3
-; CHECK-ARM-NEXT: cmp r0, r3
-; CHECK-ARM-NEXT: movhi r3, r0
-; CHECK-ARM-NEXT: mls r0, r1, r2, r3
+; CHECK-ARM-NEXT: smulbb r1, r1, r2
+; CHECK-ARM-NEXT: uxtb r1, r1
+; CHECK-ARM-NEXT: subs r0, r0, r1
+; CHECK-ARM-NEXT: movlo r0, #0
; CHECK-ARM-NEXT: uxtb r0, r0
; CHECK-ARM-NEXT: bx lr
%a = mul i8 %y, %z
@@ -183,35 +177,31 @@ define zeroext i4 @func4(i4 zeroext %x, i4 zeroext %y, i4 zeroext %z) nounwind {
; CHECK-T1: @ %bb.0:
; CHECK-T1-NEXT: muls r1, r2, r1
; CHECK-T1-NEXT: movs r2, #15
-; CHECK-T1-NEXT: mov r3, r1
-; CHECK-T1-NEXT: ands r3, r2
-; CHECK-T1-NEXT: cmp r0, r3
-; CHECK-T1-NEXT: bhi .LBB4_2
+; CHECK-T1-NEXT: ands r1, r2
+; CHECK-T1-NEXT: subs r0, r0, r1
+; CHECK-T1-NEXT: bhs .LBB4_2
; CHECK-T1-NEXT: @ %bb.1:
-; CHECK-T1-NEXT: mov r0, r3
+; CHECK-T1-NEXT: movs r0, #0
; CHECK-T1-NEXT: .LBB4_2:
-; CHECK-T1-NEXT: subs r0, r0, r1
; CHECK-T1-NEXT: ands r0, r2
; CHECK-T1-NEXT: bx lr
;
; CHECK-T2-LABEL: func4:
; CHECK-T2: @ %bb.0:
-; CHECK-T2-NEXT: mul r3, r1, r2
-; CHECK-T2-NEXT: and r3, r3, #15
-; CHECK-T2-NEXT: cmp r0, r3
-; CHECK-T2-NEXT: it hi
-; CHECK-T2-NEXT: movhi r3, r0
-; CHECK-T2-NEXT: mls r0, r1, r2, r3
+; CHECK-T2-NEXT: muls r1, r2, r1
+; CHECK-T2-NEXT: and r1, r1, #15
+; CHECK-T2-NEXT: subs r0, r0, r1
+; CHECK-T2-NEXT: it lo
+; CHECK-T2-NEXT: movlo r0, #0
; CHECK-T2-NEXT: and r0, r0, #15
; CHECK-T2-NEXT: bx lr
;
; CHECK-ARM-LABEL: func4:
; CHECK-ARM: @ %bb.0:
-; CHECK-ARM-NEXT: smulbb r3, r1, r2
-; CHECK-ARM-NEXT: and r3, r3, #15
-; CHECK-ARM-NEXT: cmp r0, r3
-; CHECK-ARM-NEXT: movhi r3, r0
-; CHECK-ARM-NEXT: mls r0, r1, r2, r3
+; CHECK-ARM-NEXT: smulbb r1, r1, r2
+; CHECK-ARM-NEXT: and r1, r1, #15
+; CHECK-ARM-NEXT: subs r0, r0, r1
+; CHECK-ARM-NEXT: movlo r0, #0
; CHECK-ARM-NEXT: and r0, r0, #15
; CHECK-ARM-NEXT: bx lr
%a = mul i4 %y, %z
diff --git a/llvm/test/CodeGen/X86/usub_sat_plus.ll b/llvm/test/CodeGen/X86/usub_sat_plus.ll
index 41fdf426c4c9..d7baa39ed536 100644
--- a/llvm/test/CodeGen/X86/usub_sat_plus.ll
+++ b/llvm/test/CodeGen/X86/usub_sat_plus.ll
@@ -111,22 +111,15 @@ define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y, i8 zeroext %z) nounwind {
define zeroext i4 @func4(i4 zeroext %x, i4 zeroext %y, i4 zeroext %z) nounwind {
; X86-LABEL: func4:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %esi
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: mulb {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: andb $15, %dl
-; X86-NEXT: movzbl %dl, %esi
-; X86-NEXT: movzbl %cl, %ebx
-; X86-NEXT: cmpb %dl, %cl
-; X86-NEXT: cmovbel %esi, %ebx
-; X86-NEXT: subb %al, %bl
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: andb $15, %al
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: subb %al, %cl
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: cmovbl %edx, %eax
; X86-NEXT: andl $15, %eax
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: func4:
@@ -134,13 +127,11 @@ define zeroext i4 @func4(i4 zeroext %x, i4 zeroext %y, i4 zeroext %z) nounwind {
; X64-NEXT: movl %esi, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: mulb %dl
-; X64-NEXT: movl %eax, %ecx
-; X64-NEXT: andb $15, %cl
-; X64-NEXT: movzbl %cl, %ecx
-; X64-NEXT: cmpb %cl, %dil
-; X64-NEXT: cmoval %edi, %ecx
-; X64-NEXT: subb %al, %cl
-; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: andb $15, %al
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: subb %al, %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: cmovbl %ecx, %eax
; X64-NEXT: andl $15, %eax
; X64-NEXT: retq
%a = mul i4 %y, %z
diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll
index efd5b0d3895c..247f09d04644 100644
--- a/llvm/test/CodeGen/X86/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll
@@ -481,26 +481,18 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; SSE-LABEL: v16i4:
; SSE: # %bb.0:
-; SSE-NEXT: psllw $4, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: psllw $4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: psubusb %xmm1, %xmm0
-; SSE-NEXT: psrlw $4, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: v16i4:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
ret <16 x i4> %z
@@ -509,38 +501,26 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
; SSE-LABEL: v16i1:
; SSE: # %bb.0:
-; SSE-NEXT: psllw $7, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: psllw $7, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: psubusb %xmm1, %xmm0
-; SSE-NEXT: psrlw $7, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: v16i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i1:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v16i1:
More information about the llvm-commits
mailing list