[llvm] [DAG] Expand vXi1 add/sub overflow operations as xor/and (PR #69191)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 16 05:15:04 PDT 2023
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/69191
>From c0c9a5016f1c7e8f7c3ba924ff0171ffbef764cf Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 16 Oct 2023 12:35:09 +0100
Subject: [PATCH] [DAG] Expand vXi1 add/sub overflow operations as xor/and
Similar to what we already do for add/sub + saturation variants.
Alive2: https://alive2.llvm.org/ce/z/rBDrNE
Fixes #69080
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 21 +++++++
llvm/test/CodeGen/AArch64/vec_uaddo.ll | 20 +++---
llvm/test/CodeGen/X86/pr69080.ll | 38 +++++++++++
llvm/test/CodeGen/X86/vec_saddo.ll | 35 ++++-------
llvm/test/CodeGen/X86/vec_ssubo.ll | 36 ++++-------
llvm/test/CodeGen/X86/vec_uaddo.ll | 63 ++++++-------------
llvm/test/CodeGen/X86/vec_usubo.ll | 63 ++++++-------------
7 files changed, 132 insertions(+), 144 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/pr69080.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 9a37627e36b9ffa..f4dabbe6c9c737a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -9854,6 +9854,27 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
SDValue ZeroOverFlow = getConstant(0, DL, VTList.VTs[1]);
return getNode(ISD::MERGE_VALUES, DL, VTList, {N1, ZeroOverFlow}, Flags);
}
+
+ if (VTList.VTs[0].isVector() &&
+ VTList.VTs[0].getVectorElementType() == MVT::i1 &&
+ VTList.VTs[1].getVectorElementType() == MVT::i1) {
+ SDValue F1 = getFreeze(N1);
+ SDValue F2 = getFreeze(N2);
+ // {vXi1,vXi1} (u/s)addo(vXi1 x, vXi1y) -> {xor(x,y),and(x,y)}
+ if (Opcode == ISD::UADDO || Opcode == ISD::SADDO)
+ return getNode(ISD::MERGE_VALUES, DL, VTList,
+ {getNode(ISD::XOR, DL, VTList.VTs[0], F1, F2),
+ getNode(ISD::AND, DL, VTList.VTs[1], F1, F2)},
+ Flags);
+ // {vXi1,vXi1} (u/s)subo(vXi1 x, vXi1y) -> {xor(x,y),and(~x,y)}
+ if (Opcode == ISD::USUBO || Opcode == ISD::SSUBO) {
+ SDValue NotF1 = getNOT(DL, F1, VTList.VTs[0]);
+ return getNode(ISD::MERGE_VALUES, DL, VTList,
+ {getNode(ISD::XOR, DL, VTList.VTs[0], F1, F2),
+ getNode(ISD::AND, DL, VTList.VTs[1], NotF1, F2)},
+ Flags);
+ }
+ }
break;
}
case ISD::SMUL_LOHI:
diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
index 6ad880020cc664a..00609b0df9b4e15 100644
--- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
@@ -245,21 +245,17 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.4h, #1
+; CHECK-NEXT: eor v2.8b, v0.8b, v1.8b
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: adrp x8, .LCPI10_0
+; CHECK-NEXT: shl v2.4h, v2.4h, #15
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: cmlt v1.4h, v2.4h, #0
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT: shl v0.4s, v0.4s, #31
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: fmov d1, d0
-; CHECK-NEXT: shl v2.4h, v0.4h, #15
-; CHECK-NEXT: cmlt v2.4h, v2.4h, #0
-; CHECK-NEXT: bic v1.4h, #2
-; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0]
-; CHECK-NEXT: and v1.8b, v2.8b, v1.8b
-; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: addv h1, v1.4h
-; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: strb w8, [x0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/X86/pr69080.ll b/llvm/test/CodeGen/X86/pr69080.ll
new file mode 100644
index 000000000000000..1b27adcb1ae7ce8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr69080.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=AVX
+
+define { <4 x i1>, <4 x i1> } @uaddo(<4 x i1> %a) {
+; SSE-LABEL: uaddo:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uaddo:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %f = call { <4 x i1>, <4 x i1> } @llvm.uadd.with.overflow.v4i1(<4 x i1> %a, <4 x i1> %a)
+ ret { <4 x i1>, <4 x i1> } %f
+}
+declare { <4 x i1>, <4 x i1> } @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
+
+define { <4 x i1>, <4 x i1> } @saddo(<4 x i1> %a) {
+; SSE-LABEL: saddo:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: saddo:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %f = call { <4 x i1>, <4 x i1> } @llvm.sadd.with.overflow.v4i1(<4 x i1> %a, <4 x i1> %a)
+ ret { <4 x i1>, <4 x i1> } %f
+}
+declare { <4 x i1>, <4 x i1> } @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 7631367ba5d667c..eae9b969211f66c 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -976,34 +976,24 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; SSE-LABEL: saddo_v4i1:
; SSE: # %bb.0:
-; SSE-NEXT: pslld $31, %xmm1
-; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm2
+; SSE-NEXT: pslld $31, %xmm2
+; SSE-NEXT: movmskps %xmm2, %eax
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pslld $31, %xmm0
; SSE-NEXT: psrad $31, %xmm0
-; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pslld $31, %xmm1
-; SSE-NEXT: movmskps %xmm1, %eax
-; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movb %al, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: saddo_v4i1:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX-NEXT: vmovmskps %xmm2, %eax
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpslld $31, %xmm0, %xmm1
-; AVX-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovmskps %xmm1, %eax
; AVX-NEXT: movb %al, (%rdi)
; AVX-NEXT: retq
;
@@ -1011,11 +1001,10 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k2
-; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k2}
-; AVX512-NEXT: kxorw %k0, %k1, %k1
+; AVX512-NEXT: kandw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: kshiftlw $12, %k2, %k0
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index d634457069c0daf..f8cf543cb9fab30 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -985,34 +985,24 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; SSE-LABEL: ssubo_v4i1:
; SSE: # %bb.0:
-; SSE-NEXT: pslld $31, %xmm1
-; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm2
+; SSE-NEXT: pslld $31, %xmm2
+; SSE-NEXT: movmskps %xmm2, %eax
+; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: pslld $31, %xmm0
; SSE-NEXT: psrad $31, %xmm0
-; SSE-NEXT: psubd %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pslld $31, %xmm1
-; SSE-NEXT: movmskps %xmm1, %eax
-; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movb %al, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: ssubo_v4i1:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX-NEXT: vmovmskps %xmm2, %eax
+; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpslld $31, %xmm0, %xmm1
-; AVX-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovmskps %xmm1, %eax
; AVX-NEXT: movb %al, (%rdi)
; AVX-NEXT: retq
;
@@ -1022,11 +1012,11 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512-NEXT: kxorw %k1, %k0, %k1
-; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1}
+; AVX512-NEXT: kxorw %k1, %k0, %k0
+; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1}
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT: kshiftlw $12, %k1, %k0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kshiftlw $12, %k0, %k0
; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 653c3a996915114..950e943bd902013 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1075,49 +1075,26 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; SSE-LABEL: uaddo_v4i1:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm2
+; SSE-NEXT: pslld $31, %xmm2
+; SSE-NEXT: movmskps %xmm2, %eax
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pslld $31, %xmm0
-; SSE-NEXT: movmskps %xmm0, %eax
+; SSE-NEXT: psrad $31, %xmm0
; SSE-NEXT: movb %al, (%rdi)
-; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: uaddo_v4i1:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT: vmovmskps %xmm1, %eax
-; AVX1-NEXT: movb %al, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uaddo_v4i1:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX2-NEXT: vmovmskps %xmm1, %eax
-; AVX2-NEXT: movb %al, (%rdi)
-; AVX2-NEXT: retq
+; AVX-LABEL: uaddo_v4i1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX-NEXT: vmovmskps %xmm2, %eax
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX-NEXT: movb %al, (%rdi)
+; AVX-NEXT: retq
;
; AVX512-LABEL: uaddo_v4i1:
; AVX512: # %bb.0:
@@ -1125,11 +1102,11 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
-; AVX512-NEXT: kxorw %k1, %k0, %k1
-; AVX512-NEXT: kandnw %k0, %k1, %k2
+; AVX512-NEXT: kxorw %k1, %k0, %k2
+; AVX512-NEXT: kandw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT: kshiftlw $12, %k1, %k0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kshiftlw $12, %k2, %k0
; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index a58c3dd0d530734..7de972770d8da4f 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -1122,49 +1122,26 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; SSE-LABEL: usubo_v4i1:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: psubd %xmm1, %xmm0
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm2
+; SSE-NEXT: pslld $31, %xmm2
+; SSE-NEXT: movmskps %xmm2, %eax
+; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: pslld $31, %xmm0
-; SSE-NEXT: movmskps %xmm0, %eax
+; SSE-NEXT: psrad $31, %xmm0
; SSE-NEXT: movb %al, (%rdi)
-; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: usubo_v4i1:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT: vmovmskps %xmm1, %eax
-; AVX1-NEXT: movb %al, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: usubo_v4i1:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX2-NEXT: vmovmskps %xmm1, %eax
-; AVX2-NEXT: movb %al, (%rdi)
-; AVX2-NEXT: retq
+; AVX-LABEL: usubo_v4i1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX-NEXT: vmovmskps %xmm2, %eax
+; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX-NEXT: movb %al, (%rdi)
+; AVX-NEXT: retq
;
; AVX512-LABEL: usubo_v4i1:
; AVX512: # %bb.0:
@@ -1172,11 +1149,11 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512-NEXT: kxorw %k1, %k0, %k1
-; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1}
+; AVX512-NEXT: kxorw %k1, %k0, %k0
+; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1}
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512-NEXT: kshiftlw $12, %k1, %k0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kshiftlw $12, %k0, %k0
; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
More information about the llvm-commits
mailing list