[llvm] [CodeGenPrepare] Unfold slow ctpop when used in power-of-two test (PR #102731)
Sergei Barannikov via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 19 23:16:22 PDT 2025
https://github.com/s-barannikov updated https://github.com/llvm/llvm-project/pull/102731
>From 23e323709aaeef178d2407dca76b4a91dae00fc3 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88 at gmail.com>
Date: Sat, 10 Aug 2024 09:53:14 +0300
Subject: [PATCH 1/4] Precommit test
---
llvm/test/CodeGen/RISCV/pr101786.ll | 60 ++++++++++++
llvm/test/CodeGen/X86/pr94829.ll | 49 ++++++++++
.../CodeGenPrepare/unfold-pow2-test.ll | 97 +++++++++++++++++++
3 files changed, 206 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/pr101786.ll
create mode 100644 llvm/test/CodeGen/X86/pr94829.ll
create mode 100644 llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
diff --git a/llvm/test/CodeGen/RISCV/pr101786.ll b/llvm/test/CodeGen/RISCV/pr101786.ll
new file mode 100644
index 0000000000000..afac9e18da1ee
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr101786.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=riscv64 -o - %s | FileCheck %s
+
+define i64 @test(i64 %x, ptr %p) {
+; CHECK-LABEL: test:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: li a0, 0
+; CHECK-NEXT: bgtz a2, .LBB0_3
+; CHECK-NEXT: # %bb.1: # %entry
+; CHECK-NEXT: srli a3, a2, 1
+; CHECK-NEXT: lui a4, 349525
+; CHECK-NEXT: addiw a4, a4, 1365
+; CHECK-NEXT: slli a5, a4, 32
+; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: and a3, a3, a4
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: lui a3, 209715
+; CHECK-NEXT: addiw a3, a3, 819
+; CHECK-NEXT: slli a4, a3, 32
+; CHECK-NEXT: add a3, a3, a4
+; CHECK-NEXT: and a4, a2, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: srli a3, a2, 4
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: lui a3, 61681
+; CHECK-NEXT: addiw a3, a3, -241
+; CHECK-NEXT: slli a4, a3, 32
+; CHECK-NEXT: add a3, a3, a4
+; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: slli a3, a2, 8
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: slli a3, a2, 16
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: slli a3, a2, 32
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: srli a2, a2, 56
+; CHECK-NEXT: li a3, 1
+; CHECK-NEXT: bltu a3, a2, .LBB0_3
+; CHECK-NEXT: # %bb.2: # %if.else
+; CHECK-NEXT: ld a0, 0(a1)
+; CHECK-NEXT: .LBB0_3: # %if.end
+; CHECK-NEXT: ret
+entry:
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %x)
+ %cmp1 = icmp ugt i64 %ctpop, 1
+ %cmp2 = icmp sgt i64 %x, 0
+ %or = or i1 %cmp2, %cmp1
+ br i1 %or, label %if.end, label %if.else
+
+if.else:
+ %load = load i64, ptr %p, align 8
+ br label %if.end
+
+if.end:
+ %res = phi i64 [0, %entry], [%load, %if.else]
+ ret i64 %res
+}
diff --git a/llvm/test/CodeGen/X86/pr94829.ll b/llvm/test/CodeGen/X86/pr94829.ll
new file mode 100644
index 0000000000000..3fc5db7907410
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr94829.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=x86_64 -o - %s | FileCheck %s
+
+define ptr @test(i64 %x) {
+; CHECK-LABEL: test:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: subq %rcx, %rdi
+; CHECK-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; CHECK-NEXT: movq %rdi, %rcx
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: shrq $2, %rdi
+; CHECK-NEXT: andq %rax, %rdi
+; CHECK-NEXT: addq %rcx, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: shrq $4, %rax
+; CHECK-NEXT: addq %rdi, %rax
+; CHECK-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; CHECK-NEXT: imulq %rcx, %rax
+; CHECK-NEXT: shrq $56, %rax
+; CHECK-NEXT: cmpq $2, %rax
+; CHECK-NEXT: jb .LBB0_2
+; CHECK-NEXT: # %bb.1: # %if.else
+; CHECK-NEXT: cmpl $2, %eax
+; CHECK-NEXT: .LBB0_2: # %exit1
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: retq
+entry:
+ %ctpop = tail call i64 @llvm.ctpop.i64(i64 %x)
+ %cmp = icmp ult i64 %ctpop, 2
+ br i1 %cmp, label %exit1, label %if.else
+
+if.else:
+ br i1 %cmp, label %exit2, label %exit3
+
+exit1:
+ ret ptr null
+
+exit2:
+ ret ptr null
+
+exit3:
+ ret ptr null
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
new file mode 100644
index 0000000000000..dd9f51cec9524
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p 'require<profile-summary>,function(codegenprepare)' -S %s \
+; RUN: | FileCheck %s --check-prefix=SLOW
+; RUN: opt -p 'require<profile-summary>,function(codegenprepare)' -S --mattr=+zbb %s \
+; RUN: | FileCheck %s --check-prefix=FAST
+; REQUIRES: riscv64-registered-target
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64"
+
+define i64 @test_ult_2(i64 %x, i64 %y, i64 %a, i64 %b) {
+; SLOW-LABEL: define i64 @test_ult_2(
+; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; SLOW-NEXT: [[ENTRY:.*]]:
+; SLOW-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
+; SLOW-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
+; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
+; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; SLOW: [[IF_THEN]]:
+; SLOW-NEXT: br label %[[IF_END]]
+; SLOW: [[IF_END]]:
+; SLOW-NEXT: [[RES:%.*]] = phi i64 [ [[A]], %[[IF_THEN]] ], [ [[B]], %[[ENTRY]] ]
+; SLOW-NEXT: ret i64 [[RES]]
+;
+; FAST-LABEL: define i64 @test_ult_2(
+; FAST-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; FAST-NEXT: [[ENTRY:.*]]:
+; FAST-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
+; FAST-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; FAST-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
+; FAST-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
+; FAST-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; FAST: [[IF_THEN]]:
+; FAST-NEXT: br label %[[IF_END]]
+; FAST: [[IF_END]]:
+; FAST-NEXT: [[RES:%.*]] = phi i64 [ [[A]], %[[IF_THEN]] ], [ [[B]], %[[ENTRY]] ]
+; FAST-NEXT: ret i64 [[RES]]
+;
+entry:
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %x)
+ %cmp1 = icmp ugt i64 %ctpop, 1
+ %cmp2 = icmp sgt i64 %y, 0
+ %cmp = or i1 %cmp2, %cmp1
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ br label %if.end
+
+if.end:
+ %res = phi i64 [ %a, %if.then ], [ %b, %entry ]
+ ret i64 %res
+}
+
+define i64 @test_ugt_1(i64 %x, i64 %y, i64 %a, i64 %b) {
+; SLOW-LABEL: define i64 @test_ugt_1(
+; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; SLOW-NEXT: [[ENTRY:.*]]:
+; SLOW-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
+; SLOW-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
+; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
+; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; SLOW: [[IF_THEN]]:
+; SLOW-NEXT: br label %[[IF_END]]
+; SLOW: [[IF_END]]:
+; SLOW-NEXT: [[RES:%.*]] = phi i64 [ [[A]], %[[IF_THEN]] ], [ [[B]], %[[ENTRY]] ]
+; SLOW-NEXT: ret i64 [[RES]]
+;
+; FAST-LABEL: define i64 @test_ugt_1(
+; FAST-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
+; FAST-NEXT: [[ENTRY:.*]]:
+; FAST-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
+; FAST-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; FAST-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
+; FAST-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
+; FAST-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; FAST: [[IF_THEN]]:
+; FAST-NEXT: br label %[[IF_END]]
+; FAST: [[IF_END]]:
+; FAST-NEXT: [[RES:%.*]] = phi i64 [ [[A]], %[[IF_THEN]] ], [ [[B]], %[[ENTRY]] ]
+; FAST-NEXT: ret i64 [[RES]]
+;
+entry:
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %x)
+ %cmp1 = icmp ugt i64 %ctpop, 1
+ %cmp2 = icmp sgt i64 %y, 0
+ %cmp = or i1 %cmp2, %cmp1
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ br label %if.end
+
+if.end:
+ %res = phi i64 [ %a, %if.then ], [ %b, %entry ]
+ ret i64 %res
+}
>From 8dc0faba4527794367f5f640a48b53c5abe3b8bd Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88 at gmail.com>
Date: Sat, 10 Aug 2024 10:30:27 +0300
Subject: [PATCH 2/4] [CodeGenPrepare] Unfold slow ctpop when used in
power-of-two test
DAG combiner already does this transformation, but in some cases it does
not have a chance because either CodeGenPrepare or SelectionDAGBuilder
move icmp to a different basic block.
---
llvm/lib/CodeGen/CodeGenPrepare.cpp | 38 +++++++
llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll | 103 +++---------------
llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll | 28 +----
llvm/test/CodeGen/RISCV/pr101786.ll | 32 +-----
llvm/test/CodeGen/RISCV/rv32zbb.ll | 24 ++--
llvm/test/CodeGen/X86/pr94829.ll | 26 +----
.../CodeGenPrepare/unfold-pow2-test.ll | 10 +-
7 files changed, 79 insertions(+), 182 deletions(-)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 12a668507fe65..91f50f87cc711 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -474,6 +474,7 @@ class CodeGenPrepare {
bool optimizeURem(Instruction *Rem);
bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
+ bool unfoldPow2Test(CmpInst *Cmp);
void verifyBFIUpdates(Function &F);
bool _run(Function &F);
};
@@ -1762,6 +1763,40 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
return true;
}
+// Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow.
+bool CodeGenPrepare::unfoldPow2Test(CmpInst *Cmp) {
+ CmpPredicate Pred;
+ Value *X;
+ uint64_t C;
+
+ if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(X)),
+ m_ConstantInt(C))))
+ return false;
+
+ Type *Ty = X->getType();
+ if (Ty->isVectorTy() || TTI->getPopcntSupport(Ty->getIntegerBitWidth()) ==
+ TargetTransformInfo::PSK_FastHardware)
+ return false;
+
+ // (ctpop x) u< 2 -> (x & (x - 1)) == 0
+ // (ctpop x) u> 1 -> (x & (x - 1)) != 0
+ if ((Pred == CmpInst::ICMP_ULT && C == 2) ||
+ (Pred == CmpInst::ICMP_UGT && C == 1)) {
+ IRBuilder<> Builder(Cmp);
+ Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(Ty));
+ Value *And = Builder.CreateAnd(X, Sub);
+ CmpInst::Predicate NewPred =
+ Pred == CmpInst::ICMP_ULT ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE;
+ Value *NewCmp =
+ Builder.CreateICmp(NewPred, And, ConstantInt::getNullValue(Ty));
+ Cmp->replaceAllUsesWith(NewCmp);
+ RecursivelyDeleteTriviallyDeadInstructions(Cmp);
+ return true;
+ }
+
+ return false;
+}
+
/// Sink the given CmpInst into user blocks to reduce the number of virtual
/// registers that must be created and coalesced. This is a clear win except on
/// targets with multiple condition code registers (PowerPC), where it might
@@ -2183,6 +2218,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (combineToUSubWithOverflow(Cmp, ModifiedDT))
return true;
+ if (unfoldPow2Test(Cmp))
+ return true;
+
if (foldICmpWithDominatingICmp(Cmp, *TLI))
return true;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
index 95af7861d4798..dcbf9bfb8874a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
@@ -357,49 +357,14 @@ define i64 @ctpop_i64(i64 %a) nounwind {
define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_ugt_two:
; RV32I: # %bb.0:
-; RV32I-NEXT: j .LBB6_2
-; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: sltiu a0, zero, 0
-; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB6_2:
-; RV32I-NEXT: srli a2, a0, 1
-; RV32I-NEXT: lui a3, 349525
-; RV32I-NEXT: lui a4, 209715
-; RV32I-NEXT: srli a5, a1, 1
-; RV32I-NEXT: addi a3, a3, 1365
-; RV32I-NEXT: and a2, a2, a3
-; RV32I-NEXT: and a3, a5, a3
-; RV32I-NEXT: lui a5, 61681
-; RV32I-NEXT: addi a4, a4, 819
-; RV32I-NEXT: addi a5, a5, -241
-; RV32I-NEXT: sub a0, a0, a2
-; RV32I-NEXT: sub a1, a1, a3
-; RV32I-NEXT: srli a2, a0, 2
-; RV32I-NEXT: and a0, a0, a4
-; RV32I-NEXT: srli a3, a1, 2
-; RV32I-NEXT: and a1, a1, a4
-; RV32I-NEXT: and a2, a2, a4
-; RV32I-NEXT: and a3, a3, a4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: srli a2, a0, 4
-; RV32I-NEXT: srli a3, a1, 4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: and a0, a0, a5
-; RV32I-NEXT: and a1, a1, a5
-; RV32I-NEXT: slli a2, a0, 8
-; RV32I-NEXT: slli a3, a1, 8
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: slli a2, a0, 16
-; RV32I-NEXT: slli a3, a1, 16
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: add a0, a1, a0
-; RV32I-NEXT: sltiu a0, a0, 2
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: addi a3, a1, -1
+; RV32I-NEXT: sltiu a4, a2, -1
+; RV32I-NEXT: add a3, a3, a4
+; RV32I-NEXT: and a0, a0, a2
+; RV32I-NEXT: and a1, a1, a3
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: seqz a0, a0
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: ctpop_i64_ugt_two:
@@ -422,50 +387,14 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_ugt_one:
; RV32I: # %bb.0:
-; RV32I-NEXT: j .LBB7_2
-; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: snez a0, zero
-; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB7_2:
-; RV32I-NEXT: srli a2, a0, 1
-; RV32I-NEXT: lui a3, 349525
-; RV32I-NEXT: lui a4, 209715
-; RV32I-NEXT: srli a5, a1, 1
-; RV32I-NEXT: addi a3, a3, 1365
-; RV32I-NEXT: and a2, a2, a3
-; RV32I-NEXT: and a3, a5, a3
-; RV32I-NEXT: lui a5, 61681
-; RV32I-NEXT: addi a4, a4, 819
-; RV32I-NEXT: addi a5, a5, -241
-; RV32I-NEXT: sub a0, a0, a2
-; RV32I-NEXT: sub a1, a1, a3
-; RV32I-NEXT: srli a2, a0, 2
-; RV32I-NEXT: and a0, a0, a4
-; RV32I-NEXT: srli a3, a1, 2
-; RV32I-NEXT: and a1, a1, a4
-; RV32I-NEXT: and a2, a2, a4
-; RV32I-NEXT: and a3, a3, a4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: srli a2, a0, 4
-; RV32I-NEXT: srli a3, a1, 4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: and a0, a0, a5
-; RV32I-NEXT: and a1, a1, a5
-; RV32I-NEXT: slli a2, a0, 8
-; RV32I-NEXT: slli a3, a1, 8
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: slli a2, a0, 16
-; RV32I-NEXT: slli a3, a1, 16
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: add a0, a1, a0
-; RV32I-NEXT: sltiu a0, a0, 2
-; RV32I-NEXT: xori a0, a0, 1
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: addi a3, a1, -1
+; RV32I-NEXT: sltiu a4, a2, -1
+; RV32I-NEXT: add a3, a3, a4
+; RV32I-NEXT: and a0, a0, a2
+; RV32I-NEXT: and a1, a1, a3
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: ctpop_i64_ugt_one:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 9a6c718703a27..8549a7c526e45 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -701,31 +701,9 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
; RV64I-LABEL: ctpop_i32_ult_two:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -16
-; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: srliw a1, a0, 1
-; RV64I-NEXT: lui a2, 349525
-; RV64I-NEXT: addi a2, a2, 1365
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: lui a2, 209715
-; RV64I-NEXT: addi a2, a2, 819
-; RV64I-NEXT: subw a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 2
-; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: lui a2, 61681
-; RV64I-NEXT: add a0, a1, a0
-; RV64I-NEXT: sraiw a1, a0, 4
-; RV64I-NEXT: addw a0, a1, a0
-; RV64I-NEXT: lui a1, 4112
-; RV64I-NEXT: addiw a2, a2, -241
-; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: addiw a1, a1, 257
-; RV64I-NEXT: call __muldi3
-; RV64I-NEXT: srliw a0, a0, 24
-; RV64I-NEXT: sltiu a0, a0, 2
-; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: addiw a1, a0, -1
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: seqz a0, a0
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: ctpop_i32_ult_two:
diff --git a/llvm/test/CodeGen/RISCV/pr101786.ll b/llvm/test/CodeGen/RISCV/pr101786.ll
index afac9e18da1ee..6d0736edd3e89 100644
--- a/llvm/test/CodeGen/RISCV/pr101786.ll
+++ b/llvm/test/CodeGen/RISCV/pr101786.ll
@@ -8,37 +8,9 @@ define i64 @test(i64 %x, ptr %p) {
; CHECK-NEXT: li a0, 0
; CHECK-NEXT: bgtz a2, .LBB0_3
; CHECK-NEXT: # %bb.1: # %entry
-; CHECK-NEXT: srli a3, a2, 1
-; CHECK-NEXT: lui a4, 349525
-; CHECK-NEXT: addiw a4, a4, 1365
-; CHECK-NEXT: slli a5, a4, 32
-; CHECK-NEXT: add a4, a4, a5
-; CHECK-NEXT: and a3, a3, a4
-; CHECK-NEXT: sub a2, a2, a3
-; CHECK-NEXT: lui a3, 209715
-; CHECK-NEXT: addiw a3, a3, 819
-; CHECK-NEXT: slli a4, a3, 32
-; CHECK-NEXT: add a3, a3, a4
-; CHECK-NEXT: and a4, a2, a3
-; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: addi a3, a2, -1
; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: add a2, a4, a2
-; CHECK-NEXT: srli a3, a2, 4
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: lui a3, 61681
-; CHECK-NEXT: addiw a3, a3, -241
-; CHECK-NEXT: slli a4, a3, 32
-; CHECK-NEXT: add a3, a3, a4
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: slli a3, a2, 8
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: slli a3, a2, 16
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: slli a3, a2, 32
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: srli a2, a2, 56
-; CHECK-NEXT: li a3, 1
-; CHECK-NEXT: bltu a3, a2, .LBB0_3
+; CHECK-NEXT: bnez a2, .LBB0_3
; CHECK-NEXT: # %bb.2: # %if.else
; CHECK-NEXT: ld a0, 0(a1)
; CHECK-NEXT: .LBB0_3: # %if.end
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 1b9b1b89aeb7e..946d5db7be8b9 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -571,12 +571,12 @@ define i64 @ctpop_i64(i64 %a) nounwind {
define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_ugt_two:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi a2, a0, -1
-; RV32I-NEXT: and a2, a0, a2
-; RV32I-NEXT: seqz a0, a0
-; RV32I-NEXT: sub a0, a1, a0
-; RV32I-NEXT: and a0, a1, a0
-; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: seqz a2, a0
+; RV32I-NEXT: addi a3, a0, -1
+; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: and a0, a0, a3
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: seqz a0, a0
; RV32I-NEXT: ret
;
@@ -595,12 +595,12 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_ugt_one:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi a2, a0, -1
-; RV32I-NEXT: and a2, a0, a2
-; RV32I-NEXT: seqz a0, a0
-; RV32I-NEXT: sub a0, a1, a0
-; RV32I-NEXT: and a0, a1, a0
-; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: seqz a2, a0
+; RV32I-NEXT: addi a3, a0, -1
+; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: and a0, a0, a3
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: ret
;
diff --git a/llvm/test/CodeGen/X86/pr94829.ll b/llvm/test/CodeGen/X86/pr94829.ll
index 3fc5db7907410..8d078b2f8e4b9 100644
--- a/llvm/test/CodeGen/X86/pr94829.ll
+++ b/llvm/test/CodeGen/X86/pr94829.ll
@@ -4,30 +4,8 @@
define ptr @test(i64 %x) {
; CHECK-LABEL: test:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: subq %rcx, %rdi
-; CHECK-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; CHECK-NEXT: movq %rdi, %rcx
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: shrq $2, %rdi
-; CHECK-NEXT: andq %rax, %rdi
-; CHECK-NEXT: addq %rcx, %rdi
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: addq %rdi, %rax
-; CHECK-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; CHECK-NEXT: imulq %rcx, %rax
-; CHECK-NEXT: shrq $56, %rax
-; CHECK-NEXT: cmpq $2, %rax
-; CHECK-NEXT: jb .LBB0_2
-; CHECK-NEXT: # %bb.1: # %if.else
-; CHECK-NEXT: cmpl $2, %eax
-; CHECK-NEXT: .LBB0_2: # %exit1
+; CHECK-NEXT: leaq -1(%rdi), %rax
+; CHECK-NEXT: testq %rax, %rdi
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retq
entry:
diff --git a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
index dd9f51cec9524..716bfdfe19411 100644
--- a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
@@ -12,8 +12,9 @@ define i64 @test_ult_2(i64 %x, i64 %y, i64 %a, i64 %b) {
; SLOW-LABEL: define i64 @test_ult_2(
; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
; SLOW-NEXT: [[ENTRY:.*]]:
-; SLOW-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
-; SLOW-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
+; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]]
+; SLOW-NEXT: [[CMP1:%.*]] = icmp ne i64 [[TMP1]], 0
; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
@@ -56,8 +57,9 @@ define i64 @test_ugt_1(i64 %x, i64 %y, i64 %a, i64 %b) {
; SLOW-LABEL: define i64 @test_ugt_1(
; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
; SLOW-NEXT: [[ENTRY:.*]]:
-; SLOW-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
-; SLOW-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
+; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]]
+; SLOW-NEXT: [[CMP1:%.*]] = icmp ne i64 [[TMP1]], 0
; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
>From 42dbb28a14079ed5322f106ab977cb8773325a6e Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88 at gmail.com>
Date: Sun, 20 Apr 2025 08:04:14 +0300
Subject: [PATCH 3/4] Handle vectors and ==/!= 1 patterns
---
llvm/lib/CodeGen/CodeGenPrepare.cpp | 45 ++-
.../PowerPC/vector-popcnt-128-ult-ugt.ll | 16 +-
llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll | 102 ++-----
llvm/test/CodeGen/RISCV/rv32zbb.ll | 72 ++---
llvm/test/CodeGen/RISCV/rv64zbb.ll | 48 +--
llvm/test/CodeGen/X86/ispow2.ll | 44 +--
llvm/test/CodeGen/X86/vector-popcnt-128.ll | 34 +--
.../CodeGen/X86/vector-popcnt-256-ult-ugt.ll | 136 ++++-----
llvm/test/CodeGen/X86/vector-popcnt-256.ll | 288 +++++++++---------
.../CodeGen/X86/vector-popcnt-512-ult-ugt.ll | 80 ++---
llvm/test/CodeGen/X86/vector-popcnt-512.ll | 112 +++----
11 files changed, 471 insertions(+), 506 deletions(-)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 91f50f87cc711..d9fcfcdff3481 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1767,28 +1767,49 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
bool CodeGenPrepare::unfoldPow2Test(CmpInst *Cmp) {
CmpPredicate Pred;
Value *X;
- uint64_t C;
+ const APInt *C;
+ // (icmp (ctpop x), c)
if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(X)),
- m_ConstantInt(C))))
+ m_APIntAllowPoison(C))))
return false;
- Type *Ty = X->getType();
- if (Ty->isVectorTy() || TTI->getPopcntSupport(Ty->getIntegerBitWidth()) ==
- TargetTransformInfo::PSK_FastHardware)
+ // This transformation increases the number of instructions, don't do it if
+ // ctpop is fast.
+ Type *OpTy = X->getType();
+ if (TLI->isCtpopFast(TLI->getValueType(*DL, OpTy)))
return false;
- // (ctpop x) u< 2 -> (x & (x - 1)) == 0
- // (ctpop x) u> 1 -> (x & (x - 1)) != 0
- if ((Pred == CmpInst::ICMP_ULT && C == 2) ||
- (Pred == CmpInst::ICMP_UGT && C == 1)) {
+ // ctpop(x) u< 2 -> (x & (x - 1)) == 0
+ // ctpop(x) u> 1 -> (x & (x - 1)) != 0
+ // Also handles ctpop(x) == 1 and ctpop(x) != 1 if ctpop(x) is known non-zero.
+ if ((Pred == CmpInst::ICMP_ULT && *C == 2) ||
+ (Pred == CmpInst::ICMP_UGT && *C == 1) ||
+ (ICmpInst::isEquality(Pred) && *C == 1 &&
+ isKnownNonZero(Cmp->getOperand(0), *DL))) {
IRBuilder<> Builder(Cmp);
- Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(Ty));
+ Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(OpTy));
Value *And = Builder.CreateAnd(X, Sub);
CmpInst::Predicate NewPred =
- Pred == CmpInst::ICMP_ULT ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE;
+ (Pred == CmpInst::ICMP_ULT || Pred == CmpInst::ICMP_EQ)
+ ? CmpInst::ICMP_EQ
+ : CmpInst::ICMP_NE;
Value *NewCmp =
- Builder.CreateICmp(NewPred, And, ConstantInt::getNullValue(Ty));
+ Builder.CreateICmp(NewPred, And, ConstantInt::getNullValue(OpTy));
+ Cmp->replaceAllUsesWith(NewCmp);
+ RecursivelyDeleteTriviallyDeadInstructions(Cmp);
+ return true;
+ }
+
+ // ctpop(x) == 1 -> (x ^ (x - 1)) u> (x - 1)
+ // ctpop(x) != 1 -> (x ^ (x - 1)) u<= (x - 1)
+ if (ICmpInst::isEquality(Pred) && *C == 1) {
+ IRBuilder<> Builder(Cmp);
+ Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(OpTy));
+ Value *Xor = Builder.CreateXor(X, Sub);
+ CmpInst::Predicate NewPred =
+ Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT : CmpInst::ICMP_ULE;
+ Value *NewCmp = Builder.CreateICmp(NewPred, Xor, Sub);
Cmp->replaceAllUsesWith(NewCmp);
RecursivelyDeleteTriviallyDeadInstructions(Cmp);
return true;
diff --git a/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
index ff7f1fc902981..04351346745b3 100644
--- a/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
@@ -11945,11 +11945,11 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
; PWR5-LABEL: ugt_1_v2i64:
; PWR5: # %bb.0:
; PWR5-NEXT: addi 5, 3, -1
+; PWR5-NEXT: addi 6, 4, -1
; PWR5-NEXT: and 3, 3, 5
-; PWR5-NEXT: addi 5, 4, -1
+; PWR5-NEXT: and 4, 4, 6
; PWR5-NEXT: subfic 3, 3, 0
; PWR5-NEXT: subfe 3, 3, 3
-; PWR5-NEXT: and 4, 4, 5
; PWR5-NEXT: subfic 4, 4, 0
; PWR5-NEXT: subfe 4, 4, 4
; PWR5-NEXT: blr
@@ -11957,11 +11957,11 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
; PWR6-LABEL: ugt_1_v2i64:
; PWR6: # %bb.0:
; PWR6-NEXT: addi 5, 3, -1
+; PWR6-NEXT: addi 6, 4, -1
; PWR6-NEXT: and 3, 3, 5
-; PWR6-NEXT: addi 5, 4, -1
+; PWR6-NEXT: and 4, 4, 6
; PWR6-NEXT: subfic 3, 3, 0
; PWR6-NEXT: subfe 3, 3, 3
-; PWR6-NEXT: and 4, 4, 5
; PWR6-NEXT: subfic 4, 4, 0
; PWR6-NEXT: subfe 4, 4, 4
; PWR6-NEXT: blr
@@ -12016,11 +12016,11 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) {
; PWR5-LABEL: ult_2_v2i64:
; PWR5: # %bb.0:
; PWR5-NEXT: addi 5, 3, -1
+; PWR5-NEXT: addi 6, 4, -1
; PWR5-NEXT: and 3, 3, 5
-; PWR5-NEXT: addi 5, 4, -1
+; PWR5-NEXT: and 4, 4, 6
; PWR5-NEXT: addic 3, 3, -1
; PWR5-NEXT: subfe 3, 3, 3
-; PWR5-NEXT: and 4, 4, 5
; PWR5-NEXT: addic 4, 4, -1
; PWR5-NEXT: subfe 4, 4, 4
; PWR5-NEXT: blr
@@ -12028,11 +12028,11 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) {
; PWR6-LABEL: ult_2_v2i64:
; PWR6: # %bb.0:
; PWR6-NEXT: addi 5, 3, -1
+; PWR6-NEXT: addi 6, 4, -1
; PWR6-NEXT: and 3, 3, 5
-; PWR6-NEXT: addi 5, 4, -1
+; PWR6-NEXT: and 4, 4, 6
; PWR6-NEXT: addic 3, 3, -1
; PWR6-NEXT: subfe 3, 3, 3
-; PWR6-NEXT: and 4, 4, 5
; PWR6-NEXT: addic 4, 4, -1
; PWR6-NEXT: subfe 4, 4, 4
; PWR6-NEXT: blr
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
index dcbf9bfb8874a..f9af74d6ec323 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
@@ -418,45 +418,18 @@ define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
define i1 @ctpop_i64_eq_one(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_eq_one:
; RV32I: # %bb.0:
-; RV32I-NEXT: srli a2, a0, 1
-; RV32I-NEXT: lui a3, 349525
-; RV32I-NEXT: lui a4, 209715
-; RV32I-NEXT: srli a5, a1, 1
-; RV32I-NEXT: addi a3, a3, 1365
-; RV32I-NEXT: and a2, a2, a3
-; RV32I-NEXT: and a3, a5, a3
-; RV32I-NEXT: lui a5, 61681
-; RV32I-NEXT: addi a4, a4, 819
-; RV32I-NEXT: addi a5, a5, -241
-; RV32I-NEXT: sub a0, a0, a2
-; RV32I-NEXT: sub a1, a1, a3
-; RV32I-NEXT: srli a2, a0, 2
-; RV32I-NEXT: and a0, a0, a4
-; RV32I-NEXT: srli a3, a1, 2
-; RV32I-NEXT: and a1, a1, a4
-; RV32I-NEXT: and a2, a2, a4
-; RV32I-NEXT: and a3, a3, a4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: srli a2, a0, 4
-; RV32I-NEXT: srli a3, a1, 4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: and a0, a0, a5
-; RV32I-NEXT: and a1, a1, a5
-; RV32I-NEXT: slli a2, a0, 8
-; RV32I-NEXT: slli a3, a1, 8
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: slli a2, a0, 16
-; RV32I-NEXT: slli a3, a1, 16
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: add a0, a1, a0
-; RV32I-NEXT: xori a0, a0, 1
-; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: sltiu a3, a2, -1
+; RV32I-NEXT: addi a4, a1, -1
+; RV32I-NEXT: add a3, a4, a3
+; RV32I-NEXT: xor a1, a1, a3
+; RV32I-NEXT: beq a1, a3, .LBB8_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sltu a0, a3, a1
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB8_2:
+; RV32I-NEXT: xor a0, a0, a2
+; RV32I-NEXT: sltu a0, a2, a0
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: ctpop_i64_eq_one:
@@ -475,45 +448,20 @@ define i1 @ctpop_i64_eq_one(i64 %a) nounwind {
define i1 @ctpop_i64_ne_one(i64 %a) nounwind {
; RV32I-LABEL: ctpop_i64_ne_one:
; RV32I: # %bb.0:
-; RV32I-NEXT: srli a2, a0, 1
-; RV32I-NEXT: lui a3, 349525
-; RV32I-NEXT: lui a4, 209715
-; RV32I-NEXT: srli a5, a1, 1
-; RV32I-NEXT: addi a3, a3, 1365
-; RV32I-NEXT: and a2, a2, a3
-; RV32I-NEXT: and a3, a5, a3
-; RV32I-NEXT: lui a5, 61681
-; RV32I-NEXT: addi a4, a4, 819
-; RV32I-NEXT: addi a5, a5, -241
-; RV32I-NEXT: sub a0, a0, a2
-; RV32I-NEXT: sub a1, a1, a3
-; RV32I-NEXT: srli a2, a0, 2
-; RV32I-NEXT: and a0, a0, a4
-; RV32I-NEXT: srli a3, a1, 2
-; RV32I-NEXT: and a1, a1, a4
-; RV32I-NEXT: and a2, a2, a4
-; RV32I-NEXT: and a3, a3, a4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: srli a2, a0, 4
-; RV32I-NEXT: srli a3, a1, 4
-; RV32I-NEXT: add a0, a2, a0
-; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: and a0, a0, a5
-; RV32I-NEXT: and a1, a1, a5
-; RV32I-NEXT: slli a2, a0, 8
-; RV32I-NEXT: slli a3, a1, 8
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: slli a2, a0, 16
-; RV32I-NEXT: slli a3, a1, 16
-; RV32I-NEXT: add a0, a0, a2
-; RV32I-NEXT: add a1, a1, a3
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: sltiu a3, a2, -1
+; RV32I-NEXT: addi a4, a1, -1
+; RV32I-NEXT: add a3, a4, a3
+; RV32I-NEXT: xor a1, a1, a3
+; RV32I-NEXT: beq a1, a3, .LBB9_2
+; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: sltu a0, a3, a1
+; RV32I-NEXT: xori a0, a0, 1
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB9_2:
+; RV32I-NEXT: xor a0, a0, a2
+; RV32I-NEXT: sltu a0, a2, a0
; RV32I-NEXT: xori a0, a0, 1
-; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: ctpop_i64_ne_one:
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 946d5db7be8b9..98c86da41afa1 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -416,9 +416,9 @@ define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind {
; RV32I-LABEL: ctpop_v2i32_ult_two:
; RV32I: # %bb.0:
; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: addi a3, a1, -1
+; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a0, a0, a2
-; RV32I-NEXT: addi a2, a1, -1
-; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: seqz a0, a0
; RV32I-NEXT: seqz a1, a1
; RV32I-NEXT: ret
@@ -439,9 +439,9 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
; RV32I-LABEL: ctpop_v2i32_ugt_one:
; RV32I: # %bb.0:
; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: addi a3, a1, -1
+; RV32I-NEXT: and a1, a1, a3
; RV32I-NEXT: and a0, a0, a2
-; RV32I-NEXT: addi a2, a1, -1
-; RV32I-NEXT: and a1, a1, a2
; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: snez a1, a1
; RV32I-NEXT: ret
@@ -464,11 +464,11 @@ define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind {
; RV32I-LABEL: ctpop_v2i32_eq_one:
; RV32I: # %bb.0:
; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: addi a3, a1, -1
+; RV32I-NEXT: xor a1, a1, a3
; RV32I-NEXT: xor a0, a0, a2
; RV32I-NEXT: sltu a0, a2, a0
-; RV32I-NEXT: addi a2, a1, -1
-; RV32I-NEXT: xor a1, a1, a2
-; RV32I-NEXT: sltu a1, a2, a1
+; RV32I-NEXT: sltu a1, a3, a1
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: ctpop_v2i32_eq_one:
@@ -489,11 +489,11 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
; RV32I-LABEL: ctpop_v2i32_ne_one:
; RV32I: # %bb.0:
; RV32I-NEXT: addi a2, a0, -1
+; RV32I-NEXT: addi a3, a1, -1
+; RV32I-NEXT: xor a1, a1, a3
; RV32I-NEXT: xor a0, a0, a2
; RV32I-NEXT: sltu a0, a2, a0
-; RV32I-NEXT: addi a2, a1, -1
-; RV32I-NEXT: xor a1, a1, a2
-; RV32I-NEXT: sltu a1, a2, a1
+; RV32I-NEXT: sltu a1, a3, a1
; RV32I-NEXT: xori a0, a0, 1
; RV32I-NEXT: xori a1, a1, 1
; RV32I-NEXT: ret
@@ -785,20 +785,20 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
; RV32I-LABEL: ctpop_v2i64_ult_two:
; RV32I: # %bb.0:
; RV32I-NEXT: lw a1, 0(a0)
-; RV32I-NEXT: lw a2, 4(a0)
-; RV32I-NEXT: lw a3, 8(a0)
+; RV32I-NEXT: lw a2, 8(a0)
+; RV32I-NEXT: lw a3, 4(a0)
; RV32I-NEXT: lw a0, 12(a0)
-; RV32I-NEXT: addi a4, a1, -1
-; RV32I-NEXT: and a4, a1, a4
-; RV32I-NEXT: seqz a1, a1
-; RV32I-NEXT: sub a1, a2, a1
-; RV32I-NEXT: and a1, a2, a1
-; RV32I-NEXT: addi a2, a3, -1
-; RV32I-NEXT: and a2, a3, a2
-; RV32I-NEXT: seqz a3, a3
-; RV32I-NEXT: sub a3, a0, a3
-; RV32I-NEXT: and a0, a0, a3
-; RV32I-NEXT: or a1, a4, a1
+; RV32I-NEXT: seqz a4, a1
+; RV32I-NEXT: seqz a5, a2
+; RV32I-NEXT: addi a6, a1, -1
+; RV32I-NEXT: addi a7, a2, -1
+; RV32I-NEXT: sub a4, a3, a4
+; RV32I-NEXT: sub a5, a0, a5
+; RV32I-NEXT: and a2, a2, a7
+; RV32I-NEXT: and a1, a1, a6
+; RV32I-NEXT: and a0, a0, a5
+; RV32I-NEXT: and a3, a3, a4
+; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: or a2, a2, a0
; RV32I-NEXT: seqz a0, a1
; RV32I-NEXT: seqz a1, a2
@@ -828,20 +828,20 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
; RV32I-LABEL: ctpop_v2i64_ugt_one:
; RV32I: # %bb.0:
; RV32I-NEXT: lw a1, 0(a0)
-; RV32I-NEXT: lw a2, 4(a0)
-; RV32I-NEXT: lw a3, 8(a0)
+; RV32I-NEXT: lw a2, 8(a0)
+; RV32I-NEXT: lw a3, 4(a0)
; RV32I-NEXT: lw a0, 12(a0)
-; RV32I-NEXT: addi a4, a1, -1
-; RV32I-NEXT: and a4, a1, a4
-; RV32I-NEXT: seqz a1, a1
-; RV32I-NEXT: sub a1, a2, a1
-; RV32I-NEXT: and a1, a2, a1
-; RV32I-NEXT: addi a2, a3, -1
-; RV32I-NEXT: and a2, a3, a2
-; RV32I-NEXT: seqz a3, a3
-; RV32I-NEXT: sub a3, a0, a3
-; RV32I-NEXT: and a0, a0, a3
-; RV32I-NEXT: or a1, a4, a1
+; RV32I-NEXT: seqz a4, a1
+; RV32I-NEXT: seqz a5, a2
+; RV32I-NEXT: addi a6, a1, -1
+; RV32I-NEXT: addi a7, a2, -1
+; RV32I-NEXT: sub a4, a3, a4
+; RV32I-NEXT: sub a5, a0, a5
+; RV32I-NEXT: and a2, a2, a7
+; RV32I-NEXT: and a1, a1, a6
+; RV32I-NEXT: and a0, a0, a5
+; RV32I-NEXT: and a3, a3, a4
+; RV32I-NEXT: or a1, a1, a3
; RV32I-NEXT: or a2, a2, a0
; RV32I-NEXT: snez a0, a1
; RV32I-NEXT: snez a1, a2
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 25325ad7d50a4..17eb0817d548a 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -719,11 +719,11 @@ define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind {
; RV64I-LABEL: ctpop_v2i32_ult_two:
; RV64I: # %bb.0:
; RV64I-NEXT: addi a2, a0, -1
+; RV64I-NEXT: addi a3, a1, -1
+; RV64I-NEXT: and a1, a1, a3
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: addi a2, a1, -1
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: sext.w a1, a1
+; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: seqz a0, a0
; RV64I-NEXT: seqz a1, a1
; RV64I-NEXT: ret
@@ -744,11 +744,11 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
; RV64I-LABEL: ctpop_v2i32_ugt_one:
; RV64I: # %bb.0:
; RV64I-NEXT: addi a2, a0, -1
+; RV64I-NEXT: addi a3, a1, -1
+; RV64I-NEXT: and a1, a1, a3
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: addi a2, a1, -1
-; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: sext.w a1, a1
+; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: snez a0, a0
; RV64I-NEXT: snez a1, a1
; RV64I-NEXT: ret
@@ -771,13 +771,13 @@ define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind {
; RV64I-LABEL: ctpop_v2i32_eq_one:
; RV64I: # %bb.0:
; RV64I-NEXT: addiw a2, a0, -1
+; RV64I-NEXT: addiw a3, a1, -1
+; RV64I-NEXT: xor a1, a1, a3
; RV64I-NEXT: xor a0, a0, a2
+; RV64I-NEXT: sext.w a1, a1
; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: sltu a0, a2, a0
-; RV64I-NEXT: addiw a2, a1, -1
-; RV64I-NEXT: xor a1, a1, a2
-; RV64I-NEXT: sext.w a1, a1
-; RV64I-NEXT: sltu a1, a2, a1
+; RV64I-NEXT: sltu a1, a3, a1
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: ctpop_v2i32_eq_one:
@@ -798,13 +798,13 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
; RV64I-LABEL: ctpop_v2i32_ne_one:
; RV64I: # %bb.0:
; RV64I-NEXT: addiw a2, a0, -1
+; RV64I-NEXT: addiw a3, a1, -1
+; RV64I-NEXT: xor a1, a1, a3
; RV64I-NEXT: xor a0, a0, a2
+; RV64I-NEXT: sext.w a1, a1
; RV64I-NEXT: sext.w a0, a0
; RV64I-NEXT: sltu a0, a2, a0
-; RV64I-NEXT: addiw a2, a1, -1
-; RV64I-NEXT: xor a1, a1, a2
-; RV64I-NEXT: sext.w a1, a1
-; RV64I-NEXT: sltu a1, a2, a1
+; RV64I-NEXT: sltu a1, a3, a1
; RV64I-NEXT: xori a0, a0, 1
; RV64I-NEXT: xori a1, a1, 1
; RV64I-NEXT: ret
@@ -1009,9 +1009,9 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
; RV64I-LABEL: ctpop_v2i64_ult_two:
; RV64I: # %bb.0:
; RV64I-NEXT: addi a2, a0, -1
+; RV64I-NEXT: addi a3, a1, -1
+; RV64I-NEXT: and a1, a1, a3
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: addi a2, a1, -1
-; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: seqz a0, a0
; RV64I-NEXT: seqz a1, a1
; RV64I-NEXT: ret
@@ -1032,9 +1032,9 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
; RV64I-LABEL: ctpop_v2i64_ugt_one:
; RV64I: # %bb.0:
; RV64I-NEXT: addi a2, a0, -1
+; RV64I-NEXT: addi a3, a1, -1
+; RV64I-NEXT: and a1, a1, a3
; RV64I-NEXT: and a0, a0, a2
-; RV64I-NEXT: addi a2, a1, -1
-; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: snez a0, a0
; RV64I-NEXT: snez a1, a1
; RV64I-NEXT: ret
@@ -1057,11 +1057,11 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
; RV64I-LABEL: ctpop_v2i64_eq_one:
; RV64I: # %bb.0:
; RV64I-NEXT: addi a2, a0, -1
+; RV64I-NEXT: addi a3, a1, -1
+; RV64I-NEXT: xor a1, a1, a3
; RV64I-NEXT: xor a0, a0, a2
; RV64I-NEXT: sltu a0, a2, a0
-; RV64I-NEXT: addi a2, a1, -1
-; RV64I-NEXT: xor a1, a1, a2
-; RV64I-NEXT: sltu a1, a2, a1
+; RV64I-NEXT: sltu a1, a3, a1
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: ctpop_v2i64_eq_one:
@@ -1082,11 +1082,11 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
; RV64I-LABEL: ctpop_v2i64_ne_one:
; RV64I: # %bb.0:
; RV64I-NEXT: addi a2, a0, -1
+; RV64I-NEXT: addi a3, a1, -1
+; RV64I-NEXT: xor a1, a1, a3
; RV64I-NEXT: xor a0, a0, a2
; RV64I-NEXT: sltu a0, a2, a0
-; RV64I-NEXT: addi a2, a1, -1
-; RV64I-NEXT: xor a1, a1, a2
-; RV64I-NEXT: sltu a1, a2, a1
+; RV64I-NEXT: sltu a1, a3, a1
; RV64I-NEXT: xori a0, a0, 1
; RV64I-NEXT: xori a1, a1, 1
; RV64I-NEXT: ret
diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll
index 649d257b28d76..badfd1af940ca 100644
--- a/llvm/test/CodeGen/X86/ispow2.ll
+++ b/llvm/test/CodeGen/X86/ispow2.ll
@@ -72,11 +72,11 @@ define <4 x i1> @is_pow2_non_zero_4xv64(<4 x i64> %xin) {
; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2
; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3
; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3
+; CHECK-NOBMI-NEXT: paddq %xmm0, %xmm2
+; CHECK-NOBMI-NEXT: pand %xmm2, %xmm0
; CHECK-NOBMI-NEXT: pand %xmm1, %xmm3
; CHECK-NOBMI-NEXT: pxor %xmm1, %xmm1
; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm3
-; CHECK-NOBMI-NEXT: paddq %xmm0, %xmm2
-; CHECK-NOBMI-NEXT: pand %xmm2, %xmm0
; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm1
; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
@@ -122,12 +122,12 @@ define <4 x i1> @neither_pow2_non_zero_4xv64(<4 x i64> %xin) {
; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2
; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3
; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3
-; CHECK-NOBMI-NEXT: pand %xmm1, %xmm3
-; CHECK-NOBMI-NEXT: pxor %xmm1, %xmm1
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm3
; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4
; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm4
; CHECK-NOBMI-NEXT: pand %xmm4, %xmm0
+; CHECK-NOBMI-NEXT: pand %xmm1, %xmm3
+; CHECK-NOBMI-NEXT: pxor %xmm1, %xmm1
+; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm3
; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm1
; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
@@ -170,27 +170,27 @@ define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) {
; CHECK-NOBMI-LABEL: neither_pow2_non_zero_4xv64_x_maybe_z:
; CHECK-NOBMI: # %bb.0:
; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2
-; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3
+; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm3
; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3
-; CHECK-NOBMI-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; CHECK-NOBMI-NEXT: pxor %xmm4, %xmm3
-; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm1
-; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm5
+; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm4
+; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm4
+; CHECK-NOBMI-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; CHECK-NOBMI-NEXT: pxor %xmm5, %xmm4
+; CHECK-NOBMI-NEXT: pxor %xmm4, %xmm1
+; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm6
+; CHECK-NOBMI-NEXT: pcmpgtd %xmm4, %xmm6
+; CHECK-NOBMI-NEXT: pxor %xmm5, %xmm3
+; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm0
+; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm5
; CHECK-NOBMI-NEXT: pcmpgtd %xmm3, %xmm5
-; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm6
-; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm6
-; CHECK-NOBMI-NEXT: pxor %xmm4, %xmm6
-; CHECK-NOBMI-NEXT: pxor %xmm6, %xmm0
-; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4
-; CHECK-NOBMI-NEXT: pcmpgtd %xmm6, %xmm4
-; CHECK-NOBMI-NEXT: movdqa %xmm4, %xmm7
-; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm5[0,2]
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm3, %xmm1
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm6, %xmm0
+; CHECK-NOBMI-NEXT: movdqa %xmm5, %xmm7
+; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2]
+; CHECK-NOBMI-NEXT: pcmpeqd %xmm4, %xmm1
+; CHECK-NOBMI-NEXT: pcmpeqd %xmm3, %xmm0
; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; CHECK-NOBMI-NEXT: andps %xmm7, %xmm0
-; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm5[1,3]
-; CHECK-NOBMI-NEXT: orps %xmm4, %xmm0
+; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3]
+; CHECK-NOBMI-NEXT: orps %xmm5, %xmm0
; CHECK-NOBMI-NEXT: xorps %xmm2, %xmm0
; CHECK-NOBMI-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
index 741d70a369022..c1d30b6d5a995 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
@@ -803,11 +803,10 @@ define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
; BITALG-LABEL: eq_1_v2i64:
; BITALG: # %bb.0:
; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; BITALG-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; BITALG-NEXT: vpminuq %xmm1, %xmm0, %xmm1
-; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; BITALG-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
+; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm2
+; BITALG-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpnleuq %xmm2, %xmm0, %k1
+; BITALG-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
; BITALG-NEXT: retq
%2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
%3 = icmp eq <2 x i64> %2, <i64 1, i64 1>
@@ -883,10 +882,10 @@ define <2 x i64> @ne_1_v2i64(<2 x i64> %0) {
; BITALG-LABEL: ne_1_v2i64:
; BITALG: # %bb.0:
; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; BITALG-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; BITALG-NEXT: vpminuq %xmm1, %xmm0, %xmm1
-; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm2
+; BITALG-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpleuq %xmm2, %xmm0, %k1
+; BITALG-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
; BITALG-NEXT: retq
%2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
%3 = icmp ne <2 x i64> %2, <i64 1, i64 1>
@@ -982,11 +981,10 @@ define <4 x i32> @eq_1_v4i32(<4 x i32> %0) {
; BITALG-LABEL: eq_1_v4i32:
; BITALG: # %bb.0:
; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; BITALG-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; BITALG-NEXT: vpminud %xmm1, %xmm0, %xmm1
-; BITALG-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; BITALG-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
+; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; BITALG-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpnleud %xmm2, %xmm0, %k1
+; BITALG-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
; BITALG-NEXT: retq
%2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
%3 = icmp eq <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
@@ -1085,10 +1083,10 @@ define <4 x i32> @ne_1_v4i32(<4 x i32> %0) {
; BITALG-LABEL: ne_1_v4i32:
; BITALG: # %bb.0:
; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; BITALG-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; BITALG-NEXT: vpminud %xmm1, %xmm0, %xmm1
-; BITALG-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; BITALG-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpleud %xmm2, %xmm0, %k1
+; BITALG-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
; BITALG-NEXT: retq
%2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
%3 = icmp ne <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
index f72ad6d70522f..487f9a5d326cf 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
@@ -10,18 +10,18 @@
define <32 x i8> @ugt_1_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ugt_1_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_1_v32i8:
@@ -76,13 +76,13 @@ define <32 x i8> @ugt_1_v32i8(<32 x i8> %0) {
define <32 x i8> @ult_2_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ult_2_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1003,18 +1003,18 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) {
define <16 x i16> @ugt_1_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ugt_1_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_1_v16i16:
@@ -1069,13 +1069,13 @@ define <16 x i16> @ugt_1_v16i16(<16 x i16> %0) {
define <16 x i16> @ult_2_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ult_2_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -3302,18 +3302,18 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
define <8 x i32> @ugt_1_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_1_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_1_v8i32:
@@ -3370,13 +3370,13 @@ define <8 x i32> @ugt_1_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_2_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_2_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -9377,18 +9377,18 @@ define <8 x i32> @ult_31_v8i32(<8 x i32> %0) {
define <4 x i64> @ugt_1_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ugt_1_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_1_v4i64:
@@ -9445,13 +9445,13 @@ define <4 x i64> @ugt_1_v4i64(<4 x i64> %0) {
define <4 x i64> @ult_2_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ult_2_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll
index 701b9622089db..7fb60b987d95d 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll
@@ -507,15 +507,15 @@ define <32 x i8> @foldv32i8() nounwind {
define <4 x i64> @eq_1_v4i64(<4 x i64> %0) {
; AVX1-LABEL: eq_1_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: # xmm4 = mem[0,0]
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
@@ -534,12 +534,12 @@ define <4 x i64> @eq_1_v4i64(<4 x i64> %0) {
;
; XOP-LABEL: eq_1_v4i64:
; XOP: # %bb.0:
-; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpaddq %xmm2, %xmm1, %xmm3
-; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpcomgtuq %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2
+; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm2
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1
+; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; XOP-NEXT: vpcomgtuq %xmm1, %xmm3, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0
; XOP-NEXT: vpcomgtuq %xmm2, %xmm0, %xmm0
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -574,11 +574,10 @@ define <4 x i64> @eq_1_v4i64(<4 x i64> %0) {
; BITALG-LABEL: eq_1_v4i64:
; BITALG: # %bb.0:
; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm1
-; BITALG-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; BITALG-NEXT: vpminuq %ymm1, %ymm0, %ymm1
-; BITALG-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
-; BITALG-NEXT: vpternlogq {{.*#+}} ymm0 = ~ymm0
+; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm2
+; BITALG-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpcmpnleuq %ymm2, %ymm0, %k1
+; BITALG-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
; BITALG-NEXT: retq
%2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0)
%3 = icmp eq <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
@@ -589,21 +588,21 @@ define <4 x i64> @eq_1_v4i64(<4 x i64> %0) {
define <4 x i64> @ne_1_v4i64(<4 x i64> %0) {
; AVX1-LABEL: ne_1_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: # xmm4 = mem[0,0]
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm5 = mem[0,0]
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ne_1_v4i64:
@@ -619,12 +618,12 @@ define <4 x i64> @ne_1_v4i64(<4 x i64> %0) {
;
; XOP-LABEL: ne_1_v4i64:
; XOP: # %bb.0:
-; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpaddq %xmm2, %xmm1, %xmm3
-; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpcomleuq %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2
+; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm2
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1
+; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; XOP-NEXT: vpcomleuq %xmm1, %xmm3, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0
; XOP-NEXT: vpcomleuq %xmm2, %xmm0, %xmm0
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -660,10 +659,10 @@ define <4 x i64> @ne_1_v4i64(<4 x i64> %0) {
; BITALG-LABEL: ne_1_v4i64:
; BITALG: # %bb.0:
; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm1
-; BITALG-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; BITALG-NEXT: vpminuq %ymm1, %ymm0, %ymm1
-; BITALG-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm2
+; BITALG-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpcmpleuq %ymm2, %ymm0, %k1
+; BITALG-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
; BITALG-NEXT: retq
%2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0)
%3 = icmp ne <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
@@ -674,19 +673,19 @@ define <4 x i64> @ne_1_v4i64(<4 x i64> %0) {
define <8 x i32> @eq_1_v8i32(<8 x i32> %0) {
; AVX1-LABEL: eq_1_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: eq_1_v8i32:
@@ -701,12 +700,12 @@ define <8 x i32> @eq_1_v8i32(<8 x i32> %0) {
;
; XOP-LABEL: eq_1_v8i32:
; XOP: # %bb.0:
-; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm3
-; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpcomgtud %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpaddd %xmm2, %xmm0, %xmm2
+; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOP-NEXT: vpaddd %xmm1, %xmm3, %xmm1
+; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; XOP-NEXT: vpcomgtud %xmm1, %xmm3, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0
; XOP-NEXT: vpcomgtud %xmm2, %xmm0, %xmm0
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -741,11 +740,10 @@ define <8 x i32> @eq_1_v8i32(<8 x i32> %0) {
; BITALG-LABEL: eq_1_v8i32:
; BITALG: # %bb.0:
; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm1
-; BITALG-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; BITALG-NEXT: vpminud %ymm1, %ymm0, %ymm1
-; BITALG-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
-; BITALG-NEXT: vpternlogq {{.*#+}} ymm0 = ~ymm0
+; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm2
+; BITALG-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpcmpnleud %ymm2, %ymm0, %k1
+; BITALG-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} {z}
; BITALG-NEXT: retq
%2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0)
%3 = icmp eq <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -756,13 +754,13 @@ define <8 x i32> @eq_1_v8i32(<8 x i32> %0) {
define <8 x i32> @ne_1_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ne_1_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
@@ -780,12 +778,12 @@ define <8 x i32> @ne_1_v8i32(<8 x i32> %0) {
;
; XOP-LABEL: ne_1_v8i32:
; XOP: # %bb.0:
-; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm3
-; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpcomleud %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpaddd %xmm2, %xmm0, %xmm2
+; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOP-NEXT: vpaddd %xmm1, %xmm3, %xmm1
+; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; XOP-NEXT: vpcomleud %xmm1, %xmm3, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0
; XOP-NEXT: vpcomleud %xmm2, %xmm0, %xmm0
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -821,10 +819,10 @@ define <8 x i32> @ne_1_v8i32(<8 x i32> %0) {
; BITALG-LABEL: ne_1_v8i32:
; BITALG: # %bb.0:
; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm1
-; BITALG-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; BITALG-NEXT: vpminud %ymm1, %ymm0, %ymm1
-; BITALG-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm2
+; BITALG-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpcmpleud %ymm2, %ymm0, %k1
+; BITALG-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} {z}
; BITALG-NEXT: retq
%2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0)
%3 = icmp ne <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -835,19 +833,19 @@ define <8 x i32> @ne_1_v8i32(<8 x i32> %0) {
define <16 x i16> @eq_1_v16i16(<16 x i16> %0) {
; AVX1-LABEL: eq_1_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpminuw %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: eq_1_v16i16:
@@ -862,12 +860,12 @@ define <16 x i16> @eq_1_v16i16(<16 x i16> %0) {
;
; XOP-LABEL: eq_1_v16i16:
; XOP: # %bb.0:
-; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm3
-; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpcomgtuw %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpaddw %xmm2, %xmm0, %xmm2
+; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm2
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOP-NEXT: vpaddw %xmm1, %xmm3, %xmm1
+; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; XOP-NEXT: vpcomgtuw %xmm1, %xmm3, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0
; XOP-NEXT: vpcomgtuw %xmm2, %xmm0, %xmm0
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -915,13 +913,13 @@ define <16 x i16> @eq_1_v16i16(<16 x i16> %0) {
define <16 x i16> @ne_1_v16i16(<16 x i16> %0) {
; AVX1-LABEL: ne_1_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpminuw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
@@ -939,12 +937,12 @@ define <16 x i16> @ne_1_v16i16(<16 x i16> %0) {
;
; XOP-LABEL: ne_1_v16i16:
; XOP: # %bb.0:
-; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm3
-; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpcomleuw %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpaddw %xmm2, %xmm0, %xmm2
+; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm2
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOP-NEXT: vpaddw %xmm1, %xmm3, %xmm1
+; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; XOP-NEXT: vpcomleuw %xmm1, %xmm3, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0
; XOP-NEXT: vpcomleuw %xmm2, %xmm0, %xmm0
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -992,19 +990,19 @@ define <16 x i16> @ne_1_v16i16(<16 x i16> %0) {
define <32 x i8> @eq_1_v32i8(<32 x i8> %0) {
; AVX1-LABEL: eq_1_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpminub %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpminub %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: eq_1_v32i8:
@@ -1019,12 +1017,12 @@ define <32 x i8> @eq_1_v32i8(<32 x i8> %0) {
;
; XOP-LABEL: eq_1_v32i8:
; XOP: # %bb.0:
-; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpaddb %xmm2, %xmm1, %xmm3
-; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpcomgtub %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm2
+; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm2
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; XOP-NEXT: vpcomgtub %xmm1, %xmm3, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0
; XOP-NEXT: vpcomgtub %xmm2, %xmm0, %xmm0
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1072,13 +1070,13 @@ define <32 x i8> @eq_1_v32i8(<32 x i8> %0) {
define <32 x i8> @ne_1_v32i8(<32 x i8> %0) {
; AVX1-LABEL: ne_1_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpminub %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
@@ -1096,12 +1094,12 @@ define <32 x i8> @ne_1_v32i8(<32 x i8> %0) {
;
; XOP-LABEL: ne_1_v32i8:
; XOP: # %bb.0:
-; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpaddb %xmm2, %xmm1, %xmm3
-; XOP-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpcomleub %xmm3, %xmm1, %xmm1
-; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm2
+; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm2
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; XOP-NEXT: vpxor %xmm1, %xmm3, %xmm3
+; XOP-NEXT: vpcomleub %xmm1, %xmm3, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0
; XOP-NEXT: vpcomleub %xmm2, %xmm0, %xmm0
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
index 828c97de3a079..1618a647a4062 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
@@ -9,13 +9,13 @@
define <64 x i8> @ugt_1_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ugt_1_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -32,13 +32,13 @@ define <64 x i8> @ugt_1_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_1_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -68,13 +68,13 @@ define <64 x i8> @ugt_1_v64i8(<64 x i8> %0) {
define <64 x i8> @ult_2_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ult_2_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -90,13 +90,13 @@ define <64 x i8> @ult_2_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ult_2_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -1035,13 +1035,13 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
define <32 x i16> @ugt_1_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ugt_1_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -1058,13 +1058,13 @@ define <32 x i16> @ugt_1_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_1_v32i16:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -1094,13 +1094,13 @@ define <32 x i16> @ugt_1_v32i16(<32 x i16> %0) {
define <32 x i16> @ult_2_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ult_2_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -1116,13 +1116,13 @@ define <32 x i16> @ult_2_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ult_2_v32i16:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512.ll b/llvm/test/CodeGen/X86/vector-popcnt-512.ll
index 0a5f16a0f635f..f470a2be8aee8 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-512.ll
@@ -451,13 +451,13 @@ define <16 x i32> @ne_1_v16i32(<16 x i32> %0) {
define <32 x i16> @eq_1_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: eq_1_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpminuw %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpxor %ymm1, %ymm3, %ymm3
+; AVX512F-NEXT: vpminuw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
@@ -476,13 +476,13 @@ define <32 x i16> @eq_1_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: eq_1_v32i16:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpminuw %ymm3, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm1, %ymm3, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpminuw %ymm1, %ymm3, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpminuw %ymm2, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
@@ -514,13 +514,13 @@ define <32 x i16> @eq_1_v32i16(<32 x i16> %0) {
define <32 x i16> @ne_1_v32i16(<32 x i16> %0) {
; AVX512F-LABEL: ne_1_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpminuw %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpxor %ymm1, %ymm3, %ymm3
+; AVX512F-NEXT: vpminuw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
@@ -538,13 +538,13 @@ define <32 x i16> @ne_1_v32i16(<32 x i16> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ne_1_v32i16:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpminuw %ymm3, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm2, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm1, %ymm3, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpminuw %ymm1, %ymm3, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpminuw %ymm2, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
@@ -575,13 +575,13 @@ define <32 x i16> @ne_1_v32i16(<32 x i16> %0) {
define <64 x i8> @eq_1_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: eq_1_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpminub %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpxor %ymm1, %ymm3, %ymm3
+; AVX512F-NEXT: vpminub %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpminub %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
@@ -600,13 +600,13 @@ define <64 x i8> @eq_1_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: eq_1_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm3, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm1, %ymm3, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm3, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm2, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
@@ -638,13 +638,13 @@ define <64 x i8> @eq_1_v64i8(<64 x i8> %0) {
define <64 x i8> @ne_1_v64i8(<64 x i8> %0) {
; AVX512F-LABEL: ne_1_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpminub %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpxor %ymm1, %ymm3, %ymm3
+; AVX512F-NEXT: vpminub %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpminub %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
@@ -662,13 +662,13 @@ define <64 x i8> @ne_1_v64i8(<64 x i8> %0) {
;
; AVX512VPOPCNTDQ-NOBW-LABEL: ne_1_v64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm3, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm1, %ymm3, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm3, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm2, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
>From c1e79fc878c75c9a9a84a9bd9870f194bd33f312 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88 at gmail.com>
Date: Sun, 20 Apr 2025 09:16:09 +0300
Subject: [PATCH 4/4] Update CGP test
* Drop seemingly unnecessary IR
* Check vector and ==/!= 1 cases
* Check known-non-zero case for scalar types
---
.../CodeGenPrepare/unfold-pow2-test-vec.ll | 85 +++++++++
.../CodeGenPrepare/unfold-pow2-test.ll | 166 ++++++++++--------
2 files changed, 180 insertions(+), 71 deletions(-)
create mode 100644 llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll
diff --git a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll
new file mode 100644
index 0000000000000..9e4a10d9eb864
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test-vec.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p 'require<profile-summary>,function(codegenprepare)' -S %s \
+; RUN: | FileCheck %s --check-prefix=SLOW
+; RUN: opt -p 'require<profile-summary>,function(codegenprepare)' -S --mattr=+zvbb %s \
+; RUN: | FileCheck %s --check-prefix=FAST
+; REQUIRES: riscv-registered-target
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64"
+
+define <4 x i1> @test_ult_2(<4 x i64> %x) {
+; SLOW-LABEL: define <4 x i1> @test_ult_2(
+; SLOW-SAME: <4 x i64> [[X:%.*]]) {
+; SLOW-NEXT: [[TMP0:%.*]] = add <4 x i64> [[X]], splat (i64 -1)
+; SLOW-NEXT: [[TMP1:%.*]] = and <4 x i64> [[X]], [[TMP0]]
+; SLOW-NEXT: [[CMP1:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer
+; SLOW-NEXT: ret <4 x i1> [[CMP1]]
+;
+; FAST-LABEL: define <4 x i1> @test_ult_2(
+; FAST-SAME: <4 x i64> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; FAST-NEXT: [[CTPOP:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[X]])
+; FAST-NEXT: [[CMP1:%.*]] = icmp ult <4 x i64> [[CTPOP]], splat (i64 2)
+; FAST-NEXT: ret <4 x i1> [[CMP1]]
+;
+ %ctpop = call <4 x i64> @llvm.ctpop(<4 x i64> %x)
+ %cmp = icmp ult <4 x i64> %ctpop, splat (i64 2)
+ ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_ugt_1(<4 x i64> %x) {
+; SLOW-LABEL: define <4 x i1> @test_ugt_1(
+; SLOW-SAME: <4 x i64> [[X:%.*]]) {
+; SLOW-NEXT: [[TMP0:%.*]] = add <4 x i64> [[X]], splat (i64 -1)
+; SLOW-NEXT: [[TMP1:%.*]] = and <4 x i64> [[X]], [[TMP0]]
+; SLOW-NEXT: [[CMP1:%.*]] = icmp ne <4 x i64> [[TMP1]], zeroinitializer
+; SLOW-NEXT: ret <4 x i1> [[CMP1]]
+;
+; FAST-LABEL: define <4 x i1> @test_ugt_1(
+; FAST-SAME: <4 x i64> [[X:%.*]]) #[[ATTR0]] {
+; FAST-NEXT: [[CTPOP:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[X]])
+; FAST-NEXT: [[CMP1:%.*]] = icmp ugt <4 x i64> [[CTPOP]], splat (i64 1)
+; FAST-NEXT: ret <4 x i1> [[CMP1]]
+;
+ %ctpop = call <4 x i64> @llvm.ctpop(<4 x i64> %x)
+ %cmp = icmp ugt <4 x i64> %ctpop, splat (i64 1)
+ ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_eq_1(<4 x i64> %x) {
+; SLOW-LABEL: define <4 x i1> @test_eq_1(
+; SLOW-SAME: <4 x i64> [[X:%.*]]) {
+; SLOW-NEXT: [[TMP0:%.*]] = add <4 x i64> [[X]], splat (i64 -1)
+; SLOW-NEXT: [[TMP1:%.*]] = xor <4 x i64> [[X]], [[TMP0]]
+; SLOW-NEXT: [[TMP2:%.*]] = icmp ugt <4 x i64> [[TMP1]], [[TMP0]]
+; SLOW-NEXT: ret <4 x i1> [[TMP2]]
+;
+; FAST-LABEL: define <4 x i1> @test_eq_1(
+; FAST-SAME: <4 x i64> [[X:%.*]]) #[[ATTR0]] {
+; FAST-NEXT: [[CTPOP:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[X]])
+; FAST-NEXT: [[CMP1:%.*]] = icmp eq <4 x i64> [[CTPOP]], splat (i64 1)
+; FAST-NEXT: ret <4 x i1> [[CMP1]]
+;
+ %ctpop = call <4 x i64> @llvm.ctpop(<4 x i64> %x)
+ %cmp = icmp eq <4 x i64> %ctpop, splat (i64 1)
+ ret <4 x i1> %cmp
+}
+
+define <4 x i1> @test_ne_1(<4 x i64> %x) {
+; SLOW-LABEL: define <4 x i1> @test_ne_1(
+; SLOW-SAME: <4 x i64> [[X:%.*]]) {
+; SLOW-NEXT: [[TMP0:%.*]] = add <4 x i64> [[X]], splat (i64 -1)
+; SLOW-NEXT: [[TMP1:%.*]] = xor <4 x i64> [[X]], [[TMP0]]
+; SLOW-NEXT: [[TMP2:%.*]] = icmp ule <4 x i64> [[TMP1]], [[TMP0]]
+; SLOW-NEXT: ret <4 x i1> [[TMP2]]
+;
+; FAST-LABEL: define <4 x i1> @test_ne_1(
+; FAST-SAME: <4 x i64> [[X:%.*]]) #[[ATTR0]] {
+; FAST-NEXT: [[CTPOP:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[X]])
+; FAST-NEXT: [[CMP1:%.*]] = icmp ne <4 x i64> [[CTPOP]], splat (i64 1)
+; FAST-NEXT: ret <4 x i1> [[CMP1]]
+;
+ %ctpop = call <4 x i64> @llvm.ctpop(<4 x i64> %x)
+ %cmp = icmp ne <4 x i64> %ctpop, splat (i64 1)
+ ret <4 x i1> %cmp
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
index 716bfdfe19411..ce4a152b16dc4 100644
--- a/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll
@@ -3,97 +3,121 @@
; RUN: | FileCheck %s --check-prefix=SLOW
; RUN: opt -p 'require<profile-summary>,function(codegenprepare)' -S --mattr=+zbb %s \
; RUN: | FileCheck %s --check-prefix=FAST
-; REQUIRES: riscv64-registered-target
+; REQUIRES: riscv-registered-target
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64"
-define i64 @test_ult_2(i64 %x, i64 %y, i64 %a, i64 %b) {
-; SLOW-LABEL: define i64 @test_ult_2(
-; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
-; SLOW-NEXT: [[ENTRY:.*]]:
+define i1 @test_ult_2(i64 %x) {
+; SLOW-LABEL: define i1 @test_ult_2(
+; SLOW-SAME: i64 [[X:%.*]]) {
; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]]
-; SLOW-NEXT: [[CMP1:%.*]] = icmp ne i64 [[TMP1]], 0
-; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
-; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
-; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
-; SLOW: [[IF_THEN]]:
-; SLOW-NEXT: br label %[[IF_END]]
-; SLOW: [[IF_END]]:
-; SLOW-NEXT: [[RES:%.*]] = phi i64 [ [[A]], %[[IF_THEN]] ], [ [[B]], %[[ENTRY]] ]
-; SLOW-NEXT: ret i64 [[RES]]
+; SLOW-NEXT: [[CMP1:%.*]] = icmp eq i64 [[TMP1]], 0
+; SLOW-NEXT: ret i1 [[CMP1]]
;
-; FAST-LABEL: define i64 @test_ult_2(
-; FAST-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; FAST-NEXT: [[ENTRY:.*]]:
+; FAST-LABEL: define i1 @test_ult_2(
+; FAST-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
; FAST-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
-; FAST-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
-; FAST-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
-; FAST-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
-; FAST-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
-; FAST: [[IF_THEN]]:
-; FAST-NEXT: br label %[[IF_END]]
-; FAST: [[IF_END]]:
-; FAST-NEXT: [[RES:%.*]] = phi i64 [ [[A]], %[[IF_THEN]] ], [ [[B]], %[[ENTRY]] ]
-; FAST-NEXT: ret i64 [[RES]]
+; FAST-NEXT: [[CMP1:%.*]] = icmp ult i64 [[CTPOP]], 2
+; FAST-NEXT: ret i1 [[CMP1]]
;
-entry:
%ctpop = call i64 @llvm.ctpop.i64(i64 %x)
- %cmp1 = icmp ugt i64 %ctpop, 1
- %cmp2 = icmp sgt i64 %y, 0
- %cmp = or i1 %cmp2, %cmp1
- br i1 %cmp, label %if.then, label %if.end
-
-if.then:
- br label %if.end
-
-if.end:
- %res = phi i64 [ %a, %if.then ], [ %b, %entry ]
- ret i64 %res
+ %cmp = icmp ult i64 %ctpop, 2
+ ret i1 %cmp
}
-define i64 @test_ugt_1(i64 %x, i64 %y, i64 %a, i64 %b) {
-; SLOW-LABEL: define i64 @test_ugt_1(
-; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
-; SLOW-NEXT: [[ENTRY:.*]]:
+define i1 @test_ugt_1(i64 %x) {
+; SLOW-LABEL: define i1 @test_ugt_1(
+; SLOW-SAME: i64 [[X:%.*]]) {
; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]]
; SLOW-NEXT: [[CMP1:%.*]] = icmp ne i64 [[TMP1]], 0
-; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
-; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
-; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
-; SLOW: [[IF_THEN]]:
-; SLOW-NEXT: br label %[[IF_END]]
-; SLOW: [[IF_END]]:
-; SLOW-NEXT: [[RES:%.*]] = phi i64 [ [[A]], %[[IF_THEN]] ], [ [[B]], %[[ENTRY]] ]
-; SLOW-NEXT: ret i64 [[RES]]
+; SLOW-NEXT: ret i1 [[CMP1]]
;
-; FAST-LABEL: define i64 @test_ugt_1(
-; FAST-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
-; FAST-NEXT: [[ENTRY:.*]]:
+; FAST-LABEL: define i1 @test_ugt_1(
+; FAST-SAME: i64 [[X:%.*]]) #[[ATTR0]] {
; FAST-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
; FAST-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
-; FAST-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
-; FAST-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
-; FAST-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
-; FAST: [[IF_THEN]]:
-; FAST-NEXT: br label %[[IF_END]]
-; FAST: [[IF_END]]:
-; FAST-NEXT: [[RES:%.*]] = phi i64 [ [[A]], %[[IF_THEN]] ], [ [[B]], %[[ENTRY]] ]
-; FAST-NEXT: ret i64 [[RES]]
+; FAST-NEXT: ret i1 [[CMP1]]
;
-entry:
%ctpop = call i64 @llvm.ctpop.i64(i64 %x)
- %cmp1 = icmp ugt i64 %ctpop, 1
- %cmp2 = icmp sgt i64 %y, 0
- %cmp = or i1 %cmp2, %cmp1
- br i1 %cmp, label %if.then, label %if.end
+ %cmp = icmp ugt i64 %ctpop, 1
+ ret i1 %cmp
+}
-if.then:
- br label %if.end
+define i1 @test_eq_1_nz(i64 %x) {
+; SLOW-LABEL: define i1 @test_eq_1_nz(
+; SLOW-SAME: i64 [[X:%.*]]) {
+; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
+; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]]
+; SLOW-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
+; SLOW-NEXT: ret i1 [[TMP2]]
+;
+; FAST-LABEL: define i1 @test_eq_1_nz(
+; FAST-SAME: i64 [[X:%.*]]) #[[ATTR0]] {
+; FAST-NEXT: [[CTPOP:%.*]] = call range(i64 1, 33) i64 @llvm.ctpop.i64(i64 [[X]])
+; FAST-NEXT: [[CMP1:%.*]] = icmp ult i64 [[CTPOP]], 2
+; FAST-NEXT: ret i1 [[CMP1]]
+;
+ %ctpop = call range(i64 1, 33) i64 @llvm.ctpop.i64(i64 %x)
+ %cmp = icmp eq i64 %ctpop, 1
+ ret i1 %cmp
+}
-if.end:
- %res = phi i64 [ %a, %if.then ], [ %b, %entry ]
- ret i64 %res
+define i1 @test_ne_1_nz(i64 %x) {
+; SLOW-LABEL: define i1 @test_ne_1_nz(
+; SLOW-SAME: i64 [[X:%.*]]) {
+; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
+; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]]
+; SLOW-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; SLOW-NEXT: ret i1 [[TMP2]]
+;
+; FAST-LABEL: define i1 @test_ne_1_nz(
+; FAST-SAME: i64 [[X:%.*]]) #[[ATTR0]] {
+; FAST-NEXT: [[CTPOP:%.*]] = call range(i64 1, 33) i64 @llvm.ctpop.i64(i64 [[X]])
+; FAST-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
+; FAST-NEXT: ret i1 [[CMP1]]
+;
+ %ctpop = call range(i64 1, 33) i64 @llvm.ctpop.i64(i64 %x)
+ %cmp = icmp ne i64 %ctpop, 1
+ ret i1 %cmp
+}
+
+define i1 @test_eq_1(i64 %x) {
+; SLOW-LABEL: define i1 @test_eq_1(
+; SLOW-SAME: i64 [[X:%.*]]) {
+; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
+; SLOW-NEXT: [[TMP1:%.*]] = xor i64 [[X]], [[TMP0]]
+; SLOW-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], [[TMP0]]
+; SLOW-NEXT: ret i1 [[TMP2]]
+;
+; FAST-LABEL: define i1 @test_eq_1(
+; FAST-SAME: i64 [[X:%.*]]) #[[ATTR0]] {
+; FAST-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
+; FAST-NEXT: [[CMP1:%.*]] = icmp eq i64 [[CTPOP]], 1
+; FAST-NEXT: ret i1 [[CMP1]]
+;
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %x)
+ %cmp = icmp eq i64 %ctpop, 1
+ ret i1 %cmp
+}
+
+define i1 @test_ne_1(i64 %x) {
+; SLOW-LABEL: define i1 @test_ne_1(
+; SLOW-SAME: i64 [[X:%.*]]) {
+; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
+; SLOW-NEXT: [[TMP1:%.*]] = xor i64 [[X]], [[TMP0]]
+; SLOW-NEXT: [[TMP2:%.*]] = icmp ule i64 [[TMP1]], [[TMP0]]
+; SLOW-NEXT: ret i1 [[TMP2]]
+;
+; FAST-LABEL: define i1 @test_ne_1(
+; FAST-SAME: i64 [[X:%.*]]) #[[ATTR0]] {
+; FAST-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
+; FAST-NEXT: [[CMP1:%.*]] = icmp ne i64 [[CTPOP]], 1
+; FAST-NEXT: ret i1 [[CMP1]]
+;
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %x)
+ %cmp = icmp ne i64 %ctpop, 1
+ ret i1 %cmp
}
More information about the llvm-commits
mailing list