[llvm] [CGP] Despeculate ctlz/cttz with "illegal" integer types (PR #137197)
Sergei Barannikov via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 24 08:33:26 PDT 2025
https://github.com/s-barannikov created https://github.com/llvm/llvm-project/pull/137197
The code below the removed check looks generic enough to support arbitrary integer widths. This change helps 32-bit targets avoid expensive expansion/libcalls in the case of zero input.
>From 2af83da29314ef7bb4248691c3c08173caa783c9 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88 at gmail.com>
Date: Thu, 24 Apr 2025 18:31:15 +0300
Subject: [PATCH] [CGP] Despeculate ctlz/cttz with "illegal" integer types
The code below the removed check looks generic enough to support
arbitrary integer widths. This helps 32-bit targets avoid expensive
expansion/libcalls in the case of zero input.
---
llvm/lib/CodeGen/CodeGenPrepare.cpp | 4 +-
llvm/test/CodeGen/ARM/cttz.ll | 140 ++++++++++--------
llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll | 26 +++-
llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll | 130 +++++++++-------
llvm/test/CodeGen/RISCV/rv32xtheadbb.ll | 85 ++++++-----
llvm/test/CodeGen/RISCV/rv32zbb.ll | 68 +++++----
llvm/test/CodeGen/SPARC/ctlz.ll | 110 +++++---------
llvm/test/CodeGen/X86/ctlo.ll | 27 ++--
llvm/test/CodeGen/X86/ctlz.ll | 32 ++--
llvm/test/CodeGen/X86/cttz.ll | 33 ++---
llvm/test/CodeGen/X86/lzcnt-cmp.ll | 86 +++++++++--
11 files changed, 425 insertions(+), 316 deletions(-)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index e8dc7752b23c0..f9dcb472ed1d2 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2552,9 +2552,9 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
(IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty)))
return false;
- // Only handle legal scalar cases. Anything else requires too much work.
+ // Only handle scalar cases. Anything else requires too much work.
unsigned SizeInBits = Ty->getScalarSizeInBits();
- if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
+ if (Ty->isVectorTy())
return false;
// Bail if the value is never zero.
diff --git a/llvm/test/CodeGen/ARM/cttz.ll b/llvm/test/CodeGen/ARM/cttz.ll
index 76adc61c5971f..1146ad64ee709 100644
--- a/llvm/test/CodeGen/ARM/cttz.ll
+++ b/llvm/test/CodeGen/ARM/cttz.ll
@@ -221,43 +221,49 @@ define i64 @test_i64(i64 %a) {
;
; CHECK-6M-LABEL: test_i64:
; CHECK-6M: @ %bb.0:
-; CHECK-6M-NEXT: .save {r4, r5, r7, lr}
-; CHECK-6M-NEXT: push {r4, r5, r7, lr}
+; CHECK-6M-NEXT: .save {r4, r5, r6, lr}
+; CHECK-6M-NEXT: push {r4, r5, r6, lr}
+; CHECK-6M-NEXT: mov r3, r1
; CHECK-6M-NEXT: mov r2, r0
-; CHECK-6M-NEXT: ldr r5, .LCPI3_0
-; CHECK-6M-NEXT: adr r3, .LCPI3_1
+; CHECK-6M-NEXT: movs r1, #0
+; CHECK-6M-NEXT: orrs r0, r3
+; CHECK-6M-NEXT: beq .LBB3_6
+; CHECK-6M-NEXT: @ %bb.1: @ %cond.false
+; CHECK-6M-NEXT: ldr r6, .LCPI3_0
+; CHECK-6M-NEXT: adr r4, .LCPI3_1
; CHECK-6M-NEXT: movs r0, #32
-; CHECK-6M-NEXT: cmp r1, #0
-; CHECK-6M-NEXT: mov r4, r0
-; CHECK-6M-NEXT: beq .LBB3_2
-; CHECK-6M-NEXT: @ %bb.1:
-; CHECK-6M-NEXT: rsbs r4, r1, #0
-; CHECK-6M-NEXT: ands r4, r1
-; CHECK-6M-NEXT: muls r4, r5, r4
-; CHECK-6M-NEXT: lsrs r1, r4, #27
-; CHECK-6M-NEXT: ldrb r4, [r3, r1]
-; CHECK-6M-NEXT: .LBB3_2:
-; CHECK-6M-NEXT: adds r4, #32
-; CHECK-6M-NEXT: rsbs r1, r2, #0
-; CHECK-6M-NEXT: ands r1, r2
-; CHECK-6M-NEXT: muls r5, r1, r5
-; CHECK-6M-NEXT: lsrs r1, r5, #27
+; CHECK-6M-NEXT: cmp r3, #0
+; CHECK-6M-NEXT: mov r5, r0
+; CHECK-6M-NEXT: beq .LBB3_3
+; CHECK-6M-NEXT: @ %bb.2: @ %cond.false
+; CHECK-6M-NEXT: rsbs r5, r3, #0
+; CHECK-6M-NEXT: ands r5, r3
+; CHECK-6M-NEXT: muls r5, r6, r5
+; CHECK-6M-NEXT: lsrs r3, r5, #27
+; CHECK-6M-NEXT: ldrb r5, [r4, r3]
+; CHECK-6M-NEXT: .LBB3_3: @ %cond.false
+; CHECK-6M-NEXT: adds r5, #32
+; CHECK-6M-NEXT: rsbs r3, r2, #0
+; CHECK-6M-NEXT: ands r3, r2
+; CHECK-6M-NEXT: muls r6, r3, r6
+; CHECK-6M-NEXT: lsrs r3, r6, #27
; CHECK-6M-NEXT: cmp r2, #0
-; CHECK-6M-NEXT: bne .LBB3_5
-; CHECK-6M-NEXT: @ %bb.3:
-; CHECK-6M-NEXT: beq .LBB3_6
-; CHECK-6M-NEXT: .LBB3_4:
-; CHECK-6M-NEXT: movs r1, #0
-; CHECK-6M-NEXT: pop {r4, r5, r7, pc}
-; CHECK-6M-NEXT: .LBB3_5:
-; CHECK-6M-NEXT: ldrb r0, [r3, r1]
-; CHECK-6M-NEXT: bne .LBB3_4
+; CHECK-6M-NEXT: bne .LBB3_7
+; CHECK-6M-NEXT: @ %bb.4: @ %cond.false
+; CHECK-6M-NEXT: beq .LBB3_8
+; CHECK-6M-NEXT: .LBB3_5: @ %cond.end
+; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
; CHECK-6M-NEXT: .LBB3_6:
-; CHECK-6M-NEXT: mov r0, r4
-; CHECK-6M-NEXT: movs r1, #0
-; CHECK-6M-NEXT: pop {r4, r5, r7, pc}
+; CHECK-6M-NEXT: movs r0, #64
+; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
+; CHECK-6M-NEXT: .LBB3_7: @ %cond.false
+; CHECK-6M-NEXT: ldrb r0, [r4, r3]
+; CHECK-6M-NEXT: bne .LBB3_5
+; CHECK-6M-NEXT: .LBB3_8: @ %cond.false
+; CHECK-6M-NEXT: mov r0, r5
+; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
; CHECK-6M-NEXT: .p2align 2
-; CHECK-6M-NEXT: @ %bb.7:
+; CHECK-6M-NEXT: @ %bb.9:
; CHECK-6M-NEXT: .LCPI3_0:
; CHECK-6M-NEXT: .long 125613361 @ 0x77cb531
; CHECK-6M-NEXT: .LCPI3_1:
@@ -265,43 +271,49 @@ define i64 @test_i64(i64 %a) {
;
; CHECK-8MBASE-LABEL: test_i64:
; CHECK-8MBASE: @ %bb.0:
-; CHECK-8MBASE-NEXT: .save {r4, r5, r7, lr}
-; CHECK-8MBASE-NEXT: push {r4, r5, r7, lr}
+; CHECK-8MBASE-NEXT: .save {r4, r5, r6, lr}
+; CHECK-8MBASE-NEXT: push {r4, r5, r6, lr}
+; CHECK-8MBASE-NEXT: mov r3, r1
; CHECK-8MBASE-NEXT: mov r2, r0
-; CHECK-8MBASE-NEXT: movw r5, #46385
-; CHECK-8MBASE-NEXT: movt r5, #1916
-; CHECK-8MBASE-NEXT: adr r3, .LCPI3_0
+; CHECK-8MBASE-NEXT: movs r1, #0
+; CHECK-8MBASE-NEXT: orrs r0, r3
+; CHECK-8MBASE-NEXT: beq .LBB3_6
+; CHECK-8MBASE-NEXT: @ %bb.1: @ %cond.false
+; CHECK-8MBASE-NEXT: movw r6, #46385
+; CHECK-8MBASE-NEXT: movt r6, #1916
+; CHECK-8MBASE-NEXT: adr r4, .LCPI3_0
; CHECK-8MBASE-NEXT: movs r0, #32
-; CHECK-8MBASE-NEXT: mov r4, r0
-; CHECK-8MBASE-NEXT: cbz r1, .LBB3_2
-; CHECK-8MBASE-NEXT: @ %bb.1:
-; CHECK-8MBASE-NEXT: rsbs r4, r1, #0
-; CHECK-8MBASE-NEXT: ands r4, r1
-; CHECK-8MBASE-NEXT: muls r4, r5, r4
-; CHECK-8MBASE-NEXT: lsrs r1, r4, #27
-; CHECK-8MBASE-NEXT: ldrb r4, [r3, r1]
-; CHECK-8MBASE-NEXT: .LBB3_2:
-; CHECK-8MBASE-NEXT: adds r4, #32
-; CHECK-8MBASE-NEXT: rsbs r1, r2, #0
-; CHECK-8MBASE-NEXT: ands r1, r2
-; CHECK-8MBASE-NEXT: muls r5, r1, r5
-; CHECK-8MBASE-NEXT: lsrs r1, r5, #27
+; CHECK-8MBASE-NEXT: mov r5, r0
+; CHECK-8MBASE-NEXT: cbz r3, .LBB3_3
+; CHECK-8MBASE-NEXT: @ %bb.2: @ %cond.false
+; CHECK-8MBASE-NEXT: rsbs r5, r3, #0
+; CHECK-8MBASE-NEXT: ands r5, r3
+; CHECK-8MBASE-NEXT: muls r5, r6, r5
+; CHECK-8MBASE-NEXT: lsrs r3, r5, #27
+; CHECK-8MBASE-NEXT: ldrb r5, [r4, r3]
+; CHECK-8MBASE-NEXT: .LBB3_3: @ %cond.false
+; CHECK-8MBASE-NEXT: adds r5, #32
+; CHECK-8MBASE-NEXT: rsbs r3, r2, #0
+; CHECK-8MBASE-NEXT: ands r3, r2
+; CHECK-8MBASE-NEXT: muls r6, r3, r6
+; CHECK-8MBASE-NEXT: lsrs r3, r6, #27
; CHECK-8MBASE-NEXT: cmp r2, #0
-; CHECK-8MBASE-NEXT: bne .LBB3_5
-; CHECK-8MBASE-NEXT: @ %bb.3:
-; CHECK-8MBASE-NEXT: beq .LBB3_6
-; CHECK-8MBASE-NEXT: .LBB3_4:
-; CHECK-8MBASE-NEXT: movs r1, #0
-; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc}
-; CHECK-8MBASE-NEXT: .LBB3_5:
-; CHECK-8MBASE-NEXT: ldrb r0, [r3, r1]
-; CHECK-8MBASE-NEXT: bne .LBB3_4
+; CHECK-8MBASE-NEXT: bne .LBB3_7
+; CHECK-8MBASE-NEXT: @ %bb.4: @ %cond.false
+; CHECK-8MBASE-NEXT: beq .LBB3_8
+; CHECK-8MBASE-NEXT: .LBB3_5: @ %cond.end
+; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
; CHECK-8MBASE-NEXT: .LBB3_6:
-; CHECK-8MBASE-NEXT: mov r0, r4
-; CHECK-8MBASE-NEXT: movs r1, #0
-; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc}
+; CHECK-8MBASE-NEXT: movs r0, #64
+; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
+; CHECK-8MBASE-NEXT: .LBB3_7: @ %cond.false
+; CHECK-8MBASE-NEXT: ldrb r0, [r4, r3]
+; CHECK-8MBASE-NEXT: bne .LBB3_5
+; CHECK-8MBASE-NEXT: .LBB3_8: @ %cond.false
+; CHECK-8MBASE-NEXT: mov r0, r5
+; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
; CHECK-8MBASE-NEXT: .p2align 2
-; CHECK-8MBASE-NEXT: @ %bb.7:
+; CHECK-8MBASE-NEXT: @ %bb.9:
; CHECK-8MBASE-NEXT: .LCPI3_0:
; CHECK-8MBASE-NEXT: .ascii "\000\001\034\002\035\016\030\003\036\026\024\017\031\021\004\b\037\033\r\027\025\023\020\007\032\f\022\006\013\005\n\t"
%tmp = call i64 @llvm.cttz.i64(i64 %a, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
index f9af74d6ec323..0632caecf8907 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
@@ -62,6 +62,9 @@ declare i64 @llvm.ctlz.i64(i64, i1)
define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-LABEL: ctlz_i64:
; RV32I: # %bb.0:
+; RV32I-NEXT: or a2, a0, a1
+; RV32I-NEXT: beqz a2, .LBB1_3
+; RV32I-NEXT: # %bb.1: # %cond.false
; RV32I-NEXT: lui a2, 349525
; RV32I-NEXT: lui a3, 209715
; RV32I-NEXT: lui a6, 61681
@@ -69,8 +72,8 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: addi a4, a3, 819
; RV32I-NEXT: addi a3, a6, -241
; RV32I-NEXT: li a2, 32
-; RV32I-NEXT: beqz a1, .LBB1_2
-; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: beqz a1, .LBB1_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: srli a0, a1, 1
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
@@ -99,7 +102,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: sub a0, a2, a0
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB1_2:
+; RV32I-NEXT: .LBB1_3:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB1_4:
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 2
@@ -195,14 +202,17 @@ declare i64 @llvm.cttz.i64(i64, i1)
define i64 @cttz_i64(i64 %a) nounwind {
; RV32I-LABEL: cttz_i64:
; RV32I: # %bb.0:
+; RV32I-NEXT: or a2, a0, a1
+; RV32I-NEXT: beqz a2, .LBB3_3
+; RV32I-NEXT: # %bb.1: # %cond.false
; RV32I-NEXT: lui a2, 349525
; RV32I-NEXT: lui a3, 209715
; RV32I-NEXT: lui a5, 61681
; RV32I-NEXT: addi a4, a2, 1365
; RV32I-NEXT: addi a3, a3, 819
; RV32I-NEXT: addi a2, a5, -241
-; RV32I-NEXT: beqz a0, .LBB3_2
-; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: beqz a0, .LBB3_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: not a1, a0
; RV32I-NEXT: addi a0, a0, -1
; RV32I-NEXT: and a0, a1, a0
@@ -223,7 +233,11 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: srli a0, a0, 24
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB3_2:
+; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB3_4:
; RV32I-NEXT: not a0, a1
; RV32I-NEXT: addi a1, a1, -1
; RV32I-NEXT: and a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index a46168f114bb9..3a7d31253b05d 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -374,39 +374,42 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: neg a0, a0
-; RV32I-NEXT: and a0, s0, a0
-; RV32I-NEXT: lui a1, 30667
-; RV32I-NEXT: addi s3, a1, 1329
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: or a1, a0, a1
+; RV32I-NEXT: beqz a1, .LBB3_3
+; RV32I-NEXT: # %bb.1: # %cond.false
+; RV32I-NEXT: neg a1, a0
+; RV32I-NEXT: and a1, a0, a1
+; RV32I-NEXT: lui a2, 30667
+; RV32I-NEXT: addi s2, a2, 1329
+; RV32I-NEXT: mv s4, a0
+; RV32I-NEXT: mv a0, a1
+; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __mulsi3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: lui s4, %hi(.LCPI3_0)
-; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0)
-; RV32I-NEXT: neg a0, s2
-; RV32I-NEXT: and a0, s2, a0
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: lui s3, %hi(.LCPI3_0)
+; RV32I-NEXT: addi s3, s3, %lo(.LCPI3_0)
+; RV32I-NEXT: neg a0, s0
+; RV32I-NEXT: and a0, s0, a0
+; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __mulsi3
-; RV32I-NEXT: bnez s2, .LBB3_3
-; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: li a0, 32
-; RV32I-NEXT: beqz s0, .LBB3_4
-; RV32I-NEXT: .LBB3_2:
-; RV32I-NEXT: srli s1, s1, 27
-; RV32I-NEXT: add s1, s4, s1
-; RV32I-NEXT: lbu a0, 0(s1)
-; RV32I-NEXT: j .LBB3_5
-; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: bnez s4, .LBB3_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: add a0, s4, a0
+; RV32I-NEXT: add a0, s3, a0
; RV32I-NEXT: lbu a0, 0(a0)
-; RV32I-NEXT: bnez s0, .LBB3_2
-; RV32I-NEXT: .LBB3_4:
; RV32I-NEXT: addi a0, a0, 32
-; RV32I-NEXT: .LBB3_5:
+; RV32I-NEXT: j .LBB3_5
+; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: j .LBB3_6
+; RV32I-NEXT: .LBB3_4:
+; RV32I-NEXT: srli s1, s1, 27
+; RV32I-NEXT: add s1, s3, s1
+; RV32I-NEXT: lbu a0, 0(s1)
+; RV32I-NEXT: .LBB3_5: # %cond.false
; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: .LBB3_6: # %cond.end
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@@ -441,33 +444,35 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
;
; RV32M-LABEL: test_cttz_i64:
; RV32M: # %bb.0:
+; RV32M-NEXT: or a2, a0, a1
+; RV32M-NEXT: beqz a2, .LBB3_3
+; RV32M-NEXT: # %bb.1: # %cond.false
; RV32M-NEXT: lui a2, 30667
; RV32M-NEXT: addi a3, a2, 1329
; RV32M-NEXT: lui a2, %hi(.LCPI3_0)
; RV32M-NEXT: addi a2, a2, %lo(.LCPI3_0)
-; RV32M-NEXT: bnez a1, .LBB3_3
-; RV32M-NEXT: # %bb.1:
-; RV32M-NEXT: li a1, 32
-; RV32M-NEXT: beqz a0, .LBB3_4
-; RV32M-NEXT: .LBB3_2:
-; RV32M-NEXT: neg a1, a0
-; RV32M-NEXT: and a0, a0, a1
+; RV32M-NEXT: bnez a0, .LBB3_4
+; RV32M-NEXT: # %bb.2: # %cond.false
+; RV32M-NEXT: neg a0, a1
+; RV32M-NEXT: and a0, a1, a0
; RV32M-NEXT: mul a0, a0, a3
; RV32M-NEXT: srli a0, a0, 27
; RV32M-NEXT: add a0, a2, a0
; RV32M-NEXT: lbu a0, 0(a0)
+; RV32M-NEXT: addi a0, a0, 32
; RV32M-NEXT: li a1, 0
; RV32M-NEXT: ret
; RV32M-NEXT: .LBB3_3:
-; RV32M-NEXT: neg a4, a1
-; RV32M-NEXT: and a1, a1, a4
-; RV32M-NEXT: mul a1, a1, a3
-; RV32M-NEXT: srli a1, a1, 27
-; RV32M-NEXT: add a1, a2, a1
-; RV32M-NEXT: lbu a1, 0(a1)
-; RV32M-NEXT: bnez a0, .LBB3_2
+; RV32M-NEXT: li a1, 0
+; RV32M-NEXT: li a0, 64
+; RV32M-NEXT: ret
; RV32M-NEXT: .LBB3_4:
-; RV32M-NEXT: addi a0, a1, 32
+; RV32M-NEXT: neg a1, a0
+; RV32M-NEXT: and a0, a0, a1
+; RV32M-NEXT: mul a0, a0, a3
+; RV32M-NEXT: srli a0, a0, 27
+; RV32M-NEXT: add a0, a2, a0
+; RV32M-NEXT: lbu a0, 0(a0)
; RV32M-NEXT: li a1, 0
; RV32M-NEXT: ret
;
@@ -510,21 +515,28 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
;
; RV32XTHEADBB-LABEL: test_cttz_i64:
; RV32XTHEADBB: # %bb.0:
-; RV32XTHEADBB-NEXT: bnez a0, .LBB3_2
-; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: or a2, a0, a1
+; RV32XTHEADBB-NEXT: beqz a2, .LBB3_3
+; RV32XTHEADBB-NEXT: # %bb.1: # %cond.false
+; RV32XTHEADBB-NEXT: bnez a0, .LBB3_4
+; RV32XTHEADBB-NEXT: # %bb.2: # %cond.false
; RV32XTHEADBB-NEXT: addi a0, a1, -1
; RV32XTHEADBB-NEXT: not a1, a1
; RV32XTHEADBB-NEXT: and a0, a1, a0
; RV32XTHEADBB-NEXT: th.ff1 a0, a0
; RV32XTHEADBB-NEXT: li a1, 64
-; RV32XTHEADBB-NEXT: j .LBB3_3
-; RV32XTHEADBB-NEXT: .LBB3_2:
+; RV32XTHEADBB-NEXT: j .LBB3_5
+; RV32XTHEADBB-NEXT: .LBB3_3:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: li a0, 64
+; RV32XTHEADBB-NEXT: ret
+; RV32XTHEADBB-NEXT: .LBB3_4:
; RV32XTHEADBB-NEXT: addi a1, a0, -1
; RV32XTHEADBB-NEXT: not a0, a0
; RV32XTHEADBB-NEXT: and a0, a0, a1
; RV32XTHEADBB-NEXT: th.ff1 a0, a0
; RV32XTHEADBB-NEXT: li a1, 32
-; RV32XTHEADBB-NEXT: .LBB3_3:
+; RV32XTHEADBB-NEXT: .LBB3_5: # %cond.false
; RV32XTHEADBB-NEXT: sub a0, a1, a0
; RV32XTHEADBB-NEXT: li a1, 0
; RV32XTHEADBB-NEXT: ret
@@ -1348,14 +1360,17 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32I-LABEL: test_ctlz_i64:
; RV32I: # %bb.0:
+; RV32I-NEXT: or a2, a0, a1
+; RV32I-NEXT: beqz a2, .LBB11_3
+; RV32I-NEXT: # %bb.1: # %cond.false
; RV32I-NEXT: lui a2, 349525
; RV32I-NEXT: lui a3, 209715
; RV32I-NEXT: lui a5, 61681
; RV32I-NEXT: addi a4, a2, 1365
; RV32I-NEXT: addi a3, a3, 819
; RV32I-NEXT: addi a2, a5, -241
-; RV32I-NEXT: bnez a1, .LBB11_2
-; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: bnez a1, .LBB11_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 2
@@ -1385,7 +1400,11 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: addi a0, a0, 32
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB11_2:
+; RV32I-NEXT: .LBB11_3:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB11_4:
; RV32I-NEXT: srli a0, a1, 1
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
@@ -1468,6 +1487,9 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
;
; RV32M-LABEL: test_ctlz_i64:
; RV32M: # %bb.0:
+; RV32M-NEXT: or a2, a0, a1
+; RV32M-NEXT: beqz a2, .LBB11_3
+; RV32M-NEXT: # %bb.1: # %cond.false
; RV32M-NEXT: lui a2, 349525
; RV32M-NEXT: lui a3, 209715
; RV32M-NEXT: lui a6, 61681
@@ -1476,8 +1498,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32M-NEXT: addi a4, a3, 819
; RV32M-NEXT: addi a3, a6, -241
; RV32M-NEXT: addi a2, a7, 257
-; RV32M-NEXT: bnez a1, .LBB11_2
-; RV32M-NEXT: # %bb.1:
+; RV32M-NEXT: bnez a1, .LBB11_4
+; RV32M-NEXT: # %bb.2: # %cond.false
; RV32M-NEXT: srli a1, a0, 1
; RV32M-NEXT: or a0, a0, a1
; RV32M-NEXT: srli a1, a0, 2
@@ -1504,7 +1526,11 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32M-NEXT: addi a0, a0, 32
; RV32M-NEXT: li a1, 0
; RV32M-NEXT: ret
-; RV32M-NEXT: .LBB11_2:
+; RV32M-NEXT: .LBB11_3:
+; RV32M-NEXT: li a1, 0
+; RV32M-NEXT: li a0, 64
+; RV32M-NEXT: ret
+; RV32M-NEXT: .LBB11_4:
; RV32M-NEXT: srli a0, a1, 1
; RV32M-NEXT: or a0, a1, a0
; RV32M-NEXT: srli a1, a0, 2
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
index 04a2f67c4942b..723437a610ff8 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
@@ -61,14 +61,17 @@ declare i64 @llvm.ctlz.i64(i64, i1)
define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-LABEL: ctlz_i64:
; RV32I: # %bb.0:
+; RV32I-NEXT: or a2, a0, a1
+; RV32I-NEXT: beqz a2, .LBB1_3
+; RV32I-NEXT: # %bb.1: # %cond.false
; RV32I-NEXT: lui a2, 349525
; RV32I-NEXT: lui a3, 209715
; RV32I-NEXT: lui a5, 61681
; RV32I-NEXT: addi a4, a2, 1365
; RV32I-NEXT: addi a3, a3, 819
; RV32I-NEXT: addi a2, a5, -241
-; RV32I-NEXT: bnez a1, .LBB1_2
-; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: bnez a1, .LBB1_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 2
@@ -98,7 +101,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: addi a0, a0, 32
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB1_2:
+; RV32I-NEXT: .LBB1_3:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB1_4:
; RV32I-NEXT: srli a0, a1, 1
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
@@ -200,39 +207,42 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: neg a0, a0
-; RV32I-NEXT: and a0, s0, a0
-; RV32I-NEXT: lui a1, 30667
-; RV32I-NEXT: addi s3, a1, 1329
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: or a1, a0, a1
+; RV32I-NEXT: beqz a1, .LBB3_3
+; RV32I-NEXT: # %bb.1: # %cond.false
+; RV32I-NEXT: neg a1, a0
+; RV32I-NEXT: and a1, a0, a1
+; RV32I-NEXT: lui a2, 30667
+; RV32I-NEXT: addi s2, a2, 1329
+; RV32I-NEXT: mv s4, a0
+; RV32I-NEXT: mv a0, a1
+; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __mulsi3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: lui s4, %hi(.LCPI3_0)
-; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0)
-; RV32I-NEXT: neg a0, s2
-; RV32I-NEXT: and a0, s2, a0
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: lui s3, %hi(.LCPI3_0)
+; RV32I-NEXT: addi s3, s3, %lo(.LCPI3_0)
+; RV32I-NEXT: neg a0, s0
+; RV32I-NEXT: and a0, s0, a0
+; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __mulsi3
-; RV32I-NEXT: bnez s2, .LBB3_3
-; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: li a0, 32
-; RV32I-NEXT: beqz s0, .LBB3_4
-; RV32I-NEXT: .LBB3_2:
-; RV32I-NEXT: srli s1, s1, 27
-; RV32I-NEXT: add s1, s4, s1
-; RV32I-NEXT: lbu a0, 0(s1)
-; RV32I-NEXT: j .LBB3_5
-; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: bnez s4, .LBB3_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: add a0, s4, a0
+; RV32I-NEXT: add a0, s3, a0
; RV32I-NEXT: lbu a0, 0(a0)
-; RV32I-NEXT: bnez s0, .LBB3_2
-; RV32I-NEXT: .LBB3_4:
; RV32I-NEXT: addi a0, a0, 32
-; RV32I-NEXT: .LBB3_5:
+; RV32I-NEXT: j .LBB3_5
+; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: j .LBB3_6
+; RV32I-NEXT: .LBB3_4:
+; RV32I-NEXT: srli s1, s1, 27
+; RV32I-NEXT: add s1, s3, s1
+; RV32I-NEXT: lbu a0, 0(s1)
+; RV32I-NEXT: .LBB3_5: # %cond.false
; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: .LBB3_6: # %cond.end
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@@ -244,21 +254,28 @@ define i64 @cttz_i64(i64 %a) nounwind {
;
; RV32XTHEADBB-LABEL: cttz_i64:
; RV32XTHEADBB: # %bb.0:
-; RV32XTHEADBB-NEXT: bnez a0, .LBB3_2
-; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: or a2, a0, a1
+; RV32XTHEADBB-NEXT: beqz a2, .LBB3_3
+; RV32XTHEADBB-NEXT: # %bb.1: # %cond.false
+; RV32XTHEADBB-NEXT: bnez a0, .LBB3_4
+; RV32XTHEADBB-NEXT: # %bb.2: # %cond.false
; RV32XTHEADBB-NEXT: addi a0, a1, -1
; RV32XTHEADBB-NEXT: not a1, a1
; RV32XTHEADBB-NEXT: and a0, a1, a0
; RV32XTHEADBB-NEXT: th.ff1 a0, a0
; RV32XTHEADBB-NEXT: li a1, 64
-; RV32XTHEADBB-NEXT: j .LBB3_3
-; RV32XTHEADBB-NEXT: .LBB3_2:
+; RV32XTHEADBB-NEXT: j .LBB3_5
+; RV32XTHEADBB-NEXT: .LBB3_3:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: li a0, 64
+; RV32XTHEADBB-NEXT: ret
+; RV32XTHEADBB-NEXT: .LBB3_4:
; RV32XTHEADBB-NEXT: addi a1, a0, -1
; RV32XTHEADBB-NEXT: not a0, a0
; RV32XTHEADBB-NEXT: and a0, a0, a1
; RV32XTHEADBB-NEXT: th.ff1 a0, a0
; RV32XTHEADBB-NEXT: li a1, 32
-; RV32XTHEADBB-NEXT: .LBB3_3:
+; RV32XTHEADBB-NEXT: .LBB3_5: # %cond.false
; RV32XTHEADBB-NEXT: sub a0, a1, a0
; RV32XTHEADBB-NEXT: li a1, 0
; RV32XTHEADBB-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 98c86da41afa1..0f2284637ca6a 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -61,14 +61,17 @@ declare i64 @llvm.ctlz.i64(i64, i1)
define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-LABEL: ctlz_i64:
; RV32I: # %bb.0:
+; RV32I-NEXT: or a2, a0, a1
+; RV32I-NEXT: beqz a2, .LBB1_3
+; RV32I-NEXT: # %bb.1: # %cond.false
; RV32I-NEXT: lui a2, 349525
; RV32I-NEXT: lui a3, 209715
; RV32I-NEXT: lui a5, 61681
; RV32I-NEXT: addi a4, a2, 1365
; RV32I-NEXT: addi a3, a3, 819
; RV32I-NEXT: addi a2, a5, -241
-; RV32I-NEXT: bnez a1, .LBB1_2
-; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: bnez a1, .LBB1_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 2
@@ -98,7 +101,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: addi a0, a0, 32
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB1_2:
+; RV32I-NEXT: .LBB1_3:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB1_4:
; RV32I-NEXT: srli a0, a1, 1
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
@@ -190,39 +197,42 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: neg a0, a0
-; RV32I-NEXT: and a0, s0, a0
-; RV32I-NEXT: lui a1, 30667
-; RV32I-NEXT: addi s3, a1, 1329
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: or a1, a0, a1
+; RV32I-NEXT: beqz a1, .LBB3_3
+; RV32I-NEXT: # %bb.1: # %cond.false
+; RV32I-NEXT: neg a1, a0
+; RV32I-NEXT: and a1, a0, a1
+; RV32I-NEXT: lui a2, 30667
+; RV32I-NEXT: addi s2, a2, 1329
+; RV32I-NEXT: mv s4, a0
+; RV32I-NEXT: mv a0, a1
+; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __mulsi3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: lui s4, %hi(.LCPI3_0)
-; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0)
-; RV32I-NEXT: neg a0, s2
-; RV32I-NEXT: and a0, s2, a0
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: lui s3, %hi(.LCPI3_0)
+; RV32I-NEXT: addi s3, s3, %lo(.LCPI3_0)
+; RV32I-NEXT: neg a0, s0
+; RV32I-NEXT: and a0, s0, a0
+; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __mulsi3
-; RV32I-NEXT: bnez s2, .LBB3_3
-; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: li a0, 32
-; RV32I-NEXT: beqz s0, .LBB3_4
-; RV32I-NEXT: .LBB3_2:
-; RV32I-NEXT: srli s1, s1, 27
-; RV32I-NEXT: add s1, s4, s1
-; RV32I-NEXT: lbu a0, 0(s1)
-; RV32I-NEXT: j .LBB3_5
-; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: bnez s4, .LBB3_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: add a0, s4, a0
+; RV32I-NEXT: add a0, s3, a0
; RV32I-NEXT: lbu a0, 0(a0)
-; RV32I-NEXT: bnez s0, .LBB3_2
-; RV32I-NEXT: .LBB3_4:
; RV32I-NEXT: addi a0, a0, 32
-; RV32I-NEXT: .LBB3_5:
+; RV32I-NEXT: j .LBB3_5
+; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: j .LBB3_6
+; RV32I-NEXT: .LBB3_4:
+; RV32I-NEXT: srli s1, s1, 27
+; RV32I-NEXT: add s1, s3, s1
+; RV32I-NEXT: lbu a0, 0(s1)
+; RV32I-NEXT: .LBB3_5: # %cond.false
; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: .LBB3_6: # %cond.end
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/SPARC/ctlz.ll b/llvm/test/CodeGen/SPARC/ctlz.ll
index 72505f221469e..75930190f5166 100644
--- a/llvm/test/CodeGen/SPARC/ctlz.ll
+++ b/llvm/test/CodeGen/SPARC/ctlz.ll
@@ -156,96 +156,54 @@ define i64 @i64_nopoison(i64 %x) nounwind {
; SPARC-LABEL: i64_nopoison:
; SPARC: ! %bb.0:
; SPARC-NEXT: save %sp, -96, %sp
+; SPARC-NEXT: or %i1, %i0, %i2
+; SPARC-NEXT: cmp %i2, 0
+; SPARC-NEXT: be .LBB2_4
+; SPARC-NEXT: nop
+; SPARC-NEXT: ! %bb.1: ! %cond.false
+; SPARC-NEXT: call __clzsi2
+; SPARC-NEXT: mov %i1, %o0
+; SPARC-NEXT: mov %o0, %i2
; SPARC-NEXT: call __clzsi2
; SPARC-NEXT: mov %i0, %o0
; SPARC-NEXT: cmp %i0, 0
-; SPARC-NEXT: bne .LBB2_2
-; SPARC-NEXT: nop
-; SPARC-NEXT: ! %bb.1:
-; SPARC-NEXT: srl %i1, 1, %i0
-; SPARC-NEXT: or %i1, %i0, %i0
-; SPARC-NEXT: srl %i0, 2, %i1
-; SPARC-NEXT: or %i0, %i1, %i0
-; SPARC-NEXT: srl %i0, 4, %i1
-; SPARC-NEXT: or %i0, %i1, %i0
-; SPARC-NEXT: srl %i0, 8, %i1
-; SPARC-NEXT: or %i0, %i1, %i0
-; SPARC-NEXT: srl %i0, 16, %i1
-; SPARC-NEXT: or %i0, %i1, %i0
-; SPARC-NEXT: xor %i0, -1, %i0
-; SPARC-NEXT: srl %i0, 1, %i1
-; SPARC-NEXT: sethi 1398101, %i2
-; SPARC-NEXT: or %i2, 341, %i2
-; SPARC-NEXT: and %i1, %i2, %i1
-; SPARC-NEXT: sub %i0, %i1, %i0
-; SPARC-NEXT: sethi 838860, %i1
-; SPARC-NEXT: or %i1, 819, %i1
-; SPARC-NEXT: and %i0, %i1, %i2
-; SPARC-NEXT: srl %i0, 2, %i0
-; SPARC-NEXT: and %i0, %i1, %i0
-; SPARC-NEXT: add %i2, %i0, %i0
-; SPARC-NEXT: srl %i0, 4, %i1
-; SPARC-NEXT: add %i0, %i1, %i0
-; SPARC-NEXT: sethi 246723, %i1
-; SPARC-NEXT: or %i1, 783, %i1
-; SPARC-NEXT: and %i0, %i1, %i0
-; SPARC-NEXT: sll %i0, 8, %i1
-; SPARC-NEXT: add %i0, %i1, %i0
-; SPARC-NEXT: sll %i0, 16, %i1
-; SPARC-NEXT: add %i0, %i1, %i0
-; SPARC-NEXT: srl %i0, 24, %i0
-; SPARC-NEXT: add %i0, 32, %o0
-; SPARC-NEXT: .LBB2_2:
+; SPARC-NEXT: bne .LBB2_3
+; SPARC-NEXT: mov %o0, %i1
+; SPARC-NEXT: ! %bb.2: ! %cond.false
+; SPARC-NEXT: add %i2, 32, %i1
+; SPARC-NEXT: .LBB2_3: ! %cond.false
+; SPARC-NEXT: ret
+; SPARC-NEXT: restore %g0, %g0, %o0
+; SPARC-NEXT: .LBB2_4:
; SPARC-NEXT: mov %g0, %i0
; SPARC-NEXT: ret
-; SPARC-NEXT: restore %g0, %o0, %o1
+; SPARC-NEXT: restore %g0, 64, %o1
;
; SPARC-POPC-LABEL: i64_nopoison:
; SPARC-POPC: ! %bb.0:
; SPARC-POPC-NEXT: save %sp, -96, %sp
+; SPARC-POPC-NEXT: or %i1, %i0, %i2
+; SPARC-POPC-NEXT: cmp %i2, 0
+; SPARC-POPC-NEXT: be .LBB2_4
+; SPARC-POPC-NEXT: nop
+; SPARC-POPC-NEXT: ! %bb.1: ! %cond.false
+; SPARC-POPC-NEXT: call __clzsi2
+; SPARC-POPC-NEXT: mov %i1, %o0
+; SPARC-POPC-NEXT: mov %o0, %i2
; SPARC-POPC-NEXT: call __clzsi2
; SPARC-POPC-NEXT: mov %i0, %o0
; SPARC-POPC-NEXT: cmp %i0, 0
-; SPARC-POPC-NEXT: bne .LBB2_2
-; SPARC-POPC-NEXT: nop
-; SPARC-POPC-NEXT: ! %bb.1:
-; SPARC-POPC-NEXT: srl %i1, 1, %i0
-; SPARC-POPC-NEXT: or %i1, %i0, %i0
-; SPARC-POPC-NEXT: srl %i0, 2, %i1
-; SPARC-POPC-NEXT: or %i0, %i1, %i0
-; SPARC-POPC-NEXT: srl %i0, 4, %i1
-; SPARC-POPC-NEXT: or %i0, %i1, %i0
-; SPARC-POPC-NEXT: srl %i0, 8, %i1
-; SPARC-POPC-NEXT: or %i0, %i1, %i0
-; SPARC-POPC-NEXT: srl %i0, 16, %i1
-; SPARC-POPC-NEXT: or %i0, %i1, %i0
-; SPARC-POPC-NEXT: xor %i0, -1, %i0
-; SPARC-POPC-NEXT: srl %i0, 1, %i1
-; SPARC-POPC-NEXT: sethi 1398101, %i2
-; SPARC-POPC-NEXT: or %i2, 341, %i2
-; SPARC-POPC-NEXT: and %i1, %i2, %i1
-; SPARC-POPC-NEXT: sub %i0, %i1, %i0
-; SPARC-POPC-NEXT: sethi 838860, %i1
-; SPARC-POPC-NEXT: or %i1, 819, %i1
-; SPARC-POPC-NEXT: and %i0, %i1, %i2
-; SPARC-POPC-NEXT: srl %i0, 2, %i0
-; SPARC-POPC-NEXT: and %i0, %i1, %i0
-; SPARC-POPC-NEXT: add %i2, %i0, %i0
-; SPARC-POPC-NEXT: srl %i0, 4, %i1
-; SPARC-POPC-NEXT: add %i0, %i1, %i0
-; SPARC-POPC-NEXT: sethi 246723, %i1
-; SPARC-POPC-NEXT: or %i1, 783, %i1
-; SPARC-POPC-NEXT: and %i0, %i1, %i0
-; SPARC-POPC-NEXT: sll %i0, 8, %i1
-; SPARC-POPC-NEXT: add %i0, %i1, %i0
-; SPARC-POPC-NEXT: sll %i0, 16, %i1
-; SPARC-POPC-NEXT: add %i0, %i1, %i0
-; SPARC-POPC-NEXT: srl %i0, 24, %i0
-; SPARC-POPC-NEXT: add %i0, 32, %o0
-; SPARC-POPC-NEXT: .LBB2_2:
+; SPARC-POPC-NEXT: bne .LBB2_3
+; SPARC-POPC-NEXT: mov %o0, %i1
+; SPARC-POPC-NEXT: ! %bb.2: ! %cond.false
+; SPARC-POPC-NEXT: add %i2, 32, %i1
+; SPARC-POPC-NEXT: .LBB2_3: ! %cond.false
+; SPARC-POPC-NEXT: ret
+; SPARC-POPC-NEXT: restore %g0, %g0, %o0
+; SPARC-POPC-NEXT: .LBB2_4:
; SPARC-POPC-NEXT: mov %g0, %i0
; SPARC-POPC-NEXT: ret
-; SPARC-POPC-NEXT: restore %g0, %o0, %o1
+; SPARC-POPC-NEXT: restore %g0, 64, %o1
;
; SPARC-VIS3-LABEL: i64_nopoison:
; SPARC-VIS3: ! %bb.0:
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index fecb62fbc5aea..752f6659948e6 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -285,30 +285,35 @@ define i32 @ctlo_i32_undef(i32 %x) {
ret i32 %tmp2
}
-define i64 @ctlo_i64(i64 %x) {
+define i64 @ctlo_i64(i64 %x) nounwind {
; X86-NOCMOV-LABEL: ctlo_i64:
; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: pushl %esi
; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NOCMOV-NEXT: notl %ecx
; X86-NOCMOV-NEXT: notl %eax
-; X86-NOCMOV-NEXT: bsrl %eax, %edx
-; X86-NOCMOV-NEXT: movl $63, %eax
-; X86-NOCMOV-NEXT: je .LBB6_2
-; X86-NOCMOV-NEXT: # %bb.1:
-; X86-NOCMOV-NEXT: movl %edx, %eax
-; X86-NOCMOV-NEXT: .LBB6_2:
+; X86-NOCMOV-NEXT: xorl %edx, %edx
+; X86-NOCMOV-NEXT: movl %eax, %esi
+; X86-NOCMOV-NEXT: orl %ecx, %esi
+; X86-NOCMOV-NEXT: je .LBB6_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
; X86-NOCMOV-NEXT: testl %ecx, %ecx
; X86-NOCMOV-NEXT: jne .LBB6_3
-; X86-NOCMOV-NEXT: # %bb.4:
+; X86-NOCMOV-NEXT: # %bb.4: # %cond.false
+; X86-NOCMOV-NEXT: bsrl %eax, %eax
; X86-NOCMOV-NEXT: xorl $31, %eax
-; X86-NOCMOV-NEXT: addl $32, %eax
-; X86-NOCMOV-NEXT: xorl %edx, %edx
+; X86-NOCMOV-NEXT: orl $32, %eax
+; X86-NOCMOV-NEXT: popl %esi
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB6_1:
+; X86-NOCMOV-NEXT: movl $64, %eax
+; X86-NOCMOV-NEXT: popl %esi
; X86-NOCMOV-NEXT: retl
; X86-NOCMOV-NEXT: .LBB6_3:
; X86-NOCMOV-NEXT: bsrl %ecx, %eax
; X86-NOCMOV-NEXT: xorl $31, %eax
-; X86-NOCMOV-NEXT: xorl %edx, %edx
+; X86-NOCMOV-NEXT: popl %esi
; X86-NOCMOV-NEXT: retl
;
; X86-CMOV-LABEL: ctlo_i64:
diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll
index 0eabfeae853f7..1267fe9033454 100644
--- a/llvm/test/CodeGen/X86/ctlz.ll
+++ b/llvm/test/CodeGen/X86/ctlz.ll
@@ -399,27 +399,33 @@ define i32 @ctlz_i32_zero_test(i32 %n) {
}
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
-define i64 @ctlz_i64_zero_test(i64 %n) {
+define i64 @ctlz_i64_zero_test(i64 %n) nounwind {
; X86-NOCMOV-LABEL: ctlz_i64_zero_test:
; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: pushl %esi
+; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NOCMOV-NEXT: bsrl {{[0-9]+}}(%esp), %edx
-; X86-NOCMOV-NEXT: movl $63, %eax
-; X86-NOCMOV-NEXT: je .LBB7_2
-; X86-NOCMOV-NEXT: # %bb.1:
-; X86-NOCMOV-NEXT: movl %edx, %eax
-; X86-NOCMOV-NEXT: .LBB7_2:
-; X86-NOCMOV-NEXT: testl %ecx, %ecx
+; X86-NOCMOV-NEXT: xorl %edx, %edx
+; X86-NOCMOV-NEXT: movl %ecx, %esi
+; X86-NOCMOV-NEXT: orl %eax, %esi
+; X86-NOCMOV-NEXT: je .LBB7_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
+; X86-NOCMOV-NEXT: testl %eax, %eax
; X86-NOCMOV-NEXT: jne .LBB7_3
-; X86-NOCMOV-NEXT: # %bb.4:
+; X86-NOCMOV-NEXT: # %bb.4: # %cond.false
+; X86-NOCMOV-NEXT: bsrl %ecx, %eax
; X86-NOCMOV-NEXT: xorl $31, %eax
-; X86-NOCMOV-NEXT: addl $32, %eax
-; X86-NOCMOV-NEXT: xorl %edx, %edx
+; X86-NOCMOV-NEXT: orl $32, %eax
+; X86-NOCMOV-NEXT: popl %esi
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB7_1:
+; X86-NOCMOV-NEXT: movl $64, %eax
+; X86-NOCMOV-NEXT: popl %esi
; X86-NOCMOV-NEXT: retl
; X86-NOCMOV-NEXT: .LBB7_3:
-; X86-NOCMOV-NEXT: bsrl %ecx, %eax
+; X86-NOCMOV-NEXT: bsrl %eax, %eax
; X86-NOCMOV-NEXT: xorl $31, %eax
-; X86-NOCMOV-NEXT: xorl %edx, %edx
+; X86-NOCMOV-NEXT: popl %esi
; X86-NOCMOV-NEXT: retl
;
; X86-CMOV-LABEL: ctlz_i64_zero_test:
diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll
index db949827af007..4afa337e93686 100644
--- a/llvm/test/CodeGen/X86/cttz.ll
+++ b/llvm/test/CodeGen/X86/cttz.ll
@@ -352,37 +352,40 @@ define i32 @cttz_i32_zero_test(i32 %n) {
}
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
-define i64 @cttz_i64_zero_test(i64 %n) {
+define i64 @cttz_i64_zero_test(i64 %n) nounwind {
; X86-NOCMOV-LABEL: cttz_i64_zero_test:
; X86-NOCMOV: # %bb.0:
+; X86-NOCMOV-NEXT: pushl %esi
+; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NOCMOV-NOT: rep
-; X86-NOCMOV-NEXT: bsfl {{[0-9]+}}(%esp), %edx
-; X86-NOCMOV-NEXT: movl $32, %eax
-; X86-NOCMOV-NEXT: je .LBB7_2
-; X86-NOCMOV-NEXT: # %bb.1:
-; X86-NOCMOV-NEXT: movl %edx, %eax
-; X86-NOCMOV-NEXT: .LBB7_2:
+; X86-NOCMOV-NEXT: xorl %edx, %edx
+; X86-NOCMOV-NEXT: movl %ecx, %esi
+; X86-NOCMOV-NEXT: orl %eax, %esi
+; X86-NOCMOV-NEXT: je .LBB7_1
+; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
; X86-NOCMOV-NEXT: testl %ecx, %ecx
; X86-NOCMOV-NEXT: jne .LBB7_3
-; X86-NOCMOV-NEXT: # %bb.4:
+; X86-NOCMOV-NEXT: # %bb.4: # %cond.false
+; X86-NOCMOV-NEXT: rep bsfl %eax, %eax
; X86-NOCMOV-NEXT: addl $32, %eax
-; X86-NOCMOV-NEXT: xorl %edx, %edx
+; X86-NOCMOV-NEXT: popl %esi
+; X86-NOCMOV-NEXT: retl
+; X86-NOCMOV-NEXT: .LBB7_1:
+; X86-NOCMOV-NEXT: movl $64, %eax
+; X86-NOCMOV-NEXT: popl %esi
; X86-NOCMOV-NEXT: retl
; X86-NOCMOV-NEXT: .LBB7_3:
; X86-NOCMOV-NEXT: rep bsfl %ecx, %eax
-; X86-NOCMOV-NEXT: xorl %edx, %edx
+; X86-NOCMOV-NEXT: popl %esi
; X86-NOCMOV-NEXT: retl
;
; X86-CMOV-LABEL: cttz_i64_zero_test:
; X86-CMOV: # %bb.0:
; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NOT: rep
; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %ecx
; X86-CMOV-NEXT: movl $32, %edx
; X86-CMOV-NEXT: cmovnel %ecx, %edx
; X86-CMOV-NEXT: addl $32, %edx
-; X86-CMOV-NOT: rep
; X86-CMOV-NEXT: bsfl %eax, %eax
; X86-CMOV-NEXT: cmovel %edx, %eax
; X86-CMOV-NEXT: xorl %edx, %edx
@@ -589,13 +592,11 @@ define i64 @cttz_i64_zero_test_knownneverzero(i64 %n) {
define i32 @cttz_i32_osize(i32 %x) optsize {
; X86-LABEL: cttz_i32_osize:
; X86: # %bb.0:
-; X86-NOT: rep
; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-LABEL: cttz_i32_osize:
; X64: # %bb.0:
-; X64-NOT: rep
; X64-NEXT: bsfl %edi, %eax
; X64-NEXT: retq
;
@@ -625,13 +626,11 @@ define i32 @cttz_i32_osize(i32 %x) optsize {
define i32 @cttz_i32_msize(i32 %x) minsize {
; X86-LABEL: cttz_i32_msize:
; X86: # %bb.0:
-; X86-NOT: rep
; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-LABEL: cttz_i32_msize:
; X64: # %bb.0:
-; X64-NOT: rep
; X64-NEXT: bsfl %edi, %eax
; X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll
index 4f65739cc70dd..e5e75790a01c2 100644
--- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll
+++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll
@@ -5,12 +5,44 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+lzcnt | FileCheck %s --check-prefixes=X64,X64-LZCNT
define i1 @lshr_ctlz_cmpeq_one_i64(i64 %in) nounwind {
-; X86-LABEL: lshr_ctlz_cmpeq_one_i64:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sete %al
-; X86-NEXT: retl
+; X86-BSR-LABEL: lshr_ctlz_cmpeq_one_i64:
+; X86-BSR: # %bb.0:
+; X86-BSR-NEXT: pushl %esi
+; X86-BSR-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BSR-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BSR-NEXT: xorl %eax, %eax
+; X86-BSR-NEXT: movl %edx, %esi
+; X86-BSR-NEXT: orl %ecx, %esi
+; X86-BSR-NEXT: je .LBB0_1
+; X86-BSR-NEXT: # %bb.2: # %cond.false
+; X86-BSR-NEXT: testl %ecx, %ecx
+; X86-BSR-NEXT: jne .LBB0_3
+; X86-BSR-NEXT: # %bb.4: # %cond.false
+; X86-BSR-NEXT: bsrl %edx, %ecx
+; X86-BSR-NEXT: xorl $31, %ecx
+; X86-BSR-NEXT: orl $32, %ecx
+; X86-BSR-NEXT: jmp .LBB0_5
+; X86-BSR-NEXT: .LBB0_1:
+; X86-BSR-NEXT: movl $64, %ecx
+; X86-BSR-NEXT: jmp .LBB0_5
+; X86-BSR-NEXT: .LBB0_3:
+; X86-BSR-NEXT: bsrl %ecx, %ecx
+; X86-BSR-NEXT: xorl $31, %ecx
+; X86-BSR-NEXT: .LBB0_5: # %cond.end
+; X86-BSR-NEXT: shrdl $6, %eax, %ecx
+; X86-BSR-NEXT: shrl $6, %eax
+; X86-BSR-NEXT: xorl $1, %ecx
+; X86-BSR-NEXT: orl %eax, %ecx
+; X86-BSR-NEXT: sete %al
+; X86-BSR-NEXT: popl %esi
+; X86-BSR-NEXT: retl
+;
+; X86-LZCNT-LABEL: lshr_ctlz_cmpeq_one_i64:
+; X86-LZCNT: # %bb.0:
+; X86-LZCNT-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-LZCNT-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-LZCNT-NEXT: sete %al
+; X86-LZCNT-NEXT: retl
;
; X64-LABEL: lshr_ctlz_cmpeq_one_i64:
; X64: # %bb.0:
@@ -58,12 +90,42 @@ define i1 @lshr_ctlz_undef_cmpeq_one_i64(i64 %in) nounwind {
}
define i1 @lshr_ctlz_cmpne_zero_i64(i64 %in) nounwind {
-; X86-LABEL: lshr_ctlz_cmpne_zero_i64:
-; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sete %al
-; X86-NEXT: retl
+; X86-BSR-LABEL: lshr_ctlz_cmpne_zero_i64:
+; X86-BSR: # %bb.0:
+; X86-BSR-NEXT: pushl %esi
+; X86-BSR-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BSR-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BSR-NEXT: xorl %eax, %eax
+; X86-BSR-NEXT: movl %edx, %esi
+; X86-BSR-NEXT: orl %ecx, %esi
+; X86-BSR-NEXT: je .LBB2_1
+; X86-BSR-NEXT: # %bb.2: # %cond.false
+; X86-BSR-NEXT: testl %ecx, %ecx
+; X86-BSR-NEXT: jne .LBB2_3
+; X86-BSR-NEXT: # %bb.4: # %cond.false
+; X86-BSR-NEXT: bsrl %edx, %ecx
+; X86-BSR-NEXT: xorl $31, %ecx
+; X86-BSR-NEXT: orl $32, %ecx
+; X86-BSR-NEXT: jmp .LBB2_5
+; X86-BSR-NEXT: .LBB2_1:
+; X86-BSR-NEXT: movl $64, %ecx
+; X86-BSR-NEXT: jmp .LBB2_5
+; X86-BSR-NEXT: .LBB2_3:
+; X86-BSR-NEXT: bsrl %ecx, %ecx
+; X86-BSR-NEXT: xorl $31, %ecx
+; X86-BSR-NEXT: .LBB2_5: # %cond.end
+; X86-BSR-NEXT: shrl $6, %ecx
+; X86-BSR-NEXT: orl %eax, %ecx
+; X86-BSR-NEXT: setne %al
+; X86-BSR-NEXT: popl %esi
+; X86-BSR-NEXT: retl
+;
+; X86-LZCNT-LABEL: lshr_ctlz_cmpne_zero_i64:
+; X86-LZCNT: # %bb.0:
+; X86-LZCNT-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-LZCNT-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-LZCNT-NEXT: sete %al
+; X86-LZCNT-NEXT: retl
;
; X64-LABEL: lshr_ctlz_cmpne_zero_i64:
; X64: # %bb.0:
More information about the llvm-commits
mailing list