[llvm] [X86] Remove redundant TEST after shifts when count is non-zero (PR #169069)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 21 09:27:18 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: None (GrumpyPigSkin)
<details>
<summary>Changes</summary>
Off the back of #<!-- -->168867, this patch implements an optimization for x86 shift instructions (shl, shr, sar) when the shift amount is provably non-zero.
On x86, shift instructions update EFLAGS (ZF, SF, PF, etc.) based on the result, unless the shift count is 0, in which case flags are preserved. Consequently, the backend conservatively emits a test instruction after variable shifts to ensure flags are correct.
If the shift count is guaranteed to be non-zero, the test instruction is redundant.
Example cases which are provably non-zero but still create the TEST instruction:
```cpp
char foo(unsigned x, int y) {
__builtin_assume(y > 0);
return 0 == (x << y);
}
char bar(unsigned x, int y) {
return (x << (y | 1)) == 0;
}
```
The distilled results using llvm-mca for foo:
| Metric | Baseline (With TEST) | Optimized (No TEST) | Improvement |
| :--- | :--- | :--- | :--- |
| **Instructions** | 500 | 400 | **-20%** |
| **Total uOps** | 800 | 700 | **-12.5%** |
| **Scheduler Stalls** | 72 cycles | 52 cycles | **-27.7%** |
| **Flag Latency** | 4 cycles | 2 cycles | **-50%** |
Closes #<!-- -->168867
---
Full diff: https://github.com/llvm/llvm-project/pull/169069.diff
8 Files Affected:
- (modified) llvm/include/llvm/CodeGen/SelectionDAGNodes.h (+3)
- (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+25-1)
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (+20)
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+51)
- (modified) llvm/lib/Target/X86/X86ISelLowering.h (+3)
- (modified) llvm/lib/Target/X86/X86InstrShiftRotate.td (+49)
- (modified) llvm/test/CodeGen/X86/apx/shift-eflags.ll (+17-6)
- (modified) llvm/test/CodeGen/X86/shift-eflags.ll (+17-6)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index cfc8a4243e894..082e7b4371d27 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -427,6 +427,7 @@ struct SDNodeFlags {
// implied by its producer as, e.g, operations between producer and PTRADD
// that affect the provenance may have been optimized away.
InBounds = 1 << 15,
+ NoZero = 1 << 16,
// NOTE: Please update LargestValue in LLVM_DECLARE_ENUM_AS_BITMASK below
// the class definition when adding new flags.
@@ -468,6 +469,7 @@ struct SDNodeFlags {
void setNoFPExcept(bool b) { setFlag<NoFPExcept>(b); }
void setUnpredictable(bool b) { setFlag<Unpredictable>(b); }
void setInBounds(bool b) { setFlag<InBounds>(b); }
+ void setNoZero(bool b) { setFlag<NoZero>(b); }
// These are accessors for each flag.
bool hasNoUnsignedWrap() const { return Flags & NoUnsignedWrap; }
@@ -486,6 +488,7 @@ struct SDNodeFlags {
bool hasNoFPExcept() const { return Flags & NoFPExcept; }
bool hasUnpredictable() const { return Flags & Unpredictable; }
bool hasInBounds() const { return Flags & InBounds; }
+ bool hasNoZero() const { return Flags & NoZero; }
bool operator==(const SDNodeFlags &Other) const {
return Flags == Other.Flags;
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 587c1372b19cb..b4b4879c515c5 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -786,8 +786,32 @@ bool CodeGenPrepare::eliminateAssumptions(Function &F) {
if (auto *Assume = dyn_cast<AssumeInst>(I)) {
MadeChange = true;
Value *Operand = Assume->getOperand(0);
+ if (ICmpInst *Cmp = dyn_cast<ICmpInst>(Operand)) {
+ // Check if we are comparing an Argument against a Constant
+ if (Argument *Arg = dyn_cast<Argument>(Cmp->getOperand(0))) {
+ if (ConstantInt *C = dyn_cast<ConstantInt>(Cmp->getOperand(1))) {
+ // We found "assume(Arg <pred> Constant)"
+ ConstantRange Constraint = ConstantRange::makeAllowedICmpRegion(
+ Cmp->getPredicate(), C->getValue());
+ // If the argument already has a range attribute, intersect with
+ // it
+ ConstantRange FinalRange = Constraint;
+ Attribute ExistingRangeAttr = Arg->getAttribute(Attribute::Range);
+ if (ExistingRangeAttr.isValid()) {
+ FinalRange =
+ FinalRange.intersectWith(ExistingRangeAttr.getRange());
+ }
+ // If the range provides new information (and isn't empty), save
+ // it.
+ if (!FinalRange.isFullSet() && !FinalRange.isEmptySet()) {
+ AttrBuilder AB(Arg->getContext());
+ AB.addRangeAttr(FinalRange);
+ Arg->addAttrs(AB);
+ }
+ }
+ }
+ }
Assume->eraseFromParent();
-
resetIteratorIfInvalidatedWhileCalling(&BB, [&]() {
RecursivelyDeleteTriviallyDeadInstructions(Operand, TLInfo, nullptr);
});
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 985a54ca83256..51d3fc0d78975 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3687,6 +3687,7 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
bool nuw = false;
bool nsw = false;
bool exact = false;
+ bool noZero = false;
if (Opcode == ISD::SRL || Opcode == ISD::SRA || Opcode == ISD::SHL) {
@@ -3698,11 +3699,30 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
if (const PossiblyExactOperator *ExactOp =
dyn_cast<const PossiblyExactOperator>(&I))
exact = ExactOp->isExact();
+
+ const Value *ShiftAmt = I.getOperand(1);
+
+ // Look through zext as computeConstantRange does not do this.
+ const Value *InnerVal = ShiftAmt;
+ if (auto *ZExt = dyn_cast<ZExtInst>(ShiftAmt)) {
+ InnerVal = ZExt->getOperand(0);
+ }
+
+ // Get the constant range and check it excludes 0
+ ConstantRange CR = llvm::computeConstantRange(
+ InnerVal, true, true, AC, dyn_cast<Instruction>(&I), nullptr);
+
+ if (!CR.isEmptySet() && !CR.contains(APInt(CR.getBitWidth(), 0))) {
+ // We can guarantee that that we will not be shifted by 0
+ noZero = true;
+ }
}
+
SDNodeFlags Flags;
Flags.setExact(exact);
Flags.setNoSignedWrap(nsw);
Flags.setNoUnsignedWrap(nuw);
+ Flags.setNoZero(noZero);
SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2,
Flags);
setValue(&I, Res);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dc84025c166a3..87e637ee4c272 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23625,6 +23625,54 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
Op->getOperand(1)).getValue(1);
}
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA: {
+ SDValue Amt = ArithOp.getOperand(1);
+
+ // Skip Constants
+ if (isa<ConstantSDNode>(Amt))
+ break;
+
+ // Check we can make this optimization
+ if (ArithOp->getFlags().hasNoZero() || DAG.isKnownNeverZero(Amt)) {
+ SDLoc DL(ArithOp);
+
+ // CopyToReg(CL, Amt)
+ SDValue Chain = DAG.getEntryNode();
+ SDValue Glue;
+
+ Chain = DAG.getCopyToReg(Chain, DL, X86::CL, Amt, Glue);
+ Glue = Chain.getValue(1);
+
+ // Select Opcode
+ unsigned X86Opcode;
+ switch (ArithOp.getOpcode()) {
+ case ISD::SHL:
+ X86Opcode = X86ISD::SHL;
+ break;
+ case ISD::SRL:
+ X86Opcode = X86ISD::SRL;
+ break;
+ case ISD::SRA:
+ X86Opcode = X86ISD::SRA;
+ break;
+ default:
+ llvm_unreachable("Unexpected shift opcode");
+ }
+
+ // Create Node [ValueToShift, Glue]
+ SDVTList VTs = DAG.getVTList(ArithOp.getValueType(), MVT::i32);
+ SDValue Ops[] = {ArithOp.getOperand(0), Glue};
+
+ SDValue NewNode = DAG.getNode(X86Opcode, DL, VTs, Ops);
+
+ // Replace and Return
+ DAG.ReplaceAllUsesOfValueWith(ArithOp, NewNode.getValue(0));
+ return NewNode.getValue(1);
+ }
+ break;
+ }
default:
break;
}
@@ -35467,6 +35515,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVTTP2UIS)
NODE_NAME_CASE(MCVTTP2UIS)
NODE_NAME_CASE(POP_FROM_X87_REG)
+ NODE_NAME_CASE(SHL)
+ NODE_NAME_CASE(SRL)
+ NODE_NAME_CASE(SRA)
}
return nullptr;
#undef NODE_NAME_CASE
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..00830804213f7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -996,6 +996,9 @@ namespace llvm {
CLOAD,
CSTORE,
LAST_MEMORY_OPCODE = CSTORE,
+ SHL,
+ SRL,
+ SRA,
};
} // end namespace X86ISD
diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td
index 2a5488847e648..e81bc33d3a459 100644
--- a/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -689,3 +689,52 @@ let Predicates = [HasBMI2, HasEGPR] in {
defm SHRX : ShiftX_Pats<srl, "_EVEX">;
defm SHLX : ShiftX_Pats<shl, "_EVEX">;
}
+
+def SDTX86ShiftWithFlags : SDTypeProfile<2, 1, [
+ SDTCisSameAs<0, 2>,
+ SDTCisVT<1, i32>,
+]>;
+
+def X86shl : SDNode<"X86ISD::SHL", SDTX86ShiftWithFlags, [SDNPInGlue]>;
+def X86srl : SDNode<"X86ISD::SRL", SDTX86ShiftWithFlags, [SDNPInGlue]>;
+def X86sra : SDNode<"X86ISD::SRA", SDTX86ShiftWithFlags, [SDNPInGlue]>;
+
+let Predicates = [NoNDD] in {
+ // SHL
+ def : Pat<(X86shl GR8:$src1), (SHL8rCL GR8:$src1)>;
+ def : Pat<(X86shl GR16:$src1), (SHL16rCL GR16:$src1)>;
+ def : Pat<(X86shl GR32:$src1), (SHL32rCL GR32:$src1)>;
+ def : Pat<(X86shl GR64:$src1), (SHL64rCL GR64:$src1)>;
+
+ // SHR
+ def : Pat<(X86srl GR8:$src1), (SHR8rCL GR8:$src1)>;
+ def : Pat<(X86srl GR16:$src1), (SHR16rCL GR16:$src1)>;
+ def : Pat<(X86srl GR32:$src1), (SHR32rCL GR32:$src1)>;
+ def : Pat<(X86srl GR64:$src1), (SHR64rCL GR64:$src1)>;
+
+ // SAR
+ def : Pat<(X86sra GR8:$src1), (SAR8rCL GR8:$src1)>;
+ def : Pat<(X86sra GR16:$src1), (SAR16rCL GR16:$src1)>;
+ def : Pat<(X86sra GR32:$src1), (SAR32rCL GR32:$src1)>;
+ def : Pat<(X86sra GR64:$src1), (SAR64rCL GR64:$src1)>;
+}
+
+let Predicates = [HasNDD, In64BitMode] in {
+ // SHL
+ def : Pat<(X86shl GR8:$src1), (SHL8rCL_ND GR8:$src1)>;
+ def : Pat<(X86shl GR16:$src1), (SHL16rCL_ND GR16:$src1)>;
+ def : Pat<(X86shl GR32:$src1), (SHL32rCL_ND GR32:$src1)>;
+ def : Pat<(X86shl GR64:$src1), (SHL64rCL_ND GR64:$src1)>;
+
+ // SHR
+ def : Pat<(X86srl GR8:$src1), (SHR8rCL_ND GR8:$src1)>;
+ def : Pat<(X86srl GR16:$src1), (SHR16rCL_ND GR16:$src1)>;
+ def : Pat<(X86srl GR32:$src1), (SHR32rCL_ND GR32:$src1)>;
+ def : Pat<(X86srl GR64:$src1), (SHR64rCL_ND GR64:$src1)>;
+
+ // SAR
+ def : Pat<(X86sra GR8:$src1), (SAR8rCL_ND GR8:$src1)>;
+ def : Pat<(X86sra GR16:$src1), (SAR16rCL_ND GR16:$src1)>;
+ def : Pat<(X86sra GR32:$src1), (SAR32rCL_ND GR32:$src1)>;
+ def : Pat<(X86sra GR64:$src1), (SAR64rCL_ND GR64:$src1)>;
+}
diff --git a/llvm/test/CodeGen/X86/apx/shift-eflags.ll b/llvm/test/CodeGen/X86/apx/shift-eflags.ll
index 2659f8031ef77..01f4689369eac 100644
--- a/llvm/test/CodeGen/X86/apx/shift-eflags.ll
+++ b/llvm/test/CodeGen/X86/apx/shift-eflags.ll
@@ -265,7 +265,6 @@ define i32 @ashr_var_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: orb $1, %sil, %cl
; CHECK-NEXT: sarl %cl, %edi
-; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovel %edx, %eax
; CHECK-NEXT: retq
%a = or i32 %a1, 1
@@ -282,7 +281,6 @@ define i32 @lshr_var_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: orb $1, %sil, %cl
; CHECK-NEXT: shrl %cl, %edi
-; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovel %edx, %eax
; CHECK-NEXT: retq
%a = or i32 %a1, 1
@@ -299,7 +297,6 @@ define i32 @shl_var_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: orb $1, %sil, %cl
; CHECK-NEXT: shll %cl, %edi
-; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovel %edx, %eax
; CHECK-NEXT: retq
%a = or i32 %a1, 1
@@ -315,7 +312,6 @@ define i32 @ashr_var_self_select_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %
; CHECK: # %bb.0:
; CHECK-NEXT: orb $1, %sil, %cl
; CHECK-NEXT: shrl %cl, %edi, %eax
-; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%a = or i32 %a1, 1
@@ -331,7 +327,6 @@ define i32 @lshr_var_self_select_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %
; CHECK: # %bb.0:
; CHECK-NEXT: orb $1, %sil, %cl
; CHECK-NEXT: shrl %cl, %edi, %eax
-; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%a = or i32 %a1, 1
@@ -347,7 +342,6 @@ define i32 @shl_var_self_select_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a
; CHECK: # %bb.0:
; CHECK-NEXT: orb $1, %sil, %cl
; CHECK-NEXT: shrl %cl, %edi, %eax
-; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%a = or i32 %a1, 1
@@ -356,3 +350,20 @@ define i32 @shl_var_self_select_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a
%r = select i1 %c, i32 %s, i32 %a2
ret i32 %r
}
+
+define range(i8 0, 2) i8 @no_test_emitted_when_assume_shift_amt_gt_zero(i64 noundef %0, i32 noundef %1) {
+; CHECK-LABEL: no_test_emitted_when_assume_shift_amt_gt_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: shlq %cl, %rdi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %3 = icmp sgt i32 %1, 0
+ tail call void @llvm.assume(i1 %3)
+ %4 = zext nneg i32 %1 to i64
+ %5 = shl i64 %0, %4
+ %6 = icmp eq i64 %5, 0
+ %7 = zext i1 %6 to i8
+ ret i8 %7
+}
diff --git a/llvm/test/CodeGen/X86/shift-eflags.ll b/llvm/test/CodeGen/X86/shift-eflags.ll
index 6eddf50ce5c9d..badcf5470ab64 100644
--- a/llvm/test/CodeGen/X86/shift-eflags.ll
+++ b/llvm/test/CodeGen/X86/shift-eflags.ll
@@ -282,7 +282,6 @@ define i32 @ashr_var_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
; CHECK-NEXT: orb $1, %cl
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NEXT: sarl %cl, %edi
-; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovel %edx, %eax
; CHECK-NEXT: retq
%a = or i32 %a1, 1
@@ -301,7 +300,6 @@ define i32 @lshr_var_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
; CHECK-NEXT: orb $1, %cl
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NEXT: shrl %cl, %edi
-; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovel %edx, %eax
; CHECK-NEXT: retq
%a = or i32 %a1, 1
@@ -320,7 +318,6 @@ define i32 @shl_var_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
; CHECK-NEXT: orb $1, %cl
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NEXT: shll %cl, %edi
-; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovel %edx, %eax
; CHECK-NEXT: retq
%a = or i32 %a1, 1
@@ -339,7 +336,6 @@ define i32 @ashr_var_self_select_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %
; CHECK-NEXT: orb $1, %cl
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NEXT: shrl %cl, %eax
-; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%a = or i32 %a1, 1
@@ -358,7 +354,6 @@ define i32 @lshr_var_self_select_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %
; CHECK-NEXT: orb $1, %cl
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NEXT: shrl %cl, %eax
-; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%a = or i32 %a1, 1
@@ -377,7 +372,6 @@ define i32 @shl_var_self_select_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a
; CHECK-NEXT: orb $1, %cl
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NEXT: shrl %cl, %eax
-; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%a = or i32 %a1, 1
@@ -386,3 +380,20 @@ define i32 @shl_var_self_select_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a
%r = select i1 %c, i32 %s, i32 %a2
ret i32 %r
}
+
+define range(i8 0, 2) i8 @no_test_emitted_when_assume_shift_amt_gt_zero(i64 noundef %0, i32 noundef %1) {
+; CHECK-LABEL: no_test_emitted_when_assume_shift_amt_gt_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: shlq %cl, %rdi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %3 = icmp sgt i32 %1, 0
+ tail call void @llvm.assume(i1 %3)
+ %4 = zext nneg i32 %1 to i64
+ %5 = shl i64 %0, %4
+ %6 = icmp eq i64 %5, 0
+ %7 = zext i1 %6 to i8
+ ret i8 %7
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/169069
More information about the llvm-commits
mailing list