[llvm] a209ff8 - [InstCombine] Limit canonicalization of extractelement(cast) to constant index or same basic block (#166227)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 8 13:43:17 PST 2026
Author: azwolski
Date: 2026-01-08T22:43:13+01:00
New Revision: a209ff855bfaafe7930dd35a1731220e65aebaf9
URL: https://github.com/llvm/llvm-project/commit/a209ff855bfaafe7930dd35a1731220e65aebaf9
DIFF: https://github.com/llvm/llvm-project/commit/a209ff855bfaafe7930dd35a1731220e65aebaf9.diff
LOG: [InstCombine] Limit canonicalization of extractelement(cast) to constant index or same basic block (#166227)
The current canonicalization of extractelement(cast) requires that the
CastInst has only one use. However, when that use occurs inside a loop,
it still satisfies this condition, even though the cast is effectively
used multiple times, once per iteration, rather than truly being used
once.
```cpp
} else if (auto *CI = dyn_cast<CastInst>(I)) {
// Canonicalize extractelement(cast) -> cast(extractelement).
// Bitcasts can change the number of vector elements, and they cost
// nothing.
if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)){
```
Before
```llvm
%34 = fptosi <4 x float> %33 to <4 x i32>
;/loop{
%40 = extractelement <4 x i32> %34, i32 %36
```
After
```llvm
;/loop{
%37 = extractelement <4 x float> %30, i32 %32
%38 = fptosi float %37 to i32
```
After canonicalization, for this particular example, it no longer uses a single instruction to cast the entire vector at once, but instead performs the cast for every element separately, which is less performant.
Ideally, we would like to check if the cast instruction **has one use and that this use is not called inside a loop**. However, InstCombine/InstCombineVectorOps.cpp does not provide utilities like `LoopInfo` to check that. It might be possible to approximate this by analyzing basic block successors or by building a dominance tree, but that may be a costly solution.
A solution to prevent this optimization could be to check if the index is an immediate value and if the use is inside the same basic block as the cast instruction:
```cpp
if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
Instruction *U = cast<Instruction>(*CI->user_begin());
if (U->getParent() == CI->getParent() || isa<ConstantInt>(Index)){
```
Fixes: https://github.com/llvm/llvm-project/issues/165793
Added:
Modified:
llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 4dcd9751ad0f5..3b034f6c37f66 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -594,7 +594,15 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
// Canonicalize extractelement(cast) -> cast(extractelement).
// Bitcasts can change the number of vector elements, and they cost
// nothing.
- if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
+ // If the CI has only one use, but that use is inside a loop, this
+ // canonicalization is not profitable because it would turn a vector
+ // operation into scalar operations inside the loop. Apply the transform
+ // when:
+ // - the index is constant and CI has one use, or
+ // - the CI and EI are in the same basic block, so the cast won't be sunk
+ // into a loop.
+ if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast) &&
+ (EI.getParent() == CI->getParent() || isa<ConstantInt>(Index))) {
Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
return CastInst::Create(CI->getOpcode(), EE, EI.getType());
}
diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
index 205b4b88c473a..d99f7c88d83d5 100644
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
+++ b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
@@ -40,19 +40,50 @@ define i32 @test_bitcast(i32 %i) {
declare void @use(i32)
+define void @test_branch(<4 x float> %in, i32 %a, i1 %cond) {
+; CHECK-LABEL: define void @test_branch(
+; CHECK-SAME: <4 x float> [[IN:%.*]], i32 [[A:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[I:%.*]] = add i32 [[A]], -2
+; CHECK-NEXT: br i1 [[COND]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; CHECK: [[TRUE]]:
+; CHECK-NEXT: call void @use(i32 [[I]])
+; CHECK-NEXT: br label %[[DONE:.*]]
+; CHECK: [[FALSE]]:
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[IN]], i32 [[I]]
+; CHECK-NEXT: [[ELEM:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT: call void @use(i32 [[ELEM]])
+; CHECK-NEXT: br label %[[DONE]]
+; CHECK: [[DONE]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ %vi = fptosi <4 x float> %in to <4 x i32>
+ %i = add i32 %a, -2
+ br i1 %cond, label %true, label %false
+true:
+ call void @use(i32 %i)
+ br label %done
+false:
+ %elem = extractelement <4 x i32> %vi, i32 %i
+ call void @use(i32 %elem)
+ br label %done
+done:
+ ret void
+}
+
define void @test_loop(<4 x float> %in) {
; CHECK-LABEL: define void @test_loop(
; CHECK-SAME: <4 x float> [[IN:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[R:%.*]] = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> [[IN]], i32 9)
+; CHECK-NEXT: [[VI:%.*]] = fptosi <4 x float> [[IN]] to <4 x i32>
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LATCH:.*]] ]
; CHECK-NEXT: [[COND:%.*]] = icmp samesign ult i32 [[I]], 4
; CHECK-NEXT: br i1 [[COND]], label %[[BODY:.*]], label %[[DONE:.*]]
; CHECK: [[BODY]]:
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[R]], i32 [[I]]
-; CHECK-NEXT: [[ELEM:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT: [[ELEM:%.*]] = extractelement <4 x i32> [[VI]], i32 [[I]]
; CHECK-NEXT: call void @use(i32 [[ELEM]])
; CHECK-NEXT: br label %[[LATCH]]
; CHECK: [[LATCH]]:
@@ -62,8 +93,7 @@ define void @test_loop(<4 x float> %in) {
; CHECK-NEXT: ret void
;
entry:
- %r = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %in, i32 9)
- %vi = fptosi <4 x float> %r to <4 x i32>
+ %vi = fptosi <4 x float> %in to <4 x i32>
br label %loop
loop:
%i = phi i32 [ 0, %entry ], [ %next, %latch ]
More information about the llvm-commits
mailing list