[llvm] [InstCombine] Limit canonicalization of extractelement(cast) to constant index or same basic block. (PR #166227)

via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 3 12:38:28 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-transforms

Author: None (azwolski)

<details>
<summary>Changes</summary>

The current canonicalization of extractelement(cast) requires that the CastInst has only one use. However, when that use occurs inside a loop, it still satisfies this condition, even though the cast is effectively used multiple times, once per iteration, rather than truly being used once.

```cpp
} else if (auto *CI = dyn_cast<CastInst>(I)) {
  // Canonicalize extractelement(cast) -> cast(extractelement).
  // Bitcasts can change the number of vector elements, and they cost
  // nothing.
  if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)){
 ```
Before
```llvm
%34 = fptosi <4 x float> %33 to <4 x i32>
;/loop{
%40 = extractelement <4 x i32> %34, i32 %36
```
 
  After
 ```llvm
 ;/loop{
 %37 = extractelement <4 x float> %30, i32 %32
 %38 = fptosi float %37 to i32
  ```

After canonicalization, for this particular example, it no longer uses a single instruction to cast the entire vector at once, but instead performs the cast for every element separately, which is less performant.

Ideally, we would like to check if the cast instruction **has one use and that this use is not called inside a loop**. However, InstCombine/InstCombineVectorOps.cpp does not provide utilities like `LoopInfo` to check that. It might be possible to approximate this by analyzing basic block successors or by building a dominance tree, but that may be a costly solution.

A solution to prevent this optimization could be to check if the index is an immediate value and if the use is inside the same basic block as the cast instruction:
```cpp
if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
    Instruction *U = cast<Instruction>(*CI->user_begin());
    if (U->getParent() == CI->getParent() || isa<ConstantInt>(Index)){
```

Fix: https://github.com/llvm/llvm-project/issues/165793

---
Full diff: https://github.com/llvm/llvm-project/pull/166227.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp (+5-2) 
- (modified) llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll (+35-5) 


``````````diff
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 18a45c6799bac..44c3863dd97b5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -589,8 +589,11 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
       // Bitcasts can change the number of vector elements, and they cost
       // nothing.
       if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
-        Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
-        return CastInst::Create(CI->getOpcode(), EE, EI.getType());
+        Instruction *U = cast<Instruction>(*CI->user_begin());
+        if (U->getParent() == CI->getParent() || isa<ConstantInt>(Index)) {
+          Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
+          return CastInst::Create(CI->getOpcode(), EE, EI.getType());
+        }
       }
     }
   }
diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
index 205b4b88c473a..f96b7070f9f2a 100644
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
+++ b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
@@ -40,19 +40,50 @@ define i32 @test_bitcast(i32 %i) {
 
 declare void @use(i32)
 
+define void @test_poison_branch(<4 x float> %in, i32 %a, i1 %cond) {
+; CHECK-LABEL: define void @test_poison_branch(
+; CHECK-SAME: <4 x float> [[IN:%.*]], i32 [[A:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[I:%.*]] = add i32 [[A]], -2
+; CHECK-NEXT:    br i1 [[COND]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; CHECK:       [[TRUE]]:
+; CHECK-NEXT:    call void @use(i32 [[I]])
+; CHECK-NEXT:    br label %[[DONE:.*]]
+; CHECK:       [[FALSE]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[IN]], i32 [[I]]
+; CHECK-NEXT:    [[ELEM:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT:    call void @use(i32 [[ELEM]])
+; CHECK-NEXT:    br label %[[DONE]]
+; CHECK:       [[DONE]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vi = fptosi <4 x float> %in to <4 x i32>
+  %i = add i32 %a, -2
+  br i1 %cond, label %true, label %false
+true:
+  call void @use(i32 %i)
+  br label %done
+false:
+  %elem = extractelement <4 x i32> %vi, i32 %i
+  call void @use(i32 %elem)
+  br label %done
+done:
+  ret void
+}
+
 define void @test_loop(<4 x float> %in) {
 ; CHECK-LABEL: define void @test_loop(
 ; CHECK-SAME: <4 x float> [[IN:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[R:%.*]] = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> [[IN]], i32 9)
+; CHECK-NEXT:    [[VI:%.*]] = fptosi <4 x float> [[IN]] to <4 x i32>
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LATCH:.*]] ]
 ; CHECK-NEXT:    [[COND:%.*]] = icmp samesign ult i32 [[I]], 4
 ; CHECK-NEXT:    br i1 [[COND]], label %[[BODY:.*]], label %[[DONE:.*]]
 ; CHECK:       [[BODY]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[R]], i32 [[I]]
-; CHECK-NEXT:    [[ELEM:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <4 x i32> [[VI]], i32 [[I]]
 ; CHECK-NEXT:    call void @use(i32 [[ELEM]])
 ; CHECK-NEXT:    br label %[[LATCH]]
 ; CHECK:       [[LATCH]]:
@@ -62,8 +93,7 @@ define void @test_loop(<4 x float> %in) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %r = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %in, i32 9)
-  %vi = fptosi <4 x float> %r to <4 x i32>
+  %vi = fptosi <4 x float> %in to <4 x i32>
   br label %loop
 loop:
   %i = phi i32 [ 0, %entry ], [ %next, %latch ]

``````````

</details>


https://github.com/llvm/llvm-project/pull/166227


More information about the llvm-commits mailing list