[llvm] [InstCombine] Limit canonicalization of extractelement(cast) to constant index or same basic block. (PR #166227)

via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 3 12:15:44 PST 2025


https://github.com/azwolski created https://github.com/llvm/llvm-project/pull/166227

The current canonicalization of extractelement(cast) requires that the CastInst has only one use. However, when that use occurs inside a loop, it still satisfies this condition, even though the cast is effectively used multiple times, once per iteration, rather than truly being used once.

```cpp
} else if (auto *CI = dyn_cast<CastInst>(I)) {
  // Canonicalize extractelement(cast) -> cast(extractelement).
  // Bitcasts can change the number of vector elements, and they cost
  // nothing.
  if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)){
 ```
Before
```llvm
%34 = fptosi <4 x float> %33 to <4 x i32>
;/loop{
%40 = extractelement <4 x i32> %34, i32 %36
```
 
  After
 ```llvm
 ;/loop{
 %37 = extractelement <4 x float> %30, i32 %32
 %38 = fptosi float %37 to i32
  ```

After canonicalization, for this particular example, it no longer uses a single instruction to cast the entire vector at once, but instead performs the cast for every element separately, which is less performant.

Ideally, we would like to check if the cast instruction **has one use and that this use is not called inside a loop**. However, InstCombine/InstCombineVectorOps.cpp does not provide utilities like `LoopInfo` to check that. It might be possible to approximate this by analyzing basic block successors or by building a dominance tree, but that may be a costly solution.

A solution to prevent this optimization could be to check if the index is an immediate value and if the use is inside the same basic block as the cast instruction:
```cpp
if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
    Instruction *U = cast<Instruction>(*CI->user_begin());
    if (U->getParent() == CI->getParent() || isa<ConstantInt>(Index)){
```

Fix: https://github.com/llvm/llvm-project/issues/165793

>From aa49b2bcda4b664c4cc6a65731f53c39cd39c7f0 Mon Sep 17 00:00:00 2001
From: Antoni Zwolski <antoni.zwolski at intel.com>
Date: Mon, 3 Nov 2025 11:53:52 +0100
Subject: [PATCH 1/5] [InstCombine] Limit canonicalization of
 extractelement(cast) to constant index

---
 llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 18a45c6799bac..87b5f3a58e72b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -588,7 +588,7 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
       // Canonicalize extractelement(cast) -> cast(extractelement).
       // Bitcasts can change the number of vector elements, and they cost
       // nothing.
-      if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
+      if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast) && isa<ConstantInt>(Index)) {
         Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
         return CastInst::Create(CI->getOpcode(), EE, EI.getType());
       }

>From 298613cffab3be828db15b6025ba310bb7e5a716 Mon Sep 17 00:00:00 2001
From: Antoni Zwolski <antoni.zwolski at intel.com>
Date: Mon, 3 Nov 2025 11:55:30 +0100
Subject: [PATCH 2/5] [InstCombine] Update vec_extract_var_elt.ll test checks

---
 llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
index 205b4b88c473a..c01cdae81b81c 100644
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
+++ b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
@@ -5,10 +5,10 @@ define void @test_poison(float %b, ptr %p) {
 ; CHECK-LABEL: define void @test_poison(
 ; CHECK-SAME: float [[B:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[P]], align 32
+; CHECK-NEXT:    [[A:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[B]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fptosi float [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[A]], i32 [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 7
 ; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <8 x i32> [[TMP6]] to <8 x float>
 ; CHECK-NEXT:    store <8 x float> [[TMP7]], ptr [[P]], align 32
@@ -45,14 +45,14 @@ define void @test_loop(<4 x float> %in) {
 ; CHECK-SAME: <4 x float> [[IN:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> [[IN]], i32 9)
+; CHECK-NEXT:    [[VI:%.*]] = fptosi <4 x float> [[R]] to <4 x i32>
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LATCH:.*]] ]
 ; CHECK-NEXT:    [[COND:%.*]] = icmp samesign ult i32 [[I]], 4
 ; CHECK-NEXT:    br i1 [[COND]], label %[[BODY:.*]], label %[[DONE:.*]]
 ; CHECK:       [[BODY]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[R]], i32 [[I]]
-; CHECK-NEXT:    [[ELEM:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <4 x i32> [[VI]], i32 [[I]]
 ; CHECK-NEXT:    call void @use(i32 [[ELEM]])
 ; CHECK-NEXT:    br label %[[LATCH]]
 ; CHECK:       [[LATCH]]:

>From 91e0c1b2cdd8564470cacfbf9c45ab6ca6713ee7 Mon Sep 17 00:00:00 2001
From: Antoni Zwolski <antoni.zwolski at intel.com>
Date: Mon, 3 Nov 2025 14:52:47 +0100
Subject: [PATCH 3/5] [InstCombine] Refactor canonicalization of
 extractelement(cast) to constant index or same basic block.

---
 llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 87b5f3a58e72b..5af7c4caab074 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -588,9 +588,12 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
       // Canonicalize extractelement(cast) -> cast(extractelement).
       // Bitcasts can change the number of vector elements, and they cost
       // nothing.
-      if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast) && isa<ConstantInt>(Index)) {
-        Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
-        return CastInst::Create(CI->getOpcode(), EE, EI.getType());
+      if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)){
+        Instruction *U = cast<Instruction>(*CI->user_begin());
+        if (U->getParent() == CI->getParent() || isa<ConstantInt>(Index)){
+          Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
+          return CastInst::Create(CI->getOpcode(), EE, EI.getType());
+        }
       }
     }
   }

>From 742d97e9a1168669b22bbb03680489416b48595b Mon Sep 17 00:00:00 2001
From: Antoni Zwolski <antoni.zwolski at intel.com>
Date: Mon, 3 Nov 2025 21:11:27 +0100
Subject: [PATCH 4/5] [InstCombine] Add test_poison_branch test and update
 vec_extract_var_elt.ll test checks

---
 .../InstCombine/vec_extract_var_elt.ll        | 43 ++++++++++++++++---
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
index c01cdae81b81c..35d11b0cdf43f 100644
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
+++ b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
@@ -5,10 +5,10 @@ define void @test_poison(float %b, ptr %p) {
 ; CHECK-LABEL: define void @test_poison(
 ; CHECK-SAME: float [[B:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[P]], align 32
-; CHECK-NEXT:    [[A:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[B]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[A]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fptosi float [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 7
 ; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <8 x i32> [[TMP6]] to <8 x float>
 ; CHECK-NEXT:    store <8 x float> [[TMP7]], ptr [[P]], align 32
@@ -39,13 +39,45 @@ define i32 @test_bitcast(i32 %i) {
 }
 
 declare void @use(i32)
+declare void @use_vi(<4 x i32>)
+
+define void @test_poison_branch(<4 x float> %in, i32 %a, i1 %cond) {
+; CHECK-LABEL: define void @test_poison_branch(
+; CHECK-SAME: <4 x float> [[IN:%.*]], i32 [[A:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[I:%.*]] = add i32 [[A]], -2
+; CHECK-NEXT:    br i1 [[COND]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; CHECK:       [[TRUE]]:
+; CHECK-NEXT:    call void @use(i32 [[I]])
+; CHECK-NEXT:    br label %[[DONE:.*]]
+; CHECK:       [[FALSE]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[IN]], i32 [[I]]
+; CHECK-NEXT:    [[ELEM:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT:    call void @use(i32 [[ELEM]])
+; CHECK-NEXT:    br label %[[DONE]]
+; CHECK:       [[DONE]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vi = fptosi <4 x float> %in to <4 x i32>
+  %i = add i32 %a, -2
+  br i1 %cond, label %true, label %false
+true:
+  call void @use(i32 %i)
+  br label %done
+false:
+  %elem = extractelement <4 x i32> %vi, i32 %i
+  call void @use(i32 %elem)
+  br label %done
+done:
+  ret void
+}
 
 define void @test_loop(<4 x float> %in) {
 ; CHECK-LABEL: define void @test_loop(
 ; CHECK-SAME: <4 x float> [[IN:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[R:%.*]] = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> [[IN]], i32 9)
-; CHECK-NEXT:    [[VI:%.*]] = fptosi <4 x float> [[R]] to <4 x i32>
+; CHECK-NEXT:    [[VI:%.*]] = fptosi <4 x float> [[IN]] to <4 x i32>
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LATCH:.*]] ]
@@ -62,8 +94,7 @@ define void @test_loop(<4 x float> %in) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %r = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %in, i32 9)
-  %vi = fptosi <4 x float> %r to <4 x i32>
+  %vi = fptosi <4 x float> %in to <4 x i32>
   br label %loop
 loop:
   %i = phi i32 [ 0, %entry ], [ %next, %latch ]

>From 4d8d8fdd9099e4384a215ed5d2e90e9aa690975e Mon Sep 17 00:00:00 2001
From: Antoni Zwolski <antoni.zwolski at intel.com>
Date: Mon, 3 Nov 2025 21:13:40 +0100
Subject: [PATCH 5/5] [InstCombine] Remove unused declaration of @use_vi in
 vec_extract_var_elt.ll

---
 llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
index 35d11b0cdf43f..f96b7070f9f2a 100644
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
+++ b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
@@ -39,7 +39,6 @@ define i32 @test_bitcast(i32 %i) {
 }
 
 declare void @use(i32)
-declare void @use_vi(<4 x i32>)
 
 define void @test_poison_branch(<4 x float> %in, i32 %a, i1 %cond) {
 ; CHECK-LABEL: define void @test_poison_branch(



More information about the llvm-commits mailing list