[llvm] [DAGCombiner] Relax condition for extract_vector_elt combine (PR #157658)

Wed Sep 10 18:53:52 PDT 2025

zhaoqi5 wrote:

> this is causing hangs on the following IR:
> 
> ```
> $ cat /tmp/a.ll
> target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-apple-ios17.0.0-simulator"
> 
> declare void @llvm.memset.p0.i64(ptr writeonly captures(none), i8, i64, i1 immarg)
> 
> declare void @llvm.memcpy.p0.p0.i64(ptr noalias writeonly captures(none), ptr noalias readonly captures(none), i64, i1 immarg)
> 
> define ptr @_ZN5SkM449setRotateE4SkV3f(ptr noundef returned writeonly align 4 captures(ret: address, provenance) dereferenceable_or_null(64) initializes((0, 64)) %this, <2 x float> %axis.coerce0, float %axis.coerce1, float noundef %radians) {
> entry:
>   %0 = fmul <2 x float> %axis.coerce0, %axis.coerce0
>   %shift = shufflevector <2 x float> %0, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
>   %foldExtExtBinop = fadd <2 x float> %0, %shift
>   %add.i.i = extractelement <2 x float> %foldExtExtBinop, i64 0
>   %mul5.i.i = fmul float %axis.coerce1, %axis.coerce1
>   %add6.i.i = fadd float %mul5.i.i, %add.i.i
>   %1 = tail call noundef float @llvm.sqrt.f32(float %add6.i.i)
>   %cmp = fcmp ogt float %add6.i.i, 0.000000e+00
>   %sub.i = fsub float %1, %1
>   %cmp.i = fcmp ord float %sub.i, 0.000000e+00
>   %or.cond = and i1 %cmp, %cmp.i
>   %div = fdiv float 1.000000e+00, %1
>   %mul5.i = fmul float %axis.coerce1, %div
>   %2 = tail call noundef float @llvm.sin.f32(float %radians)
>   %3 = tail call noundef float @llvm.cos.f32(float %radians)
>   %sub.i.i = fsub float 1.000000e+00, %3
>   %mul8.i.i = fmul float %2, %mul5.i
>   %mul33.i.i = fmul float %sub.i.i, %mul5.i
>   %mul34.i.i = fmul float %mul5.i, %mul33.i.i
>   %add35.i.i = fadd float %3, %mul34.i.i
>   %4 = insertelement <2 x float> poison, float %div, i64 0
>   %5 = shufflevector <2 x float> %4, <2 x float> poison, <2 x i32> zeroinitializer
>   %6 = fmul <2 x float> %axis.coerce0, %5
>   %7 = extractelement <2 x float> %6, i64 0
>   %mul.i.i8 = fmul float %sub.i.i, %7
>   %8 = insertelement <2 x float> poison, float %mul.i.i8, i64 0
>   %9 = shufflevector <2 x float> %8, <2 x float> poison, <2 x i32> zeroinitializer
>   %10 = fmul <2 x float> %6, %9
>   %11 = extractelement <2 x float> %10, i64 1
>   %sub9.i.i = fsub float %11, %mul8.i.i
>   %mul11.i.i = fmul float %mul5.i, %mul.i.i8
>   %12 = extractelement <2 x float> %6, i64 1
>   %mul12.i.i = fmul float %2, %12
>   %add13.i.i = fadd float %mul12.i.i, %mul11.i.i
>   %13 = insertelement <2 x float> poison, float %3, i64 0
>   %14 = insertelement <2 x float> %13, float %mul8.i.i, i64 1
>   %15 = fadd <2 x float> %14, %10
>   %mul18.i.i = fmul float %sub.i.i, %12
>   %mul22.i.i = fmul float %mul5.i, %mul18.i.i
>   %sub28.i.i = fsub float %mul11.i.i, %mul12.i.i
>   store <2 x float> %15, ptr %this, align 4
>   %ref.tmp.sroa.5.0.this.sroa_idx.i.i = getelementptr inbounds nuw i8, ptr %this, i64 8
>   store float %sub28.i.i, ptr %ref.tmp.sroa.5.0.this.sroa_idx.i.i, align 4
>   %ref.tmp.sroa.6.0.this.sroa_idx.i.i = getelementptr inbounds nuw i8, ptr %this, i64 12
>   store float 0.000000e+00, ptr %ref.tmp.sroa.6.0.this.sroa_idx.i.i, align 4
>   %ref.tmp.sroa.7.0.this.sroa_idx.i.i = getelementptr inbounds nuw i8, ptr %this, i64 16
>   store float %sub9.i.i, ptr %ref.tmp.sroa.7.0.this.sroa_idx.i.i, align 4
>   %ref.tmp.sroa.8.0.this.sroa_idx.i.i = getelementptr inbounds nuw i8, ptr %this, i64 20
>   %16 = insertelement <2 x float> poison, float %2, i64 0
>   %17 = insertelement <2 x float> %16, float %mul18.i.i, i64 1
>   %18 = fmul <2 x float> %6, %17
>   %19 = extractelement <2 x float> %18, i64 0
>   %sub24.i.i = fsub float %mul22.i.i, %19
>   %20 = insertelement <2 x float> poison, float %mul22.i.i, i64 0
>   %21 = insertelement <2 x float> %20, float %3, i64 1
>   %22 = fadd <2 x float> %21, %18
>   %23 = shufflevector <2 x float> %22, <2 x float> poison, <2 x i32> <i32 1, i32 0>
>   store <2 x float> %23, ptr %ref.tmp.sroa.8.0.this.sroa_idx.i.i, align 4
>   %ref.tmp.sroa.10.0.this.sroa_idx.i.i = getelementptr inbounds nuw i8, ptr %this, i64 28
>   store float 0.000000e+00, ptr %ref.tmp.sroa.10.0.this.sroa_idx.i.i, align 4
>   %ref.tmp.sroa.11.0.this.sroa_idx.i.i = getelementptr inbounds nuw i8, ptr %this, i64 32
>   store float %add13.i.i, ptr %ref.tmp.sroa.11.0.this.sroa_idx.i.i, align 4
>   %ref.tmp.sroa.12.0.this.sroa_idx.i.i = getelementptr inbounds nuw i8, ptr %this, i64 36
>   store float %sub24.i.i, ptr %ref.tmp.sroa.12.0.this.sroa_idx.i.i, align 4
>   %ref.tmp.sroa.13.0.this.sroa_idx.i.i = getelementptr inbounds nuw i8, ptr %this, i64 40
>   store float %add35.i.i, ptr %ref.tmp.sroa.13.0.this.sroa_idx.i.i, align 4
>   %ref.tmp.sroa.14.0.this.sroa_idx.i.i = getelementptr inbounds nuw i8, ptr %this, i64 44
>   %ref.tmp.sroa.18.0.this.sroa_idx.i.i = getelementptr inbounds nuw i8, ptr %this, i64 60
>   tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) %ref.tmp.sroa.14.0.this.sroa_idx.i.i, i8 0, i64 16, i1 false)
>   store float 1.000000e+00, ptr %ref.tmp.sroa.18.0.this.sroa_idx.i.i, align 4
>   ret ptr null
> }
> 
> declare float @llvm.sqrt.f32(float)
> 
> declare float @llvm.sin.f32(float)
> 
> declare float @llvm.cos.f32(float)
> 
> $ llc -o /dev/null /tmp/a.ll
> hang ... 
> ```
> 
> I'll revert this in the meantime

Thank you for pointing out this potential issue and providing an example.

If possible, I think it would be better to address this in the targets, as this commit can enable broader optimization opportunities for all targets. 

If anyone is willing to take a look, that would be great. I’ll also continue to study this issue further when time permits.

https://github.com/llvm/llvm-project/pull/157658