[llvm] [X86] combineINSERT_SUBVECTOR - fold insert_subvector(base,extract_subvector(broadcast)) -> blend shuffle(base,broadcast) (PR #133083)

Jordan Rupprecht via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 28 14:57:30 PDT 2025


rupprecht wrote:

The timeout (probably infinite loop) happens when a tool lowers to MCJIT machine code with this snippet:

```c++
  llvm::MCContext* mc_context;
  llvm::legacy::PassManager codegen_passes;
  target_machine->addPassesToEmitMC(codegen_passes, mc_context, ostream);
  codegen_passes.run(module);  // Timeout/infinite loop happens here
```

([source](https://github.com/openxla/xla/blob/1808c0eb86b5da1dd82394c16bb650318deaccf4/xla/backends/cpu/codegen/ir_compiler.cc#L358))

I dumped the IR before that snippet, and it looks something like:

```llvm
; ModuleID = '/tmp/repro.ll'
source_filename = "__repro"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) uwtable
define noalias noundef ptr @widget(ptr readonly captures(none) %arg) local_unnamed_addr #0 {
bb:
  %getelementptr = getelementptr inbounds nuw i8, ptr %arg, i64 24
  %load = load ptr, ptr %getelementptr, align 8
  %load1 = load ptr, ptr %load, align 8, !invariant.load !2, !dereferenceable !3, !align !4
  %getelementptr2 = getelementptr i8, ptr %load, i64 16
  %load3 = load ptr, ptr %getelementptr2, align 8, !invariant.load !2, !dereferenceable !5, !align !4
  %load4 = load i64, ptr %load1, align 32, !invariant.load !2, !noalias !6
  %add = add i64 %load4, 1
  %insertelement = insertelement <16 x i64> poison, i64 %load4, i64 0
  %insertelement5 = insertelement <16 x i64> %insertelement, i64 %add, i64 1
  %shufflevector = shufflevector <16 x i64> %insertelement5, <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %icmp = icmp ugt <16 x i64> %shufflevector, <i64 9223372036854775806, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>
  %icmp6 = icmp slt <16 x i64> %shufflevector, <i64 9223372036854775806, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>
  %shufflevector7 = shufflevector <16 x i1> %icmp, <16 x i1> %icmp6, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %zext = zext <16 x i1> %shufflevector7 to <16 x i8>
  store <16 x i8> %zext, ptr %load3, align 32, !alias.scope !6
  %getelementptr8 = getelementptr inbounds nuw i8, ptr %load3, i64 16
  %insertelement9 = insertelement <8 x i64> poison, i64 %add, i64 0
  %shufflevector10 = shufflevector <8 x i64> %insertelement9, <8 x i64> poison, <8 x i32> zeroinitializer
  %icmp11 = icmp slt <8 x i64> %shufflevector10, <i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24>
  %zext12 = zext <8 x i1> %icmp11 to <8 x i8>
  store <8 x i8> %zext12, ptr %getelementptr8, align 16, !alias.scope !6
  %icmp13 = icmp slt i64 %add, 25
  %zext14 = zext i1 %icmp13 to i8
  %getelementptr15 = getelementptr inbounds nuw i8, ptr %load3, i64 24
  store i8 %zext14, ptr %getelementptr15, align 8, !alias.scope !6
  ret ptr null
}

attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) uwtable "frame-pointer"="all" "prefer-vector-width"="256" }

!llvm.module.flags = !{!0, !1}

!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 1, !"xx", i64 18}
!2 = !{}
!3 = !{i64 8}
!4 = !{i64 32}
!5 = !{i64 25}
!6 = !{!7}
!7 = !{!"result slice: {index:1, offset:0, size:25}", !8}
!8 = !{!"xx"}
```

When running the original test, it fails ~half the time. (A single test case takes 2 seconds when it passes, but otherwise gets killed by the test runner at 60s). However, I can't get standard LLVM tool (opt, lli, ...) to fail when directly processing this IR. I'm not sure what the difference is yet.

https://github.com/llvm/llvm-project/pull/133083


More information about the llvm-commits mailing list