[llvm] [LoopInterchange] Motivating example for interchange enablement. NFC. (PR #171631)

Wed Dec 10 07:03:40 PST 2025

https://github.com/sjoerdmeijer created https://github.com/llvm/llvm-project/pull/171631

This is precommitting a full reproducer of one of our motivating examples.  Looking at a full reproducer is helpful for further discussion on DependenceAnalysis and Delinearization issues and the runtime predicates discussion. I appreciate that this is a larger than usual test case, but that is by design, because I think it is useful to look at the whole thing with all of its complexities. 

I have given useful names to all the relevant loop variables, and the relevant blocks in these loops and their functions, but have intentionally not done that for others as there are quite a few more.

>From 8844688c647f288ae5aa6049c7e8f66cf3e55f18 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <smeijer at nvidia.com>
Date: Wed, 10 Dec 2025 06:54:22 -0800
Subject: [PATCH] [LoopInterchange] Add motivating example for interchange
 enablement. NFC.

This is precommitting a full reproducer of one of our motivating
examples.  Looking at a full reproducer is helpful for further
discussion on DependenceAnalysis and Delinearization issues.

I have given useful names to all the relevant loop variables, and the
relevant blocks in these loops and their functions, but have
intentionally not done that for others as there are quite a few more.
---
 .../LoopInterchange/large-nested-6d.ll        | 543 ++++++++++++++++++
 1 file changed, 543 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopInterchange/large-nested-6d.ll

diff --git a/llvm/test/Transforms/LoopInterchange/large-nested-6d.ll b/llvm/test/Transforms/LoopInterchange/large-nested-6d.ll
new file mode 100644
index 0000000000000..a9ada019b0c30
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/large-nested-6d.ll
@@ -0,0 +1,543 @@
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S
+; RUN: FileCheck --input-file=%t %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+
+; The IR test case below is a full and representative example of a more complex
+; loop nest structure where we would like to see loops 'j' and 'JL' get
+; interchanged. This is the pseudo-code of the loop nest to get and idea of
+; the stucture:
+;
+;      for L=1 to NX
+;       for M=1 to NY
+;        for i=1 to NX
+;         for j=1 to NY
+;          for IL=1 to NX
+;           for JL=1 to NY
+;             GlobC(j,JL,M) + GlobG(j,JL,M) + GlobE(j,JL,M) + GlobI(j,JL,M)
+;           End
+;          End
+;         End
+;        End
+;        // Stmt 2
+;        // Stmt 3
+;        // Stmt 4
+;      End
+;     End
+;
+; In the IR below, basic block JL.body is part of the loop that we would like
+; like to see interchanged. There are 4 loads and 1 store that are
+; unit-strided over 'j', so making 'j' loop the innermost is preferable here.
+
+; CHECK:       --- !Missed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            UnsupportedLoopNestDepth
+; CHECK-NEXT:  Function:        test
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          'Unsupported depth of loop nest, the supported range is ['
+; CHECK-NEXT:    - String:          '2'
+; CHECK-NEXT:    - String:          ', '
+; CHECK-NEXT:    - String:          '10'
+; CHECK-NEXT:    - String:          "].\n"
+; CHECK-NEXT:  ...
+; CHECK-NEXT:  --- !Analysis
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            Dependence
+; CHECK-NEXT:  Function:        test
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Computed dependence info, invoking the transform.
+; CHECK-NEXT:  ...
+; CHECK-NEXT:  --- !Missed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            Dependence
+; CHECK-NEXT:  Function:        test
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Cannot interchange loops due to dependences.
+; CHECK-NEXT:  ...
+; CHECK-NEXT:  --- !Missed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            UnsupportedLoopNestDepth
+; CHECK-NEXT:  Function:        test
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          'Unsupported depth of loop nest, the supported range is ['
+; CHECK-NEXT:    - String:          '2'
+; CHECK-NEXT:    - String:          ', '
+; CHECK-NEXT:    - String:          '10'
+; CHECK-NEXT:    - String:          "].\n"
+; CHECK-NEXT:  ...
+; CHECK-NEXT:  --- !Analysis
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            Dependence
+; CHECK-NEXT:  Function:        test
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Computed dependence info, invoking the transform.
+; CHECK-NEXT:  ...
+; CHECK-NEXT:  --- !Missed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            NotTightlyNested
+; CHECK-NEXT:  Function:        test
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Cannot interchange loops because they are not tightly nested.
+; CHECK-NEXT:  ...
+; CHECK-NEXT:  --- !Missed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            Dependence
+; CHECK-NEXT:  Function:        test
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Cannot interchange loops due to dependences.
+; CHECK-NEXT:  ...
+; CHECK-NEXT:  --- !Analysis
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            Dependence
+; CHECK-NEXT:  Function:        test
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          Computed dependence info, invoking the transform.
+; CHECK-NEXT:  ...
+; CHECK-NEXT:  --- !Missed
+; CHECK-NEXT:  Pass:            loop-interchange
+; CHECK-NEXT:  Name:            Dependence
+; CHECK-NEXT:  Function:        test
+; CHECK-NEXT:  Args:
+; CHECK-NEXT:    - String:          All loops have dependencies in all directions.
+; CHECK-NEXT:  ...
+
+ at GlobC = local_unnamed_addr global [54 x [54 x [54 x double]]] zeroinitializer
+ at GlobD = local_unnamed_addr global [54 x [54 x [54 x double]]] zeroinitializer
+ at GlobE = local_unnamed_addr global [54 x [54 x [54 x double]]] zeroinitializer
+ at GlobF = local_unnamed_addr global [54 x [54 x [54 x double]]] zeroinitializer
+ at GlobG = local_unnamed_addr global [54 x [54 x [54 x double]]] zeroinitializer
+ at GlobH = local_unnamed_addr global [54 x [54 x [54 x double]]] zeroinitializer
+ at GlobI = local_unnamed_addr global [54 x [54 x [54 x double]]] zeroinitializer
+ at GlobJ = local_unnamed_addr global [54 x [54 x [54 x double]]] zeroinitializer
+ at GlobK = local_unnamed_addr global [1000 x [1000 x double]] zeroinitializer
+ at GlobL = local_unnamed_addr global [1000 x [1000 x double]] zeroinitializer
+ at GlobM = local_unnamed_addr global [2500 x double] zeroinitializer
+
+define void @test(ptr noalias readonly captures(none) %0, ptr noalias readonly captures(none) %1, ptr noalias captures(none) %2, ptr noalias captures(none) %3, ptr noalias readonly captures(none) %4, ptr noalias readonly captures(none) %5, ptr noalias readonly captures(none) %6, ptr noalias readonly captures(none) %7, ptr noalias readonly captures(none) %8, ptr noalias readonly captures(none) %9) {
+  %11 = alloca [2500 x double], align 8
+  %12 = load i32, ptr %4, align 4
+  %13 = tail call i32 @llvm.smax.i32(i32 %12, i32 0)
+  %14 = zext nneg i32 %13 to i64
+  %15 = load i32, ptr %9, align 4
+  %.not = icmp eq i32 %15, 1
+  br i1 %.not, label %171, label %16
+
+16:
+  %17 = load i32, ptr %7, align 4
+  %18 = sext i32 %17 to i64
+  %19 = icmp sgt i32 %17, 0
+  br i1 %19, label %.lr.ph286, label %._crit_edge287
+
+.lr.ph286:
+  %20 = load i32, ptr %8, align 4
+  %21 = sext i32 %20 to i64
+  %22 = icmp sgt i32 %20, 0
+  br i1 %22, label %preheader.L, label %._crit_edge287
+
+preheader.L:
+  %23 = load i32, ptr %5, align 4
+  %24 = tail call i32 @llvm.smax.i32(i32 %23, i32 0)
+  %25 = zext nneg i32 %24 to i64
+  %26 = load i32, ptr %6, align 4
+  %27 = sext i32 %26 to i64
+  %28 = getelementptr double, ptr %1, i64 %27
+  %.not241270.us = icmp slt i32 %23, 1
+  %29 = shl nuw nsw i64 %25, 3
+  %30 = add nuw nsw i64 %25, 2
+  %31 = icmp sgt i32 %23, 0
+  %.neg = sext i1 %31 to i64
+  %32 = add nsw i64 %30, %.neg
+  br label %L.header
+
+L.header:
+  %L = phi i64 [ %L.next, %L.latch ], [ 1, %preheader.L ]
+  %33 = mul nuw nsw i64 %L, 2916
+  %34 = add nsw i64 %33, -2971
+  %35 = add nsw i64 %L, -1
+  %36 = mul nsw i64 %35, %21
+  br label %M.header
+
+exit.i:
+  br i1 %.not241270.us, label %._crit_edge275.us.thread, label %.preheader258.us.preheader
+
+.lr.ph274.us:
+  %37 = phi i64 [ %48, %.lr.ph274.us ], [ %25, %.preheader260.us ]
+  %38 = phi double [ %46, %.lr.ph274.us ], [ 0.000000e+00, %.preheader260.us ]
+  %39 = phi i64 [ %47, %.lr.ph274.us ], [ 1, %.preheader260.us ]
+  %40 = add nsw i64 %39, -1
+  %41 = getelementptr double, ptr %28, i64 %40
+  %42 = load double, ptr %41, align 8
+  %43 = getelementptr double, ptr @GlobM, i64 %40
+  %44 = load double, ptr %43, align 8
+  %45 = fmul fast double %44, %42
+  %46 = fadd fast double %45, %38
+  %47 = add nuw nsw i64 %39, 1
+  %48 = add nsw i64 %37, -1
+  %.not242.us = icmp eq i64 %48, 0
+  br i1 %.not242.us, label %.lr.ph278.us.preheader, label %.lr.ph274.us
+
+.lr.ph278.us.preheader:
+  %.lcssa = phi double [ %46, %.lr.ph274.us ]
+  %49 = add nsw i64 %M, %36
+  %50 = getelementptr double, ptr %11, i64 %49
+  %51 = getelementptr i8, ptr %50, i64 -8
+  store double %.lcssa, ptr %51, align 8
+  %52 = getelementptr double, ptr @GlobK, i64 %49
+  %53 = getelementptr i8, ptr %52, i64 -8
+  br label %.lr.ph278.us
+
+latch.M.loopexit:
+  br label %latch.M
+
+latch.M:
+  %M.next = add nuw nsw i64 %M, 1
+  %exitcond335.not = icmp eq i64 %M, %21
+  br i1 %exitcond335.not, label %L.latch, label %M.header
+
+.lr.ph278.us:
+  %54 = phi i64 [ %133, %._crit_edge279.us ], [ 1, %.lr.ph278.us.preheader ]
+  %55 = add nsw i64 %54, -1
+  %.idx244.us = mul nuw nsw i64 %55, 8000
+  %56 = getelementptr i8, ptr @GlobL, i64 %.idx244.us
+  br label %57
+
+57:
+  %58 = phi i64 [ %25, %.lr.ph278.us ], [ %69, %57 ]
+  %59 = phi double [ 0.000000e+00, %.lr.ph278.us ], [ %67, %57 ]
+  %60 = phi i64 [ 1, %.lr.ph278.us ], [ %68, %57 ]
+  %61 = add nsw i64 %60, -1
+  %62 = getelementptr double, ptr %56, i64 %61
+  %63 = load double, ptr %62, align 8
+  %64 = getelementptr double, ptr %28, i64 %61
+  %65 = load double, ptr %64, align 8
+  %66 = fmul fast double %65, %63
+  %67 = fadd fast double %66, %59
+  %68 = add nuw nsw i64 %60, 1
+  %69 = add nsw i64 %58, -1
+  %.not243.us = icmp eq i64 %69, 0
+  br i1 %.not243.us, label %._crit_edge279.us, label %57
+
+70:
+  %71 = phi i64 [ %25, %.preheader258.us ], [ %81, %70 ]
+  %72 = phi i64 [ 1, %.preheader258.us ], [ %80, %70 ]
+  %73 = add nsw i64 %72, -1
+  %74 = getelementptr double, ptr @GlobM, i64 %73
+  %75 = load double, ptr %74, align 8
+  %76 = getelementptr double, ptr %84, i64 %73
+  %77 = load double, ptr %76, align 8
+  %78 = fmul fast double %86, %77
+  %79 = fadd fast double %78, %75
+  store double %79, ptr %74, align 8
+  %80 = add nuw nsw i64 %72, 1
+  %81 = add nsw i64 %71, -1
+  %.not245.us = icmp eq i64 %81, 0
+  br i1 %.not245.us, label %._crit_edge.us, label %70
+
+.preheader258.us:
+  %82 = phi i64 [ %128, %._crit_edge.us ], [ 1, %.preheader258.us.preheader ]
+  %83 = add nsw i64 %82, -1
+  %.idx246.us = mul nuw nsw i64 %83, 8000
+  %84 = getelementptr i8, ptr @GlobL, i64 %.idx246.us
+  %85 = getelementptr double, ptr %28, i64 %83
+  %86 = load double, ptr %85, align 8
+  br label %70
+
+.preheader260.us:
+  br label %.lr.ph274.us
+
+._crit_edge275.us.thread:
+  %87 = getelementptr double, ptr %11, i64 %M
+  %88 = getelementptr double, ptr %87, i64 %36
+  %89 = getelementptr i8, ptr %88, i64 -8
+  store double 0.000000e+00, ptr %89, align 8
+  br label %latch.M
+
+.preheader258.us.preheader:
+  call void @llvm.memset.p0.i64(ptr nonnull align 16 @GlobM, i8 0, i64 %29, i1 false)
+  br label %.preheader258.us
+
+M.header:
+  %M = phi i64 [ 1, %L.header ], [ %M.next, %latch.M ]
+  %90 = mul nuw nsw i64 %M, 2916
+  %91 = add nsw i64 %90, -2971
+  br label %i.header
+
+i.header:
+  %i = phi i64 [ %i.next, %i.latch ], [ 1, %M.header ]
+  %92 = add nsw i64 %34, %i
+  %93 = add nsw i64 %i, -1
+  %94 = mul nsw i64 %93, %21
+  %invariant.gep = getelementptr double, ptr @GlobL, i64 %94
+  br label %j.header
+
+j.header:
+  %j = phi i64 [ %j.next, %j.latch ], [ 1, %i.header ]
+  %95 = add nsw i64 %91, %j
+  %gep358 = getelementptr double, ptr %invariant.gep, i64 %j
+  br label %IL.header
+
+IL.header:
+  %IL = phi i64 [ %IL.next, %IL.latch ], [ 1, %j.header ]
+  %96 = mul nuw nsw i64 %IL, 54
+  %97 = add nsw i64 %92, %96
+  %98 = getelementptr double, ptr @GlobC, i64 %97
+  %99 = load double, ptr %98, align 8
+  %100 = getelementptr double, ptr @GlobG, i64 %97
+  %101 = load double, ptr %100, align 8
+  %102 = getelementptr double, ptr @GlobE, i64 %97
+  %103 = load double, ptr %102, align 8
+  %104 = getelementptr double, ptr @GlobI, i64 %97
+  %105 = load double, ptr %104, align 8
+  %106 = add nsw i64 %IL, -1
+  %107 = mul nsw i64 %106, %21
+  br label %JL.body
+
+JL.body:
+  %JL = phi i64 [ %JL.next, %JL.body ], [ 1, %IL.header ]
+  %109 = mul nuw nsw i64 %JL, 54
+  %110 = add nsw i64 %95, %109
+  %111 = getelementptr double, ptr @GlobD, i64 %110
+  %112 = load double, ptr %111, align 8
+  %113 = fmul fast double %112, %99
+  %114 = getelementptr double, ptr @GlobH, i64 %110
+  %115 = load double, ptr %114, align 8
+  %116 = fmul fast double %115, %101
+  %117 = fadd fast double %116, %113
+  %118 = getelementptr double, ptr @GlobF, i64 %110
+  %119 = load double, ptr %118, align 8
+  %120 = fmul fast double %119, %103
+  %121 = fadd fast double %117, %120
+  %122 = getelementptr double, ptr @GlobJ, i64 %110
+  %123 = load double, ptr %122, align 8
+  %124 = fmul fast double %123, %105
+  %125 = fadd fast double %121, %124
+  %126 = add nsw i64 %JL, %107
+  %.idx247.us.us.us.us.us.us = mul nsw i64 %126, 8000
+  %gep.us.us.us.us.us.us = getelementptr i8, ptr %gep358, i64 %.idx247.us.us.us.us.us.us
+  %127 = getelementptr i8, ptr %gep.us.us.us.us.us.us, i64 -8008
+  store double %125, ptr %127, align 8
+  %JL.next = add nuw nsw i64 %JL, 1
+  %exitcond.not = icmp eq i64 %JL, %21
+  br i1 %exitcond.not, label %IL.latch, label %JL.body
+
+IL.latch:
+  %IL.next = add nuw nsw i64 %IL, 1
+  %exitcond320.not = icmp eq i64 %IL, %18
+  br i1 %exitcond320.not, label %j.latch, label %IL.header
+
+j.latch:
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond324.not = icmp eq i64 %j, %21
+  br i1 %exitcond324.not, label %i.latch, label %j.header
+
+i.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond328.not = icmp eq i64 %i, %18
+  br i1 %exitcond328.not, label %exit.i, label %i.header
+
+._crit_edge.us:
+  %128 = add nuw nsw i64 %82, 1
+  %exitcond329.not = icmp eq i64 %128, %32
+  br i1 %exitcond329.not, label %.preheader260.us, label %.preheader258.us
+
+._crit_edge279.us:
+  %.lcssa360 = phi double [ %67, %57 ]
+  %129 = getelementptr double, ptr @GlobM, i64 %55
+  %130 = load double, ptr %129, align 8
+  %131 = fadd fast double %130, %.lcssa360
+  %132 = getelementptr i8, ptr %53, i64 %.idx244.us
+  store double %131, ptr %132, align 8
+  %133 = add nuw nsw i64 %54, 1
+  %exitcond331.not = icmp eq i64 %133, %32
+  br i1 %exitcond331.not, label %latch.M.loopexit, label %.lr.ph278.us
+
+L.latch:
+  %L.next = add nuw nsw i64 %L, 1
+  %exitcond339.not = icmp eq i64 %L, %18
+  br i1 %exitcond339.not, label %exit.L, label %L.header
+
+exit.L:
+  br label %._crit_edge287
+
+._crit_edge287:
+  %134 = load i32, ptr %6, align 4
+  %135 = load i32, ptr %5, align 4
+  %136 = tail call i32 @llvm.smax.i32(i32 %135, i32 0)
+  %137 = zext nneg i32 %136 to i64
+  %138 = sext i32 %134 to i64
+  %139 = getelementptr double, ptr %2, i64 %138
+  %140 = shl nuw nsw i64 %137, 3
+  %.not236 = icmp slt i32 %135, 1
+  %141 = select i1 %.not236, i64 1, i64 %140
+  %142 = tail call ptr @malloc(i64 %141)
+  br i1 %.not236, label %._crit_edge294, label %.preheader254.preheader
+
+.preheader254.preheader:
+  call void @llvm.memset.p0.i64(ptr align 8 %142, i8 0, i64 %140, i1 false)
+  br label %.preheader254
+
+.preheader254:
+  %143 = phi i64 [ %160, %._crit_edge ], [ 1, %.preheader254.preheader ]
+  %144 = add nsw i64 %143, -1
+  %.idx240 = mul nuw nsw i64 %144, 8000
+  %145 = getelementptr i8, ptr %0, i64 %.idx240
+  %146 = getelementptr double, ptr %11, i64 %144
+  %147 = load double, ptr %146, align 8
+  br label %148
+
+.preheader253:
+  br label %.lr.ph293
+
+148:
+  %149 = phi i64 [ %137, %.preheader254 ], [ %159, %148 ]
+  %150 = phi i64 [ 1, %.preheader254 ], [ %158, %148 ]
+  %151 = add nsw i64 %150, -1
+  %152 = getelementptr double, ptr %142, i64 %151
+  %153 = load double, ptr %152, align 8
+  %154 = getelementptr double, ptr %145, i64 %151
+  %155 = load double, ptr %154, align 8
+  %156 = fmul fast double %147, %155
+  %157 = fadd fast double %156, %153
+  store double %157, ptr %152, align 8
+  %158 = add nuw nsw i64 %150, 1
+  %159 = add nsw i64 %149, -1
+  %.not239 = icmp eq i64 %159, 0
+  br i1 %.not239, label %._crit_edge, label %148
+
+._crit_edge:
+  %160 = add nuw nsw i64 %143, 1
+  %exitcond341.not = icmp eq i64 %143, %137
+  br i1 %exitcond341.not, label %.preheader253, label %.preheader254
+
+.lr.ph293:
+  %161 = phi i64 [ %170, %.lr.ph293 ], [ %137, %.preheader253 ]
+  %162 = phi i64 [ %169, %.lr.ph293 ], [ 1, %.preheader253 ]
+  %163 = add nsw i64 %162, -1
+  %164 = getelementptr double, ptr %139, i64 %163
+  %165 = getelementptr double, ptr %142, i64 %163
+  %166 = load double, ptr %165, align 8
+  %167 = load double, ptr %164, align 8
+  %168 = fsub fast double %167, %166
+  store double %168, ptr %164, align 8
+  %169 = add nuw nsw i64 %162, 1
+  %170 = add nsw i64 %161, -1
+  %.not238 = icmp eq i64 %170, 0
+  br i1 %.not238, label %._crit_edge294.loopexit359, label %.lr.ph293
+
+171:
+  %172 = load i32, ptr %6, align 4
+  %173 = load i32, ptr %5, align 4
+  %174 = tail call i32 @llvm.smax.i32(i32 %173, i32 0)
+  %175 = zext nneg i32 %174 to i64
+  %176 = shl nuw nsw i64 %175, 3
+  %177 = mul i64 %176, %175
+  %178 = tail call i64 @llvm.smax.i64(i64 %177, i64 1)
+  %179 = tail call ptr @malloc(i64 %178)
+  %.not311 = icmp slt i32 %173, 1
+  br i1 %.not311, label %._crit_edge294, label %.preheader250.us.preheader
+
+.preheader250.us.preheader:
+  %180 = mul nuw nsw i64 %175, %175
+  %181 = shl i64 %180, 3
+  call void @llvm.memset.p0.i64(ptr align 8 %179, i8 0, i64 %181, i1 false)
+  br label %.preheader250.us
+
+.preheader250.us:
+  %182 = phi i64 [ %203, %._crit_edge301.split.us ], [ 1, %.preheader250.us.preheader ]
+  %183 = add nsw i64 %182, -1
+  %.idx.us = mul nuw nsw i64 %183, 8000
+  %184 = getelementptr i8, ptr %0, i64 %.idx.us
+  %invariant.gep.us = getelementptr double, ptr @GlobK, i64 %183
+  br label %.preheader249.us
+
+185:
+  %186 = phi i64 [ %175, %.preheader249.us ], [ %196, %185 ]
+  %187 = phi i64 [ 1, %.preheader249.us ], [ %195, %185 ]
+  %188 = add nsw i64 %187, -1
+  %189 = getelementptr double, ptr %200, i64 %188
+  %190 = load double, ptr %189, align 8
+  %191 = getelementptr double, ptr %184, i64 %188
+  %192 = load double, ptr %191, align 8
+  %193 = fmul fast double %201, %192
+  %194 = fadd fast double %193, %190
+  store double %194, ptr %189, align 8
+  %195 = add nuw nsw i64 %187, 1
+  %196 = add nsw i64 %186, -1
+  %.not233.us = icmp eq i64 %196, 0
+  br i1 %.not233.us, label %._crit_edge300.us, label %185
+
+.preheader249.us:
+  %197 = phi i64 [ 1, %.preheader250.us ], [ %202, %._crit_edge300.us ]
+  %198 = add nsw i64 %197, -1
+  %199 = mul nuw nsw i64 %198, %175
+  %200 = getelementptr double, ptr %179, i64 %199
+  %.idx234.us = mul nuw nsw i64 %198, 8000
+  %gep.us = getelementptr i8, ptr %invariant.gep.us, i64 %.idx234.us
+  %201 = load double, ptr %gep.us, align 8
+  br label %185
+
+._crit_edge300.us:
+  %202 = add nuw nsw i64 %197, 1
+  %exitcond344.not = icmp eq i64 %197, %175
+  br i1 %exitcond344.not, label %._crit_edge301.split.us, label %.preheader249.us
+
+._crit_edge301.split.us:
+  %203 = add nuw nsw i64 %182, 1
+  %exitcond345.not = icmp eq i64 %182, %175
+  br i1 %exitcond345.not, label %.preheader248, label %.preheader250.us
+
+.preheader248:
+  br label %.preheader.lr.ph
+
+.preheader.lr.ph:
+  %204 = sext i32 %172 to i64
+  %invariant.gep306 = getelementptr double, ptr %3, i64 %204
+  br label %.preheader
+
+.preheader:
+  %205 = phi i64 [ 1, %.preheader.lr.ph ], [ %221, %._crit_edge304 ]
+  %206 = add nsw i64 %205, -1
+  %207 = add nsw i64 %206, %204
+  %208 = mul nsw i64 %207, %14
+  %gep307 = getelementptr double, ptr %invariant.gep306, i64 %208
+  %209 = mul nuw nsw i64 %206, %175
+  %210 = getelementptr double, ptr %179, i64 %209
+  br label %211
+
+211:
+  %212 = phi i64 [ %175, %.preheader ], [ %220, %211 ]
+  %213 = phi i64 [ 1, %.preheader ], [ %219, %211 ]
+  %214 = add nsw i64 %213, -1
+  %gep = getelementptr double, ptr %gep307, i64 %214
+  %215 = getelementptr double, ptr %210, i64 %214
+  %216 = load double, ptr %215, align 8
+  %217 = load double, ptr %gep, align 8
+  %218 = fsub fast double %217, %216
+  store double %218, ptr %gep, align 8
+  %219 = add nuw nsw i64 %213, 1
+  %220 = add nsw i64 %212, -1
+  %.not232 = icmp eq i64 %220, 0
+  br i1 %.not232, label %._crit_edge304, label %211
+
+._crit_edge304:
+  %221 = add nuw nsw i64 %205, 1
+  %exitcond347.not = icmp eq i64 %205, %175
+  br i1 %exitcond347.not, label %._crit_edge294.loopexit, label %.preheader
+
+._crit_edge294.loopexit:
+  br label %._crit_edge294
+
+._crit_edge294.loopexit359:
+  br label %._crit_edge294
+
+._crit_edge294:
+  %.sink = phi ptr [ %142, %._crit_edge287 ], [ %179, %171 ], [ %179, %._crit_edge294.loopexit ], [ %142, %._crit_edge294.loopexit359 ]
+  tail call void @free(ptr %.sink)
+  ret void
+}
+
+declare i64 @llvm.smax.i64(i64, i64)
+declare i32 @llvm.smax.i32(i32, i32)
+declare void @llvm.memset.p0.i64(ptr writeonly captures(none), i8, i64, i1 immarg)
+declare void @free(ptr allocptr noundef captures(none)) local_unnamed_addr
+declare noalias noundef ptr @malloc(i64 noundef) local_unnamed_addr