[llvm-branch-commits] [llvm] [mlir] Release/14.x (PR #104042)

Shravan Kumar via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Aug 14 08:36:37 PDT 2024


https://github.com/shravankumar0811 created https://github.com/llvm/llvm-project/pull/104042

None

>From cfd62625658626c24e9549fa5c6e07aadfe2d792 Mon Sep 17 00:00:00 2001
From: Shravan Kumar <shkumar at habana.ai>
Date: Tue, 28 Jun 2022 17:34:18 +0300
Subject: [PATCH 1/8] Adding cfcss pass

---
 llvm/lib/Transforms/CMakeLists.txt       |   1 +
 llvm/lib/Transforms/Cfcss/CMakeLists.txt |  20 +++
 llvm/lib/Transforms/Cfcss/Cfcss.cpp      | 165 ++++++++++++++++++++
 llvm/lib/Transforms/Cfcss/Cfscc.exports  |   0
 tests/CMakeLists.txt                     |   1 +
 tests/cfcss/cfcss.c                      |  10 ++
 tests/cfcss/cfcss.ll                     |  65 ++++++++
 tests/cfcss/command.sh                   |   2 +
 tests/cfcss/out1_cfcss.ll                | 186 +++++++++++++++++++++++
 tests/cfcss/out_cfcss.ll                 | 104 +++++++++++++
 10 files changed, 554 insertions(+)
 create mode 100644 llvm/lib/Transforms/Cfcss/CMakeLists.txt
 create mode 100644 llvm/lib/Transforms/Cfcss/Cfcss.cpp
 create mode 100644 llvm/lib/Transforms/Cfcss/Cfscc.exports
 create mode 100644 tests/CMakeLists.txt
 create mode 100644 tests/cfcss/cfcss.c
 create mode 100644 tests/cfcss/cfcss.ll
 create mode 100644 tests/cfcss/command.sh
 create mode 100644 tests/cfcss/out1_cfcss.ll
 create mode 100644 tests/cfcss/out_cfcss.ll

diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt
index dda5f6de11e326..5ed9ca62265fe8 100644
--- a/llvm/lib/Transforms/CMakeLists.txt
+++ b/llvm/lib/Transforms/CMakeLists.txt
@@ -9,3 +9,4 @@ add_subdirectory(Hello)
 add_subdirectory(ObjCARC)
 add_subdirectory(Coroutines)
 add_subdirectory(CFGuard)
+add_subdirectory(Cfcss)
diff --git a/llvm/lib/Transforms/Cfcss/CMakeLists.txt b/llvm/lib/Transforms/Cfcss/CMakeLists.txt
new file mode 100644
index 00000000000000..4dc70e819e0cf7
--- /dev/null
+++ b/llvm/lib/Transforms/Cfcss/CMakeLists.txt
@@ -0,0 +1,20 @@
+# If we don't need RTTI or EH, there's no reason to export anything
+# from the hello plugin.
+if( NOT LLVM_REQUIRES_RTTI )
+  if( NOT LLVM_REQUIRES_EH )
+    set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/Cfscc.exports)
+  endif()
+endif()
+
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_library( LLVMCfcss MODULE BUILDTREE_ONLY
+  Cfcss.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/llvm/lib/Transforms/Cfcss/Cfcss.cpp b/llvm/lib/Transforms/Cfcss/Cfcss.cpp
new file mode 100644
index 00000000000000..5bdcd478b1b9bb
--- /dev/null
+++ b/llvm/lib/Transforms/Cfcss/Cfcss.cpp
@@ -0,0 +1,165 @@
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+#include <string>
+
+using namespace llvm;
+
+namespace {
+// Hello2 - The second implementation with getAnalysisUsage implemented.
+struct Cfcss : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  Cfcss() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override {
+
+    for (Function &F : M) {
+
+      if (F.getName() != "__cfcss_error" && F.getName() != "printf" &&
+          F.getName() != "exit") {
+
+        IRBuilder<> Builder((F.begin())->getFirstNonPHI());
+
+        GlobalVariable *GV = new llvm::GlobalVariable(
+            *F.getParent(), IntegerType::getInt32Ty((F.getContext())), false,
+            llvm::GlobalValue::InternalLinkage, Builder.getInt32(0), "G");
+
+        GlobalVariable *Dg = new llvm::GlobalVariable(
+            *F.getParent(), IntegerType::getInt32Ty((F.getContext())), false,
+            llvm::GlobalValue::InternalLinkage, Builder.getInt32(0), "D");
+
+        //While iterating over BB we might get new BB and it is not-exiting
+        SmallVector<llvm::BasicBlock*> VBasicBlock;
+        llvm::DenseMap<BasicBlock *, int> SigMap;
+        llvm::DenseMap<BasicBlock *, int> Dsig;
+        llvm::DenseMap<BasicBlock *, int> Diffsig;
+        llvm::DenseMap<BasicBlock *, Instruction *> BrIMap;
+        int SigCount = 1;
+        BasicBlock *Pbb;
+        LLVMContext &Ctx = M.getContext();
+        FunctionCallee ErrorFunc =
+            M.getOrInsertFunction("__cfcss_error", Builder.getVoidTy());
+        SmallVector<Value *> Arguments;
+
+        // Checking the branch/return instruction of each BB and storing it into
+        // BrIMap.
+        for (BasicBlock &BB : F) {
+          for (Instruction &I : BB) {
+            if (isa<BranchInst, ReturnInst>(I)) {
+              BrIMap[&BB] = &I;
+              break;
+            }
+          }
+        }
+
+        // Calculating Signature(s) of each BB and storing it into SigMap.
+        for (BasicBlock &BB : F) {
+          SigMap[&BB] = SigCount;
+          SigCount++;
+          VBasicBlock.push_back(&BB);
+        }
+
+        Builder.CreateStore(Builder.getInt32(1), GV);
+
+        // Calculating Dsig, by xoring Source (S) and Destination (sd) sig.
+        for (BasicBlock &BB : F) {
+          if (BB.hasNPredecessors(1)) {
+            Dsig[&BB] = SigMap[&BB] ^ SigMap[BB.getSinglePredecessor()];
+          }
+          // Calculating Dsig of BB of one predecessors , if BB contains 2
+          // predecessors.
+          if (BB.hasNPredecessorsOrMore(2)) {
+            for (BasicBlock *Pred : predecessors(&BB)) {
+              Pbb = Pred;
+              Dsig[&BB] = SigMap[&BB] ^ SigMap[Pbb];
+              break;
+            }
+            // Calculating Diffsig of each , if BB contains 2 predecessors.
+            for (BasicBlock *Pred : predecessors(&BB)) {
+              Builder.SetInsertPoint((Pred)->getFirstNonPHI());
+              Diffsig[Pred] = SigMap[Pbb] ^ SigMap[Pred];
+              Builder.CreateStore(Builder.getInt32(Diffsig[Pred]), Dg);
+            }
+          }
+        }
+
+
+        // Creating a new BB to emit errors.
+        BasicBlock* ErrorBlock = BasicBlock::Create(Ctx, "ErrorBlock", &F);
+        Builder.SetInsertPoint(ErrorBlock);
+        Builder.CreateCall(ErrorFunc); //To display error message
+        Value *Rzero=Builder.getInt32(0);
+        Builder.CreateRet(Rzero);
+
+
+        // Calculating G, and comparing it with source Signature by calling
+        // error function.
+        for (BasicBlock* BB : VBasicBlock) {
+
+          // G1=s1
+          // if it has 0 predecessor then no need to call error function.as G{i}
+          // and s(i) are initialized to same value.
+
+          // G=Gs^dsig
+          if (BB->hasNPredecessors(1)) {
+            Builder.SetInsertPoint((BB)->getFirstNonPHI());
+            LoadInst *LI = Builder.CreateLoad(Builder.getInt32Ty(), GV);
+            Value *Diff = Builder.CreateXor(LI, Dsig[BB]);
+            Builder.CreateStore(Diff, GV);
+
+            // Value *Args[] = {Diff, Builder.getInt32(SigMap[&BB])};
+            // Builder.SetInsertPoint(BrIMap[&BB]);
+            // Builder.CreateCall(ErrorFunc, Args);
+            Value *Fail=Builder.CreateICmpNE(Diff,Builder.getInt32(SigMap[BB]), "failure" );
+            BasicBlock *Dd=(BB)->splitBasicBlock(dyn_cast<Instruction>(Fail)->getNextNode(), "split");
+            (BB->getTerminator())->eraseFromParent();
+            Builder.SetInsertPoint(BB);
+            Builder.CreateCondBr(Fail,ErrorBlock, Dd);
+          }
+          // G=Gs^dsig; G=G^D
+          if (BB->hasNPredecessorsOrMore(2)) {
+            Builder.SetInsertPoint((BB)->getFirstNonPHI());
+            LoadInst *LI = Builder.CreateLoad(Builder.getInt32Ty(), GV);
+            LoadInst *DI = Builder.CreateLoad(Builder.getInt32Ty(), Dg);
+            Value *Diff = Builder.CreateXor(LI, Dsig[BB]);
+            Value *Diff1 = Builder.CreateXor(Diff, DI);
+            Builder.CreateStore(Diff1, GV);
+
+            // Value *Args[] = {Diff1, Builder.getInt32(SigMap[&BB])};
+            // Builder.SetInsertPoint(BrIMap[&BB]);
+            // Builder.CreateCall(ErrorFunc, Args);
+            Value *Fail=Builder.CreateICmpNE(Diff1,Builder.getInt32(SigMap[BB]), "failure" );
+            BasicBlock *Dd=(BB)->splitBasicBlock(dyn_cast<Instruction>(Fail)->getNextNode(), "split");
+            (BB->getTerminator())->eraseFromParent();
+            Builder.SetInsertPoint(BB);
+            Builder.CreateCondBr(Fail,ErrorBlock, Dd);
+          }
+        }
+      }
+    }
+
+    return false;
+  }
+
+  // We don't modify the program, so we preserve all analyses.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+};
+} // namespace
+
+char Cfcss::ID = 0;
+static RegisterPass<Cfcss> Y("cfcss",
+                             "Cfcss Pass (with getAnalysisUsage implemented)");
\ No newline at end of file
diff --git a/llvm/lib/Transforms/Cfcss/Cfscc.exports b/llvm/lib/Transforms/Cfcss/Cfscc.exports
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 00000000000000..9569308eb4a637
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(cfcss)
diff --git a/tests/cfcss/cfcss.c b/tests/cfcss/cfcss.c
new file mode 100644
index 00000000000000..e607f2d32120ad
--- /dev/null
+++ b/tests/cfcss/cfcss.c
@@ -0,0 +1,10 @@
+#include<stdio.h>
+#include<stdlib.h>
+void __cfcss_error() {
+    printf(" Signatures do not match");
+    exit(0);
+}
+int main() {
+  for (int i = 0; i < 10; i++)
+    printf(" Value is %d", i);
+}
diff --git a/tests/cfcss/cfcss.ll b/tests/cfcss/cfcss.ll
new file mode 100644
index 00000000000000..143d95b3db3901
--- /dev/null
+++ b/tests/cfcss/cfcss.ll
@@ -0,0 +1,65 @@
+; ModuleID = 'cfcss.c'
+source_filename = "cfcss.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at .str = private unnamed_addr constant [25 x i8] c" Signatures do not match\00", align 1
+ at .str.1 = private unnamed_addr constant [13 x i8] c" Value is %d\00", align 1
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @__cfcss_error() #0 {
+entry:
+  %call = call i32 (i8*, ...) @printf(i8* noundef getelementptr inbounds ([25 x i8], [25 x i8]* @.str, i64 0, i64 0))
+  call void @exit(i32 noundef 0) #3
+  unreachable
+}
+
+declare dso_local i32 @printf(i8* noundef, ...) #1
+
+; Function Attrs: noreturn nounwind
+declare dso_local void @exit(i32 noundef) #2
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %call = call i32 (i8*, ...) @printf(i8* noundef getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i64 0, i64 0), i32 noundef %1)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32, i32* %retval, align 4
+  ret i32 %3
+}
+
+attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { noreturn nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #3 = { noreturn nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git f28c006a5895fc0e329fe15fead81e37457cb1d1)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
diff --git a/tests/cfcss/command.sh b/tests/cfcss/command.sh
new file mode 100644
index 00000000000000..496ba116a2ebe6
--- /dev/null
+++ b/tests/cfcss/command.sh
@@ -0,0 +1,2 @@
+clang -O0 -S -emit-llvm cfcss.c -o cfcss.ll
+opt -load ${LLVM_HOME}/build/lib/LLVMCfcss.so -enable-new-pm=0 -cfcss  cfcss.ll -S -o out_cfcss.ll
diff --git a/tests/cfcss/out1_cfcss.ll b/tests/cfcss/out1_cfcss.ll
new file mode 100644
index 00000000000000..dbde32637c6907
--- /dev/null
+++ b/tests/cfcss/out1_cfcss.ll
@@ -0,0 +1,186 @@
+; ModuleID = 'out_cfcss.ll'
+source_filename = "cfcss.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at .str = private unnamed_addr constant [25 x i8] c" Signatures do not match\00", align 1
+ at .str.1 = private unnamed_addr constant [13 x i8] c" Value is %d\00", align 1
+ at G = internal global i32 0
+ at D = internal global i32 0
+ at G.1 = internal global i32 0
+ at D.2 = internal global i32 0
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @__cfcss_error() #0 {
+entry:
+  %call = call i32 (ptr, ...) @printf(ptr noundef @.str)
+  call void @exit(i32 noundef 0) #3
+  unreachable
+}
+
+declare i32 @printf(ptr noundef, ...) #1
+
+; Function Attrs: noreturn nounwind
+declare void @exit(i32 noundef) #2
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @main() #0 {
+entry:
+  store i32 6, ptr @D.2, align 4
+  store i32 1, ptr @G.1, align 4
+  store i32 5, ptr @D, align 4
+  store i32 1, ptr @G, align 4
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %split13, %entry
+  %0 = load i32, ptr @G.1, align 4
+  %1 = load i32, ptr @D.2, align 4
+  %2 = xor i32 %0, 5
+  %3 = xor i32 %2, %1
+  store i32 %3, ptr @G.1, align 4
+  %failure2 = icmp ne i32 %3, 2
+  br i1 %failure2, label %ErrorBlock1, label %split3
+
+split3:                                           ; preds = %for.cond
+  store i32 10, ptr @D.2, align 4
+  %4 = load i32, ptr @G, align 4
+  %5 = load i32, ptr @D, align 4
+  %6 = xor i32 %4, 6
+  %7 = xor i32 %6, %5
+  store i32 %7, ptr @G, align 4
+  %failure = icmp ne i32 %7, 2
+  br i1 %failure, label %ErrorBlock, label %split
+
+split:                                            ; preds = %split3
+  %8 = load i32, ptr @G.1, align 4
+  %9 = xor i32 %8, 1
+  store i32 %9, ptr @G.1, align 4
+  %failure4 = icmp ne i32 %9, 3
+  br i1 %failure4, label %ErrorBlock1, label %split5
+
+split5:                                           ; preds = %split
+  %10 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %10, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %split5
+  %11 = load i32, ptr @G.1, align 4
+  %12 = xor i32 %11, 7
+  store i32 %12, ptr @G.1, align 4
+  %failure6 = icmp ne i32 %12, 4
+  br i1 %failure6, label %ErrorBlock1, label %split7
+
+split7:                                           ; preds = %for.body
+  store i32 12, ptr @D.2, align 4
+  %13 = load i32, ptr @G, align 4
+  %14 = xor i32 %13, 1
+  store i32 %14, ptr @G, align 4
+  %failure1 = icmp ne i32 %14, 3
+  br i1 %failure1, label %ErrorBlock, label %split2
+
+split2:                                           ; preds = %split7
+  %15 = load i32, ptr @G.1, align 4
+  %16 = xor i32 %15, 1
+  store i32 %16, ptr @G.1, align 4
+  %failure8 = icmp ne i32 %16, 5
+  br i1 %failure8, label %ErrorBlock1, label %split9
+
+split9:                                           ; preds = %split2
+  %17 = load i32, ptr %i, align 4
+  %call = call i32 (ptr, ...) @printf(ptr noundef @.str.1, i32 noundef %17)
+  br label %for.inc
+
+for.inc:                                          ; preds = %split9
+  %18 = load i32, ptr @G.1, align 4
+  %19 = xor i32 %18, 3
+  store i32 %19, ptr @G.1, align 4
+  %failure10 = icmp ne i32 %19, 6
+  br i1 %failure10, label %ErrorBlock1, label %split11
+
+split11:                                          ; preds = %for.inc
+  store i32 14, ptr @D.2, align 4
+  %20 = load i32, ptr @G, align 4
+  %21 = xor i32 %20, 10
+  store i32 %21, ptr @G, align 4
+  %failure3 = icmp ne i32 %21, 4
+  br i1 %failure3, label %ErrorBlock, label %split4
+
+split4:                                           ; preds = %split11
+  %22 = load i32, ptr @G.1, align 4
+  %23 = xor i32 %22, 1
+  store i32 %23, ptr @G.1, align 4
+  %failure12 = icmp ne i32 %23, 7
+  br i1 %failure12, label %ErrorBlock1, label %split13
+
+split13:                                          ; preds = %split4
+  store i32 0, ptr @D.2, align 4
+  store i32 0, ptr @D, align 4
+  %24 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %24, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond, !llvm.loop !6
+
+for.end:                                          ; preds = %split5
+  %25 = load i32, ptr @G.1, align 4
+  %26 = xor i32 %25, 11
+  store i32 %26, ptr @G.1, align 4
+  %failure14 = icmp ne i32 %26, 8
+  br i1 %failure14, label %ErrorBlock1, label %split15
+
+split15:                                          ; preds = %for.end
+  store i32 0, ptr @D.2, align 4
+  %27 = load i32, ptr @G, align 4
+  %28 = xor i32 %27, 7
+  store i32 %28, ptr @G, align 4
+  %failure5 = icmp ne i32 %28, 5
+  br i1 %failure5, label %ErrorBlock, label %split6
+
+split6:                                           ; preds = %split15
+  %29 = load i32, ptr @G.1, align 4
+  %30 = xor i32 %29, 1
+  store i32 %30, ptr @G.1, align 4
+  %failure16 = icmp ne i32 %30, 9
+  br i1 %failure16, label %ErrorBlock1, label %split17
+
+split17:                                          ; preds = %split6
+  %31 = load i32, ptr %retval, align 4
+  ret i32 %31
+
+ErrorBlock:                                       ; preds = %split15, %split11, %split7, %split3
+  %32 = load i32, ptr @G.1, align 4
+  %33 = load i32, ptr @D.2, align 4
+  %34 = xor i32 %32, 2
+  %35 = xor i32 %34, %33
+  store i32 %35, ptr @G.1, align 4
+  %failure18 = icmp ne i32 %35, 10
+  br i1 %failure18, label %ErrorBlock1, label %split19
+
+split19:                                          ; preds = %ErrorBlock
+  call void @__cfcss_error()
+  ret i32 0
+
+ErrorBlock1:                                      ; preds = %ErrorBlock, %split6, %for.end, %split4, %for.inc, %split2, %for.body, %split, %for.cond
+  call void @__cfcss_error()
+  ret i32 0
+}
+
+attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { noreturn nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #3 = { noreturn nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!"clang version 15.0.0 (https://shravan_kumar0826@bitbucket.org/shravan_kumar0826/llvm-project.git 00bb96a3bfe1901661abfdb27177c1ba6c6920c6)"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.mustprogress"}
diff --git a/tests/cfcss/out_cfcss.ll b/tests/cfcss/out_cfcss.ll
new file mode 100644
index 00000000000000..d2ea7e9baa5370
--- /dev/null
+++ b/tests/cfcss/out_cfcss.ll
@@ -0,0 +1,104 @@
+; ModuleID = 'cfcss.ll'
+source_filename = "cfcss.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at .str = private unnamed_addr constant [25 x i8] c" Signatures do not match\00", align 1
+ at .str.1 = private unnamed_addr constant [13 x i8] c" Value is %d\00", align 1
+ at G = internal global i32 0
+ at D = internal global i32 0
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @__cfcss_error() #0 {
+entry:
+  %call = call i32 (i8*, ...) @printf(i8* noundef getelementptr inbounds ([25 x i8], [25 x i8]* @.str, i64 0, i64 0))
+  call void @exit(i32 noundef 0) #3
+  unreachable
+}
+
+declare dso_local i32 @printf(i8* noundef, ...) #1
+
+; Function Attrs: noreturn nounwind
+declare dso_local void @exit(i32 noundef) #2
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @main() #0 {
+entry:
+  store i32 5, i32* @D, align 4
+  store i32 1, i32* @G, align 4
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %split4, %entry
+  %0 = load i32, i32* @G, align 4
+  %1 = load i32, i32* @D, align 4
+  %2 = xor i32 %0, 6
+  %3 = xor i32 %2, %1
+  store i32 %3, i32* @G, align 4
+  %failure = icmp ne i32 %3, 2
+  br i1 %failure, label %ErrorBlock, label %split
+
+split:                                            ; preds = %for.cond
+  %4 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %4, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %split
+  %5 = load i32, i32* @G, align 4
+  %6 = xor i32 %5, 1
+  store i32 %6, i32* @G, align 4
+  %failure1 = icmp ne i32 %6, 3
+  br i1 %failure1, label %ErrorBlock, label %split2
+
+split2:                                           ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %call = call i32 (i8*, ...) @printf(i8* noundef getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i64 0, i64 0), i32 noundef %7)
+  br label %for.inc
+
+for.inc:                                          ; preds = %split2
+  %8 = load i32, i32* @G, align 4
+  %9 = xor i32 %8, 7
+  store i32 %9, i32* @G, align 4
+  %failure3 = icmp ne i32 %9, 4
+  br i1 %failure3, label %ErrorBlock, label %split4
+
+split4:                                           ; preds = %for.inc
+  store i32 0, i32* @D, align 4
+  %10 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %split
+  %11 = load i32, i32* @G, align 4
+  %12 = xor i32 %11, 7
+  store i32 %12, i32* @G, align 4
+  %failure5 = icmp ne i32 %12, 5
+  br i1 %failure5, label %ErrorBlock, label %split6
+
+split6:                                           ; preds = %for.end
+  %13 = load i32, i32* %retval, align 4
+  ret i32 %13
+
+ErrorBlock:                                       ; preds = %for.end, %for.inc, %for.body, %for.cond
+  call void @__cfcss_error()
+  ret i32 0
+}
+
+attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { noreturn nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #3 = { noreturn nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git f28c006a5895fc0e329fe15fead81e37457cb1d1)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}

>From 973cbf86551cd15a9cb85bbb0ad01f2406ccb62d Mon Sep 17 00:00:00 2001
From: Shravan Kumar <shkumar at habana.ai>
Date: Tue, 28 Jun 2022 19:18:42 +0300
Subject: [PATCH 2/8] Adding assignment1 answers

---
 llvm/lib/Transforms/Hello/Hello.cpp | 218 ++++++++++++++++++++++++----
 tests/CMakeLists.txt                |   1 +
 tests/assignment1/1.c               |   6 +
 tests/assignment1/1.ll              |  42 ++++++
 tests/assignment1/2.ll              |  48 ++++++
 tests/assignment1/command.sh        |   2 +
 6 files changed, 286 insertions(+), 31 deletions(-)
 create mode 100644 tests/assignment1/1.c
 create mode 100644 tests/assignment1/1.ll
 create mode 100644 tests/assignment1/2.ll
 create mode 100644 tests/assignment1/command.sh

diff --git a/llvm/lib/Transforms/Hello/Hello.cpp b/llvm/lib/Transforms/Hello/Hello.cpp
index b0adb5401f8912..00c43b44443ae5 100644
--- a/llvm/lib/Transforms/Hello/Hello.cpp
+++ b/llvm/lib/Transforms/Hello/Hello.cpp
@@ -12,7 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -22,43 +25,196 @@ using namespace llvm;
 STATISTIC(HelloCounter, "Counts number of functions greeted");
 
 namespace {
-  // Hello - The first implementation, without getAnalysisUsage.
-  struct Hello : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    Hello() : FunctionPass(ID) {}
-
-    bool runOnFunction(Function &F) override {
-      ++HelloCounter;
-      errs() << "Hello: ";
-      errs().write_escaped(F.getName()) << '\n';
-      return false;
-    }
-  };
-}
+// Hello - The first implementation, without getAnalysisUsage.
+struct Hello : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  Hello() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override {
+    ++HelloCounter;
+    errs() << "Hello: ";
+    errs().write_escaped(F.getName()) << '\n';
+    return false;
+  }
+};
+} // namespace
 
 char Hello::ID = 0;
 static RegisterPass<Hello> X("hello", "Hello World Pass");
 
 namespace {
-  // Hello2 - The second implementation with getAnalysisUsage implemented.
-  struct Hello2 : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    Hello2() : FunctionPass(ID) {}
-
-    bool runOnFunction(Function &F) override {
-      ++HelloCounter;
-      errs() << "Hello: ";
-      errs().write_escaped(F.getName()) << '\n';
-      return false;
-    }
+// Hello2 - The second implementation with getAnalysisUsage implemented.
+struct Hello2 : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  Hello2() : FunctionPass(ID) {}
 
-    // We don't modify the program, so we preserve all analyses.
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesAll();
-    }
-  };
-}
+  bool runOnFunction(Function &F) override {
+    ++HelloCounter;
+    errs() << "Hello: ";
+    errs().write_escaped(F.getName()) << '\n';
+    return false;
+  }
+
+  // We don't modify the program, so we preserve all analyses.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+};
+} // namespace
 
 char Hello2::ID = 0;
 static RegisterPass<Hello2>
-Y("hello2", "Hello World Pass (with getAnalysisUsage implemented)");
+    Y("hello2", "Hello World Pass (with getAnalysisUsage implemented)");
+
+namespace {
+// Hello - The first implementation, without getAnalysisUsage.
+struct MyHello : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  MyHello() : FunctionPass(ID) {}
+  int Count = 0;
+  int CountBb = 0;
+  llvm::DenseMap<llvm::StringRef, int> CountI;
+  llvm::DenseMap<llvm::StringRef, int> CountSuc;
+  llvm::DenseMap<llvm::StringRef, int> CountPred;
+
+  bool runOnFunction(Function &F) override {
+
+    // Count the number of instruction in a function
+    for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+      ++Count;
+    }
+
+    // count the no of BB in a Function
+    for (BasicBlock &BB : F) {
+      // Print out the name of the basic block if it has one, and then the
+      // number of instructions that it contains
+      // errs() << "Basic block (name=" << BB.getName() << ") has "
+      //       << BB.size() << " instructions.\n";
+      ++CountBb;
+    }
+
+    // Find the basic block with maximum instructions.
+    for (BasicBlock &BB : F) {
+      CountI[BB.getName()] = BB.size();
+    }
+
+    // for (llvm::DenseMap<llvm::StringRef, int>::iterator V = CountI.begin(),
+    //                                               E = CountI.end();
+    //      V != E; ++V) {
+    //   errs() << V->first << " :" << V->second << "\n";
+    // }
+
+    int CurrentMax = 0;
+    llvm::StringRef Maax;
+    for (llvm::DenseMap<llvm::StringRef, int>::iterator V = CountI.begin(),
+                                                        E = CountI.end();
+         V != E; ++V) {
+      if (V->second > CurrentMax) {
+        Maax = V->first;
+        CurrentMax = V->second;
+      }
+    }
+
+    // Find the basic block with maximum successors.
+    BasicBlock *Target = nullptr;
+    int PredCount = 0;
+
+    for (BasicBlock &BB : F) {
+      Target = &BB;
+      PredCount = 0;
+      CountPred[Target->getName()] = PredCount;
+      for (BasicBlock *Pred : predecessors(Target)) {
+        PredCount++;
+        CountPred[Target->getName()] = PredCount;
+        // errs() << "Basic block name=" << Target->getName() << "\t"
+        //        << Pred->getName() << "\n";
+      }
+    }
+
+    // for (llvm::DenseMap<llvm::StringRef, int>::iterator V =
+    // CountPred.begin(),
+    //                                               E = CountPred.end();
+    //      V != E; ++V) {
+    //   errs() << V->first << " :" << V->second << "\n";
+    // }
+
+    int CurrentPredMax = 0;
+    llvm::StringRef MaaxPred;
+    for (llvm::DenseMap<llvm::StringRef, int>::iterator V = CountPred.begin(),
+                                                        E = CountPred.end();
+         V != E; ++V) {
+      if (V->second > CurrentPredMax) {
+        MaaxPred = V->first;
+        CurrentPredMax = V->second;
+      }
+    }
+
+    // Find the basic block with maximum successors.
+    BasicBlock *TargetS = nullptr;
+    int SuccCount = 0;
+
+    for (BasicBlock &BB : F) {
+      TargetS = &BB;
+      SuccCount = 0;
+      // CountSuc[TargetS->getName()] = SuccCount;
+      for (BasicBlock *Succ : successors(TargetS)) {
+        SuccCount++;
+        CountSuc[TargetS->getName()] = SuccCount;
+        // errs() << "Basic block name=" << TargetS->getName() << "\t"
+        //        << Succ->getName() << "\n";
+      }
+    }
+
+    // for (llvm::DenseMap<llvm::StringRef, int>::iterator V = CountSuc.begin(),
+    //                                               E = CountSuc.end();
+    //      V != E; ++V) {
+    //   errs() << V->first << " :" << V->second << "\n";
+    // }
+
+    int CurrentSuccMax = 0;
+    llvm::StringRef MaaxSucc;
+    for (llvm::DenseMap<llvm::StringRef, int>::iterator V = CountSuc.begin(),
+                                                        E = CountSuc.end();
+         V != E; ++V) {
+      if (V->second > CurrentSuccMax) {
+        MaaxSucc = V->first;
+        CurrentSuccMax = V->second;
+      }
+    }
+
+    // Adding global variable using IRBuilder class and store zero to the new
+    // global variable in entry block
+    IRBuilder<> Builder((F.begin())->getFirstNonPHI());
+    GlobalVariable *GV = new llvm::GlobalVariable(
+        *F.getParent(), IntegerType::getInt32Ty((F.getContext())), false,
+        llvm::GlobalValue::InternalLinkage, Builder.getInt32(0), "G");
+    //Builder.CreateStore(Builder.getInt32(0), GV);
+
+    // Store to different sequential numbers from 1 to all other blocks
+    int Counter = 0;
+    for (BasicBlock &BB : F) {
+      Builder.SetInsertPoint(&*BB.begin());
+      auto CountVal = APInt(32, Counter);
+      auto *Var = Builder.getInt(CountVal);
+      Builder.CreateStore(Var, GV);
+      Counter++;
+    }
+
+    errs() << "Total no of instruction in a Function: " << Count << "\n";
+    errs() << "Total no of BBs in a Function: " << CountBb << "\n";
+    errs() << "BasicBlock with max instructions: " << Maax << "->" << CurrentMax
+           << " instructions"
+           << "\n";
+    errs() << "BasicBlock with max predecessors: " << MaaxPred << "->"
+           << CurrentPredMax << " predecessors "
+           << "\n";
+    errs() << "BasicBlock with max successors: " << MaaxSucc << "->"
+           << CurrentSuccMax << " successors "
+           << "\n";
+    return false;
+  }
+};
+} // namespace
+
+char MyHello::ID = 0;
+static RegisterPass<MyHello> Z("myhello", "Hello World Pass");
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9569308eb4a637..588ebd7a8fda17 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(cfcss)
+add_subdirectory(assignment1)
diff --git a/tests/assignment1/1.c b/tests/assignment1/1.c
new file mode 100644
index 00000000000000..a5dc65529c0a2c
--- /dev/null
+++ b/tests/assignment1/1.c
@@ -0,0 +1,6 @@
+int max(int x, int y) {
+  int k;
+  if (x > y) k=x;
+  else  k = y;
+  return k;
+}
diff --git a/tests/assignment1/1.ll b/tests/assignment1/1.ll
new file mode 100644
index 00000000000000..4ba7cf0aac5aaa
--- /dev/null
+++ b/tests/assignment1/1.ll
@@ -0,0 +1,42 @@
+; ModuleID = '1.c'
+source_filename = "1.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @max(i32 noundef %x, i32 noundef %y) #0 {
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  %k = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  %0 = load i32, i32* %x.addr, align 4
+  %1 = load i32, i32* %y.addr, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %2 = load i32, i32* %x.addr, align 4
+  store i32 %2, i32* %k, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %3 = load i32, i32* %y.addr, align 4
+  store i32 %3, i32* %k, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %4 = load i32, i32* %k, align 4
+  ret i32 %4
+}
+
+attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git cfd62625658626c24e9549fa5c6e07aadfe2d792)"}
diff --git a/tests/assignment1/2.ll b/tests/assignment1/2.ll
new file mode 100644
index 00000000000000..3f559ba706916d
--- /dev/null
+++ b/tests/assignment1/2.ll
@@ -0,0 +1,48 @@
+; ModuleID = '1.ll'
+source_filename = "1.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at G = internal global i32 0
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @max(i32 noundef %x, i32 noundef %y) #0 {
+entry:
+  store i32 0, i32* @G, align 4
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  %k = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  %0 = load i32, i32* %x.addr, align 4
+  %1 = load i32, i32* %y.addr, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @G, align 4
+  %2 = load i32, i32* %x.addr, align 4
+  store i32 %2, i32* %k, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i32 2, i32* @G, align 4
+  %3 = load i32, i32* %y.addr, align 4
+  store i32 %3, i32* %k, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  store i32 3, i32* @G, align 4
+  %4 = load i32, i32* %k, align 4
+  ret i32 %4
+}
+
+attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git cfd62625658626c24e9549fa5c6e07aadfe2d792)"}
diff --git a/tests/assignment1/command.sh b/tests/assignment1/command.sh
new file mode 100644
index 00000000000000..210ea0c932dfba
--- /dev/null
+++ b/tests/assignment1/command.sh
@@ -0,0 +1,2 @@
+clang -O0 -S -emit-llvm 1.c -o 1.ll
+opt -load ${LLVM_HOME}/build/lib/LLVMHello.so -enable-new-pm=0 -myhello  1.ll -S -o 2.ll

>From 8e78085d22f2ac489f95a76f7e2dcfb7d832e9b8 Mon Sep 17 00:00:00 2001
From: Shravan Kumar <shkumar at habana.ai>
Date: Fri, 1 Jul 2022 09:53:46 +0300
Subject: [PATCH 3/8] Adding scev pass

---
 llvm/lib/Transforms/CMakeLists.txt      |   1 +
 llvm/lib/Transforms/Scev/CMakeLists.txt |  20 ++++
 llvm/lib/Transforms/Scev/Scev.cpp       | 110 ++++++++++++++++++++++
 llvm/lib/Transforms/Scev/Scev.exports   |   0
 tests/CMakeLists.txt                    |   1 +
 tests/scev/command.sh                   |   8 ++
 tests/scev/out.ll                       |  83 +++++++++++++++++
 tests/scev/out1.ll                      |  83 +++++++++++++++++
 tests/scev/scev.c                       |  31 +++++++
 tests/scev/scev.ll                      | 116 ++++++++++++++++++++++++
 10 files changed, 453 insertions(+)
 create mode 100644 llvm/lib/Transforms/Scev/CMakeLists.txt
 create mode 100644 llvm/lib/Transforms/Scev/Scev.cpp
 create mode 100644 llvm/lib/Transforms/Scev/Scev.exports
 create mode 100644 tests/scev/command.sh
 create mode 100644 tests/scev/out.ll
 create mode 100644 tests/scev/out1.ll
 create mode 100644 tests/scev/scev.c
 create mode 100644 tests/scev/scev.ll

diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt
index 5ed9ca62265fe8..8ace411e1ca82b 100644
--- a/llvm/lib/Transforms/CMakeLists.txt
+++ b/llvm/lib/Transforms/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(ObjCARC)
 add_subdirectory(Coroutines)
 add_subdirectory(CFGuard)
 add_subdirectory(Cfcss)
+add_subdirectory(Scev)
\ No newline at end of file
diff --git a/llvm/lib/Transforms/Scev/CMakeLists.txt b/llvm/lib/Transforms/Scev/CMakeLists.txt
new file mode 100644
index 00000000000000..b5a4d0ea4c7569
--- /dev/null
+++ b/llvm/lib/Transforms/Scev/CMakeLists.txt
@@ -0,0 +1,20 @@
+# If we don't need RTTI or EH, there's no reason to export anything
+# from the hello plugin.
+if( NOT LLVM_REQUIRES_RTTI )
+  if( NOT LLVM_REQUIRES_EH )
+    set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/Scev.exports)
+  endif()
+endif()
+
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_library( LLVMScev MODULE BUILDTREE_ONLY
+  Scev.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/llvm/lib/Transforms/Scev/Scev.cpp b/llvm/lib/Transforms/Scev/Scev.cpp
new file mode 100644
index 00000000000000..3c7aaaf05c24a8
--- /dev/null
+++ b/llvm/lib/Transforms/Scev/Scev.cpp
@@ -0,0 +1,110 @@
+/*===- Scev.cpp -Creates and Simplifies Recurrences for ‘Expressions involving
+Induction Variables’ Algorithm:
+1. Get ScalarEvolution object.
+2. Use getSCEV for the pointer operands
+3. Take the scev pointer base
+4. Subtract scev with scev pointer base to get the SCEVAddRecExpr(DiffVal).
+eg:{8,+,16}<nuw><nsw><%for.cond>
+5. This SCEVAddRecExpr will contain the required indices and Extract it. eg : 8
+6. Store the index and corresponding Store instruction in StoreInsts map.
+7. Sorting the Offset vector values.
+8. Get the BB of store instruction and using that get the terminator
+instruction.
+9. Move all store instructions one by one before terminator instruction.
+===-------------------------------------------------------------------------------------------===*/
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "hello"
+
+namespace {
+// Scev - The second implementation with getAnalysisUsage implemented.
+struct Scev : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  Scev() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override {
+
+    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    SCEV *ScevVal, *BasePtr, *DiffVal, *GetEle, *TempPtr = nullptr;
+    SmallVector<int> OffSet;
+    llvm::DenseMap<int, Instruction *> StoreInsts;
+
+    int Value = 0;
+
+    // Store the index and corresponding Store instruction in StoreInsts map
+    for (BasicBlock &BB : F) {
+      for (Instruction &I : BB) {
+        if (auto *Store = dyn_cast<StoreInst>(&I)) {
+          if (auto *Gep =
+                  dyn_cast<GetElementPtrInst>(Store->getPointerOperand())) {
+            ScevVal = const_cast<SCEV *>(SE.getSCEV(Gep));
+            if ((BasePtr = const_cast<SCEV *>(SE.getPointerBase(ScevVal)))) {
+              if (TempPtr == nullptr)
+                TempPtr = BasePtr;
+              else if (TempPtr != BasePtr) {
+                LLVM_DEBUG(dbgs()
+                           << "\nBasePointers are not same, stopping the pass");
+                continue;
+              }
+              DiffVal = const_cast<SCEV *>(SE.getMinusSCEV(ScevVal, BasePtr));
+              // Get the index of scev
+              if (SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(DiffVal)) {
+                if ((GetEle = const_cast<SCEV *>(AddRec->getStart()))) {
+                  if (SCEVConstant *BConst = dyn_cast<SCEVConstant>(GetEle)) {
+                    ConstantInt *CI = BConst->getValue();
+                    Value = CI->getSExtValue();
+                  }
+                  OffSet.push_back(Value);
+                  StoreInsts[Value] = &I;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Sorting the Offset vector values
+    std::sort(OffSet.begin(), OffSet.end());
+
+    // Get the BB of store instruction and using that get the terminator
+    // instruction
+    BasicBlock *StoreInstBB = StoreInsts[OffSet[0]]->getParent();
+    Instruction *LastInst = StoreInstBB->getTerminator();
+
+    // Move all store instructions one by one before terminator instruction
+    if (OffSet.size() != 0) {
+      for (auto V = OffSet.begin(), E = OffSet.end(); V != E; V = V + 1) {
+        StoreInsts[*V]->moveBefore(LastInst);
+      }
+    }
+    return false;
+  }
+
+  // We don't modify the program, so we preserve all analyses.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+  }
+};
+} // namespace
+
+char Scev::ID = 0;
+static RegisterPass<Scev>
+    X("scev", "Scev Implementation Pass (with getAnalysisUsage implemented)");
\ No newline at end of file
diff --git a/llvm/lib/Transforms/Scev/Scev.exports b/llvm/lib/Transforms/Scev/Scev.exports
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 588ebd7a8fda17..0ab0d081f586b9 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(cfcss)
 add_subdirectory(assignment1)
+add_subdirectory(scev)
diff --git a/tests/scev/command.sh b/tests/scev/command.sh
new file mode 100644
index 00000000000000..7eca24760d208a
--- /dev/null
+++ b/tests/scev/command.sh
@@ -0,0 +1,8 @@
+# first command is to emit ir for test case
+clang -S -emit-llvm scev.c -Xclang -disable-O0-optnone
+
+#second command is to clean up ir so that scev can understand it
+opt -mem2reg -loop-simplify -instcombine -instnamer -indvars scev.ll -S -o out.ll
+
+#Third command will run scev
+opt -load  ${LLVM_HOME}/build/lib/LLVMScev.so -scev out.ll -enable-new-pm=0 -S -o out1.ll
\ No newline at end of file
diff --git a/tests/scev/out.ll b/tests/scev/out.ll
new file mode 100644
index 00000000000000..d172d379a14edf
--- /dev/null
+++ b/tests/scev/out.ll
@@ -0,0 +1,83 @@
+; ModuleID = 'scev.ll'
+source_filename = "scev.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at Z = dso_local global i32 5, align 4
+ at .str = private unnamed_addr constant [3 x i8] c"%d\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %A = alloca [10 x i32], align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %cmp = icmp ult i64 %indvars.iv, 5
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = add nuw nsw i64 %indvars.iv, 10
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %indvars.iv
+  %1 = trunc i64 %0 to i32
+  store i32 %1, i32* %arrayidx, align 16
+  %2 = add nuw nsw i64 %indvars.iv, 11
+  %3 = or i64 %indvars.iv, 1
+  %arrayidx5 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %3
+  %4 = trunc i64 %2 to i32
+  store i32 %4, i32* %arrayidx5, align 4
+  %5 = add nuw nsw i64 %indvars.iv, 12
+  %6 = or i64 %indvars.iv, 2
+  %arrayidx10 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %6
+  %7 = trunc i64 %5 to i32
+  store i32 %7, i32* %arrayidx10, align 8
+  %8 = add nuw nsw i64 %indvars.iv, 13
+  %9 = or i64 %indvars.iv, 3
+  %arrayidx15 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %9
+  %10 = trunc i64 %8 to i32
+  store i32 %10, i32* %arrayidx15, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.inc22, %for.end
+  %indvars.iv9 = phi i64 [ %indvars.iv.next10, %for.inc22 ], [ 0, %for.end ]
+  %exitcond = icmp ne i64 %indvars.iv9, 5
+  br i1 %exitcond, label %for.body19, label %for.end23
+
+for.body19:                                       ; preds = %for.cond17
+  %arrayidx21 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %indvars.iv9
+  %i = load i32, i32* %arrayidx21, align 4
+  %call = call i32 (i8*, ...) @printf(i8* noundef nonnull dereferenceable(1) getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i64 0, i64 0), i32 noundef %i) #2
+  br label %for.inc22
+
+for.inc22:                                        ; preds = %for.body19
+  %indvars.iv.next10 = add nuw nsw i64 %indvars.iv9, 1
+  br label %for.cond17, !llvm.loop !6
+
+for.end23:                                        ; preds = %for.cond17
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8* noundef, ...) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 973cbf86551cd15a9cb85bbb0ad01f2406ccb62d)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/scev/out1.ll b/tests/scev/out1.ll
new file mode 100644
index 00000000000000..7afff529e3a892
--- /dev/null
+++ b/tests/scev/out1.ll
@@ -0,0 +1,83 @@
+; ModuleID = 'out.ll'
+source_filename = "scev.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at Z = dso_local global i32 5, align 4
+ at .str = private unnamed_addr constant [3 x i8] c"%d\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %A = alloca [10 x i32], align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %cmp = icmp ult i64 %indvars.iv, 5
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = add nuw nsw i64 %indvars.iv, 10
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %indvars.iv
+  %1 = trunc i64 %0 to i32
+  %2 = add nuw nsw i64 %indvars.iv, 11
+  %3 = or i64 %indvars.iv, 1
+  %arrayidx5 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %3
+  %4 = trunc i64 %2 to i32
+  %5 = add nuw nsw i64 %indvars.iv, 12
+  %6 = or i64 %indvars.iv, 2
+  %arrayidx10 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %6
+  %7 = trunc i64 %5 to i32
+  %8 = add nuw nsw i64 %indvars.iv, 13
+  %9 = or i64 %indvars.iv, 3
+  %arrayidx15 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %9
+  %10 = trunc i64 %8 to i32
+  store i32 %1, i32* %arrayidx, align 16
+  store i32 %4, i32* %arrayidx5, align 4
+  store i32 %7, i32* %arrayidx10, align 8
+  store i32 %10, i32* %arrayidx15, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.inc22, %for.end
+  %indvars.iv9 = phi i64 [ %indvars.iv.next10, %for.inc22 ], [ 0, %for.end ]
+  %exitcond = icmp ne i64 %indvars.iv9, 5
+  br i1 %exitcond, label %for.body19, label %for.end23
+
+for.body19:                                       ; preds = %for.cond17
+  %arrayidx21 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %indvars.iv9
+  %i = load i32, i32* %arrayidx21, align 4
+  %call = call i32 (i8*, ...) @printf(i8* noundef nonnull dereferenceable(1) getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i64 0, i64 0), i32 noundef %i) #2
+  br label %for.inc22
+
+for.inc22:                                        ; preds = %for.body19
+  %indvars.iv.next10 = add nuw nsw i64 %indvars.iv9, 1
+  br label %for.cond17, !llvm.loop !6
+
+for.end23:                                        ; preds = %for.cond17
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8* noundef, ...) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 973cbf86551cd15a9cb85bbb0ad01f2406ccb62d)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/scev/scev.c b/tests/scev/scev.c
new file mode 100644
index 00000000000000..2b9fb9aba8b2df
--- /dev/null
+++ b/tests/scev/scev.c
@@ -0,0 +1,31 @@
+#include<stdio.h>
+#include<stdlib.h>
+int Z=5;
+// void print(int *A,int *B)
+// {
+//   printf("A=%d, B=%d", *A,*B);
+// }
+
+int main()
+{
+
+ int C = 10,A[10],N=5,B[12];
+ for(int I = 0; I < N; I += 4) {
+  // A[I+2] = C+I+2;
+  // A[I+1] = C+I+1;
+
+
+  // A[I+3] = C+I+3;
+  // A[I] = C+I;
+  A[I] = C+I;
+  A[I+1] = C+I+1;
+  A[I+2] = C+I+2;
+  A[I+3] = C+I+3;
+
+}
+for (int i=0; i<N;i++) {
+  printf("%d", A[i] );
+}
+// print(A,B);
+return 0;
+}
diff --git a/tests/scev/scev.ll b/tests/scev/scev.ll
new file mode 100644
index 00000000000000..2054e81e975ddb
--- /dev/null
+++ b/tests/scev/scev.ll
@@ -0,0 +1,116 @@
+; ModuleID = 'scev.c'
+source_filename = "scev.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at Z = dso_local global i32 5, align 4
+ at .str = private unnamed_addr constant [3 x i8] c"%d\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %C = alloca i32, align 4
+  %A = alloca [10 x i32], align 16
+  %N = alloca i32, align 4
+  %B = alloca [12 x i32], align 16
+  %I = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 10, i32* %C, align 4
+  store i32 5, i32* %N, align 4
+  store i32 0, i32* %I, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %I, align 4
+  %1 = load i32, i32* %N, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %C, align 4
+  %3 = load i32, i32* %I, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %I, align 4
+  %idxprom = sext i32 %4 to i64
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %5 = load i32, i32* %C, align 4
+  %6 = load i32, i32* %I, align 4
+  %add1 = add nsw i32 %5, %6
+  %add2 = add nsw i32 %add1, 1
+  %7 = load i32, i32* %I, align 4
+  %add3 = add nsw i32 %7, 1
+  %idxprom4 = sext i32 %add3 to i64
+  %arrayidx5 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom4
+  store i32 %add2, i32* %arrayidx5, align 4
+  %8 = load i32, i32* %C, align 4
+  %9 = load i32, i32* %I, align 4
+  %add6 = add nsw i32 %8, %9
+  %add7 = add nsw i32 %add6, 2
+  %10 = load i32, i32* %I, align 4
+  %add8 = add nsw i32 %10, 2
+  %idxprom9 = sext i32 %add8 to i64
+  %arrayidx10 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom9
+  store i32 %add7, i32* %arrayidx10, align 4
+  %11 = load i32, i32* %C, align 4
+  %12 = load i32, i32* %I, align 4
+  %add11 = add nsw i32 %11, %12
+  %add12 = add nsw i32 %add11, 3
+  %13 = load i32, i32* %I, align 4
+  %add13 = add nsw i32 %13, 3
+  %idxprom14 = sext i32 %add13 to i64
+  %arrayidx15 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom14
+  store i32 %add12, i32* %arrayidx15, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %14 = load i32, i32* %I, align 4
+  %add16 = add nsw i32 %14, 4
+  store i32 %add16, i32* %I, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, i32* %i, align 4
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.inc22, %for.end
+  %15 = load i32, i32* %i, align 4
+  %16 = load i32, i32* %N, align 4
+  %cmp18 = icmp slt i32 %15, %16
+  br i1 %cmp18, label %for.body19, label %for.end23
+
+for.body19:                                       ; preds = %for.cond17
+  %17 = load i32, i32* %i, align 4
+  %idxprom20 = sext i32 %17 to i64
+  %arrayidx21 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom20
+  %18 = load i32, i32* %arrayidx21, align 4
+  %call = call i32 (i8*, ...) @printf(i8* noundef getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i64 0, i64 0), i32 noundef %18)
+  br label %for.inc22
+
+for.inc22:                                        ; preds = %for.body19
+  %19 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %19, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond17, !llvm.loop !6
+
+for.end23:                                        ; preds = %for.cond17
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8* noundef, ...) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 973cbf86551cd15a9cb85bbb0ad01f2406ccb62d)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}

>From 7f049514ee22563de5f8817412efd6d7d83109cf Mon Sep 17 00:00:00 2001
From: Shravan Kumar <shkumar at habana.ai>
Date: Fri, 1 Jul 2022 09:53:46 +0300
Subject: [PATCH 4/8] Adding scev pass

---
 llvm/lib/Transforms/Scev/Scev.cpp |  71 ++++++++++------
 tests/scev/out.ll                 |  40 ++++-----
 tests/scev/out1.ll                |  36 ++++-----
 tests/scev/scev.c                 |  14 ++--
 tests/scev/scev.ll                |  40 ++++-----
 tests/scev/scev1.c                |  19 +++++
 tests/scev/scev1.ll               | 120 +++++++++++++++++++++++++++
 tests/scev/scev1_1.ll             |  86 ++++++++++++++++++++
 tests/scev/scev1_2.ll             |  86 ++++++++++++++++++++
 tests/scev/scev_negative.c        |  19 +++++
 tests/scev/scev_negative.ll       | 130 ++++++++++++++++++++++++++++++
 tests/scev/scev_negative_out.ll   |  92 +++++++++++++++++++++
 tests/scev/scev_negative_out1.ll  |  92 +++++++++++++++++++++
 13 files changed, 754 insertions(+), 91 deletions(-)
 create mode 100644 tests/scev/scev1.c
 create mode 100644 tests/scev/scev1.ll
 create mode 100644 tests/scev/scev1_1.ll
 create mode 100644 tests/scev/scev1_2.ll
 create mode 100644 tests/scev/scev_negative.c
 create mode 100644 tests/scev/scev_negative.ll
 create mode 100644 tests/scev/scev_negative_out.ll
 create mode 100644 tests/scev/scev_negative_out1.ll

diff --git a/llvm/lib/Transforms/Scev/Scev.cpp b/llvm/lib/Transforms/Scev/Scev.cpp
index 3c7aaaf05c24a8..7f004a6a971df6 100644
--- a/llvm/lib/Transforms/Scev/Scev.cpp
+++ b/llvm/lib/Transforms/Scev/Scev.cpp
@@ -1,8 +1,8 @@
 /*===- Scev.cpp -Creates and Simplifies Recurrences for ‘Expressions involving
 Induction Variables’ Algorithm:
 1. Get ScalarEvolution object.
-2. Use getSCEV for the pointer operands
-3. Take the scev pointer base
+2. Use getSCEV for the pointer operands.
+3. Take the scev pointer base.
 4. Subtract scev with scev pointer base to get the SCEVAddRecExpr(DiffVal).
 eg:{8,+,16}<nuw><nsw><%for.cond>
 5. This SCEVAddRecExpr will contain the required indices and Extract it. eg : 8
@@ -10,10 +10,11 @@ eg:{8,+,16}<nuw><nsw><%for.cond>
 7. Sorting the Offset vector values.
 8. Get the BB of store instruction and using that get the terminator
 instruction.
-9. Move all store instructions one by one before terminator instruction.
+9. Move all store instructions and corresponding operands one by one before terminator instruction.
 ===-------------------------------------------------------------------------------------------===*/
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -44,35 +45,40 @@ struct Scev : public FunctionPass {
     SCEV *ScevVal, *BasePtr, *DiffVal, *GetEle, *TempPtr = nullptr;
     SmallVector<int> OffSet;
     llvm::DenseMap<int, Instruction *> StoreInsts;
+    //llvm::DenseMap<SCEV *, SmallVector <Instruction *> > BasePtrMap;
+    Instruction *Inst;
 
     int Value = 0;
 
     // Store the index and corresponding Store instruction in StoreInsts map
     for (BasicBlock &BB : F) {
       for (Instruction &I : BB) {
-        if (auto *Store = dyn_cast<StoreInst>(&I)) {
-          if (auto *Gep =
-                  dyn_cast<GetElementPtrInst>(Store->getPointerOperand())) {
-            ScevVal = const_cast<SCEV *>(SE.getSCEV(Gep));
-            if ((BasePtr = const_cast<SCEV *>(SE.getPointerBase(ScevVal)))) {
-              if (TempPtr == nullptr)
-                TempPtr = BasePtr;
-              else if (TempPtr != BasePtr) {
-                LLVM_DEBUG(dbgs()
-                           << "\nBasePointers are not same, stopping the pass");
-                continue;
-              }
-              DiffVal = const_cast<SCEV *>(SE.getMinusSCEV(ScevVal, BasePtr));
-              // Get the index of scev
-              if (SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(DiffVal)) {
-                if ((GetEle = const_cast<SCEV *>(AddRec->getStart()))) {
-                  if (SCEVConstant *BConst = dyn_cast<SCEVConstant>(GetEle)) {
-                    ConstantInt *CI = BConst->getValue();
-                    Value = CI->getSExtValue();
-                  }
-                  OffSet.push_back(Value);
-                  StoreInsts[Value] = &I;
+        if (!isa<StoreInst>(I)) {
+          continue;
+        }
+        auto *Store = dyn_cast<StoreInst>(&I);
+        if (auto *Gep =
+                dyn_cast<GetElementPtrInst>(Store->getPointerOperand())) {
+          ScevVal = const_cast<SCEV *>(SE.getSCEV(Gep));
+          if ((BasePtr = const_cast<SCEV *>(SE.getPointerBase(ScevVal)))) {
+            // BasePtrMap[BasePtr].push_back(Store);
+            if (TempPtr == nullptr)
+              TempPtr = BasePtr;
+            else if (TempPtr != BasePtr) {
+              LLVM_DEBUG(dbgs()
+                         << "\nBasePointers are not same, stopping the pass");
+              continue;
+            }
+            DiffVal = const_cast<SCEV *>(SE.getMinusSCEV(ScevVal, BasePtr));
+            // Get the index of scev
+            if (SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(DiffVal)) {
+              if ((GetEle = const_cast<SCEV *>(AddRec->getStart()))) {
+                if (SCEVConstant *BConst = dyn_cast<SCEVConstant>(GetEle)) {
+                  ConstantInt *CI = BConst->getValue();
+                  Value = CI->getSExtValue();
                 }
+                OffSet.push_back(Value);
+                StoreInsts[Value] = &I;
               }
             }
           }
@@ -80,6 +86,14 @@ struct Scev : public FunctionPass {
       }
     }
 
+    // If vector is already Sorted , then there is use of continuing the code. Stop the pass.
+    if (std::is_sorted(OffSet.begin(), OffSet.end())) {
+      LLVM_DEBUG(
+          dbgs()
+          << "\nScave values are already in sorted order.Exiting the pass");
+          return true;
+    }
+
     // Sorting the Offset vector values
     std::sort(OffSet.begin(), OffSet.end());
 
@@ -88,10 +102,15 @@ struct Scev : public FunctionPass {
     BasicBlock *StoreInstBB = StoreInsts[OffSet[0]]->getParent();
     Instruction *LastInst = StoreInstBB->getTerminator();
 
-    // Move all store instructions one by one before terminator instruction
+    // Move all store instructions and corresponding operands one by one before terminator instruction
     if (OffSet.size() != 0) {
       for (auto V = OffSet.begin(), E = OffSet.end(); V != E; V = V + 1) {
         StoreInsts[*V]->moveBefore(LastInst);
+        for (Use &U : StoreInsts[*V]->operands()) {
+          llvm::Value *Val = U.get();
+          Inst = dyn_cast<Instruction>(Val);
+          Inst->moveBefore(StoreInsts[*V]);
+        }
       }
     }
     return false;
diff --git a/tests/scev/out.ll b/tests/scev/out.ll
index d172d379a14edf..99bce0e775e908 100644
--- a/tests/scev/out.ll
+++ b/tests/scev/out.ll
@@ -18,25 +18,25 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  %0 = add nuw nsw i64 %indvars.iv, 10
-  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %indvars.iv
-  %1 = trunc i64 %0 to i32
-  store i32 %1, i32* %arrayidx, align 16
-  %2 = add nuw nsw i64 %indvars.iv, 11
-  %3 = or i64 %indvars.iv, 1
-  %arrayidx5 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %3
-  %4 = trunc i64 %2 to i32
-  store i32 %4, i32* %arrayidx5, align 4
-  %5 = add nuw nsw i64 %indvars.iv, 12
-  %6 = or i64 %indvars.iv, 2
-  %arrayidx10 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %6
-  %7 = trunc i64 %5 to i32
-  store i32 %7, i32* %arrayidx10, align 8
-  %8 = add nuw nsw i64 %indvars.iv, 13
-  %9 = or i64 %indvars.iv, 3
-  %arrayidx15 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %9
-  %10 = trunc i64 %8 to i32
-  store i32 %10, i32* %arrayidx15, align 4
+  %0 = add nuw nsw i64 %indvars.iv, 12
+  %1 = or i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %1
+  %2 = trunc i64 %0 to i32
+  store i32 %2, i32* %arrayidx, align 8
+  %3 = add nuw nsw i64 %indvars.iv, 11
+  %4 = or i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %4
+  %5 = trunc i64 %3 to i32
+  store i32 %5, i32* %arrayidx7, align 4
+  %6 = add nuw nsw i64 %indvars.iv, 13
+  %7 = or i64 %indvars.iv, 3
+  %arrayidx12 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %7
+  %8 = trunc i64 %6 to i32
+  store i32 %8, i32* %arrayidx12, align 4
+  %9 = add nuw nsw i64 %indvars.iv, 10
+  %arrayidx15 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %indvars.iv
+  %10 = trunc i64 %9 to i32
+  store i32 %10, i32* %arrayidx15, align 16
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body
@@ -77,7 +77,7 @@ attributes #2 = { nounwind }
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"uwtable", i32 1}
 !2 = !{i32 7, !"frame-pointer", i32 2}
-!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 973cbf86551cd15a9cb85bbb0ad01f2406ccb62d)"}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 8e78085d22f2ac489f95a76f7e2dcfb7d832e9b8)"}
 !4 = distinct !{!4, !5}
 !5 = !{!"llvm.loop.mustprogress"}
 !6 = distinct !{!6, !5}
diff --git a/tests/scev/out1.ll b/tests/scev/out1.ll
index 7afff529e3a892..c4e29e27ee4d43 100644
--- a/tests/scev/out1.ll
+++ b/tests/scev/out1.ll
@@ -18,25 +18,25 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  %0 = add nuw nsw i64 %indvars.iv, 10
-  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %indvars.iv
-  %1 = trunc i64 %0 to i32
+  %0 = add nuw nsw i64 %indvars.iv, 12
+  %1 = or i64 %indvars.iv, 2
   %2 = add nuw nsw i64 %indvars.iv, 11
   %3 = or i64 %indvars.iv, 1
-  %arrayidx5 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %3
-  %4 = trunc i64 %2 to i32
-  %5 = add nuw nsw i64 %indvars.iv, 12
-  %6 = or i64 %indvars.iv, 2
-  %arrayidx10 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %6
-  %7 = trunc i64 %5 to i32
-  %8 = add nuw nsw i64 %indvars.iv, 13
-  %9 = or i64 %indvars.iv, 3
-  %arrayidx15 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %9
-  %10 = trunc i64 %8 to i32
-  store i32 %1, i32* %arrayidx, align 16
-  store i32 %4, i32* %arrayidx5, align 4
-  store i32 %7, i32* %arrayidx10, align 8
-  store i32 %10, i32* %arrayidx15, align 4
+  %4 = add nuw nsw i64 %indvars.iv, 13
+  %5 = or i64 %indvars.iv, 3
+  %6 = add nuw nsw i64 %indvars.iv, 10
+  %7 = trunc i64 %6 to i32
+  %arrayidx15 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %indvars.iv
+  store i32 %7, i32* %arrayidx15, align 16
+  %8 = trunc i64 %2 to i32
+  %arrayidx7 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %3
+  store i32 %8, i32* %arrayidx7, align 4
+  %9 = trunc i64 %0 to i32
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %1
+  store i32 %9, i32* %arrayidx, align 8
+  %10 = trunc i64 %4 to i32
+  %arrayidx12 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %5
+  store i32 %10, i32* %arrayidx12, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body
@@ -77,7 +77,7 @@ attributes #2 = { nounwind }
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"uwtable", i32 1}
 !2 = !{i32 7, !"frame-pointer", i32 2}
-!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 973cbf86551cd15a9cb85bbb0ad01f2406ccb62d)"}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 8e78085d22f2ac489f95a76f7e2dcfb7d832e9b8)"}
 !4 = distinct !{!4, !5}
 !5 = !{!"llvm.loop.mustprogress"}
 !6 = distinct !{!6, !5}
diff --git a/tests/scev/scev.c b/tests/scev/scev.c
index 2b9fb9aba8b2df..27247470576806 100644
--- a/tests/scev/scev.c
+++ b/tests/scev/scev.c
@@ -11,16 +11,16 @@ int main()
 
  int C = 10,A[10],N=5,B[12];
  for(int I = 0; I < N; I += 4) {
-  // A[I+2] = C+I+2;
-  // A[I+1] = C+I+1;
+  A[I+2] = C+I+2;
+  A[I+1] = C+I+1;
 
 
-  // A[I+3] = C+I+3;
-  // A[I] = C+I;
-  A[I] = C+I;
-  A[I+1] = C+I+1;
-  A[I+2] = C+I+2;
   A[I+3] = C+I+3;
+  A[I] = C+I;
+  // A[I] = C+I;
+  // A[I+1] = C+I+1;
+  // A[I+2] = C+I+2;
+  // A[I+3] = C+I+3;
 
 }
 for (int i=0; i<N;i++) {
diff --git a/tests/scev/scev.ll b/tests/scev/scev.ll
index 2054e81e975ddb..24b364067e5e5f 100644
--- a/tests/scev/scev.ll
+++ b/tests/scev/scev.ll
@@ -32,37 +32,37 @@ for.body:                                         ; preds = %for.cond
   %2 = load i32, i32* %C, align 4
   %3 = load i32, i32* %I, align 4
   %add = add nsw i32 %2, %3
+  %add1 = add nsw i32 %add, 2
   %4 = load i32, i32* %I, align 4
-  %idxprom = sext i32 %4 to i64
+  %add2 = add nsw i32 %4, 2
+  %idxprom = sext i32 %add2 to i64
   %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom
-  store i32 %add, i32* %arrayidx, align 4
+  store i32 %add1, i32* %arrayidx, align 4
   %5 = load i32, i32* %C, align 4
   %6 = load i32, i32* %I, align 4
-  %add1 = add nsw i32 %5, %6
-  %add2 = add nsw i32 %add1, 1
+  %add3 = add nsw i32 %5, %6
+  %add4 = add nsw i32 %add3, 1
   %7 = load i32, i32* %I, align 4
-  %add3 = add nsw i32 %7, 1
-  %idxprom4 = sext i32 %add3 to i64
-  %arrayidx5 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom4
-  store i32 %add2, i32* %arrayidx5, align 4
+  %add5 = add nsw i32 %7, 1
+  %idxprom6 = sext i32 %add5 to i64
+  %arrayidx7 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom6
+  store i32 %add4, i32* %arrayidx7, align 4
   %8 = load i32, i32* %C, align 4
   %9 = load i32, i32* %I, align 4
-  %add6 = add nsw i32 %8, %9
-  %add7 = add nsw i32 %add6, 2
+  %add8 = add nsw i32 %8, %9
+  %add9 = add nsw i32 %add8, 3
   %10 = load i32, i32* %I, align 4
-  %add8 = add nsw i32 %10, 2
-  %idxprom9 = sext i32 %add8 to i64
-  %arrayidx10 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom9
-  store i32 %add7, i32* %arrayidx10, align 4
+  %add10 = add nsw i32 %10, 3
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom11
+  store i32 %add9, i32* %arrayidx12, align 4
   %11 = load i32, i32* %C, align 4
   %12 = load i32, i32* %I, align 4
-  %add11 = add nsw i32 %11, %12
-  %add12 = add nsw i32 %add11, 3
+  %add13 = add nsw i32 %11, %12
   %13 = load i32, i32* %I, align 4
-  %add13 = add nsw i32 %13, 3
-  %idxprom14 = sext i32 %add13 to i64
+  %idxprom14 = sext i32 %13 to i64
   %arrayidx15 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom14
-  store i32 %add12, i32* %arrayidx15, align 4
+  store i32 %add13, i32* %arrayidx15, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body
@@ -110,7 +110,7 @@ attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protect
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"uwtable", i32 1}
 !2 = !{i32 7, !"frame-pointer", i32 2}
-!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 973cbf86551cd15a9cb85bbb0ad01f2406ccb62d)"}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 8e78085d22f2ac489f95a76f7e2dcfb7d832e9b8)"}
 !4 = distinct !{!4, !5}
 !5 = !{!"llvm.loop.mustprogress"}
 !6 = distinct !{!6, !5}
diff --git a/tests/scev/scev1.c b/tests/scev/scev1.c
new file mode 100644
index 00000000000000..68210a4cc2d8fd
--- /dev/null
+++ b/tests/scev/scev1.c
@@ -0,0 +1,19 @@
+#include <stdio.h>
+#include <stdlib.h>
+int Z = 5;
+
+int main() {
+  int A[100], B[100];
+  int N = 20;
+  int C = 10;
+  for (int I = 0; I < N; I += 4) {
+    A[I + 2] = C + I + 2;
+    B[I + 1] = C + I + 1;
+    A[I + 3] = C + I + 3;
+    A[I] = C + I;
+  }
+  for (int I = 0; I < N; I++) {
+    printf("%d%d", A[I], B[I]);
+  }
+  return 0;
+}
diff --git a/tests/scev/scev1.ll b/tests/scev/scev1.ll
new file mode 100644
index 00000000000000..1bcf5ba88bb206
--- /dev/null
+++ b/tests/scev/scev1.ll
@@ -0,0 +1,120 @@
+; ModuleID = 'scev1.c'
+source_filename = "scev1.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at Z = dso_local global i32 5, align 4
+ at .str = private unnamed_addr constant [5 x i8] c"%d%d\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %A = alloca [100 x i32], align 16
+  %B = alloca [100 x i32], align 16
+  %N = alloca i32, align 4
+  %C = alloca i32, align 4
+  %I = alloca i32, align 4
+  %I17 = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 20, i32* %N, align 4
+  store i32 10, i32* %C, align 4
+  store i32 0, i32* %I, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %I, align 4
+  %1 = load i32, i32* %N, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %C, align 4
+  %3 = load i32, i32* %I, align 4
+  %add = add nsw i32 %2, %3
+  %add1 = add nsw i32 %add, 2
+  %4 = load i32, i32* %I, align 4
+  %add2 = add nsw i32 %4, 2
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %idxprom
+  store i32 %add1, i32* %arrayidx, align 4
+  %5 = load i32, i32* %C, align 4
+  %6 = load i32, i32* %I, align 4
+  %add3 = add nsw i32 %5, %6
+  %add4 = add nsw i32 %add3, 1
+  %7 = load i32, i32* %I, align 4
+  %add5 = add nsw i32 %7, 1
+  %idxprom6 = sext i32 %add5 to i64
+  %arrayidx7 = getelementptr inbounds [100 x i32], [100 x i32]* %B, i64 0, i64 %idxprom6
+  store i32 %add4, i32* %arrayidx7, align 4
+  %8 = load i32, i32* %C, align 4
+  %9 = load i32, i32* %I, align 4
+  %add8 = add nsw i32 %8, %9
+  %add9 = add nsw i32 %add8, 3
+  %10 = load i32, i32* %I, align 4
+  %add10 = add nsw i32 %10, 3
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %idxprom11
+  store i32 %add9, i32* %arrayidx12, align 4
+  %11 = load i32, i32* %C, align 4
+  %12 = load i32, i32* %I, align 4
+  %add13 = add nsw i32 %11, %12
+  %13 = load i32, i32* %I, align 4
+  %idxprom14 = sext i32 %13 to i64
+  %arrayidx15 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %idxprom14
+  store i32 %add13, i32* %arrayidx15, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %14 = load i32, i32* %I, align 4
+  %add16 = add nsw i32 %14, 4
+  store i32 %add16, i32* %I, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, i32* %I17, align 4
+  br label %for.cond18
+
+for.cond18:                                       ; preds = %for.inc25, %for.end
+  %15 = load i32, i32* %I17, align 4
+  %16 = load i32, i32* %N, align 4
+  %cmp19 = icmp slt i32 %15, %16
+  br i1 %cmp19, label %for.body20, label %for.end26
+
+for.body20:                                       ; preds = %for.cond18
+  %17 = load i32, i32* %I17, align 4
+  %idxprom21 = sext i32 %17 to i64
+  %arrayidx22 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %idxprom21
+  %18 = load i32, i32* %arrayidx22, align 4
+  %19 = load i32, i32* %I17, align 4
+  %idxprom23 = sext i32 %19 to i64
+  %arrayidx24 = getelementptr inbounds [100 x i32], [100 x i32]* %B, i64 0, i64 %idxprom23
+  %20 = load i32, i32* %arrayidx24, align 4
+  %call = call i32 (i8*, ...) @printf(i8* noundef getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i32 noundef %18, i32 noundef %20)
+  br label %for.inc25
+
+for.inc25:                                        ; preds = %for.body20
+  %21 = load i32, i32* %I17, align 4
+  %inc = add nsw i32 %21, 1
+  store i32 %inc, i32* %I17, align 4
+  br label %for.cond18, !llvm.loop !6
+
+for.end26:                                        ; preds = %for.cond18
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8* noundef, ...) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 973cbf86551cd15a9cb85bbb0ad01f2406ccb62d)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/scev/scev1_1.ll b/tests/scev/scev1_1.ll
new file mode 100644
index 00000000000000..1f3db930596144
--- /dev/null
+++ b/tests/scev/scev1_1.ll
@@ -0,0 +1,86 @@
+; ModuleID = 'scev1.ll'
+source_filename = "scev1.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at Z = dso_local global i32 5, align 4
+ at .str = private unnamed_addr constant [5 x i8] c"%d%d\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %A = alloca [100 x i32], align 16
+  %B = alloca [100 x i32], align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %cmp = icmp ult i64 %indvars.iv, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = add nuw nsw i64 %indvars.iv, 12
+  %1 = or i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %1
+  %2 = trunc i64 %0 to i32
+  store i32 %2, i32* %arrayidx, align 8
+  %3 = add nuw nsw i64 %indvars.iv, 11
+  %4 = or i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [100 x i32], [100 x i32]* %B, i64 0, i64 %4
+  %5 = trunc i64 %3 to i32
+  store i32 %5, i32* %arrayidx7, align 4
+  %6 = add nuw nsw i64 %indvars.iv, 13
+  %7 = or i64 %indvars.iv, 3
+  %arrayidx12 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %7
+  %8 = trunc i64 %6 to i32
+  store i32 %8, i32* %arrayidx12, align 4
+  %9 = add nuw nsw i64 %indvars.iv, 10
+  %arrayidx15 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %indvars.iv
+  %10 = trunc i64 %9 to i32
+  store i32 %10, i32* %arrayidx15, align 16
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond18
+
+for.cond18:                                       ; preds = %for.inc25, %for.end
+  %indvars.iv11 = phi i64 [ %indvars.iv.next12, %for.inc25 ], [ 0, %for.end ]
+  %exitcond = icmp ne i64 %indvars.iv11, 20
+  br i1 %exitcond, label %for.body20, label %for.end26
+
+for.body20:                                       ; preds = %for.cond18
+  %arrayidx22 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %indvars.iv11
+  %i = load i32, i32* %arrayidx22, align 4
+  %arrayidx24 = getelementptr inbounds [100 x i32], [100 x i32]* %B, i64 0, i64 %indvars.iv11
+  %i1 = load i32, i32* %arrayidx24, align 4
+  %call = call i32 (i8*, ...) @printf(i8* noundef nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i32 noundef %i, i32 noundef %i1) #2
+  br label %for.inc25
+
+for.inc25:                                        ; preds = %for.body20
+  %indvars.iv.next12 = add nuw nsw i64 %indvars.iv11, 1
+  br label %for.cond18, !llvm.loop !6
+
+for.end26:                                        ; preds = %for.cond18
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8* noundef, ...) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 973cbf86551cd15a9cb85bbb0ad01f2406ccb62d)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/scev/scev1_2.ll b/tests/scev/scev1_2.ll
new file mode 100644
index 00000000000000..b60e791c1ca765
--- /dev/null
+++ b/tests/scev/scev1_2.ll
@@ -0,0 +1,86 @@
+; ModuleID = 'scev1_1.ll'
+source_filename = "scev1.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at Z = dso_local global i32 5, align 4
+ at .str = private unnamed_addr constant [5 x i8] c"%d%d\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %A = alloca [100 x i32], align 16
+  %B = alloca [100 x i32], align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %cmp = icmp ult i64 %indvars.iv, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = add nuw nsw i64 %indvars.iv, 12
+  %1 = or i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %1
+  %2 = trunc i64 %0 to i32
+  %3 = add nuw nsw i64 %indvars.iv, 11
+  %4 = or i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [100 x i32], [100 x i32]* %B, i64 0, i64 %4
+  %5 = trunc i64 %3 to i32
+  store i32 %5, i32* %arrayidx7, align 4
+  %6 = add nuw nsw i64 %indvars.iv, 13
+  %7 = or i64 %indvars.iv, 3
+  %arrayidx12 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %7
+  %8 = trunc i64 %6 to i32
+  %9 = add nuw nsw i64 %indvars.iv, 10
+  %arrayidx15 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %indvars.iv
+  %10 = trunc i64 %9 to i32
+  store i32 %10, i32* %arrayidx15, align 16
+  store i32 %2, i32* %arrayidx, align 8
+  store i32 %8, i32* %arrayidx12, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond18
+
+for.cond18:                                       ; preds = %for.inc25, %for.end
+  %indvars.iv11 = phi i64 [ %indvars.iv.next12, %for.inc25 ], [ 0, %for.end ]
+  %exitcond = icmp ne i64 %indvars.iv11, 20
+  br i1 %exitcond, label %for.body20, label %for.end26
+
+for.body20:                                       ; preds = %for.cond18
+  %arrayidx22 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 0, i64 %indvars.iv11
+  %i = load i32, i32* %arrayidx22, align 4
+  %arrayidx24 = getelementptr inbounds [100 x i32], [100 x i32]* %B, i64 0, i64 %indvars.iv11
+  %i1 = load i32, i32* %arrayidx24, align 4
+  %call = call i32 (i8*, ...) @printf(i8* noundef nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i32 noundef %i, i32 noundef %i1) #2
+  br label %for.inc25
+
+for.inc25:                                        ; preds = %for.body20
+  %indvars.iv.next12 = add nuw nsw i64 %indvars.iv11, 1
+  br label %for.cond18, !llvm.loop !6
+
+for.end26:                                        ; preds = %for.cond18
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8* noundef, ...) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 973cbf86551cd15a9cb85bbb0ad01f2406ccb62d)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/scev/scev_negative.c b/tests/scev/scev_negative.c
new file mode 100644
index 00000000000000..b91f1f3f6b00ef
--- /dev/null
+++ b/tests/scev/scev_negative.c
@@ -0,0 +1,19 @@
+#include <stdio.h>
+#include <stdlib.h>
+int Z = 5;
+
+int main() {
+
+  int C = 10, A[10], N = 5, B[12], E[12], D[12];
+  for (int I = 0; I < N; I += 4) {
+    D[I + 2] = C + I + 2;
+    B[I + 1] = C + I + 1;
+    E[I + 3] = C + I + 3;
+    A[I] = C + I;
+  }
+  for (int i = 0; i < N; i++) {
+    printf("%d%d%d%d", A[i],B[i],E[i],D[i]);
+  }
+  // print(A,B);
+  return 0;
+}
diff --git a/tests/scev/scev_negative.ll b/tests/scev/scev_negative.ll
new file mode 100644
index 00000000000000..6af7bac6d26cc0
--- /dev/null
+++ b/tests/scev/scev_negative.ll
@@ -0,0 +1,130 @@
+; ModuleID = 'scev_negative.c'
+source_filename = "scev_negative.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at Z = dso_local global i32 5, align 4
+ at .str = private unnamed_addr constant [9 x i8] c"%d%d%d%d\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %C = alloca i32, align 4
+  %A = alloca [10 x i32], align 16
+  %N = alloca i32, align 4
+  %B = alloca [12 x i32], align 16
+  %E = alloca [12 x i32], align 16
+  %D = alloca [12 x i32], align 16
+  %I = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 10, i32* %C, align 4
+  store i32 5, i32* %N, align 4
+  store i32 0, i32* %I, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %I, align 4
+  %1 = load i32, i32* %N, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %C, align 4
+  %3 = load i32, i32* %I, align 4
+  %add = add nsw i32 %2, %3
+  %add1 = add nsw i32 %add, 2
+  %4 = load i32, i32* %I, align 4
+  %add2 = add nsw i32 %4, 2
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds [12 x i32], [12 x i32]* %D, i64 0, i64 %idxprom
+  store i32 %add1, i32* %arrayidx, align 4
+  %5 = load i32, i32* %C, align 4
+  %6 = load i32, i32* %I, align 4
+  %add3 = add nsw i32 %5, %6
+  %add4 = add nsw i32 %add3, 1
+  %7 = load i32, i32* %I, align 4
+  %add5 = add nsw i32 %7, 1
+  %idxprom6 = sext i32 %add5 to i64
+  %arrayidx7 = getelementptr inbounds [12 x i32], [12 x i32]* %B, i64 0, i64 %idxprom6
+  store i32 %add4, i32* %arrayidx7, align 4
+  %8 = load i32, i32* %C, align 4
+  %9 = load i32, i32* %I, align 4
+  %add8 = add nsw i32 %8, %9
+  %add9 = add nsw i32 %add8, 3
+  %10 = load i32, i32* %I, align 4
+  %add10 = add nsw i32 %10, 3
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds [12 x i32], [12 x i32]* %E, i64 0, i64 %idxprom11
+  store i32 %add9, i32* %arrayidx12, align 4
+  %11 = load i32, i32* %C, align 4
+  %12 = load i32, i32* %I, align 4
+  %add13 = add nsw i32 %11, %12
+  %13 = load i32, i32* %I, align 4
+  %idxprom14 = sext i32 %13 to i64
+  %arrayidx15 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom14
+  store i32 %add13, i32* %arrayidx15, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %14 = load i32, i32* %I, align 4
+  %add16 = add nsw i32 %14, 4
+  store i32 %add16, i32* %I, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, i32* %i, align 4
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.inc28, %for.end
+  %15 = load i32, i32* %i, align 4
+  %16 = load i32, i32* %N, align 4
+  %cmp18 = icmp slt i32 %15, %16
+  br i1 %cmp18, label %for.body19, label %for.end29
+
+for.body19:                                       ; preds = %for.cond17
+  %17 = load i32, i32* %i, align 4
+  %idxprom20 = sext i32 %17 to i64
+  %arrayidx21 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %idxprom20
+  %18 = load i32, i32* %arrayidx21, align 4
+  %19 = load i32, i32* %i, align 4
+  %idxprom22 = sext i32 %19 to i64
+  %arrayidx23 = getelementptr inbounds [12 x i32], [12 x i32]* %B, i64 0, i64 %idxprom22
+  %20 = load i32, i32* %arrayidx23, align 4
+  %21 = load i32, i32* %i, align 4
+  %idxprom24 = sext i32 %21 to i64
+  %arrayidx25 = getelementptr inbounds [12 x i32], [12 x i32]* %E, i64 0, i64 %idxprom24
+  %22 = load i32, i32* %arrayidx25, align 4
+  %23 = load i32, i32* %i, align 4
+  %idxprom26 = sext i32 %23 to i64
+  %arrayidx27 = getelementptr inbounds [12 x i32], [12 x i32]* %D, i64 0, i64 %idxprom26
+  %24 = load i32, i32* %arrayidx27, align 4
+  %call = call i32 (i8*, ...) @printf(i8* noundef getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i64 0, i64 0), i32 noundef %18, i32 noundef %20, i32 noundef %22, i32 noundef %24)
+  br label %for.inc28
+
+for.inc28:                                        ; preds = %for.body19
+  %25 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %25, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond17, !llvm.loop !6
+
+for.end29:                                        ; preds = %for.cond17
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8* noundef, ...) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 8e78085d22f2ac489f95a76f7e2dcfb7d832e9b8)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/scev/scev_negative_out.ll b/tests/scev/scev_negative_out.ll
new file mode 100644
index 00000000000000..83b1de26729d5a
--- /dev/null
+++ b/tests/scev/scev_negative_out.ll
@@ -0,0 +1,92 @@
+; ModuleID = 'scev_negative.ll'
+source_filename = "scev_negative.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at Z = dso_local global i32 5, align 4
+ at .str = private unnamed_addr constant [9 x i8] c"%d%d%d%d\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %A = alloca [10 x i32], align 16
+  %B = alloca [12 x i32], align 16
+  %E = alloca [12 x i32], align 16
+  %D = alloca [12 x i32], align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %cmp = icmp ult i64 %indvars.iv, 5
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = add nuw nsw i64 %indvars.iv, 12
+  %1 = or i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [12 x i32], [12 x i32]* %D, i64 0, i64 %1
+  %2 = trunc i64 %0 to i32
+  store i32 %2, i32* %arrayidx, align 8
+  %3 = add nuw nsw i64 %indvars.iv, 11
+  %4 = or i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [12 x i32], [12 x i32]* %B, i64 0, i64 %4
+  %5 = trunc i64 %3 to i32
+  store i32 %5, i32* %arrayidx7, align 4
+  %6 = add nuw nsw i64 %indvars.iv, 13
+  %7 = or i64 %indvars.iv, 3
+  %arrayidx12 = getelementptr inbounds [12 x i32], [12 x i32]* %E, i64 0, i64 %7
+  %8 = trunc i64 %6 to i32
+  store i32 %8, i32* %arrayidx12, align 4
+  %9 = add nuw nsw i64 %indvars.iv, 10
+  %arrayidx15 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %indvars.iv
+  %10 = trunc i64 %9 to i32
+  store i32 %10, i32* %arrayidx15, align 16
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.inc28, %for.end
+  %indvars.iv12 = phi i64 [ %indvars.iv.next13, %for.inc28 ], [ 0, %for.end ]
+  %exitcond = icmp ne i64 %indvars.iv12, 5
+  br i1 %exitcond, label %for.body19, label %for.end29
+
+for.body19:                                       ; preds = %for.cond17
+  %arrayidx21 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %indvars.iv12
+  %i = load i32, i32* %arrayidx21, align 4
+  %arrayidx23 = getelementptr inbounds [12 x i32], [12 x i32]* %B, i64 0, i64 %indvars.iv12
+  %i1 = load i32, i32* %arrayidx23, align 4
+  %arrayidx25 = getelementptr inbounds [12 x i32], [12 x i32]* %E, i64 0, i64 %indvars.iv12
+  %i2 = load i32, i32* %arrayidx25, align 4
+  %arrayidx27 = getelementptr inbounds [12 x i32], [12 x i32]* %D, i64 0, i64 %indvars.iv12
+  %i3 = load i32, i32* %arrayidx27, align 4
+  %call = call i32 (i8*, ...) @printf(i8* noundef nonnull dereferenceable(1) getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i64 0, i64 0), i32 noundef %i, i32 noundef %i1, i32 noundef %i2, i32 noundef %i3) #2
+  br label %for.inc28
+
+for.inc28:                                        ; preds = %for.body19
+  %indvars.iv.next13 = add nuw nsw i64 %indvars.iv12, 1
+  br label %for.cond17, !llvm.loop !6
+
+for.end29:                                        ; preds = %for.cond17
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8* noundef, ...) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 8e78085d22f2ac489f95a76f7e2dcfb7d832e9b8)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/scev/scev_negative_out1.ll b/tests/scev/scev_negative_out1.ll
new file mode 100644
index 00000000000000..21214d9a6b89a4
--- /dev/null
+++ b/tests/scev/scev_negative_out1.ll
@@ -0,0 +1,92 @@
+; ModuleID = 'scev_negative_out.ll'
+source_filename = "scev_negative.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at Z = dso_local global i32 5, align 4
+ at .str = private unnamed_addr constant [9 x i8] c"%d%d%d%d\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %A = alloca [10 x i32], align 16
+  %B = alloca [12 x i32], align 16
+  %E = alloca [12 x i32], align 16
+  %D = alloca [12 x i32], align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %cmp = icmp ult i64 %indvars.iv, 5
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = add nuw nsw i64 %indvars.iv, 12
+  %1 = or i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [12 x i32], [12 x i32]* %D, i64 0, i64 %1
+  %2 = trunc i64 %0 to i32
+  store i32 %2, i32* %arrayidx, align 8
+  %3 = add nuw nsw i64 %indvars.iv, 11
+  %4 = or i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [12 x i32], [12 x i32]* %B, i64 0, i64 %4
+  %5 = trunc i64 %3 to i32
+  store i32 %5, i32* %arrayidx7, align 4
+  %6 = add nuw nsw i64 %indvars.iv, 13
+  %7 = or i64 %indvars.iv, 3
+  %arrayidx12 = getelementptr inbounds [12 x i32], [12 x i32]* %E, i64 0, i64 %7
+  %8 = trunc i64 %6 to i32
+  store i32 %8, i32* %arrayidx12, align 4
+  %9 = add nuw nsw i64 %indvars.iv, 10
+  %arrayidx15 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %indvars.iv
+  %10 = trunc i64 %9 to i32
+  store i32 %10, i32* %arrayidx15, align 16
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.inc28, %for.end
+  %indvars.iv12 = phi i64 [ %indvars.iv.next13, %for.inc28 ], [ 0, %for.end ]
+  %exitcond = icmp ne i64 %indvars.iv12, 5
+  br i1 %exitcond, label %for.body19, label %for.end29
+
+for.body19:                                       ; preds = %for.cond17
+  %arrayidx21 = getelementptr inbounds [10 x i32], [10 x i32]* %A, i64 0, i64 %indvars.iv12
+  %i = load i32, i32* %arrayidx21, align 4
+  %arrayidx23 = getelementptr inbounds [12 x i32], [12 x i32]* %B, i64 0, i64 %indvars.iv12
+  %i1 = load i32, i32* %arrayidx23, align 4
+  %arrayidx25 = getelementptr inbounds [12 x i32], [12 x i32]* %E, i64 0, i64 %indvars.iv12
+  %i2 = load i32, i32* %arrayidx25, align 4
+  %arrayidx27 = getelementptr inbounds [12 x i32], [12 x i32]* %D, i64 0, i64 %indvars.iv12
+  %i3 = load i32, i32* %arrayidx27, align 4
+  %call = call i32 (i8*, ...) @printf(i8* noundef nonnull dereferenceable(1) getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i64 0, i64 0), i32 noundef %i, i32 noundef %i1, i32 noundef %i2, i32 noundef %i3) #2
+  br label %for.inc28
+
+for.inc28:                                        ; preds = %for.body19
+  %indvars.iv.next13 = add nuw nsw i64 %indvars.iv12, 1
+  br label %for.cond17, !llvm.loop !6
+
+for.end29:                                        ; preds = %for.cond17
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8* noundef, ...) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 8e78085d22f2ac489f95a76f7e2dcfb7d832e9b8)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}

>From 47ee914ea16086c1958b93540ed2351bcdae7cdb Mon Sep 17 00:00:00 2001
From: Shravan Kumar <shkumar at habana.ai>
Date: Thu, 7 Jul 2022 10:09:06 +0300
Subject: [PATCH 5/8] Adding Loop Fusion pass

---
 llvm/lib/Transforms/CMakeLists.txt            |   3 +-
 llvm/lib/Transforms/LoopFusion/CMakeLists.txt |  20 ++
 llvm/lib/Transforms/LoopFusion/LoopFusion.cpp | 206 ++++++++++++++++++
 .../Transforms/LoopFusion/LoopFusion.exports  |   0
 tests/CMakeLists.txt                          |   1 +
 tests/loop_fuse/.init.dot                     |  16 ++
 tests/loop_fuse/command.sh                    |  11 +
 tests/loop_fuse/loop_fuse.c                   |  10 +
 tests/loop_fuse/loop_fuse.ll                  | 103 +++++++++
 tests/loop_fuse/loop_fuse_out.ll              |  75 +++++++
 tests/loop_fuse/loop_fuse_out1.ll             |  61 ++++++
 tests/loop_fuse/negative_loop_fuse.c          |  10 +
 tests/loop_fuse/negative_loop_fuse.ll         | 103 +++++++++
 tests/loop_fuse/negative_loop_fuse_out.ll     |  76 +++++++
 tests/loop_fuse/negative_loop_fuse_out1.ll    |  76 +++++++
 15 files changed, 770 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/Transforms/LoopFusion/CMakeLists.txt
 create mode 100644 llvm/lib/Transforms/LoopFusion/LoopFusion.cpp
 create mode 100644 llvm/lib/Transforms/LoopFusion/LoopFusion.exports
 create mode 100644 tests/loop_fuse/.init.dot
 create mode 100644 tests/loop_fuse/command.sh
 create mode 100644 tests/loop_fuse/loop_fuse.c
 create mode 100644 tests/loop_fuse/loop_fuse.ll
 create mode 100644 tests/loop_fuse/loop_fuse_out.ll
 create mode 100644 tests/loop_fuse/loop_fuse_out1.ll
 create mode 100644 tests/loop_fuse/negative_loop_fuse.c
 create mode 100644 tests/loop_fuse/negative_loop_fuse.ll
 create mode 100644 tests/loop_fuse/negative_loop_fuse_out.ll
 create mode 100644 tests/loop_fuse/negative_loop_fuse_out1.ll

diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt
index 8ace411e1ca82b..6b165fd71dfcfb 100644
--- a/llvm/lib/Transforms/CMakeLists.txt
+++ b/llvm/lib/Transforms/CMakeLists.txt
@@ -10,4 +10,5 @@ add_subdirectory(ObjCARC)
 add_subdirectory(Coroutines)
 add_subdirectory(CFGuard)
 add_subdirectory(Cfcss)
-add_subdirectory(Scev)
\ No newline at end of file
+add_subdirectory(Scev)
+add_subdirectory(LoopFusion)
diff --git a/llvm/lib/Transforms/LoopFusion/CMakeLists.txt b/llvm/lib/Transforms/LoopFusion/CMakeLists.txt
new file mode 100644
index 00000000000000..6c0edac49c6ec2
--- /dev/null
+++ b/llvm/lib/Transforms/LoopFusion/CMakeLists.txt
@@ -0,0 +1,20 @@
+# If we don't need RTTI or EH, there's no reason to export anything
+# from the hello plugin.
+if( NOT LLVM_REQUIRES_RTTI )
+  if( NOT LLVM_REQUIRES_EH )
+    set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/LoopFusion.exports)
+  endif()
+endif()
+
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_library( LLVMLoopFusion MODULE BUILDTREE_ONLY
+LoopFusion.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/llvm/lib/Transforms/LoopFusion/LoopFusion.cpp b/llvm/lib/Transforms/LoopFusion/LoopFusion.cpp
new file mode 100644
index 00000000000000..f82d6e597d858b
--- /dev/null
+++ b/llvm/lib/Transforms/LoopFusion/LoopFusion.cpp
@@ -0,0 +1,206 @@
+/*===- LoopFusion.cpp -
+  This program is the implementation of a pass for loop fusion in LLVM compiler.
+Two loops, which are adjacent and have the same condition and increments with
+respect to the loop variable may be fused, i.e, their bodies may be executed one
+after the other with in a single loop. The decision to fuse the loops is taken
+based on the legality and profitability of the fusion. It should not be
+performed if the resulting code has anti-dependency or if the execution time of
+the program increases. Algorithm:
+1. Check 2 loops are can fuse.
+2. Replace the use of induction variable of 2nd loop with that of 1st loop.
+3. Combine the bodies of loop1 and loop2.
+3. Set the succesor of 1st loop’s header to exit block of 2nd loop.
+4. Delete the unwanted basic blocks of 2nd loop.
+===-------------------------------------------------------------------------------------------===*/
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "hello"
+
+namespace {
+// Scev - The second implementation with getAnalysisUsage implemented.
+struct LoopFusion : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  LoopFusion() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override {
+
+    SmallVector<Loop *> LoopVector;
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+    for (auto *L : *LI) {
+      LoopVector.push_back(L);
+    }
+
+    // Function to perform basic checks on the two loops
+    if (fuseCheck(LoopVector[1], LoopVector[0]))
+      // Function to perform fusing on the two loops
+      fuseBody(LoopVector[1], LoopVector[0], F);
+
+    return false;
+  }
+
+  void fuseBody(Loop *Loop1, Loop *Loop2, Function &F) {
+    BasicBlock *Body1 = nullptr;
+    BasicBlock *Body2 = nullptr;
+    BasicBlock *Header1 = nullptr;
+    BasicBlock *Latch1 = nullptr;
+    BasicBlock *Exit2 = nullptr;
+
+    Body1 = getBody(Loop1);
+    Body2 = getBody(Loop2);
+    Header1 = getHeader(Loop1);
+    Latch1 = getLoopLatch(Loop1);
+    Exit2 = getLoopExit(Loop2);
+
+    PHINode *Phi1 = Loop1->getCanonicalInductionVariable();
+    PHINode *Phi2 = Loop2->getCanonicalInductionVariable();
+
+    // Replace the use of induction variable of 2nd loop with that of 1st loop.
+    Phi2->replaceAllUsesWith(Phi1);
+
+    for (BasicBlock &BB : F) {
+      BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator());
+      if (&BB == Body1) {
+        BI->setSuccessor(0, Body2);
+      }
+
+      if (&BB == Body2) {
+        BI->setSuccessor(0, Latch1);
+      }
+
+      if (&BB == Header1) {
+        BI->setSuccessor(1, Exit2);
+      }
+    }
+    // Function to remove un-wanted basic blocks.
+    EliminateUnreachableBlocks(F);
+  }
+
+  // Function to get Loop Body Blocks.
+  BasicBlock *getBody(Loop *L) {
+    for (BasicBlock *BB : L->getBlocks()) {
+      BasicBlock *HeaderBlock = L->getHeader();
+      if ((HeaderBlock != BB) && !(L->isLoopLatch(BB))) {
+        return BB;
+      }
+    }
+    return {};
+  }
+
+  // Function to get Loop Header Blocks.
+  BasicBlock *getHeader(Loop *L) { return L->getHeader(); }
+
+  // Function to get Loop Latch Blocks.
+  BasicBlock *getLoopLatch(Loop *L) {
+    for (BasicBlock *BB : L->getBlocks()) {
+      if (L->isLoopLatch(BB)) {
+        return BB;
+      }
+    }
+    return {};
+  }
+
+  // Function to get Loop exit blocks.
+  BasicBlock *getLoopExit(Loop *L) { return L->getExitBlock(); }
+
+  bool adjacent(Loop *Loop1, Loop *Loop2) {
+
+    BasicBlock *Bb1 = Loop1->getExitBlock();
+    BasicBlock *Bb2 = Loop2->getLoopPreheader();
+
+    //  If exit block and preHeader are not same.
+    if (Bb1 != Bb2) {
+      if (Bb1->size() != 1)
+        return false;
+      if (Bb1->getTerminator()->getSuccessor(0) != Bb2)
+        return false;
+      if (Bb1 == nullptr || Bb2 == nullptr) {
+        llvm::errs() << "NULL Pointer encountered\n";
+        return false;
+      }
+      return false;
+    }
+    return true;
+  }
+
+  // Helper function to check and fuse two loops.
+  bool fuseCheck(Loop *L1, Loop *L2) {
+
+    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    // Check if the two loops are adjacent.
+    if (!adjacent(L1, L2)) {
+      llvm::errs() << "The two loops are not adjacent.CANNOT fuse\n";
+      return false;
+    }
+
+    // Check if the start integer is same.
+    if (startValue(*L1, *SE) != startValue(*L2, *SE)) {
+      llvm::errs() << "The loop check starting value is not same.CANNOT fuse\n";
+      return false;
+    }
+
+    // Check if the limit integer is same.
+    if (limitValue(L1) != limitValue(L2)) {
+      llvm::errs() << "The loop check limiting value is not same.CANNOT fuse\n";
+      return false;
+    }
+    return true;
+  }
+
+  // Check if the start value is same.
+  int startValue(Loop &LoopV, ScalarEvolution &SE) {
+    for (auto &IndVar : LoopV.getHeader()->phis()) {
+      Value *V = IndVar.getOperand(1);
+      auto startValue = dyn_cast<ConstantInt>(V);
+      return startValue->getSExtValue();
+    }
+    return {};
+  }
+
+  // Check if the limit value is same.
+  Value *limitValue(Loop *LoopV) {
+    Value *end;
+    for (Use &U : LoopV->getHeader()->getFirstNonPHI()->operands()) {
+      if (!dyn_cast<PHINode>(U.get())) {
+        Instruction *I = dyn_cast<Instruction>(U.get());
+        for (Use &U : I->operands())
+          end = U.get();
+      }
+    }
+    return end;
+  }
+
+  // We don't modify the program, so we preserve all analyses.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+};
+} // namespace
+
+char LoopFusion::ID = 0;
+static RegisterPass<LoopFusion>
+    X("loopfusion",
+      "LoopFusion Implementation Pass (with getAnalysisUsage implemented)");
\ No newline at end of file
diff --git a/llvm/lib/Transforms/LoopFusion/LoopFusion.exports b/llvm/lib/Transforms/LoopFusion/LoopFusion.exports
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0ab0d081f586b9..2321ce7a8e9ce7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(cfcss)
 add_subdirectory(assignment1)
 add_subdirectory(scev)
+add_subdirectory(loop_fuse)
diff --git a/tests/loop_fuse/.init.dot b/tests/loop_fuse/.init.dot
new file mode 100644
index 00000000000000..cecd6d2978d441
--- /dev/null
+++ b/tests/loop_fuse/.init.dot
@@ -0,0 +1,16 @@
+digraph "CFG for 'init' function" {
+	label="CFG for 'init' function";
+
+	Node0x558a60718b60 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{entry:\l  %smax = call i32 @llvm.smax.i32(i32 %n, i32 0)\l  %wide.trip.count = zext i32 %smax to i64\l  br label %for.cond\l}"];
+	Node0x558a60718b60 -> Node0x558a607194d0;
+	Node0x558a607194d0 [shape=record,color="#b70d28ff", style=filled, fillcolor="#b70d2870",label="{for.cond:                                         \l  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]\l  %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count\l  br i1 %exitcond, label %for.body, label %for.end16\l|{<s0>T|<s1>F}}"];
+	Node0x558a607194d0:s0 -> Node0x558a60719900;
+	Node0x558a607194d0:s1 -> Node0x558a607199e0;
+	Node0x558a60719900 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.body:                                         \l  %0 = shl nuw nsw i64 %indvars.iv, 1\l  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv\l  %1 = trunc i64 %0 to i32\l  store i32 %1, i32* %arrayidx, align 4\l  %2 = trunc i64 %indvars.iv to i32\l  %mul = mul nsw i32 %2, %2\l  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv\l  store i32 %mul, i32* %arrayidx2, align 4\l  br label %for.body6\l}"];
+	Node0x558a60719900 -> Node0x558a6071a270;
+	Node0x558a60719680 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.inc:                                          \l  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l  br label %for.cond, !llvm.loop !4\l}"];
+	Node0x558a60719680 -> Node0x558a607194d0;
+	Node0x558a6071a270 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.body6:                                        \l  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv\l  %i = load i32, i32* %arrayidx8, align 4\l  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv\l  %i1 = load i32, i32* %arrayidx10, align 4\l  %add11 = add nsw i32 %i, %i1\l  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv\l  store i32 %add11, i32* %arrayidx13, align 4\l  br label %for.inc\l}"];
+	Node0x558a6071a270 -> Node0x558a60719680;
+	Node0x558a607199e0 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{for.end16:                                        \l  ret void\l}"];
+}
diff --git a/tests/loop_fuse/command.sh b/tests/loop_fuse/command.sh
new file mode 100644
index 00000000000000..aac1e1e99a0ee5
--- /dev/null
+++ b/tests/loop_fuse/command.sh
@@ -0,0 +1,11 @@
+# first command is to emit ir for test case
+clang -S -emit-llvm loop_fuse.c -Xclang -disable-O0-optnone
+
+#second command is to clean up ir so that scev can understand it
+opt -mem2reg -loop-simplify -instcombine -instnamer -indvars loop_fuse.ll -S -o loop_fuse_out.ll
+
+#Third command will run loopfusion
+opt -load  ${LLVM_HOME}/build/lib/LLVMLoopFusion.so -loopfusion loop_fuse_out.ll -enable-new-pm=0 -S -o loop_fuse_out1.ll
+
+#To create cfg
+opt -analyze -dot-cfg -enable-new-pm=0 loop_fuse_out1.ll
\ No newline at end of file
diff --git a/tests/loop_fuse/loop_fuse.c b/tests/loop_fuse/loop_fuse.c
new file mode 100644
index 00000000000000..e0189dd76ec488
--- /dev/null
+++ b/tests/loop_fuse/loop_fuse.c
@@ -0,0 +1,10 @@
+void init(int *a, int *b, int *c, int n) {
+  for (int i = 0; i < n; i++) {
+    c[i] = i + i;
+    b[i] = i * i;
+  }
+
+  for (int i = 0; i < n; i++) {
+    a[i] = b[i] + c[i];
+  }
+}
\ No newline at end of file
diff --git a/tests/loop_fuse/loop_fuse.ll b/tests/loop_fuse/loop_fuse.ll
new file mode 100644
index 00000000000000..5bdecfd993a5e7
--- /dev/null
+++ b/tests/loop_fuse/loop_fuse.ll
@@ -0,0 +1,103 @@
+; ModuleID = 'loop_fuse.c'
+source_filename = "loop_fuse.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init(i32* noundef %a, i32* noundef %b, i32* noundef %c, i32 noundef %n) #0 {
+entry:
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca i32*, align 8
+  %c.addr = alloca i32*, align 8
+  %n.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %i3 = alloca i32, align 4
+  store i32* %a, i32** %a.addr, align 8
+  store i32* %b, i32** %b.addr, align 8
+  store i32* %c, i32** %c.addr, align 8
+  store i32 %n, i32* %n.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %n.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %3 = load i32, i32* %i, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32*, i32** %c.addr, align 8
+  %5 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %5 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %4, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %6 = load i32, i32* %i, align 4
+  %7 = load i32, i32* %i, align 4
+  %mul = mul nsw i32 %6, %7
+  %8 = load i32*, i32** %b.addr, align 8
+  %9 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %9 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %8, i64 %idxprom1
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %10 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, i32* %i3, align 4
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc14, %for.end
+  %11 = load i32, i32* %i3, align 4
+  %12 = load i32, i32* %n.addr, align 4
+  %cmp5 = icmp slt i32 %11, %12
+  br i1 %cmp5, label %for.body6, label %for.end16
+
+for.body6:                                        ; preds = %for.cond4
+  %13 = load i32*, i32** %b.addr, align 8
+  %14 = load i32, i32* %i3, align 4
+  %idxprom7 = sext i32 %14 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32* %13, i64 %idxprom7
+  %15 = load i32, i32* %arrayidx8, align 4
+  %16 = load i32*, i32** %c.addr, align 8
+  %17 = load i32, i32* %i3, align 4
+  %idxprom9 = sext i32 %17 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %16, i64 %idxprom9
+  %18 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %15, %18
+  %19 = load i32*, i32** %a.addr, align 8
+  %20 = load i32, i32* %i3, align 4
+  %idxprom12 = sext i32 %20 to i64
+  %arrayidx13 = getelementptr inbounds i32, i32* %19, i64 %idxprom12
+  store i32 %add11, i32* %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6
+  %21 = load i32, i32* %i3, align 4
+  %inc15 = add nsw i32 %21, 1
+  store i32 %inc15, i32* %i3, align 4
+  br label %for.cond4, !llvm.loop !6
+
+for.end16:                                        ; preds = %for.cond4
+  ret void
+}
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 7f049514ee22563de5f8817412efd6d7d83109cf)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/loop_fuse/loop_fuse_out.ll b/tests/loop_fuse/loop_fuse_out.ll
new file mode 100644
index 00000000000000..5f5cde757646c6
--- /dev/null
+++ b/tests/loop_fuse/loop_fuse_out.ll
@@ -0,0 +1,75 @@
+; ModuleID = 'loop_fuse.ll'
+source_filename = "loop_fuse.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init(i32* noundef %a, i32* noundef %b, i32* noundef %c, i32 noundef %n) #0 {
+entry:
+  %smax = call i32 @llvm.smax.i32(i32 %n, i32 0)
+  %wide.trip.count = zext i32 %smax to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = shl nuw nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %1 = trunc i64 %0 to i32
+  store i32 %1, i32* %arrayidx, align 4
+  %2 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %2, %2
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  %wide.trip.count7 = zext i32 %smax to i64
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc14, %for.end
+  %indvars.iv4 = phi i64 [ %indvars.iv.next5, %for.inc14 ], [ 0, %for.end ]
+  %exitcond8 = icmp ne i64 %indvars.iv4, %wide.trip.count7
+  br i1 %exitcond8, label %for.body6, label %for.end16
+
+for.body6:                                        ; preds = %for.cond4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv4
+  %i = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv4
+  %i1 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %i, %i1
+  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv4
+  store i32 %add11, i32* %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6
+  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+  br label %for.cond4, !llvm.loop !6
+
+for.end16:                                        ; preds = %for.cond4
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 7f049514ee22563de5f8817412efd6d7d83109cf)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/loop_fuse/loop_fuse_out1.ll b/tests/loop_fuse/loop_fuse_out1.ll
new file mode 100644
index 00000000000000..bd15857dfeb13a
--- /dev/null
+++ b/tests/loop_fuse/loop_fuse_out1.ll
@@ -0,0 +1,61 @@
+; ModuleID = 'loop_fuse_out.ll'
+source_filename = "loop_fuse.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init(i32* noundef %a, i32* noundef %b, i32* noundef %c, i32 noundef %n) #0 {
+entry:
+  %smax = call i32 @llvm.smax.i32(i32 %n, i32 0)
+  %wide.trip.count = zext i32 %smax to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end16
+
+for.body:                                         ; preds = %for.cond
+  %0 = shl nuw nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %1 = trunc i64 %0 to i32
+  store i32 %1, i32* %arrayidx, align 4
+  %2 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %2, %2
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.body6
+
+for.inc:                                          ; preds = %for.body6
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond, !llvm.loop !4
+
+for.body6:                                        ; preds = %for.body
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %i = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %i1 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %i, %i1
+  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add11, i32* %arrayidx13, align 4
+  br label %for.inc
+
+for.end16:                                        ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 7f049514ee22563de5f8817412efd6d7d83109cf)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
diff --git a/tests/loop_fuse/negative_loop_fuse.c b/tests/loop_fuse/negative_loop_fuse.c
new file mode 100644
index 00000000000000..ca7a77bd3b3731
--- /dev/null
+++ b/tests/loop_fuse/negative_loop_fuse.c
@@ -0,0 +1,10 @@
+void init(int *a, int *b, int *c, int n) {
+  for (int i = 3; i < n; i++) {
+    c[i] = i + i;
+    b[i] = i * i;
+  }
+
+  for (int i = 5; i < n; i++) {
+    a[i] = b[i] + c[i];
+  }
+}
\ No newline at end of file
diff --git a/tests/loop_fuse/negative_loop_fuse.ll b/tests/loop_fuse/negative_loop_fuse.ll
new file mode 100644
index 00000000000000..a5646e46627d58
--- /dev/null
+++ b/tests/loop_fuse/negative_loop_fuse.ll
@@ -0,0 +1,103 @@
+; ModuleID = 'negative_loop_fuse.c'
+source_filename = "negative_loop_fuse.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init(i32* noundef %a, i32* noundef %b, i32* noundef %c, i32 noundef %n) #0 {
+entry:
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca i32*, align 8
+  %c.addr = alloca i32*, align 8
+  %n.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %i3 = alloca i32, align 4
+  store i32* %a, i32** %a.addr, align 8
+  store i32* %b, i32** %b.addr, align 8
+  store i32* %c, i32** %c.addr, align 8
+  store i32 %n, i32* %n.addr, align 4
+  store i32 3, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %n.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %3 = load i32, i32* %i, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32*, i32** %c.addr, align 8
+  %5 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %5 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %4, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %6 = load i32, i32* %i, align 4
+  %7 = load i32, i32* %i, align 4
+  %mul = mul nsw i32 %6, %7
+  %8 = load i32*, i32** %b.addr, align 8
+  %9 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %9 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %8, i64 %idxprom1
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %10 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  store i32 5, i32* %i3, align 4
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc14, %for.end
+  %11 = load i32, i32* %i3, align 4
+  %12 = load i32, i32* %n.addr, align 4
+  %cmp5 = icmp slt i32 %11, %12
+  br i1 %cmp5, label %for.body6, label %for.end16
+
+for.body6:                                        ; preds = %for.cond4
+  %13 = load i32*, i32** %b.addr, align 8
+  %14 = load i32, i32* %i3, align 4
+  %idxprom7 = sext i32 %14 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32* %13, i64 %idxprom7
+  %15 = load i32, i32* %arrayidx8, align 4
+  %16 = load i32*, i32** %c.addr, align 8
+  %17 = load i32, i32* %i3, align 4
+  %idxprom9 = sext i32 %17 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %16, i64 %idxprom9
+  %18 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %15, %18
+  %19 = load i32*, i32** %a.addr, align 8
+  %20 = load i32, i32* %i3, align 4
+  %idxprom12 = sext i32 %20 to i64
+  %arrayidx13 = getelementptr inbounds i32, i32* %19, i64 %idxprom12
+  store i32 %add11, i32* %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6
+  %21 = load i32, i32* %i3, align 4
+  %inc15 = add nsw i32 %21, 1
+  store i32 %inc15, i32* %i3, align 4
+  br label %for.cond4, !llvm.loop !6
+
+for.end16:                                        ; preds = %for.cond4
+  ret void
+}
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 7f049514ee22563de5f8817412efd6d7d83109cf)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/loop_fuse/negative_loop_fuse_out.ll b/tests/loop_fuse/negative_loop_fuse_out.ll
new file mode 100644
index 00000000000000..100883a6e9d56e
--- /dev/null
+++ b/tests/loop_fuse/negative_loop_fuse_out.ll
@@ -0,0 +1,76 @@
+; ModuleID = 'negative_loop_fuse.ll'
+source_filename = "negative_loop_fuse.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init(i32* noundef %a, i32* noundef %b, i32* noundef %c, i32 noundef %n) #0 {
+entry:
+  %smax = call i32 @llvm.smax.i32(i32 %n, i32 3)
+  %wide.trip.count = zext i32 %smax to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 3, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = shl nuw nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %1 = trunc i64 %0 to i32
+  store i32 %1, i32* %arrayidx, align 4
+  %2 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %2, %2
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  %smax7 = call i32 @llvm.smax.i32(i32 %n, i32 5)
+  %wide.trip.count8 = zext i32 %smax7 to i64
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc14, %for.end
+  %indvars.iv4 = phi i64 [ %indvars.iv.next5, %for.inc14 ], [ 5, %for.end ]
+  %exitcond9 = icmp ne i64 %indvars.iv4, %wide.trip.count8
+  br i1 %exitcond9, label %for.body6, label %for.end16
+
+for.body6:                                        ; preds = %for.cond4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv4
+  %i = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv4
+  %i1 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %i, %i1
+  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv4
+  store i32 %add11, i32* %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6
+  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+  br label %for.cond4, !llvm.loop !6
+
+for.end16:                                        ; preds = %for.cond4
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 7f049514ee22563de5f8817412efd6d7d83109cf)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/loop_fuse/negative_loop_fuse_out1.ll b/tests/loop_fuse/negative_loop_fuse_out1.ll
new file mode 100644
index 00000000000000..1eabb624ad1dc4
--- /dev/null
+++ b/tests/loop_fuse/negative_loop_fuse_out1.ll
@@ -0,0 +1,76 @@
+; ModuleID = 'negative_loop_fuse_out.ll'
+source_filename = "negative_loop_fuse.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init(i32* noundef %a, i32* noundef %b, i32* noundef %c, i32 noundef %n) #0 {
+entry:
+  %smax = call i32 @llvm.smax.i32(i32 %n, i32 3)
+  %wide.trip.count = zext i32 %smax to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 3, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = shl nuw nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %1 = trunc i64 %0 to i32
+  store i32 %1, i32* %arrayidx, align 4
+  %2 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %2, %2
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  %smax7 = call i32 @llvm.smax.i32(i32 %n, i32 5)
+  %wide.trip.count8 = zext i32 %smax7 to i64
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc14, %for.end
+  %indvars.iv4 = phi i64 [ %indvars.iv.next5, %for.inc14 ], [ 5, %for.end ]
+  %exitcond9 = icmp ne i64 %indvars.iv4, %wide.trip.count8
+  br i1 %exitcond9, label %for.body6, label %for.end16
+
+for.body6:                                        ; preds = %for.cond4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv4
+  %i = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv4
+  %i1 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %i, %i1
+  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv4
+  store i32 %add11, i32* %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6
+  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+  br label %for.cond4, !llvm.loop !6
+
+for.end16:                                        ; preds = %for.cond4
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 7f049514ee22563de5f8817412efd6d7d83109cf)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}

>From 777a30c4f97db458e06e228b49d8d5f0e4127197 Mon Sep 17 00:00:00 2001
From: Shravan Kumar <shkumar at habana.ai>
Date: Thu, 7 Jul 2022 10:09:06 +0300
Subject: [PATCH 6/8] Adding Loop Fusion pass

---
 llvm/lib/Transforms/LoopFusion/LoopFusion.cpp | 124 ++++++++++--------
 tests/loop_fuse/.init.dot                     |  42 ++++--
 tests/loop_fuse/loop_fuse.c                   |   8 ++
 tests/loop_fuse/loop_fuse.ll                  | 112 +++++++++++++---
 tests/loop_fuse/loop_fuse_out.ll              |  64 +++++++--
 tests/loop_fuse/loop_fuse_out1.ll             |  67 ++++++++--
 tests/loop_fuse/negative_loop_fuse.ll         |   2 +-
 tests/loop_fuse/negative_loop_fuse_out.ll     |   2 +-
 tests/loop_fuse/negative_loop_fuse_out1.ll    |   2 +-
 9 files changed, 315 insertions(+), 108 deletions(-)

diff --git a/llvm/lib/Transforms/LoopFusion/LoopFusion.cpp b/llvm/lib/Transforms/LoopFusion/LoopFusion.cpp
index f82d6e597d858b..d3890e8c00009b 100644
--- a/llvm/lib/Transforms/LoopFusion/LoopFusion.cpp
+++ b/llvm/lib/Transforms/LoopFusion/LoopFusion.cpp
@@ -52,26 +52,39 @@ struct LoopFusion : public FunctionPass {
       LoopVector.push_back(L);
     }
 
-    // Function to perform basic checks on the two loops
-    if (fuseCheck(LoopVector[1], LoopVector[0]))
-      // Function to perform fusing on the two loops
-      fuseBody(LoopVector[1], LoopVector[0], F);
+    int LoopCount = LoopVector.size();
+    if (LoopCount < 2) {
+      llvm::errs() << "The program contains less no of loops to fuse\n";
+      return false;
+    }
+
+    // Check for each combinations of loops are fusable
+    for (int i = 0; i < LoopCount; i++) {
+      for (int j = i + 1; j < LoopCount; j++) {
+        // Function to perform basic checks on the two loops
+        if (fuseCheck(LoopVector[j], LoopVector[i]))
+          // Function to perform fusing on the two loops
+          fuseBody(LoopVector[j], LoopVector[i], F);
+        break;
+      }
+    }
 
     return false;
   }
 
   void fuseBody(Loop *Loop1, Loop *Loop2, Function &F) {
-    BasicBlock *Body1 = nullptr;
-    BasicBlock *Body2 = nullptr;
     BasicBlock *Header1 = nullptr;
     BasicBlock *Latch1 = nullptr;
     BasicBlock *Exit2 = nullptr;
 
-    Body1 = getBody(Loop1);
-    Body2 = getBody(Loop2);
-    Header1 = getHeader(Loop1);
-    Latch1 = getLoopLatch(Loop1);
-    Exit2 = getLoopExit(Loop2);
+    BasicBlock *Body1 = getBody(Loop1);
+    BasicBlock *Body2 = getBody(Loop2);
+    Header1 = Loop1->getHeader();
+    Latch1 = Loop1->getLoopLatch();
+    Exit2 = Loop2->getExitBlock();
+
+    assert(Body1 && Body2 && Header1 && Latch1 && Exit2 &&
+           "NULL Pointer encountered\n");
 
     PHINode *Phi1 = Loop1->getCanonicalInductionVariable();
     PHINode *Phi2 = Loop2->getCanonicalInductionVariable();
@@ -80,15 +93,21 @@ struct LoopFusion : public FunctionPass {
     Phi2->replaceAllUsesWith(Phi1);
 
     for (BasicBlock &BB : F) {
+
+      if (isa<ReturnInst>(BB.getTerminator()))
+        continue;
+      // Get the branch Instruction every block.
       BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator());
+      // Set the successor of first Body block to Body of the second block.
       if (&BB == Body1) {
         BI->setSuccessor(0, Body2);
       }
-
+      // Set the successor of second body block to Latch of the first block.
       if (&BB == Body2) {
         BI->setSuccessor(0, Latch1);
       }
-
+      // Set the successor of first header block to exit of the second as its
+      // contains return insn.
       if (&BB == Header1) {
         BI->setSuccessor(1, Exit2);
       }
@@ -99,31 +118,16 @@ struct LoopFusion : public FunctionPass {
 
   // Function to get Loop Body Blocks.
   BasicBlock *getBody(Loop *L) {
+    BasicBlock *NullBB = nullptr;
     for (BasicBlock *BB : L->getBlocks()) {
       BasicBlock *HeaderBlock = L->getHeader();
       if ((HeaderBlock != BB) && !(L->isLoopLatch(BB))) {
         return BB;
       }
     }
-    return {};
+    return NullBB;
   }
 
-  // Function to get Loop Header Blocks.
-  BasicBlock *getHeader(Loop *L) { return L->getHeader(); }
-
-  // Function to get Loop Latch Blocks.
-  BasicBlock *getLoopLatch(Loop *L) {
-    for (BasicBlock *BB : L->getBlocks()) {
-      if (L->isLoopLatch(BB)) {
-        return BB;
-      }
-    }
-    return {};
-  }
-
-  // Function to get Loop exit blocks.
-  BasicBlock *getLoopExit(Loop *L) { return L->getExitBlock(); }
-
   bool adjacent(Loop *Loop1, Loop *Loop2) {
 
     BasicBlock *Bb1 = Loop1->getExitBlock();
@@ -131,14 +135,6 @@ struct LoopFusion : public FunctionPass {
 
     //  If exit block and preHeader are not same.
     if (Bb1 != Bb2) {
-      if (Bb1->size() != 1)
-        return false;
-      if (Bb1->getTerminator()->getSuccessor(0) != Bb2)
-        return false;
-      if (Bb1 == nullptr || Bb2 == nullptr) {
-        llvm::errs() << "NULL Pointer encountered\n";
-        return false;
-      }
       return false;
     }
     return true;
@@ -147,48 +143,68 @@ struct LoopFusion : public FunctionPass {
   // Helper function to check and fuse two loops.
   bool fuseCheck(Loop *L1, Loop *L2) {
 
-    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     // Check if the two loops are adjacent.
     if (!adjacent(L1, L2)) {
-      llvm::errs() << "The two loops are not adjacent.CANNOT fuse\n";
+      llvm::errs() << "The two loops  " << L1->getName() << " and "
+                   << L2->getName() << "  are not adjacent. CANNOT fuse.\n";
       return false;
     }
 
     // Check if the start integer is same.
-    if (startValue(*L1, *SE) != startValue(*L2, *SE)) {
-      llvm::errs() << "The loop check starting value is not same.CANNOT fuse\n";
+    if (startValue(*L1) != startValue(*L2)) {
+      llvm::errs() << "The loop check starting values of 2 loops "
+                   << L1->getName() << " and " << L2->getName()
+                   << "  are not same. CANNOT fuse.\n";
       return false;
     }
 
     // Check if the limit integer is same.
     if (limitValue(L1) != limitValue(L2)) {
-      llvm::errs() << "The loop check limiting value is not same.CANNOT fuse\n";
+      llvm::errs() << "The loop check limiting value of 2 loops "
+                   << L1->getName() << " and " << L2->getName()
+                   << "  are not same. CANNOT fuse.\n";
       return false;
     }
+    llvm::errs() << "The two loops " << *L1 << " and  " << *L2
+                 << " are being fused.\n";
     return true;
   }
 
   // Check if the start value is same.
-  int startValue(Loop &LoopV, ScalarEvolution &SE) {
+  Value *startValue(Loop &LoopV) {
     for (auto &IndVar : LoopV.getHeader()->phis()) {
       Value *V = IndVar.getOperand(1);
-      auto startValue = dyn_cast<ConstantInt>(V);
-      return startValue->getSExtValue();
+      return V;
     }
-    return {};
+    return nullptr;
   }
 
   // Check if the limit value is same.
   Value *limitValue(Loop *LoopV) {
-    Value *end;
-    for (Use &U : LoopV->getHeader()->getFirstNonPHI()->operands()) {
-      if (!dyn_cast<PHINode>(U.get())) {
-        Instruction *I = dyn_cast<Instruction>(U.get());
-        for (Use &U : I->operands())
-          end = U.get();
+    Value *End, *ContEnd;
+
+    BasicBlock *BB = LoopV->getHeader();
+    for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Check instruction is compare
+      if (isa<ICmpInst>(I)) {
+        ContEnd = I->getOperand(1);
+        // Check end value is constant
+        if (dyn_cast<Constant>(ContEnd)) {
+          return ContEnd;
+        } else {
+          for (Use &U : LoopV->getHeader()->getFirstNonPHI()->operands()) {
+            if (!dyn_cast<PHINode>(U.get())) {
+              Instruction *I = dyn_cast<Instruction>(U.get());
+              for (Use &U : I->operands())
+                End = U.get();
+              return End;
+            }
+          }
+        }
       }
     }
-    return end;
+
+    return nullptr;
   }
 
   // We don't modify the program, so we preserve all analyses.
diff --git a/tests/loop_fuse/.init.dot b/tests/loop_fuse/.init.dot
index cecd6d2978d441..7850606709b6c5 100644
--- a/tests/loop_fuse/.init.dot
+++ b/tests/loop_fuse/.init.dot
@@ -1,16 +1,34 @@
 digraph "CFG for 'init' function" {
 	label="CFG for 'init' function";
 
-	Node0x558a60718b60 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{entry:\l  %smax = call i32 @llvm.smax.i32(i32 %n, i32 0)\l  %wide.trip.count = zext i32 %smax to i64\l  br label %for.cond\l}"];
-	Node0x558a60718b60 -> Node0x558a607194d0;
-	Node0x558a607194d0 [shape=record,color="#b70d28ff", style=filled, fillcolor="#b70d2870",label="{for.cond:                                         \l  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]\l  %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count\l  br i1 %exitcond, label %for.body, label %for.end16\l|{<s0>T|<s1>F}}"];
-	Node0x558a607194d0:s0 -> Node0x558a60719900;
-	Node0x558a607194d0:s1 -> Node0x558a607199e0;
-	Node0x558a60719900 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.body:                                         \l  %0 = shl nuw nsw i64 %indvars.iv, 1\l  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv\l  %1 = trunc i64 %0 to i32\l  store i32 %1, i32* %arrayidx, align 4\l  %2 = trunc i64 %indvars.iv to i32\l  %mul = mul nsw i32 %2, %2\l  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv\l  store i32 %mul, i32* %arrayidx2, align 4\l  br label %for.body6\l}"];
-	Node0x558a60719900 -> Node0x558a6071a270;
-	Node0x558a60719680 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.inc:                                          \l  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l  br label %for.cond, !llvm.loop !4\l}"];
-	Node0x558a60719680 -> Node0x558a607194d0;
-	Node0x558a6071a270 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.body6:                                        \l  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv\l  %i = load i32, i32* %arrayidx8, align 4\l  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv\l  %i1 = load i32, i32* %arrayidx10, align 4\l  %add11 = add nsw i32 %i, %i1\l  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv\l  store i32 %add11, i32* %arrayidx13, align 4\l  br label %for.inc\l}"];
-	Node0x558a6071a270 -> Node0x558a60719680;
-	Node0x558a607199e0 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{for.end16:                                        \l  ret void\l}"];
+	Node0x560bdf31b3a0 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{entry:\l  %smax = call i32 @llvm.smax.i32(i32 %n, i32 0)\l  %wide.trip.count = zext i32 %smax to i64\l  br label %for.cond\l}"];
+	Node0x560bdf31b3a0 -> Node0x560bdf31bd10;
+	Node0x560bdf31bd10 [shape=record,color="#b70d28ff", style=filled, fillcolor="#b70d2870",label="{for.cond:                                         \l  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]\l  %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count\l  br i1 %exitcond, label %for.body, label %for.end\l|{<s0>T|<s1>F}}"];
+	Node0x560bdf31bd10:s0 -> Node0x560bdf31c140;
+	Node0x560bdf31bd10:s1 -> Node0x560bdf31c220;
+	Node0x560bdf31c140 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.body:                                         \l  %0 = shl nuw nsw i64 %indvars.iv, 1\l  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv\l  %1 = trunc i64 %0 to i32\l  store i32 %1, i32* %arrayidx, align 4\l  %2 = trunc i64 %indvars.iv to i32\l  %mul = mul nsw i32 %2, %2\l  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv\l  store i32 %mul, i32* %arrayidx2, align 4\l  br label %for.inc\l}"];
+	Node0x560bdf31c140 -> Node0x560bdf31bec0;
+	Node0x560bdf31bec0 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.inc:                                          \l  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l  br label %for.cond, !llvm.loop !4\l}"];
+	Node0x560bdf31bec0 -> Node0x560bdf31bd10;
+	Node0x560bdf31c220 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{for.end:                                          \l  br label %for.cond4\l}"];
+	Node0x560bdf31c220 -> Node0x560bdf31d730;
+	Node0x560bdf31d730 [shape=record,color="#b70d28ff", style=filled, fillcolor="#b70d2870",label="{for.cond4:                                        \l  %indvars.iv8 = phi i64 [ %indvars.iv.next9, %for.inc14 ], [ 0, %for.end ]\l  %exitcond11 = icmp ne i64 %indvars.iv8, 10\l  br i1 %exitcond11, label %for.body6, label %for.end30\l|{<s0>T|<s1>F}}"];
+	Node0x560bdf31d730:s0 -> Node0x560bdf31da60;
+	Node0x560bdf31d730:s1 -> Node0x560bdf31dae0;
+	Node0x560bdf31da60 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.body6:                                        \l  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv8\l  %i = load i32, i32* %arrayidx8, align 4\l  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv8\l  %i1 = load i32, i32* %arrayidx10, align 4\l  %add11 = add nsw i32 %i, %i1\l  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv8\l  store i32 %add11, i32* %arrayidx13, align 4\l  br label %for.body20\l}"];
+	Node0x560bdf31da60 -> Node0x560bdf31e460;
+	Node0x560bdf31d8a0 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.inc14:                                        \l  %indvars.iv.next9 = add nuw nsw i64 %indvars.iv8, 1\l  br label %for.cond4, !llvm.loop !6\l}"];
+	Node0x560bdf31d8a0 -> Node0x560bdf31d730;
+	Node0x560bdf31e460 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.body20:                                       \l  %arrayidx22 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv8\l  %i2 = load i32, i32* %arrayidx22, align 4\l  %arrayidx24 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv8\l  %i3 = load i32, i32* %arrayidx24, align 4\l  %add25 = add nsw i32 %i2, %i3\l  %arrayidx27 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv8\l  store i32 %add25, i32* %arrayidx27, align 4\l  br label %for.inc14\l}"];
+	Node0x560bdf31e460 -> Node0x560bdf31d8a0;
+	Node0x560bdf31dae0 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{for.end30:                                        \l  %wide.trip.count19 = zext i32 %smax to i64\l  br label %for.cond32\l}"];
+	Node0x560bdf31dae0 -> Node0x560bdf31ed40;
+	Node0x560bdf31ed40 [shape=record,color="#b70d28ff", style=filled, fillcolor="#b70d2870",label="{for.cond32:                                       \l  %indvars.iv16 = phi i64 [ %indvars.iv.next17, %for.inc42 ], [ 0, %for.end30 ]\l  %exitcond20 = icmp ne i64 %indvars.iv16, %wide.trip.count19\l  br i1 %exitcond20, label %for.body34, label %for.end44\l|{<s0>T|<s1>F}}"];
+	Node0x560bdf31ed40:s0 -> Node0x560bdf31f070;
+	Node0x560bdf31ed40:s1 -> Node0x560bdf31f0c0;
+	Node0x560bdf31f070 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.body34:                                       \l  %arrayidx36 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv16\l  %i4 = load i32, i32* %arrayidx36, align 4\l  %arrayidx38 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv16\l  %i5 = load i32, i32* %arrayidx38, align 4\l  %add39 = add nsw i32 %i4, %i5\l  %arrayidx41 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv16\l  store i32 %add39, i32* %arrayidx41, align 4\l  br label %for.inc42\l}"];
+	Node0x560bdf31f070 -> Node0x560bdf31eeb0;
+	Node0x560bdf31eeb0 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.inc42:                                        \l  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1\l  br label %for.cond32, !llvm.loop !7\l}"];
+	Node0x560bdf31eeb0 -> Node0x560bdf31ed40;
+	Node0x560bdf31f0c0 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{for.end44:                                        \l  ret void\l}"];
 }
diff --git a/tests/loop_fuse/loop_fuse.c b/tests/loop_fuse/loop_fuse.c
index e0189dd76ec488..fc29abd9cdd2a3 100644
--- a/tests/loop_fuse/loop_fuse.c
+++ b/tests/loop_fuse/loop_fuse.c
@@ -4,6 +4,14 @@ void init(int *a, int *b, int *c, int n) {
     b[i] = i * i;
   }
 
+  for (int i = 0; i < 10; i++) {
+    a[i] = b[i] + c[i];
+  }
+
+  for (int i = 0; i < 10; i++) {
+    a[i] = b[i] + c[i];
+  }
+
   for (int i = 0; i < n; i++) {
     a[i] = b[i] + c[i];
   }
diff --git a/tests/loop_fuse/loop_fuse.ll b/tests/loop_fuse/loop_fuse.ll
index 5bdecfd993a5e7..50535e86c160b3 100644
--- a/tests/loop_fuse/loop_fuse.ll
+++ b/tests/loop_fuse/loop_fuse.ll
@@ -12,6 +12,8 @@ entry:
   %n.addr = alloca i32, align 4
   %i = alloca i32, align 4
   %i3 = alloca i32, align 4
+  %i17 = alloca i32, align 4
+  %i31 = alloca i32, align 4
   store i32* %a, i32** %a.addr, align 8
   store i32* %b, i32** %b.addr, align 8
   store i32* %c, i32** %c.addr, align 8
@@ -56,36 +58,104 @@ for.end:                                          ; preds = %for.cond
 
 for.cond4:                                        ; preds = %for.inc14, %for.end
   %11 = load i32, i32* %i3, align 4
-  %12 = load i32, i32* %n.addr, align 4
-  %cmp5 = icmp slt i32 %11, %12
+  %cmp5 = icmp slt i32 %11, 10
   br i1 %cmp5, label %for.body6, label %for.end16
 
 for.body6:                                        ; preds = %for.cond4
-  %13 = load i32*, i32** %b.addr, align 8
-  %14 = load i32, i32* %i3, align 4
-  %idxprom7 = sext i32 %14 to i64
-  %arrayidx8 = getelementptr inbounds i32, i32* %13, i64 %idxprom7
-  %15 = load i32, i32* %arrayidx8, align 4
-  %16 = load i32*, i32** %c.addr, align 8
-  %17 = load i32, i32* %i3, align 4
-  %idxprom9 = sext i32 %17 to i64
-  %arrayidx10 = getelementptr inbounds i32, i32* %16, i64 %idxprom9
-  %18 = load i32, i32* %arrayidx10, align 4
-  %add11 = add nsw i32 %15, %18
-  %19 = load i32*, i32** %a.addr, align 8
-  %20 = load i32, i32* %i3, align 4
-  %idxprom12 = sext i32 %20 to i64
-  %arrayidx13 = getelementptr inbounds i32, i32* %19, i64 %idxprom12
+  %12 = load i32*, i32** %b.addr, align 8
+  %13 = load i32, i32* %i3, align 4
+  %idxprom7 = sext i32 %13 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32* %12, i64 %idxprom7
+  %14 = load i32, i32* %arrayidx8, align 4
+  %15 = load i32*, i32** %c.addr, align 8
+  %16 = load i32, i32* %i3, align 4
+  %idxprom9 = sext i32 %16 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %15, i64 %idxprom9
+  %17 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %14, %17
+  %18 = load i32*, i32** %a.addr, align 8
+  %19 = load i32, i32* %i3, align 4
+  %idxprom12 = sext i32 %19 to i64
+  %arrayidx13 = getelementptr inbounds i32, i32* %18, i64 %idxprom12
   store i32 %add11, i32* %arrayidx13, align 4
   br label %for.inc14
 
 for.inc14:                                        ; preds = %for.body6
-  %21 = load i32, i32* %i3, align 4
-  %inc15 = add nsw i32 %21, 1
+  %20 = load i32, i32* %i3, align 4
+  %inc15 = add nsw i32 %20, 1
   store i32 %inc15, i32* %i3, align 4
   br label %for.cond4, !llvm.loop !6
 
 for.end16:                                        ; preds = %for.cond4
+  store i32 0, i32* %i17, align 4
+  br label %for.cond18
+
+for.cond18:                                       ; preds = %for.inc28, %for.end16
+  %21 = load i32, i32* %i17, align 4
+  %cmp19 = icmp slt i32 %21, 10
+  br i1 %cmp19, label %for.body20, label %for.end30
+
+for.body20:                                       ; preds = %for.cond18
+  %22 = load i32*, i32** %b.addr, align 8
+  %23 = load i32, i32* %i17, align 4
+  %idxprom21 = sext i32 %23 to i64
+  %arrayidx22 = getelementptr inbounds i32, i32* %22, i64 %idxprom21
+  %24 = load i32, i32* %arrayidx22, align 4
+  %25 = load i32*, i32** %c.addr, align 8
+  %26 = load i32, i32* %i17, align 4
+  %idxprom23 = sext i32 %26 to i64
+  %arrayidx24 = getelementptr inbounds i32, i32* %25, i64 %idxprom23
+  %27 = load i32, i32* %arrayidx24, align 4
+  %add25 = add nsw i32 %24, %27
+  %28 = load i32*, i32** %a.addr, align 8
+  %29 = load i32, i32* %i17, align 4
+  %idxprom26 = sext i32 %29 to i64
+  %arrayidx27 = getelementptr inbounds i32, i32* %28, i64 %idxprom26
+  store i32 %add25, i32* %arrayidx27, align 4
+  br label %for.inc28
+
+for.inc28:                                        ; preds = %for.body20
+  %30 = load i32, i32* %i17, align 4
+  %inc29 = add nsw i32 %30, 1
+  store i32 %inc29, i32* %i17, align 4
+  br label %for.cond18, !llvm.loop !7
+
+for.end30:                                        ; preds = %for.cond18
+  store i32 0, i32* %i31, align 4
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.inc42, %for.end30
+  %31 = load i32, i32* %i31, align 4
+  %32 = load i32, i32* %n.addr, align 4
+  %cmp33 = icmp slt i32 %31, %32
+  br i1 %cmp33, label %for.body34, label %for.end44
+
+for.body34:                                       ; preds = %for.cond32
+  %33 = load i32*, i32** %b.addr, align 8
+  %34 = load i32, i32* %i31, align 4
+  %idxprom35 = sext i32 %34 to i64
+  %arrayidx36 = getelementptr inbounds i32, i32* %33, i64 %idxprom35
+  %35 = load i32, i32* %arrayidx36, align 4
+  %36 = load i32*, i32** %c.addr, align 8
+  %37 = load i32, i32* %i31, align 4
+  %idxprom37 = sext i32 %37 to i64
+  %arrayidx38 = getelementptr inbounds i32, i32* %36, i64 %idxprom37
+  %38 = load i32, i32* %arrayidx38, align 4
+  %add39 = add nsw i32 %35, %38
+  %39 = load i32*, i32** %a.addr, align 8
+  %40 = load i32, i32* %i31, align 4
+  %idxprom40 = sext i32 %40 to i64
+  %arrayidx41 = getelementptr inbounds i32, i32* %39, i64 %idxprom40
+  store i32 %add39, i32* %arrayidx41, align 4
+  br label %for.inc42
+
+for.inc42:                                        ; preds = %for.body34
+  %41 = load i32, i32* %i31, align 4
+  %inc43 = add nsw i32 %41, 1
+  store i32 %inc43, i32* %i31, align 4
+  br label %for.cond32, !llvm.loop !8
+
+for.end44:                                        ; preds = %for.cond32
   ret void
 }
 
@@ -97,7 +167,9 @@ attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vec
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"uwtable", i32 1}
 !2 = !{i32 7, !"frame-pointer", i32 2}
-!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 7f049514ee22563de5f8817412efd6d7d83109cf)"}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 47ee914ea16086c1958b93540ed2351bcdae7cdb)"}
 !4 = distinct !{!4, !5}
 !5 = !{!"llvm.loop.mustprogress"}
 !6 = distinct !{!6, !5}
+!7 = distinct !{!7, !5}
+!8 = distinct !{!8, !5}
diff --git a/tests/loop_fuse/loop_fuse_out.ll b/tests/loop_fuse/loop_fuse_out.ll
index 5f5cde757646c6..82f6059af1e8df 100644
--- a/tests/loop_fuse/loop_fuse_out.ll
+++ b/tests/loop_fuse/loop_fuse_out.ll
@@ -31,29 +31,73 @@ for.inc:                                          ; preds = %for.body
   br label %for.cond, !llvm.loop !4
 
 for.end:                                          ; preds = %for.cond
-  %wide.trip.count7 = zext i32 %smax to i64
   br label %for.cond4
 
 for.cond4:                                        ; preds = %for.inc14, %for.end
-  %indvars.iv4 = phi i64 [ %indvars.iv.next5, %for.inc14 ], [ 0, %for.end ]
-  %exitcond8 = icmp ne i64 %indvars.iv4, %wide.trip.count7
-  br i1 %exitcond8, label %for.body6, label %for.end16
+  %indvars.iv8 = phi i64 [ %indvars.iv.next9, %for.inc14 ], [ 0, %for.end ]
+  %exitcond11 = icmp ne i64 %indvars.iv8, 10
+  br i1 %exitcond11, label %for.body6, label %for.end16
 
 for.body6:                                        ; preds = %for.cond4
-  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv8
   %i = load i32, i32* %arrayidx8, align 4
-  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv8
   %i1 = load i32, i32* %arrayidx10, align 4
   %add11 = add nsw i32 %i, %i1
-  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv4
+  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv8
   store i32 %add11, i32* %arrayidx13, align 4
   br label %for.inc14
 
 for.inc14:                                        ; preds = %for.body6
-  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+  %indvars.iv.next9 = add nuw nsw i64 %indvars.iv8, 1
   br label %for.cond4, !llvm.loop !6
 
 for.end16:                                        ; preds = %for.cond4
+  br label %for.cond18
+
+for.cond18:                                       ; preds = %for.inc28, %for.end16
+  %indvars.iv12 = phi i64 [ %indvars.iv.next13, %for.inc28 ], [ 0, %for.end16 ]
+  %exitcond15 = icmp ne i64 %indvars.iv12, 10
+  br i1 %exitcond15, label %for.body20, label %for.end30
+
+for.body20:                                       ; preds = %for.cond18
+  %arrayidx22 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv12
+  %i2 = load i32, i32* %arrayidx22, align 4
+  %arrayidx24 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv12
+  %i3 = load i32, i32* %arrayidx24, align 4
+  %add25 = add nsw i32 %i2, %i3
+  %arrayidx27 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv12
+  store i32 %add25, i32* %arrayidx27, align 4
+  br label %for.inc28
+
+for.inc28:                                        ; preds = %for.body20
+  %indvars.iv.next13 = add nuw nsw i64 %indvars.iv12, 1
+  br label %for.cond18, !llvm.loop !7
+
+for.end30:                                        ; preds = %for.cond18
+  %wide.trip.count19 = zext i32 %smax to i64
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.inc42, %for.end30
+  %indvars.iv16 = phi i64 [ %indvars.iv.next17, %for.inc42 ], [ 0, %for.end30 ]
+  %exitcond20 = icmp ne i64 %indvars.iv16, %wide.trip.count19
+  br i1 %exitcond20, label %for.body34, label %for.end44
+
+for.body34:                                       ; preds = %for.cond32
+  %arrayidx36 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv16
+  %i4 = load i32, i32* %arrayidx36, align 4
+  %arrayidx38 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv16
+  %i5 = load i32, i32* %arrayidx38, align 4
+  %add39 = add nsw i32 %i4, %i5
+  %arrayidx41 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv16
+  store i32 %add39, i32* %arrayidx41, align 4
+  br label %for.inc42
+
+for.inc42:                                        ; preds = %for.body34
+  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
+  br label %for.cond32, !llvm.loop !8
+
+for.end44:                                        ; preds = %for.cond32
   ret void
 }
 
@@ -69,7 +113,9 @@ attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"uwtable", i32 1}
 !2 = !{i32 7, !"frame-pointer", i32 2}
-!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 7f049514ee22563de5f8817412efd6d7d83109cf)"}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 47ee914ea16086c1958b93540ed2351bcdae7cdb)"}
 !4 = distinct !{!4, !5}
 !5 = !{!"llvm.loop.mustprogress"}
 !6 = distinct !{!6, !5}
+!7 = distinct !{!7, !5}
+!8 = distinct !{!8, !5}
diff --git a/tests/loop_fuse/loop_fuse_out1.ll b/tests/loop_fuse/loop_fuse_out1.ll
index bd15857dfeb13a..330dd11d463ab2 100644
--- a/tests/loop_fuse/loop_fuse_out1.ll
+++ b/tests/loop_fuse/loop_fuse_out1.ll
@@ -13,7 +13,7 @@ entry:
 for.cond:                                         ; preds = %for.inc, %entry
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
   %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count
-  br i1 %exitcond, label %for.body, label %for.end16
+  br i1 %exitcond, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
   %0 = shl nuw nsw i64 %indvars.iv, 1
@@ -24,23 +24,68 @@ for.body:                                         ; preds = %for.cond
   %mul = mul nsw i32 %2, %2
   %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
   store i32 %mul, i32* %arrayidx2, align 4
-  br label %for.body6
+  br label %for.inc
 
-for.inc:                                          ; preds = %for.body6
+for.inc:                                          ; preds = %for.body
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   br label %for.cond, !llvm.loop !4
 
-for.body6:                                        ; preds = %for.body
-  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+for.end:                                          ; preds = %for.cond
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc14, %for.end
+  %indvars.iv8 = phi i64 [ %indvars.iv.next9, %for.inc14 ], [ 0, %for.end ]
+  %exitcond11 = icmp ne i64 %indvars.iv8, 10
+  br i1 %exitcond11, label %for.body6, label %for.end30
+
+for.body6:                                        ; preds = %for.cond4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv8
   %i = load i32, i32* %arrayidx8, align 4
-  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv8
   %i1 = load i32, i32* %arrayidx10, align 4
   %add11 = add nsw i32 %i, %i1
-  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv8
   store i32 %add11, i32* %arrayidx13, align 4
-  br label %for.inc
+  br label %for.body20
+
+for.inc14:                                        ; preds = %for.body20
+  %indvars.iv.next9 = add nuw nsw i64 %indvars.iv8, 1
+  br label %for.cond4, !llvm.loop !6
+
+for.body20:                                       ; preds = %for.body6
+  %arrayidx22 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv8
+  %i2 = load i32, i32* %arrayidx22, align 4
+  %arrayidx24 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv8
+  %i3 = load i32, i32* %arrayidx24, align 4
+  %add25 = add nsw i32 %i2, %i3
+  %arrayidx27 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv8
+  store i32 %add25, i32* %arrayidx27, align 4
+  br label %for.inc14
+
+for.end30:                                        ; preds = %for.cond4
+  %wide.trip.count19 = zext i32 %smax to i64
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.inc42, %for.end30
+  %indvars.iv16 = phi i64 [ %indvars.iv.next17, %for.inc42 ], [ 0, %for.end30 ]
+  %exitcond20 = icmp ne i64 %indvars.iv16, %wide.trip.count19
+  br i1 %exitcond20, label %for.body34, label %for.end44
+
+for.body34:                                       ; preds = %for.cond32
+  %arrayidx36 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv16
+  %i4 = load i32, i32* %arrayidx36, align 4
+  %arrayidx38 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv16
+  %i5 = load i32, i32* %arrayidx38, align 4
+  %add39 = add nsw i32 %i4, %i5
+  %arrayidx41 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv16
+  store i32 %add39, i32* %arrayidx41, align 4
+  br label %for.inc42
+
+for.inc42:                                        ; preds = %for.body34
+  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
+  br label %for.cond32, !llvm.loop !7
 
-for.end16:                                        ; preds = %for.cond
+for.end44:                                        ; preds = %for.cond32
   ret void
 }
 
@@ -56,6 +101,8 @@ attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"uwtable", i32 1}
 !2 = !{i32 7, !"frame-pointer", i32 2}
-!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 7f049514ee22563de5f8817412efd6d7d83109cf)"}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 47ee914ea16086c1958b93540ed2351bcdae7cdb)"}
 !4 = distinct !{!4, !5}
 !5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
+!7 = distinct !{!7, !5}
diff --git a/tests/loop_fuse/negative_loop_fuse.ll b/tests/loop_fuse/negative_loop_fuse.ll
index a5646e46627d58..cf38ff8b6ff8e9 100644
--- a/tests/loop_fuse/negative_loop_fuse.ll
+++ b/tests/loop_fuse/negative_loop_fuse.ll
@@ -97,7 +97,7 @@ attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vec
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"uwtable", i32 1}
 !2 = !{i32 7, !"frame-pointer", i32 2}
-!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 7f049514ee22563de5f8817412efd6d7d83109cf)"}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 47ee914ea16086c1958b93540ed2351bcdae7cdb)"}
 !4 = distinct !{!4, !5}
 !5 = !{!"llvm.loop.mustprogress"}
 !6 = distinct !{!6, !5}
diff --git a/tests/loop_fuse/negative_loop_fuse_out.ll b/tests/loop_fuse/negative_loop_fuse_out.ll
index 100883a6e9d56e..b3fdb2fa9918a1 100644
--- a/tests/loop_fuse/negative_loop_fuse_out.ll
+++ b/tests/loop_fuse/negative_loop_fuse_out.ll
@@ -70,7 +70,7 @@ attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"uwtable", i32 1}
 !2 = !{i32 7, !"frame-pointer", i32 2}
-!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 7f049514ee22563de5f8817412efd6d7d83109cf)"}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 47ee914ea16086c1958b93540ed2351bcdae7cdb)"}
 !4 = distinct !{!4, !5}
 !5 = !{!"llvm.loop.mustprogress"}
 !6 = distinct !{!6, !5}
diff --git a/tests/loop_fuse/negative_loop_fuse_out1.ll b/tests/loop_fuse/negative_loop_fuse_out1.ll
index 1eabb624ad1dc4..a63151ac6cda1d 100644
--- a/tests/loop_fuse/negative_loop_fuse_out1.ll
+++ b/tests/loop_fuse/negative_loop_fuse_out1.ll
@@ -70,7 +70,7 @@ attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"uwtable", i32 1}
 !2 = !{i32 7, !"frame-pointer", i32 2}
-!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 7f049514ee22563de5f8817412efd6d7d83109cf)"}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 47ee914ea16086c1958b93540ed2351bcdae7cdb)"}
 !4 = distinct !{!4, !5}
 !5 = !{!"llvm.loop.mustprogress"}
 !6 = distinct !{!6, !5}

>From 307d24efdf6607d8699747788f03bf0bbf1f3e0a Mon Sep 17 00:00:00 2001
From: Shravan Kumar <87087331+shravankumar0811 at users.noreply.github.com>
Date: Wed, 13 Jul 2022 22:03:01 +0530
Subject: [PATCH 7/8] Adding Loop Fusion pass (#3)

* Adding Loop Fusion pass

* Adding Loop Fusion pass

Co-authored-by: Shravan Kumar <shkumar at habana.ai>
---
 llvm/lib/Transforms/CMakeLists.txt            |   3 +-
 llvm/lib/Transforms/LoopFusion/CMakeLists.txt |  20 ++
 llvm/lib/Transforms/LoopFusion/LoopFusion.cpp | 222 ++++++++++++++++++
 .../Transforms/LoopFusion/LoopFusion.exports  |   0
 tests/CMakeLists.txt                          |   1 +
 tests/loop_fuse/.init.dot                     |  34 +++
 tests/loop_fuse/command.sh                    |  11 +
 tests/loop_fuse/loop_fuse.c                   |  18 ++
 tests/loop_fuse/loop_fuse.ll                  | 175 ++++++++++++++
 tests/loop_fuse/loop_fuse_out.ll              | 121 ++++++++++
 tests/loop_fuse/loop_fuse_out1.ll             | 108 +++++++++
 tests/loop_fuse/negative_loop_fuse.c          |  10 +
 tests/loop_fuse/negative_loop_fuse.ll         | 103 ++++++++
 tests/loop_fuse/negative_loop_fuse_out.ll     |  76 ++++++
 tests/loop_fuse/negative_loop_fuse_out1.ll    |  76 ++++++
 15 files changed, 977 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/Transforms/LoopFusion/CMakeLists.txt
 create mode 100644 llvm/lib/Transforms/LoopFusion/LoopFusion.cpp
 create mode 100644 llvm/lib/Transforms/LoopFusion/LoopFusion.exports
 create mode 100644 tests/loop_fuse/.init.dot
 create mode 100644 tests/loop_fuse/command.sh
 create mode 100644 tests/loop_fuse/loop_fuse.c
 create mode 100644 tests/loop_fuse/loop_fuse.ll
 create mode 100644 tests/loop_fuse/loop_fuse_out.ll
 create mode 100644 tests/loop_fuse/loop_fuse_out1.ll
 create mode 100644 tests/loop_fuse/negative_loop_fuse.c
 create mode 100644 tests/loop_fuse/negative_loop_fuse.ll
 create mode 100644 tests/loop_fuse/negative_loop_fuse_out.ll
 create mode 100644 tests/loop_fuse/negative_loop_fuse_out1.ll

diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt
index 8ace411e1ca82b..6b165fd71dfcfb 100644
--- a/llvm/lib/Transforms/CMakeLists.txt
+++ b/llvm/lib/Transforms/CMakeLists.txt
@@ -10,4 +10,5 @@ add_subdirectory(ObjCARC)
 add_subdirectory(Coroutines)
 add_subdirectory(CFGuard)
 add_subdirectory(Cfcss)
-add_subdirectory(Scev)
\ No newline at end of file
+add_subdirectory(Scev)
+add_subdirectory(LoopFusion)
diff --git a/llvm/lib/Transforms/LoopFusion/CMakeLists.txt b/llvm/lib/Transforms/LoopFusion/CMakeLists.txt
new file mode 100644
index 00000000000000..6c0edac49c6ec2
--- /dev/null
+++ b/llvm/lib/Transforms/LoopFusion/CMakeLists.txt
@@ -0,0 +1,20 @@
+# If we don't need RTTI or EH, there's no reason to export anything
+# from the hello plugin.
+if( NOT LLVM_REQUIRES_RTTI )
+  if( NOT LLVM_REQUIRES_EH )
+    set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/LoopFusion.exports)
+  endif()
+endif()
+
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_library( LLVMLoopFusion MODULE BUILDTREE_ONLY
+LoopFusion.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/llvm/lib/Transforms/LoopFusion/LoopFusion.cpp b/llvm/lib/Transforms/LoopFusion/LoopFusion.cpp
new file mode 100644
index 00000000000000..d3890e8c00009b
--- /dev/null
+++ b/llvm/lib/Transforms/LoopFusion/LoopFusion.cpp
@@ -0,0 +1,222 @@
+/*===- LoopFusion.cpp -
+  This program is the implementation of a pass for loop fusion in LLVM compiler.
+Two loops, which are adjacent and have the same condition and increments with
+respect to the loop variable may be fused, i.e, their bodies may be executed one
+after the other with in a single loop. The decision to fuse the loops is taken
+based on the legality and profitability of the fusion. It should not be
+performed if the resulting code has anti-dependency or if the execution time of
+the program increases. Algorithm:
+1. Check 2 loops are can fuse.
+2. Replace the use of induction variable of 2nd loop with that of 1st loop.
+3. Combine the bodies of loop1 and loop2.
+3. Set the succesor of 1st loop’s header to exit block of 2nd loop.
+4. Delete the unwanted basic blocks of 2nd loop.
+===-------------------------------------------------------------------------------------------===*/
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "hello"
+
+namespace {
+// Scev - The second implementation with getAnalysisUsage implemented.
+struct LoopFusion : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  LoopFusion() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override {
+
+    SmallVector<Loop *> LoopVector;
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+    for (auto *L : *LI) {
+      LoopVector.push_back(L);
+    }
+
+    int LoopCount = LoopVector.size();
+    if (LoopCount < 2) {
+      llvm::errs() << "The program contains less no of loops to fuse\n";
+      return false;
+    }
+
+    // Check for each combinations of loops are fusable
+    for (int i = 0; i < LoopCount; i++) {
+      for (int j = i + 1; j < LoopCount; j++) {
+        // Function to perform basic checks on the two loops
+        if (fuseCheck(LoopVector[j], LoopVector[i]))
+          // Function to perform fusing on the two loops
+          fuseBody(LoopVector[j], LoopVector[i], F);
+        break;
+      }
+    }
+
+    return false;
+  }
+
+  void fuseBody(Loop *Loop1, Loop *Loop2, Function &F) {
+    BasicBlock *Header1 = nullptr;
+    BasicBlock *Latch1 = nullptr;
+    BasicBlock *Exit2 = nullptr;
+
+    BasicBlock *Body1 = getBody(Loop1);
+    BasicBlock *Body2 = getBody(Loop2);
+    Header1 = Loop1->getHeader();
+    Latch1 = Loop1->getLoopLatch();
+    Exit2 = Loop2->getExitBlock();
+
+    assert(Body1 && Body2 && Header1 && Latch1 && Exit2 &&
+           "NULL Pointer encountered\n");
+
+    PHINode *Phi1 = Loop1->getCanonicalInductionVariable();
+    PHINode *Phi2 = Loop2->getCanonicalInductionVariable();
+
+    // Replace the use of induction variable of 2nd loop with that of 1st loop.
+    Phi2->replaceAllUsesWith(Phi1);
+
+    for (BasicBlock &BB : F) {
+
+      if (isa<ReturnInst>(BB.getTerminator()))
+        continue;
+      // Get the branch Instruction every block.
+      BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator());
+      // Set the successor of first Body block to Body of the second block.
+      if (&BB == Body1) {
+        BI->setSuccessor(0, Body2);
+      }
+      // Set the successor of second body block to Latch of the first block.
+      if (&BB == Body2) {
+        BI->setSuccessor(0, Latch1);
+      }
+      // Set the successor of first header block to exit of the second as its
+      // contains return insn.
+      if (&BB == Header1) {
+        BI->setSuccessor(1, Exit2);
+      }
+    }
+    // Function to remove un-wanted basic blocks.
+    EliminateUnreachableBlocks(F);
+  }
+
+  // Function to get Loop Body Blocks.
+  BasicBlock *getBody(Loop *L) {
+    BasicBlock *NullBB = nullptr;
+    for (BasicBlock *BB : L->getBlocks()) {
+      BasicBlock *HeaderBlock = L->getHeader();
+      if ((HeaderBlock != BB) && !(L->isLoopLatch(BB))) {
+        return BB;
+      }
+    }
+    return NullBB;
+  }
+
+  bool adjacent(Loop *Loop1, Loop *Loop2) {
+
+    BasicBlock *Bb1 = Loop1->getExitBlock();
+    BasicBlock *Bb2 = Loop2->getLoopPreheader();
+
+    //  If exit block and preHeader are not same.
+    if (Bb1 != Bb2) {
+      return false;
+    }
+    return true;
+  }
+
+  // Helper function to check and fuse two loops.
+  bool fuseCheck(Loop *L1, Loop *L2) {
+
+    // Check if the two loops are adjacent.
+    if (!adjacent(L1, L2)) {
+      llvm::errs() << "The two loops  " << L1->getName() << " and "
+                   << L2->getName() << "  are not adjacent. CANNOT fuse.\n";
+      return false;
+    }
+
+    // Check if the start integer is same.
+    if (startValue(*L1) != startValue(*L2)) {
+      llvm::errs() << "The loop check starting values of 2 loops "
+                   << L1->getName() << " and " << L2->getName()
+                   << "  are not same. CANNOT fuse.\n";
+      return false;
+    }
+
+    // Check if the limit integer is same.
+    if (limitValue(L1) != limitValue(L2)) {
+      llvm::errs() << "The loop check limiting value of 2 loops "
+                   << L1->getName() << " and " << L2->getName()
+                   << "  are not same. CANNOT fuse.\n";
+      return false;
+    }
+    llvm::errs() << "The two loops " << *L1 << " and  " << *L2
+                 << " are being fused.\n";
+    return true;
+  }
+
+  // Check if the start value is same.
+  Value *startValue(Loop &LoopV) {
+    for (auto &IndVar : LoopV.getHeader()->phis()) {
+      Value *V = IndVar.getOperand(1);
+      return V;
+    }
+    return nullptr;
+  }
+
+  // Check if the limit value is same.
+  Value *limitValue(Loop *LoopV) {
+    Value *End, *ContEnd;
+
+    BasicBlock *BB = LoopV->getHeader();
+    for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Check instruction is compare
+      if (isa<ICmpInst>(I)) {
+        ContEnd = I->getOperand(1);
+        // Check end value is constant
+        if (dyn_cast<Constant>(ContEnd)) {
+          return ContEnd;
+        } else {
+          for (Use &U : LoopV->getHeader()->getFirstNonPHI()->operands()) {
+            if (!dyn_cast<PHINode>(U.get())) {
+              Instruction *I = dyn_cast<Instruction>(U.get());
+              for (Use &U : I->operands())
+                End = U.get();
+              return End;
+            }
+          }
+        }
+      }
+    }
+
+    return nullptr;
+  }
+
+  // We don't modify the program, so we preserve all analyses.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+};
+} // namespace
+
+char LoopFusion::ID = 0;
+static RegisterPass<LoopFusion>
+    X("loopfusion",
+      "LoopFusion Implementation Pass (with getAnalysisUsage implemented)");
\ No newline at end of file
diff --git a/llvm/lib/Transforms/LoopFusion/LoopFusion.exports b/llvm/lib/Transforms/LoopFusion/LoopFusion.exports
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0ab0d081f586b9..2321ce7a8e9ce7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(cfcss)
 add_subdirectory(assignment1)
 add_subdirectory(scev)
+add_subdirectory(loop_fuse)
diff --git a/tests/loop_fuse/.init.dot b/tests/loop_fuse/.init.dot
new file mode 100644
index 00000000000000..7850606709b6c5
--- /dev/null
+++ b/tests/loop_fuse/.init.dot
@@ -0,0 +1,34 @@
+digraph "CFG for 'init' function" {
+	label="CFG for 'init' function";
+
+	Node0x560bdf31b3a0 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{entry:\l  %smax = call i32 @llvm.smax.i32(i32 %n, i32 0)\l  %wide.trip.count = zext i32 %smax to i64\l  br label %for.cond\l}"];
+	Node0x560bdf31b3a0 -> Node0x560bdf31bd10;
+	Node0x560bdf31bd10 [shape=record,color="#b70d28ff", style=filled, fillcolor="#b70d2870",label="{for.cond:                                         \l  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]\l  %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count\l  br i1 %exitcond, label %for.body, label %for.end\l|{<s0>T|<s1>F}}"];
+	Node0x560bdf31bd10:s0 -> Node0x560bdf31c140;
+	Node0x560bdf31bd10:s1 -> Node0x560bdf31c220;
+	Node0x560bdf31c140 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.body:                                         \l  %0 = shl nuw nsw i64 %indvars.iv, 1\l  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv\l  %1 = trunc i64 %0 to i32\l  store i32 %1, i32* %arrayidx, align 4\l  %2 = trunc i64 %indvars.iv to i32\l  %mul = mul nsw i32 %2, %2\l  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv\l  store i32 %mul, i32* %arrayidx2, align 4\l  br label %for.inc\l}"];
+	Node0x560bdf31c140 -> Node0x560bdf31bec0;
+	Node0x560bdf31bec0 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.inc:                                          \l  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l  br label %for.cond, !llvm.loop !4\l}"];
+	Node0x560bdf31bec0 -> Node0x560bdf31bd10;
+	Node0x560bdf31c220 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{for.end:                                          \l  br label %for.cond4\l}"];
+	Node0x560bdf31c220 -> Node0x560bdf31d730;
+	Node0x560bdf31d730 [shape=record,color="#b70d28ff", style=filled, fillcolor="#b70d2870",label="{for.cond4:                                        \l  %indvars.iv8 = phi i64 [ %indvars.iv.next9, %for.inc14 ], [ 0, %for.end ]\l  %exitcond11 = icmp ne i64 %indvars.iv8, 10\l  br i1 %exitcond11, label %for.body6, label %for.end30\l|{<s0>T|<s1>F}}"];
+	Node0x560bdf31d730:s0 -> Node0x560bdf31da60;
+	Node0x560bdf31d730:s1 -> Node0x560bdf31dae0;
+	Node0x560bdf31da60 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.body6:                                        \l  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv8\l  %i = load i32, i32* %arrayidx8, align 4\l  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv8\l  %i1 = load i32, i32* %arrayidx10, align 4\l  %add11 = add nsw i32 %i, %i1\l  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv8\l  store i32 %add11, i32* %arrayidx13, align 4\l  br label %for.body20\l}"];
+	Node0x560bdf31da60 -> Node0x560bdf31e460;
+	Node0x560bdf31d8a0 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.inc14:                                        \l  %indvars.iv.next9 = add nuw nsw i64 %indvars.iv8, 1\l  br label %for.cond4, !llvm.loop !6\l}"];
+	Node0x560bdf31d8a0 -> Node0x560bdf31d730;
+	Node0x560bdf31e460 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.body20:                                       \l  %arrayidx22 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv8\l  %i2 = load i32, i32* %arrayidx22, align 4\l  %arrayidx24 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv8\l  %i3 = load i32, i32* %arrayidx24, align 4\l  %add25 = add nsw i32 %i2, %i3\l  %arrayidx27 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv8\l  store i32 %add25, i32* %arrayidx27, align 4\l  br label %for.inc14\l}"];
+	Node0x560bdf31e460 -> Node0x560bdf31d8a0;
+	Node0x560bdf31dae0 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{for.end30:                                        \l  %wide.trip.count19 = zext i32 %smax to i64\l  br label %for.cond32\l}"];
+	Node0x560bdf31dae0 -> Node0x560bdf31ed40;
+	Node0x560bdf31ed40 [shape=record,color="#b70d28ff", style=filled, fillcolor="#b70d2870",label="{for.cond32:                                       \l  %indvars.iv16 = phi i64 [ %indvars.iv.next17, %for.inc42 ], [ 0, %for.end30 ]\l  %exitcond20 = icmp ne i64 %indvars.iv16, %wide.trip.count19\l  br i1 %exitcond20, label %for.body34, label %for.end44\l|{<s0>T|<s1>F}}"];
+	Node0x560bdf31ed40:s0 -> Node0x560bdf31f070;
+	Node0x560bdf31ed40:s1 -> Node0x560bdf31f0c0;
+	Node0x560bdf31f070 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.body34:                                       \l  %arrayidx36 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv16\l  %i4 = load i32, i32* %arrayidx36, align 4\l  %arrayidx38 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv16\l  %i5 = load i32, i32* %arrayidx38, align 4\l  %add39 = add nsw i32 %i4, %i5\l  %arrayidx41 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv16\l  store i32 %add39, i32* %arrayidx41, align 4\l  br label %for.inc42\l}"];
+	Node0x560bdf31f070 -> Node0x560bdf31eeb0;
+	Node0x560bdf31eeb0 [shape=record,color="#b70d28ff", style=filled, fillcolor="#bb1b2c70",label="{for.inc42:                                        \l  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1\l  br label %for.cond32, !llvm.loop !7\l}"];
+	Node0x560bdf31eeb0 -> Node0x560bdf31ed40;
+	Node0x560bdf31f0c0 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#b9d0f970",label="{for.end44:                                        \l  ret void\l}"];
+}
diff --git a/tests/loop_fuse/command.sh b/tests/loop_fuse/command.sh
new file mode 100644
index 00000000000000..aac1e1e99a0ee5
--- /dev/null
+++ b/tests/loop_fuse/command.sh
@@ -0,0 +1,11 @@
+# first command is to emit ir for test case
+clang -S -emit-llvm loop_fuse.c -Xclang -disable-O0-optnone
+
+#second command is to clean up ir so that scev can understand it
+opt -mem2reg -loop-simplify -instcombine -instnamer -indvars loop_fuse.ll -S -o loop_fuse_out.ll
+
+#Third command will run loopfusion
+opt -load  ${LLVM_HOME}/build/lib/LLVMLoopFusion.so -loopfusion loop_fuse_out.ll -enable-new-pm=0 -S -o loop_fuse_out1.ll
+
+#To create cfg
+opt -analyze -dot-cfg -enable-new-pm=0 loop_fuse_out1.ll
\ No newline at end of file
diff --git a/tests/loop_fuse/loop_fuse.c b/tests/loop_fuse/loop_fuse.c
new file mode 100644
index 00000000000000..fc29abd9cdd2a3
--- /dev/null
+++ b/tests/loop_fuse/loop_fuse.c
@@ -0,0 +1,18 @@
+void init(int *a, int *b, int *c, int n) {
+  for (int i = 0; i < n; i++) {
+    c[i] = i + i;
+    b[i] = i * i;
+  }
+
+  for (int i = 0; i < 10; i++) {
+    a[i] = b[i] + c[i];
+  }
+
+  for (int i = 0; i < 10; i++) {
+    a[i] = b[i] + c[i];
+  }
+
+  for (int i = 0; i < n; i++) {
+    a[i] = b[i] + c[i];
+  }
+}
\ No newline at end of file
diff --git a/tests/loop_fuse/loop_fuse.ll b/tests/loop_fuse/loop_fuse.ll
new file mode 100644
index 00000000000000..50535e86c160b3
--- /dev/null
+++ b/tests/loop_fuse/loop_fuse.ll
@@ -0,0 +1,175 @@
+; ModuleID = 'loop_fuse.c'
+source_filename = "loop_fuse.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init(i32* noundef %a, i32* noundef %b, i32* noundef %c, i32 noundef %n) #0 {
+entry:
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca i32*, align 8
+  %c.addr = alloca i32*, align 8
+  %n.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %i3 = alloca i32, align 4
+  %i17 = alloca i32, align 4
+  %i31 = alloca i32, align 4
+  store i32* %a, i32** %a.addr, align 8
+  store i32* %b, i32** %b.addr, align 8
+  store i32* %c, i32** %c.addr, align 8
+  store i32 %n, i32* %n.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %n.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %3 = load i32, i32* %i, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32*, i32** %c.addr, align 8
+  %5 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %5 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %4, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %6 = load i32, i32* %i, align 4
+  %7 = load i32, i32* %i, align 4
+  %mul = mul nsw i32 %6, %7
+  %8 = load i32*, i32** %b.addr, align 8
+  %9 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %9 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %8, i64 %idxprom1
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %10 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, i32* %i3, align 4
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc14, %for.end
+  %11 = load i32, i32* %i3, align 4
+  %cmp5 = icmp slt i32 %11, 10
+  br i1 %cmp5, label %for.body6, label %for.end16
+
+for.body6:                                        ; preds = %for.cond4
+  %12 = load i32*, i32** %b.addr, align 8
+  %13 = load i32, i32* %i3, align 4
+  %idxprom7 = sext i32 %13 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32* %12, i64 %idxprom7
+  %14 = load i32, i32* %arrayidx8, align 4
+  %15 = load i32*, i32** %c.addr, align 8
+  %16 = load i32, i32* %i3, align 4
+  %idxprom9 = sext i32 %16 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %15, i64 %idxprom9
+  %17 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %14, %17
+  %18 = load i32*, i32** %a.addr, align 8
+  %19 = load i32, i32* %i3, align 4
+  %idxprom12 = sext i32 %19 to i64
+  %arrayidx13 = getelementptr inbounds i32, i32* %18, i64 %idxprom12
+  store i32 %add11, i32* %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6
+  %20 = load i32, i32* %i3, align 4
+  %inc15 = add nsw i32 %20, 1
+  store i32 %inc15, i32* %i3, align 4
+  br label %for.cond4, !llvm.loop !6
+
+for.end16:                                        ; preds = %for.cond4
+  store i32 0, i32* %i17, align 4
+  br label %for.cond18
+
+for.cond18:                                       ; preds = %for.inc28, %for.end16
+  %21 = load i32, i32* %i17, align 4
+  %cmp19 = icmp slt i32 %21, 10
+  br i1 %cmp19, label %for.body20, label %for.end30
+
+for.body20:                                       ; preds = %for.cond18
+  %22 = load i32*, i32** %b.addr, align 8
+  %23 = load i32, i32* %i17, align 4
+  %idxprom21 = sext i32 %23 to i64
+  %arrayidx22 = getelementptr inbounds i32, i32* %22, i64 %idxprom21
+  %24 = load i32, i32* %arrayidx22, align 4
+  %25 = load i32*, i32** %c.addr, align 8
+  %26 = load i32, i32* %i17, align 4
+  %idxprom23 = sext i32 %26 to i64
+  %arrayidx24 = getelementptr inbounds i32, i32* %25, i64 %idxprom23
+  %27 = load i32, i32* %arrayidx24, align 4
+  %add25 = add nsw i32 %24, %27
+  %28 = load i32*, i32** %a.addr, align 8
+  %29 = load i32, i32* %i17, align 4
+  %idxprom26 = sext i32 %29 to i64
+  %arrayidx27 = getelementptr inbounds i32, i32* %28, i64 %idxprom26
+  store i32 %add25, i32* %arrayidx27, align 4
+  br label %for.inc28
+
+for.inc28:                                        ; preds = %for.body20
+  %30 = load i32, i32* %i17, align 4
+  %inc29 = add nsw i32 %30, 1
+  store i32 %inc29, i32* %i17, align 4
+  br label %for.cond18, !llvm.loop !7
+
+for.end30:                                        ; preds = %for.cond18
+  store i32 0, i32* %i31, align 4
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.inc42, %for.end30
+  %31 = load i32, i32* %i31, align 4
+  %32 = load i32, i32* %n.addr, align 4
+  %cmp33 = icmp slt i32 %31, %32
+  br i1 %cmp33, label %for.body34, label %for.end44
+
+for.body34:                                       ; preds = %for.cond32
+  %33 = load i32*, i32** %b.addr, align 8
+  %34 = load i32, i32* %i31, align 4
+  %idxprom35 = sext i32 %34 to i64
+  %arrayidx36 = getelementptr inbounds i32, i32* %33, i64 %idxprom35
+  %35 = load i32, i32* %arrayidx36, align 4
+  %36 = load i32*, i32** %c.addr, align 8
+  %37 = load i32, i32* %i31, align 4
+  %idxprom37 = sext i32 %37 to i64
+  %arrayidx38 = getelementptr inbounds i32, i32* %36, i64 %idxprom37
+  %38 = load i32, i32* %arrayidx38, align 4
+  %add39 = add nsw i32 %35, %38
+  %39 = load i32*, i32** %a.addr, align 8
+  %40 = load i32, i32* %i31, align 4
+  %idxprom40 = sext i32 %40 to i64
+  %arrayidx41 = getelementptr inbounds i32, i32* %39, i64 %idxprom40
+  store i32 %add39, i32* %arrayidx41, align 4
+  br label %for.inc42
+
+for.inc42:                                        ; preds = %for.body34
+  %41 = load i32, i32* %i31, align 4
+  %inc43 = add nsw i32 %41, 1
+  store i32 %inc43, i32* %i31, align 4
+  br label %for.cond32, !llvm.loop !8
+
+for.end44:                                        ; preds = %for.cond32
+  ret void
+}
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 47ee914ea16086c1958b93540ed2351bcdae7cdb)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
+!7 = distinct !{!7, !5}
+!8 = distinct !{!8, !5}
diff --git a/tests/loop_fuse/loop_fuse_out.ll b/tests/loop_fuse/loop_fuse_out.ll
new file mode 100644
index 00000000000000..82f6059af1e8df
--- /dev/null
+++ b/tests/loop_fuse/loop_fuse_out.ll
@@ -0,0 +1,121 @@
+; ModuleID = 'loop_fuse.ll'
+source_filename = "loop_fuse.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init(i32* noundef %a, i32* noundef %b, i32* noundef %c, i32 noundef %n) #0 {
+entry:
+  %smax = call i32 @llvm.smax.i32(i32 %n, i32 0)
+  %wide.trip.count = zext i32 %smax to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = shl nuw nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %1 = trunc i64 %0 to i32
+  store i32 %1, i32* %arrayidx, align 4
+  %2 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %2, %2
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc14, %for.end
+  %indvars.iv8 = phi i64 [ %indvars.iv.next9, %for.inc14 ], [ 0, %for.end ]
+  %exitcond11 = icmp ne i64 %indvars.iv8, 10
+  br i1 %exitcond11, label %for.body6, label %for.end16
+
+for.body6:                                        ; preds = %for.cond4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv8
+  %i = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv8
+  %i1 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %i, %i1
+  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv8
+  store i32 %add11, i32* %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6
+  %indvars.iv.next9 = add nuw nsw i64 %indvars.iv8, 1
+  br label %for.cond4, !llvm.loop !6
+
+for.end16:                                        ; preds = %for.cond4
+  br label %for.cond18
+
+for.cond18:                                       ; preds = %for.inc28, %for.end16
+  %indvars.iv12 = phi i64 [ %indvars.iv.next13, %for.inc28 ], [ 0, %for.end16 ]
+  %exitcond15 = icmp ne i64 %indvars.iv12, 10
+  br i1 %exitcond15, label %for.body20, label %for.end30
+
+for.body20:                                       ; preds = %for.cond18
+  %arrayidx22 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv12
+  %i2 = load i32, i32* %arrayidx22, align 4
+  %arrayidx24 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv12
+  %i3 = load i32, i32* %arrayidx24, align 4
+  %add25 = add nsw i32 %i2, %i3
+  %arrayidx27 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv12
+  store i32 %add25, i32* %arrayidx27, align 4
+  br label %for.inc28
+
+for.inc28:                                        ; preds = %for.body20
+  %indvars.iv.next13 = add nuw nsw i64 %indvars.iv12, 1
+  br label %for.cond18, !llvm.loop !7
+
+for.end30:                                        ; preds = %for.cond18
+  %wide.trip.count19 = zext i32 %smax to i64
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.inc42, %for.end30
+  %indvars.iv16 = phi i64 [ %indvars.iv.next17, %for.inc42 ], [ 0, %for.end30 ]
+  %exitcond20 = icmp ne i64 %indvars.iv16, %wide.trip.count19
+  br i1 %exitcond20, label %for.body34, label %for.end44
+
+for.body34:                                       ; preds = %for.cond32
+  %arrayidx36 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv16
+  %i4 = load i32, i32* %arrayidx36, align 4
+  %arrayidx38 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv16
+  %i5 = load i32, i32* %arrayidx38, align 4
+  %add39 = add nsw i32 %i4, %i5
+  %arrayidx41 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv16
+  store i32 %add39, i32* %arrayidx41, align 4
+  br label %for.inc42
+
+for.inc42:                                        ; preds = %for.body34
+  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
+  br label %for.cond32, !llvm.loop !8
+
+for.end44:                                        ; preds = %for.cond32
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 47ee914ea16086c1958b93540ed2351bcdae7cdb)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
+!7 = distinct !{!7, !5}
+!8 = distinct !{!8, !5}
diff --git a/tests/loop_fuse/loop_fuse_out1.ll b/tests/loop_fuse/loop_fuse_out1.ll
new file mode 100644
index 00000000000000..330dd11d463ab2
--- /dev/null
+++ b/tests/loop_fuse/loop_fuse_out1.ll
@@ -0,0 +1,108 @@
+; ModuleID = 'loop_fuse_out.ll'
+source_filename = "loop_fuse.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init(i32* noundef %a, i32* noundef %b, i32* noundef %c, i32 noundef %n) #0 {
+entry:
+  %smax = call i32 @llvm.smax.i32(i32 %n, i32 0)
+  %wide.trip.count = zext i32 %smax to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = shl nuw nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %1 = trunc i64 %0 to i32
+  store i32 %1, i32* %arrayidx, align 4
+  %2 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %2, %2
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc14, %for.end
+  %indvars.iv8 = phi i64 [ %indvars.iv.next9, %for.inc14 ], [ 0, %for.end ]
+  %exitcond11 = icmp ne i64 %indvars.iv8, 10
+  br i1 %exitcond11, label %for.body6, label %for.end30
+
+for.body6:                                        ; preds = %for.cond4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv8
+  %i = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv8
+  %i1 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %i, %i1
+  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv8
+  store i32 %add11, i32* %arrayidx13, align 4
+  br label %for.body20
+
+for.inc14:                                        ; preds = %for.body20
+  %indvars.iv.next9 = add nuw nsw i64 %indvars.iv8, 1
+  br label %for.cond4, !llvm.loop !6
+
+for.body20:                                       ; preds = %for.body6
+  %arrayidx22 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv8
+  %i2 = load i32, i32* %arrayidx22, align 4
+  %arrayidx24 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv8
+  %i3 = load i32, i32* %arrayidx24, align 4
+  %add25 = add nsw i32 %i2, %i3
+  %arrayidx27 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv8
+  store i32 %add25, i32* %arrayidx27, align 4
+  br label %for.inc14
+
+for.end30:                                        ; preds = %for.cond4
+  %wide.trip.count19 = zext i32 %smax to i64
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.inc42, %for.end30
+  %indvars.iv16 = phi i64 [ %indvars.iv.next17, %for.inc42 ], [ 0, %for.end30 ]
+  %exitcond20 = icmp ne i64 %indvars.iv16, %wide.trip.count19
+  br i1 %exitcond20, label %for.body34, label %for.end44
+
+for.body34:                                       ; preds = %for.cond32
+  %arrayidx36 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv16
+  %i4 = load i32, i32* %arrayidx36, align 4
+  %arrayidx38 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv16
+  %i5 = load i32, i32* %arrayidx38, align 4
+  %add39 = add nsw i32 %i4, %i5
+  %arrayidx41 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv16
+  store i32 %add39, i32* %arrayidx41, align 4
+  br label %for.inc42
+
+for.inc42:                                        ; preds = %for.body34
+  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
+  br label %for.cond32, !llvm.loop !7
+
+for.end44:                                        ; preds = %for.cond32
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 47ee914ea16086c1958b93540ed2351bcdae7cdb)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
+!7 = distinct !{!7, !5}
diff --git a/tests/loop_fuse/negative_loop_fuse.c b/tests/loop_fuse/negative_loop_fuse.c
new file mode 100644
index 00000000000000..ca7a77bd3b3731
--- /dev/null
+++ b/tests/loop_fuse/negative_loop_fuse.c
@@ -0,0 +1,10 @@
+void init(int *a, int *b, int *c, int n) {
+  for (int i = 3; i < n; i++) {
+    c[i] = i + i;
+    b[i] = i * i;
+  }
+
+  for (int i = 5; i < n; i++) {
+    a[i] = b[i] + c[i];
+  }
+}
\ No newline at end of file
diff --git a/tests/loop_fuse/negative_loop_fuse.ll b/tests/loop_fuse/negative_loop_fuse.ll
new file mode 100644
index 00000000000000..cf38ff8b6ff8e9
--- /dev/null
+++ b/tests/loop_fuse/negative_loop_fuse.ll
@@ -0,0 +1,103 @@
+; ModuleID = 'negative_loop_fuse.c'
+source_filename = "negative_loop_fuse.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init(i32* noundef %a, i32* noundef %b, i32* noundef %c, i32 noundef %n) #0 {
+entry:
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca i32*, align 8
+  %c.addr = alloca i32*, align 8
+  %n.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %i3 = alloca i32, align 4
+  store i32* %a, i32** %a.addr, align 8
+  store i32* %b, i32** %b.addr, align 8
+  store i32* %c, i32** %c.addr, align 8
+  store i32 %n, i32* %n.addr, align 4
+  store i32 3, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %n.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %3 = load i32, i32* %i, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32*, i32** %c.addr, align 8
+  %5 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %5 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %4, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %6 = load i32, i32* %i, align 4
+  %7 = load i32, i32* %i, align 4
+  %mul = mul nsw i32 %6, %7
+  %8 = load i32*, i32** %b.addr, align 8
+  %9 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %9 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %8, i64 %idxprom1
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %10 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  store i32 5, i32* %i3, align 4
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc14, %for.end
+  %11 = load i32, i32* %i3, align 4
+  %12 = load i32, i32* %n.addr, align 4
+  %cmp5 = icmp slt i32 %11, %12
+  br i1 %cmp5, label %for.body6, label %for.end16
+
+for.body6:                                        ; preds = %for.cond4
+  %13 = load i32*, i32** %b.addr, align 8
+  %14 = load i32, i32* %i3, align 4
+  %idxprom7 = sext i32 %14 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32* %13, i64 %idxprom7
+  %15 = load i32, i32* %arrayidx8, align 4
+  %16 = load i32*, i32** %c.addr, align 8
+  %17 = load i32, i32* %i3, align 4
+  %idxprom9 = sext i32 %17 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32* %16, i64 %idxprom9
+  %18 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %15, %18
+  %19 = load i32*, i32** %a.addr, align 8
+  %20 = load i32, i32* %i3, align 4
+  %idxprom12 = sext i32 %20 to i64
+  %arrayidx13 = getelementptr inbounds i32, i32* %19, i64 %idxprom12
+  store i32 %add11, i32* %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6
+  %21 = load i32, i32* %i3, align 4
+  %inc15 = add nsw i32 %21, 1
+  store i32 %inc15, i32* %i3, align 4
+  br label %for.cond4, !llvm.loop !6
+
+for.end16:                                        ; preds = %for.cond4
+  ret void
+}
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 47ee914ea16086c1958b93540ed2351bcdae7cdb)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/loop_fuse/negative_loop_fuse_out.ll b/tests/loop_fuse/negative_loop_fuse_out.ll
new file mode 100644
index 00000000000000..b3fdb2fa9918a1
--- /dev/null
+++ b/tests/loop_fuse/negative_loop_fuse_out.ll
@@ -0,0 +1,76 @@
+; ModuleID = 'negative_loop_fuse.ll'
+source_filename = "negative_loop_fuse.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init(i32* noundef %a, i32* noundef %b, i32* noundef %c, i32 noundef %n) #0 {
+entry:
+  %smax = call i32 @llvm.smax.i32(i32 %n, i32 3)
+  %wide.trip.count = zext i32 %smax to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 3, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = shl nuw nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %1 = trunc i64 %0 to i32
+  store i32 %1, i32* %arrayidx, align 4
+  %2 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %2, %2
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  %smax7 = call i32 @llvm.smax.i32(i32 %n, i32 5)
+  %wide.trip.count8 = zext i32 %smax7 to i64
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc14, %for.end
+  %indvars.iv4 = phi i64 [ %indvars.iv.next5, %for.inc14 ], [ 5, %for.end ]
+  %exitcond9 = icmp ne i64 %indvars.iv4, %wide.trip.count8
+  br i1 %exitcond9, label %for.body6, label %for.end16
+
+for.body6:                                        ; preds = %for.cond4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv4
+  %i = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv4
+  %i1 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %i, %i1
+  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv4
+  store i32 %add11, i32* %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6
+  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+  br label %for.cond4, !llvm.loop !6
+
+for.end16:                                        ; preds = %for.cond4
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 47ee914ea16086c1958b93540ed2351bcdae7cdb)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}
diff --git a/tests/loop_fuse/negative_loop_fuse_out1.ll b/tests/loop_fuse/negative_loop_fuse_out1.ll
new file mode 100644
index 00000000000000..a63151ac6cda1d
--- /dev/null
+++ b/tests/loop_fuse/negative_loop_fuse_out1.ll
@@ -0,0 +1,76 @@
+; ModuleID = 'negative_loop_fuse_out.ll'
+source_filename = "negative_loop_fuse.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @init(i32* noundef %a, i32* noundef %b, i32* noundef %c, i32 noundef %n) #0 {
+entry:
+  %smax = call i32 @llvm.smax.i32(i32 %n, i32 3)
+  %wide.trip.count = zext i32 %smax to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 3, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = shl nuw nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %1 = trunc i64 %0 to i32
+  store i32 %1, i32* %arrayidx, align 4
+  %2 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %2, %2
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  %smax7 = call i32 @llvm.smax.i32(i32 %n, i32 5)
+  %wide.trip.count8 = zext i32 %smax7 to i64
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc14, %for.end
+  %indvars.iv4 = phi i64 [ %indvars.iv.next5, %for.inc14 ], [ 5, %for.end ]
+  %exitcond9 = icmp ne i64 %indvars.iv4, %wide.trip.count8
+  br i1 %exitcond9, label %for.body6, label %for.end16
+
+for.body6:                                        ; preds = %for.cond4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv4
+  %i = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv4
+  %i1 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %i, %i1
+  %arrayidx13 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv4
+  store i32 %add11, i32* %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6
+  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+  br label %for.cond4, !llvm.loop !6
+
+for.end16:                                        ; preds = %for.cond4
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 1}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 14.0.6 (https://github.com/shravankumar0811/llvm-project.git 47ee914ea16086c1958b93540ed2351bcdae7cdb)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = distinct !{!6, !5}

>From 7000d87c7b48e8d6007ddf31724945a34e34e417 Mon Sep 17 00:00:00 2001
From: Shravan Kumar <shkumar at habana.ai>
Date: Thu, 7 Jul 2022 10:09:06 +0300
Subject: [PATCH 8/8] Adding Loop Fusion pass

---
 mlir/examples/toy/Ch2/include/toy/Ops.td | 19 ++++++++++++++++++
 mlir/examples/toy/Ch2/mlir/Dialect.cpp   | 25 ++++++++++++++++++++++++
 mlir/examples/toy/Ch2/mlir/MLIRGen.cpp   | 11 +++++++++++
 mlir/test/Examples/Toy/Ch2/codegen.toy   |  2 +-
 tests/CMakeLists.txt                     |  1 +
 tests/toy/codegen.mlir                   | 16 +++++++++++++++
 tests/toy/command.sh                     |  2 ++
 7 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 tests/toy/codegen.mlir
 create mode 100644 tests/toy/command.sh

diff --git a/mlir/examples/toy/Ch2/include/toy/Ops.td b/mlir/examples/toy/Ch2/include/toy/Ops.td
index eaec24c3ae5bb7..80f82778580621 100644
--- a/mlir/examples/toy/Ch2/include/toy/Ops.td
+++ b/mlir/examples/toy/Ch2/include/toy/Ops.td
@@ -246,4 +246,23 @@ def TransposeOp : Toy_Op<"transpose"> {
   let verifier = [{ return ::verify(*this); }];
 }
 
+def MatmulOp : Toy_Op<"matmul"> {
+  let summary = "matmul operation";
+
+  let arguments = (ins F64Tensor:$a , F64Tensor:$b );
+  let results = (outs F64Tensor);
+
+  let assemblyFormat = [{
+    `(` $a `:` type($a) `,` $b `:` type($b) `)` attr-dict `to` type(results)
+  }];
+
+  // Allow building a MatmulOp with from the input operand.
+  let builders = [
+    OpBuilder<(ins "Value":$a, "Value":$b )>
+  ];
+
+  // Invoke a static verify method to verify this matmul operation.
+  let verifier = [{ return ::verify(*this); }];
+}
+
 #endif // TOY_OPS
diff --git a/mlir/examples/toy/Ch2/mlir/Dialect.cpp b/mlir/examples/toy/Ch2/mlir/Dialect.cpp
index 278c857ea46816..3baa930740b39e 100644
--- a/mlir/examples/toy/Ch2/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch2/mlir/Dialect.cpp
@@ -248,6 +248,31 @@ static mlir::LogicalResult verify(TransposeOp op) {
   return mlir::success();
 }
 
+//===----------------------------------------------------------------------===//
+// MatmulOp
+
+void MatmulOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
+                     mlir::Value value1, mlir::Value value2) {
+  state.addTypes(UnrankedTensorType::get(builder.getF64Type()));
+  state.addOperands(value1);
+  state.addOperands(value2);
+}
+
+static mlir::LogicalResult verify(MatmulOp op) {
+  auto inputType = op.getOperand(0).getType().dyn_cast<RankedTensorType>();
+  auto resultType = op.getType().dyn_cast<RankedTensorType>();
+  if (!inputType || !resultType)
+    return mlir::success();
+
+  auto inputShape = inputType.getShape();
+  if (!std::equal(inputShape.begin(), inputShape.end(),
+                  resultType.getShape().rbegin())) {
+    return op.emitError()
+           << "expected result shape to be a matmul of the input";
+  }
+  return mlir::success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/examples/toy/Ch2/mlir/MLIRGen.cpp b/mlir/examples/toy/Ch2/mlir/MLIRGen.cpp
index b7b573672fc74b..f95f7185abab6f 100644
--- a/mlir/examples/toy/Ch2/mlir/MLIRGen.cpp
+++ b/mlir/examples/toy/Ch2/mlir/MLIRGen.cpp
@@ -323,6 +323,17 @@ class MLIRGenImpl {
       return builder.create<TransposeOp>(location, operands[0]);
     }
 
+    // Builtin calls have their custom operation, meaning this is a
+    // straightforward emission.
+    if (callee == "matmul") {
+      if (call.getArgs().size() != 2) {
+        emitError(location, "MLIR codegen encountered an error: toy.matmul "
+                            "does not accept multiple arguments");
+        return nullptr;
+      }
+      return builder.create<MatmulOp>(location, operands[0], operands[1]);
+    }
+
     // Otherwise this is a call to a user-defined function. Calls to
     // user-defined functions are mapped to a custom call that takes the callee
     // name as an attribute.
diff --git a/mlir/test/Examples/Toy/Ch2/codegen.toy b/mlir/test/Examples/Toy/Ch2/codegen.toy
index ea1708e6fee18d..6ee9b181c5710d 100644
--- a/mlir/test/Examples/Toy/Ch2/codegen.toy
+++ b/mlir/test/Examples/Toy/Ch2/codegen.toy
@@ -2,7 +2,7 @@
 
 # User defined generic function that operates on unknown shaped arguments
 def multiply_transpose(a, b) {
-  return transpose(a) * transpose(b);
+  return matmul(a,b);
 }
 
 def main() {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2321ce7a8e9ce7..dd44aafff57d7e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -2,3 +2,4 @@ add_subdirectory(cfcss)
 add_subdirectory(assignment1)
 add_subdirectory(scev)
 add_subdirectory(loop_fuse)
+add_subdirectory(toy)
diff --git a/tests/toy/codegen.mlir b/tests/toy/codegen.mlir
new file mode 100644
index 00000000000000..58045981ac1742
--- /dev/null
+++ b/tests/toy/codegen.mlir
@@ -0,0 +1,16 @@
+module {
+  func @multiply_transpose(%arg0: tensor<*xf64> loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":4:1), %arg1: tensor<*xf64> loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":4:1)) -> tensor<*xf64> {
+    %0 = toy.matmul(%arg0 : tensor<*xf64>, %arg1 : tensor<*xf64>) to tensor<*xf64> loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":5:10)
+    toy.return %0 : tensor<*xf64> loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":5:3)
+  } loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":4:1)
+  func @main() {
+    %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":9:17)
+    %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64> loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":9:3)
+    %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64> loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":10:17)
+    %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64> loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":10:3)
+    %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":11:11)
+    %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":12:11)
+    toy.print %5 : tensor<*xf64> loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":13:3)
+    toy.return loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":8:1)
+  } loc("/home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy":8:1)
+} loc(unknown)
diff --git a/tests/toy/command.sh b/tests/toy/command.sh
new file mode 100644
index 00000000000000..109563b08ae60d
--- /dev/null
+++ b/tests/toy/command.sh
@@ -0,0 +1,2 @@
+toyc-ch2 /home/shkumar/LLVM/llvm-project/mlir/test/Examples/Toy/Ch2/codegen.toy -emit=mlir -mlir-print-debuginfo 2> codegen.mlir
+toyc-ch2 codegen.mlir -emit=mlir



More information about the llvm-branch-commits mailing list