[llvm] [llvm][ARM]Add widen global arrays pass (PR #107120)

Nashe Mncube via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 11 04:27:30 PDT 2024


https://github.com/nasherm updated https://github.com/llvm/llvm-project/pull/107120

>From cc8bf21cbdda1b8ae6338602dc06b8ab139bb168 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Tue, 3 Sep 2024 16:09:43 +0100
Subject: [PATCH 01/12] [llvm][ARM]Add ARM widen strings pass

- Pass optimizes memcpy's by padding out destinations and sources to a
  full word to make ARM backend generate full word loads instead of
  loading a single byte (ldrb) and/or half word (ldrh). Only pads
  destination when it's a stack allocated constant size array and source
  when it's constant string. Heuristic to decide whether to pad or not
  is very basic and could be improved to allow more examples to be
  padded.
- Pass works at the midend level

Change-Id: I1c6371f0962e7ad3c166602b800d041ac1cc7b04
---
 .../llvm/Transforms/Scalar/ARMWidenStrings.h  |  30 +++
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 .../lib/Transforms/Scalar/ARMWidenStrings.cpp | 227 ++++++++++++++++++
 llvm/lib/Transforms/Scalar/CMakeLists.txt     |   1 +
 .../ARMWidenStrings/arm-widen-strings-1.ll    |  25 ++
 .../ARMWidenStrings/arm-widen-strings-2.ll    |  22 ++
 .../arm-widen-strings-lengths-dont-match.ll   |  28 +++
 .../arm-widen-strings-more-than-64-bytes.ll   |  29 +++
 .../arm-widen-strings-ptrtoint.ll             |  42 ++++
 .../arm-widen-strings-struct-test.ll          |  52 ++++
 .../arm-widen-strings-volatile.ll             |  29 +++
 12 files changed, 487 insertions(+)
 create mode 100755 llvm/include/llvm/Transforms/Scalar/ARMWidenStrings.h
 create mode 100644 llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp
 create mode 100644 llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll
 create mode 100644 llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll
 create mode 100644 llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll
 create mode 100644 llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll
 create mode 100644 llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll
 create mode 100644 llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll
 create mode 100644 llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-volatile.ll

diff --git a/llvm/include/llvm/Transforms/Scalar/ARMWidenStrings.h b/llvm/include/llvm/Transforms/Scalar/ARMWidenStrings.h
new file mode 100755
index 00000000000000..3bda666660144a
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/ARMWidenStrings.h
@@ -0,0 +1,30 @@
+//===- ARMWidenStrings.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface for the ArmWidenStrings pass
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
+#define LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Module;
+
+class ARMWidenStringsPass : public PassInfoMixin<ARMWidenStringsPass> {
+public:
+  ARMWidenStringsPass() = default;
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
\ No newline at end of file
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 1df1449fce597c..6b989231cb9861 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -207,6 +207,7 @@
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/Transforms/Scalar/ARMWidenStrings.h"
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
 #include "llvm/Transforms/Scalar/AnnotationRemarks.h"
 #include "llvm/Transforms/Scalar/BDCE.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index d6067089c6b5c1..55566f43e5435d 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -489,6 +489,7 @@ FUNCTION_PASS("view-dom-only", DomOnlyViewer())
 FUNCTION_PASS("view-post-dom", PostDomViewer())
 FUNCTION_PASS("view-post-dom-only", PostDomOnlyViewer())
 FUNCTION_PASS("wasm-eh-prepare", WasmEHPreparePass())
+FUNCTION_PASS("arm-widen-strings", ARMWidenStringsPass())
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_PASS_WITH_PARAMS
diff --git a/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp b/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp
new file mode 100644
index 00000000000000..dd06c2a7ea10d1
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp
@@ -0,0 +1,227 @@
+// ARMWidenStrings.cpp - Widen strings to word boundaries to speed up
+// programs that use simple strcpy's with constant strings as source
+// and stack allocated array for destination.
+
+#define DEBUG_TYPE "arm-widen-strings"
+
+#include "llvm/Transforms/Scalar/ARMWidenStrings.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+cl::opt<bool> DisableARMWidenStrings("disable-arm-widen-strings",
+                                     cl::init(false));
+
+namespace {
+
+class ARMWidenStrings {
+public:
+  /*
+  Max number of bytes that memcpy allows for lowering to load/stores before it
+  uses library function (__aeabi_memcpy).  This is the same value returned by
+  ARMSubtarget::getMaxInlineSizeThreshold which I would have called in place of
+  the constant int but can't get access to the subtarget info class from the
+  midend.
+  */
+  const unsigned int MemcpyInliningLimit = 64;
+
+  bool run(Function &F);
+};
+
+static bool IsCharArray(Type *t) {
+  const unsigned int CHAR_BIT_SIZE = 8;
+  return t && t->isArrayTy() && t->getArrayElementType()->isIntegerTy() &&
+         t->getArrayElementType()->getIntegerBitWidth() == CHAR_BIT_SIZE;
+}
+
+bool ARMWidenStrings::run(Function &F) {
+  if (DisableARMWidenStrings) {
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Running ARMWidenStrings on module " << F.getName()
+                    << "\n");
+
+  for (Function::iterator b = F.begin(); b != F.end(); ++b) {
+    for (BasicBlock::iterator i = b->begin(); i != b->end(); ++i) {
+      CallInst *CI = dyn_cast<CallInst>(i);
+      if (!CI) {
+        continue;
+      }
+
+      Function *CallMemcpy = CI->getCalledFunction();
+      // find out if the current call instruction is a call to llvm memcpy
+      // intrinsics
+      if (CallMemcpy == NULL || !CallMemcpy->isIntrinsic() ||
+          CallMemcpy->getIntrinsicID() != Intrinsic::memcpy) {
+        continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "Found call to strcpy/memcpy:\n" << *CI << "\n");
+
+      auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
+      auto *SourceVar = dyn_cast<GlobalVariable>(CI->getArgOperand(1));
+      auto *BytesToCopy = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+      auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
+
+      if (!BytesToCopy) {
+        LLVM_DEBUG(dbgs() << "Number of bytes to copy is null\n");
+        continue;
+      }
+
+      uint64_t NumBytesToCopy = BytesToCopy->getZExtValue();
+
+      if (!Alloca) {
+        LLVM_DEBUG(dbgs() << "Destination isn't a Alloca\n");
+        continue;
+      }
+
+      if (!SourceVar) {
+        LLVM_DEBUG(dbgs() << "Source isn't a global constant variable\n");
+        continue;
+      }
+
+      if (!IsVolatile || IsVolatile->isOne()) {
+        LLVM_DEBUG(
+            dbgs() << "Not widening strings for this memcpy because it's "
+                      "a volatile operations\n");
+        continue;
+      }
+
+      if (NumBytesToCopy % 4 == 0) {
+        LLVM_DEBUG(dbgs() << "Bytes to copy in strcpy/memcpy is already word "
+                             "aligned so nothing to do here.\n");
+        continue;
+      }
+
+      if (!SourceVar->hasInitializer() || !SourceVar->isConstant() ||
+          !SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr()) {
+        LLVM_DEBUG(dbgs() << "Source is not constant global, thus it's "
+                             "mutable therefore it's not safe to pad\n");
+        continue;
+      }
+
+      ConstantDataArray *SourceDataArray =
+          dyn_cast<ConstantDataArray>(SourceVar->getInitializer());
+      if (!SourceDataArray || !IsCharArray(SourceDataArray->getType())) {
+        LLVM_DEBUG(dbgs() << "Source isn't a constant data array\n");
+        continue;
+      }
+
+      if (!Alloca->isStaticAlloca()) {
+        LLVM_DEBUG(dbgs() << "Destination allocation isn't a static "
+                             "constant which is locally allocated in this "
+                             "function, so skipping.\n");
+        continue;
+      }
+
+      // Make sure destination is definitley a char array.
+      if (!IsCharArray(Alloca->getAllocatedType())) {
+        LLVM_DEBUG(dbgs() << "Destination doesn't look like a constant char (8 "
+                             "bits) array\n");
+        continue;
+      }
+      LLVM_DEBUG(dbgs() << "With Alloca: " << *Alloca << "\n");
+
+      uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
+      uint64_t SZSize = SourceDataArray->getType()->getNumElements();
+
+      // For safety purposes lets add a constraint and only padd when
+      // num bytes to copy == destination array size == source string
+      // which is a constant
+      LLVM_DEBUG(dbgs() << "Number of bytes to copy is: " << NumBytesToCopy
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "Size of destination array is: " << DZSize << "\n");
+      LLVM_DEBUG(dbgs() << "Size of source array is: " << SZSize << "\n");
+      if (NumBytesToCopy != DZSize || DZSize != SZSize) {
+        LLVM_DEBUG(dbgs() << "Size of number of bytes to copy, destination "
+                             "array and source string don't match, so "
+                             "skipping\n");
+        continue;
+      }
+      LLVM_DEBUG(dbgs() << "Going to widen.\n");
+      unsigned int NumBytesToPad = 4 - (NumBytesToCopy % 4);
+      LLVM_DEBUG(dbgs() << "Number of bytes to pad by is " << NumBytesToPad
+                        << "\n");
+      unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
+
+      if (TotalBytes > MemcpyInliningLimit) {
+        LLVM_DEBUG(
+            dbgs() << "Not going to pad because total number of bytes is "
+                   << TotalBytes
+                   << "  which be greater than the inlining "
+                      "limit for memcpy which is "
+                   << MemcpyInliningLimit << "\n");
+        continue;
+      }
+
+      // update destination char array to be word aligned (memcpy(X,...,...))
+      IRBuilder<> BuildAlloca(Alloca);
+      AllocaInst *NewAlloca = cast<AllocaInst>(BuildAlloca.CreateAlloca(
+          ArrayType::get(Alloca->getAllocatedType()->getArrayElementType(),
+                         NumBytesToCopy + NumBytesToPad)));
+      NewAlloca->takeName(Alloca);
+      NewAlloca->setAlignment(Alloca->getAlign());
+      Alloca->replaceAllUsesWith(NewAlloca);
+
+      LLVM_DEBUG(dbgs() << "Updating users of destination stack object to use "
+                        << "new size\n");
+
+      // update source to be word aligned (memcpy(...,X,...))
+      // create replacement string with padded null bytes.
+      StringRef Data = SourceDataArray->getRawDataValues();
+      std::vector<uint8_t> StrData(Data.begin(), Data.end());
+      for (unsigned int p = 0; p < NumBytesToPad; p++)
+        StrData.push_back('\0');
+      auto Arr = ArrayRef(StrData.data(), TotalBytes);
+
+      // create new padded version of global variable string.
+      Constant *SourceReplace = ConstantDataArray::get(F.getContext(), Arr);
+      GlobalVariable *NewGV = new GlobalVariable(
+          *F.getParent(), SourceReplace->getType(), true,
+          SourceVar->getLinkage(), SourceReplace, SourceReplace->getName());
+
+      // copy any other attributes from original global variable string
+      // e.g. unamed_addr
+      NewGV->copyAttributesFrom(SourceVar);
+      NewGV->takeName(SourceVar);
+
+      // replace intrinsic source.
+      CI->setArgOperand(1, NewGV);
+
+      // Update number of bytes to copy (memcpy(...,...,X))
+      CI->setArgOperand(2,
+                        ConstantInt::get(BytesToCopy->getType(), TotalBytes));
+      LLVM_DEBUG(dbgs() << "Padded dest/source and increased number of bytes:\n"
+                        << *CI << "\n"
+                        << *NewAlloca << "\n");
+    }
+  }
+  return true;
+}
+
+} // end of anonymous namespace
+
+PreservedAnalyses ARMWidenStringsPass::run(Function &F,
+                                           FunctionAnalysisManager &AM) {
+  if (!ARMWidenStrings().run(F))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 939a1457239567..a9607e4ebc6583 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_component_library(LLVMScalarOpts
   ADCE.cpp
   AlignmentFromAssumptions.cpp
   AnnotationRemarks.cpp
+  ARMWidenStrings.cpp
   BDCE.cpp
   CallSiteSplitting.cpp
   ConstantHoisting.cpp
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll
new file mode 100644
index 00000000000000..e11cf372c36a6e
--- /dev/null
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>,arm-widen-strings" -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O0>" -S | FileCheck %s --check-prefix=TURNED-OFF
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK: [12 x i8]
+; TURNED-OFF-NOT: [12 x i8]
+ at .str = private unnamed_addr constant [10 x i8] c"123456789\00", align 1
+
+; Function Attrs: nounwind
+define hidden void @foo() #0 {
+entry:
+; CHECK: %something = alloca [12 x i8]
+; TURNED-OFF-NOT: %something = alloca [12 x i8]
+  %something = alloca [10 x i8], align 1
+  %arraydecay = getelementptr inbounds [10 x i8], ptr %something, i32 0, i32 0
+; CHECK: @llvm.memcpy.p0.p0.i32
+  %call = call ptr @strcpy(ptr %arraydecay, ptr @.str)
+  %arraydecay1 = getelementptr inbounds [10 x i8], ptr %something, i32 0, i32 0
+  %call2 = call i32 @bar(ptr %arraydecay1)
+  ret void
+}
+
+declare ptr @strcpy(ptr, ptr) #1
+
+declare i32 @bar(...) #1
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll
new file mode 100644
index 00000000000000..2df8108f445fe1
--- /dev/null
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>,arm-widen-strings" -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK: [64 x i8]
+ at .str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
+
+; Function Attrs: nounwind
+define hidden void @foo() #0 {
+entry:
+; CHECK: %something = alloca [64 x i8]
+  %something = alloca [62 x i8], align 1
+  %arraydecay = getelementptr inbounds [62 x i8], ptr %something, i32 0, i32 0
+; CHECK: @llvm.memcpy.p0.p0.i32
+  %call = call ptr @strcpy(ptr %arraydecay, ptr @.str)
+  %arraydecay1 = getelementptr inbounds [62 x i8], ptr %something, i32 0, i32 0
+  %call2 = call i32 @bar(ptr %arraydecay1)
+  ret void
+}
+
+declare ptr @strcpy(ptr, ptr) #1
+
+declare i32 @bar(...) #1
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll
new file mode 100644
index 00000000000000..a0c1e213298167
--- /dev/null
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll
@@ -0,0 +1,28 @@
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=arm-widen-strings -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-arm-none-eabi"
+
+; CHECK: [17 x i8]
+ at .str = private unnamed_addr constant [17 x i8] c"aaaaaaaaaaaaaaaa\00", align 1
+
+; Function Attrs: nounwind
+define hidden void @foo() local_unnamed_addr #0 {
+entry:
+  %something = alloca [20 x i8], align 1
+  call void @llvm.lifetime.start(i64 20, ptr nonnull %something) #3
+  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %something, ptr align 1 @.str, i32 17, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something) #3
+  call void @llvm.lifetime.end(i64 20, ptr nonnull %something) #3
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+
+declare i32 @bar(...) local_unnamed_addr #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, ptr nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll
new file mode 100644
index 00000000000000..67cb99023c5328
--- /dev/null
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=arm-widen-strings -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-arm-none-eabi"
+
+; CHECK: [65 x i8]
+; CHECK-NOT: [68 x i8]
+ at .str = private unnamed_addr constant [65 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzz\00", align 1
+
+; Function Attrs: nounwind
+define hidden void @foo() local_unnamed_addr #0 {
+entry:
+  %something = alloca [65 x i8], align 1
+  call void @llvm.lifetime.start(i64 65, ptr nonnull %something) #3
+  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %something, ptr align 1 @.str, i32 65, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something) #3
+  call void @llvm.lifetime.end(i64 65, ptr nonnull %something) #3
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+
+declare i32 @bar(...) local_unnamed_addr #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, ptr nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll
new file mode 100644
index 00000000000000..3f02c02ad845b2
--- /dev/null
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=arm-widen-strings -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK: [48 x i8]
+ at f.string1 = private unnamed_addr constant [45 x i8] c"The quick brown dog jumps over the lazy fox.\00", align 1
+
+; Function Attrs: nounwind
+define hidden i32 @f() {
+entry:
+  %string1 = alloca [45 x i8], align 1
+  %pos = alloca i32, align 4
+  %token = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0i8(i64 45, ptr %string1)
+  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %string1, ptr align 1 @f.string1, i32 45, i1 false)
+  call void @llvm.lifetime.start.p0i8(i64 4, ptr %pos)
+  call void @llvm.lifetime.start.p0i8(i64 4, ptr %token)
+  %call = call ptr @strchr(ptr %string1, i32 101)
+  store ptr %call, ptr %token, align 4
+  %0 = load ptr, ptr %token, align 4
+  %sub.ptr.lhs.cast = ptrtoint ptr %0 to i32
+  %sub.ptr.rhs.cast = ptrtoint ptr %string1 to i32
+  %sub.ptr.sub = sub i32 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %add = add nsw i32 %sub.ptr.sub, 1
+  store i32 %add, ptr %pos, align 4
+  %1 = load i32, ptr %pos, align 4
+  call void @llvm.lifetime.end.p0i8(i64 4, ptr %token)
+  call void @llvm.lifetime.end.p0i8(i64 4, ptr %pos)
+  call void @llvm.lifetime.end.p0i8(i64 45, ptr %string1)
+  ret i32 %1
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, ptr nocapture)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1)
+
+; Function Attrs: nounwind
+declare ptr @strchr(ptr, i32)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, ptr nocapture)
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll
new file mode 100644
index 00000000000000..937bfaecd8e3e9
--- /dev/null
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=arm-widen-strings -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-arm-none-eabi"
+
+%struct.P = type { i32, [13 x i8] }
+
+; CHECK-NOT: [16 x i8]
+ at .str = private unnamed_addr constant [13 x i8] c"hello world\0A\00", align 1
+ at .str.1 = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
+ at __ARM_use_no_argv = global i32 1, section ".ARM.use_no_argv", align 4
+ at llvm.used = appending global [1 x ptr] [ptr @__ARM_use_no_argv], section "llvm.metadata"
+
+; Function Attrs: nounwind
+define hidden i32 @main() local_unnamed_addr #0 {
+entry:
+  %p = alloca %struct.P, align 4
+  call void @llvm.lifetime.start(i64 20, ptr nonnull %p) #2
+  store i32 10, ptr %p, align 4, !tbaa !3
+  %arraydecay = getelementptr inbounds %struct.P, ptr %p, i32 0, i32 1, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %arraydecay, ptr align 1 @.str, i32 13, i1 false)
+  %puts = call i32 @puts(ptr %arraydecay)
+  call void @llvm.lifetime.end(i64 20, ptr nonnull %p) #2
+  ret i32 0
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, ptr nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
+
+; Function Attrs: nounwind
+declare i32 @puts(ptr nocapture readonly) #2
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m0" "target-features"="+strict-align" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"Component: ARM Compiler 6 devbuild Tool: armclang [devbuild]"}
+!3 = !{!4, !5, i64 0}
+!4 = !{!"P", !5, i64 0, !6, i64 4}
+!5 = !{!"int", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-volatile.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-volatile.ll
new file mode 100644
index 00000000000000..6cbd823a18c367
--- /dev/null
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-volatile.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=arm-widen-strings -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-arm-none-eabi"
+
+; CHECK-NOT: [64 x i8]
+ at .str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
+
+; Function Attrs: nounwind
+define hidden void @foo() local_unnamed_addr #0 {
+entry:
+  %something = alloca [62 x i8], align 1
+  %0 = getelementptr inbounds [62 x i8], ptr %something, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 62, ptr nonnull %0) #3
+  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %0, ptr align 1 @.str, i32 62, i1 true)
+  %call2 = call i32 @bar(ptr nonnull %0) #3
+  call void @llvm.lifetime.end(i64 62, ptr nonnull %0) #3
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+
+declare i32 @bar(...) local_unnamed_addr #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, ptr nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1

>From 3f2fea46fdb265ac83d913bf21ea7443798a87d3 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Wed, 11 Sep 2024 16:54:42 +0100
Subject: [PATCH 02/12] Responding to review comments

Change-Id: I492ea4e5b6f589e5d877eeb6be31f7ab4720be9b
---
 .../lib/Transforms/Scalar/ARMWidenStrings.cpp | 61 +++++--------------
 1 file changed, 15 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp b/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp
index dd06c2a7ea10d1..1439e8af04292e 100644
--- a/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp
+++ b/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp
@@ -1,6 +1,16 @@
-// ARMWidenStrings.cpp - Widen strings to word boundaries to speed up
-// programs that use simple strcpy's with constant strings as source
-// and stack allocated array for destination.
+//===- ARMWidenStrings.cpp - Widen strings to ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Widen strings to word boundaries to speed up  programs that use simple
+// strcpy's with constant strings as source and stack allocated array for
+// destination.
+//
+//===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "arm-widen-strings"
 
@@ -25,8 +35,7 @@
 
 using namespace llvm;
 
-cl::opt<bool> DisableARMWidenStrings("disable-arm-widen-strings",
-                                     cl::init(false));
+cl::opt<bool> DisableARMWidenStrings("disable-arm-widen-strings");
 
 namespace {
 
@@ -73,71 +82,53 @@ bool ARMWidenStrings::run(Function &F) {
         continue;
       }
 
-      LLVM_DEBUG(dbgs() << "Found call to strcpy/memcpy:\n" << *CI << "\n");
-
       auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
       auto *SourceVar = dyn_cast<GlobalVariable>(CI->getArgOperand(1));
       auto *BytesToCopy = dyn_cast<ConstantInt>(CI->getArgOperand(2));
       auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
 
       if (!BytesToCopy) {
-        LLVM_DEBUG(dbgs() << "Number of bytes to copy is null\n");
         continue;
       }
 
       uint64_t NumBytesToCopy = BytesToCopy->getZExtValue();
 
       if (!Alloca) {
-        LLVM_DEBUG(dbgs() << "Destination isn't a Alloca\n");
         continue;
       }
 
+      // Source isn't a global constant variable
       if (!SourceVar) {
-        LLVM_DEBUG(dbgs() << "Source isn't a global constant variable\n");
         continue;
       }
 
       if (!IsVolatile || IsVolatile->isOne()) {
-        LLVM_DEBUG(
-            dbgs() << "Not widening strings for this memcpy because it's "
-                      "a volatile operations\n");
         continue;
       }
 
       if (NumBytesToCopy % 4 == 0) {
-        LLVM_DEBUG(dbgs() << "Bytes to copy in strcpy/memcpy is already word "
-                             "aligned so nothing to do here.\n");
         continue;
       }
 
       if (!SourceVar->hasInitializer() || !SourceVar->isConstant() ||
           !SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr()) {
-        LLVM_DEBUG(dbgs() << "Source is not constant global, thus it's "
-                             "mutable therefore it's not safe to pad\n");
         continue;
       }
 
       ConstantDataArray *SourceDataArray =
           dyn_cast<ConstantDataArray>(SourceVar->getInitializer());
       if (!SourceDataArray || !IsCharArray(SourceDataArray->getType())) {
-        LLVM_DEBUG(dbgs() << "Source isn't a constant data array\n");
         continue;
       }
 
       if (!Alloca->isStaticAlloca()) {
-        LLVM_DEBUG(dbgs() << "Destination allocation isn't a static "
-                             "constant which is locally allocated in this "
-                             "function, so skipping.\n");
         continue;
       }
 
       // Make sure destination is definitley a char array.
       if (!IsCharArray(Alloca->getAllocatedType())) {
-        LLVM_DEBUG(dbgs() << "Destination doesn't look like a constant char (8 "
-                             "bits) array\n");
         continue;
       }
-      LLVM_DEBUG(dbgs() << "With Alloca: " << *Alloca << "\n");
 
       uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
       uint64_t SZSize = SourceDataArray->getType()->getNumElements();
@@ -145,29 +136,13 @@ bool ARMWidenStrings::run(Function &F) {
       // For safety purposes lets add a constraint and only padd when
       // num bytes to copy == destination array size == source string
       // which is a constant
-      LLVM_DEBUG(dbgs() << "Number of bytes to copy is: " << NumBytesToCopy
-                        << "\n");
-      LLVM_DEBUG(dbgs() << "Size of destination array is: " << DZSize << "\n");
-      LLVM_DEBUG(dbgs() << "Size of source array is: " << SZSize << "\n");
       if (NumBytesToCopy != DZSize || DZSize != SZSize) {
-        LLVM_DEBUG(dbgs() << "Size of number of bytes to copy, destination "
-                             "array and source string don't match, so "
-                             "skipping\n");
         continue;
       }
-      LLVM_DEBUG(dbgs() << "Going to widen.\n");
       unsigned int NumBytesToPad = 4 - (NumBytesToCopy % 4);
-      LLVM_DEBUG(dbgs() << "Number of bytes to pad by is " << NumBytesToPad
-                        << "\n");
       unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
 
       if (TotalBytes > MemcpyInliningLimit) {
-        LLVM_DEBUG(
-            dbgs() << "Not going to pad because total number of bytes is "
-                   << TotalBytes
-                   << "  which be greater than the inlining "
-                      "limit for memcpy which is "
-                   << MemcpyInliningLimit << "\n");
         continue;
       }
 
@@ -180,9 +155,6 @@ bool ARMWidenStrings::run(Function &F) {
       NewAlloca->setAlignment(Alloca->getAlign());
       Alloca->replaceAllUsesWith(NewAlloca);
 
-      LLVM_DEBUG(dbgs() << "Updating users of destination stack object to use "
-                        << "new size\n");
-
       // update source to be word aligned (memcpy(...,X,...))
       // create replacement string with padded null bytes.
       StringRef Data = SourceDataArray->getRawDataValues();
@@ -208,9 +180,6 @@ bool ARMWidenStrings::run(Function &F) {
       // Update number of bytes to copy (memcpy(...,...,X))
       CI->setArgOperand(2,
                         ConstantInt::get(BytesToCopy->getType(), TotalBytes));
-      LLVM_DEBUG(dbgs() << "Padded dest/source and increased number of bytes:\n"
-                        << *CI << "\n"
-                        << *NewAlloca << "\n");
     }
   }
   return true;

>From 3b0405bfe44c2cdab939a58a60896268b122e0fa Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Fri, 13 Sep 2024 12:24:32 +0100
Subject: [PATCH 03/12] Making ARMWidenStrings to be target independent

Change-Id: Ic6ed9a549e39020e8c04b38bc21ba8162b4ebfd9
---
 .../llvm/Analysis/TargetTransformInfo.h       |   8 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   2 +
 .../llvm/Transforms/Scalar/ARMWidenStrings.h  |  30 ---
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   4 +
 llvm/lib/Passes/PassBuilder.cpp               |   1 -
 llvm/lib/Passes/PassRegistry.def              |   1 -
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |   6 +
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h  |   2 +
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 149 +++++++++++++
 .../lib/Transforms/Scalar/ARMWidenStrings.cpp | 196 ------------------
 llvm/lib/Transforms/Scalar/CMakeLists.txt     |   1 -
 .../ARMWidenStrings/arm-widen-strings-1.ll    |   2 +-
 .../ARMWidenStrings/arm-widen-strings-2.ll    |   2 +-
 .../arm-widen-strings-lengths-dont-match.ll   |   2 +-
 .../arm-widen-strings-more-than-64-bytes.ll   |   2 +-
 .../arm-widen-strings-ptrtoint.ll             |   2 +-
 .../arm-widen-strings-struct-test.ll          |   2 +-
 .../arm-widen-strings-volatile.ll             |   2 +-
 18 files changed, 178 insertions(+), 236 deletions(-)
 delete mode 100755 llvm/include/llvm/Transforms/Scalar/ARMWidenStrings.h
 delete mode 100644 llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index b2124c6106198e..2acdd561f61ce0 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1786,6 +1786,9 @@ class TargetTransformInfo {
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
+  /// \return true if global strings should be padded to an alignment boundary
+  bool useWidenGlobalStrings() const;
+
   /// @}
 
 private:
@@ -2179,6 +2182,7 @@ class TargetTransformInfo::Concept {
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
+  virtual bool useWidenGlobalStrings() const = 0;
 };
 
 template <typename T>
@@ -2952,6 +2956,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
+
+  bool useWidenGlobalStrings() const override {
+    return Impl.useWidenGlobalStrings();
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 90eef93a2a54d5..ac899608be0efd 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -973,6 +973,8 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  bool useWidenGlobalStrings() const { return false; }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/Transforms/Scalar/ARMWidenStrings.h b/llvm/include/llvm/Transforms/Scalar/ARMWidenStrings.h
deleted file mode 100755
index 3bda666660144a..00000000000000
--- a/llvm/include/llvm/Transforms/Scalar/ARMWidenStrings.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===- ARMWidenStrings.h --------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides the interface for the ArmWidenStrings pass
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
-#define LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
-
-#include "llvm/IR/PassManager.h"
-
-namespace llvm {
-
-class Module;
-
-class ARMWidenStringsPass : public PassInfoMixin<ARMWidenStringsPass> {
-public:
-  ARMWidenStringsPass() = default;
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
-};
-
-} // end namespace llvm
-
-#endif // LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
\ No newline at end of file
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2c26493bd3f1ca..e06d7bbb119dab 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1348,6 +1348,10 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+bool TargetTransformInfo::useWidenGlobalStrings() const {
+  return TTIImpl->useWidenGlobalStrings();
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 6b989231cb9861..1df1449fce597c 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -207,7 +207,6 @@
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
-#include "llvm/Transforms/Scalar/ARMWidenStrings.h"
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
 #include "llvm/Transforms/Scalar/AnnotationRemarks.h"
 #include "llvm/Transforms/Scalar/BDCE.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 55566f43e5435d..d6067089c6b5c1 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -489,7 +489,6 @@ FUNCTION_PASS("view-dom-only", DomOnlyViewer())
 FUNCTION_PASS("view-post-dom", PostDomViewer())
 FUNCTION_PASS("view-post-dom-only", PostDomOnlyViewer())
 FUNCTION_PASS("wasm-eh-prepare", WasmEHPreparePass())
-FUNCTION_PASS("arm-widen-strings", ARMWidenStringsPass())
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 912569a8fec118..7bc91e2935f3dc 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -56,6 +56,10 @@ static cl::opt<bool>
     AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
                   cl::desc("Enable the generation of WLS loops"));
 
+static cl::opt<unsigned> UseWidenGlobalStrings(
+    "widen-global-strings", cl::Hidden, cl::init(true),
+    cl::desc("Enable the widening of global strings to alignment boundaries"));
+
 extern cl::opt<TailPredication::Mode> EnableTailPredication;
 
 extern cl::opt<bool> EnableMaskedGatherScatters;
@@ -2644,3 +2648,5 @@ bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
     return ST->hasARMOps();
   }
 }
+
+bool ARMTTIImpl::useWidenGlobalStrings() const { return UseWidenGlobalStrings; }
\ No newline at end of file
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index bea088065172e0..29b9d8a35eb5eb 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -334,6 +334,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
 
   bool hasArmWideBranch(bool Thumb) const;
 
+  bool useWidenGlobalStrings() const;
+
   /// @}
 };
 
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index aae4926e027ff4..84c1585fede11c 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -92,6 +92,8 @@ STATISTIC(NumInternalFunc, "Number of internal functions");
 STATISTIC(NumColdCC, "Number of functions marked coldcc");
 STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
 STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed");
+STATISTIC(NumGlobalStringsPadded,
+          "Number of global strings padded to alignment boundary");
 
 static cl::opt<bool>
     EnableColdCCStressTest("enable-coldcc-stress-test",
@@ -2029,6 +2031,145 @@ OptimizeFunctions(Module &M,
   return Changed;
 }
 
+static bool IsCharArray(Type *t) {
+  const unsigned int CHAR_BIT_SIZE = 8;
+  return t && t->isArrayTy() && t->getArrayElementType()->isIntegerTy() &&
+         t->getArrayElementType()->getIntegerBitWidth() == CHAR_BIT_SIZE;
+}
+
+static bool
+tryWidenGlobalStrings(Function &F,
+                      function_ref<TargetTransformInfo &(Function &)> GetTTI) {
+  bool changed = false;
+
+  for (Function::iterator b = F.begin(); b != F.end(); ++b) {
+    for (BasicBlock::iterator i = b->begin(); i != b->end(); ++i) {
+      CallInst *CI = dyn_cast<CallInst>(i);
+      if (!CI) {
+        continue;
+      }
+
+      TargetTransformInfo &TTI = GetTTI(F);
+
+      Function *CallMemcpy = CI->getCalledFunction();
+      // find out if the current call instruction is a call to llvm memcpy
+      // intrinsics
+      if (CallMemcpy == NULL || !CallMemcpy->isIntrinsic() ||
+          CallMemcpy->getIntrinsicID() != Intrinsic::memcpy) {
+        continue;
+      }
+
+      auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
+      auto *SourceVar = dyn_cast<GlobalVariable>(CI->getArgOperand(1));
+      auto *BytesToCopy = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+      auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
+
+      if (!BytesToCopy) {
+        continue;
+      }
+
+      uint64_t NumBytesToCopy = BytesToCopy->getZExtValue();
+
+      if (!Alloca) {
+        continue;
+      }
+
+      // Source isn't a global constant variable
+      if (!SourceVar) {
+        continue;
+      }
+
+      if (!IsVolatile || IsVolatile->isOne()) {
+        continue;
+      }
+
+      if (NumBytesToCopy % 4 == 0) {
+        continue;
+      }
+
+      if (!SourceVar->hasInitializer() || !SourceVar->isConstant() ||
+          !SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr()) {
+        continue;
+      }
+
+      ConstantDataArray *SourceDataArray =
+          dyn_cast<ConstantDataArray>(SourceVar->getInitializer());
+      if (!SourceDataArray || !IsCharArray(SourceDataArray->getType())) {
+        continue;
+      }
+
+      if (!Alloca->isStaticAlloca()) {
+        continue;
+      }
+
+      // Make sure destination is definitley a char array.
+      if (!IsCharArray(Alloca->getAllocatedType())) {
+        continue;
+      }
+
+      uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
+      uint64_t SZSize = SourceDataArray->getType()->getNumElements();
+
+      // For safety purposes lets add a constraint and only padd when
+      // num bytes to copy == destination array size == source string
+      // which is a constant
+      if (NumBytesToCopy != DZSize || DZSize != SZSize) {
+        continue;
+      }
+      unsigned int NumBytesToPad = 4 - (NumBytesToCopy % 4);
+      unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
+
+      /*
+      Max number of bytes that memcpy allows for lowering to load/stores before
+      it uses library function (__aeabi_memcpy).
+      */
+      unsigned MaxMemIntrinsicSize =
+          TTI.getMaxMemIntrinsicInlineSizeThreshold();
+      if (TotalBytes > MaxMemIntrinsicSize) {
+        continue;
+      }
+
+      // update destination char array to be word aligned (memcpy(X,...,...))
+      IRBuilder<> BuildAlloca(Alloca);
+      AllocaInst *NewAlloca = cast<AllocaInst>(BuildAlloca.CreateAlloca(
+          ArrayType::get(Alloca->getAllocatedType()->getArrayElementType(),
+                         NumBytesToCopy + NumBytesToPad)));
+      NewAlloca->takeName(Alloca);
+      NewAlloca->setAlignment(Alloca->getAlign());
+      Alloca->replaceAllUsesWith(NewAlloca);
+
+      // update source to be word aligned (memcpy(...,X,...))
+      // create replacement string with padded null bytes.
+      StringRef Data = SourceDataArray->getRawDataValues();
+      std::vector<uint8_t> StrData(Data.begin(), Data.end());
+      for (unsigned int p = 0; p < NumBytesToPad; p++)
+        StrData.push_back('\0');
+      auto Arr = ArrayRef(StrData.data(), TotalBytes);
+
+      // create new padded version of global variable string.
+      Constant *SourceReplace = ConstantDataArray::get(F.getContext(), Arr);
+      GlobalVariable *NewGV = new GlobalVariable(
+          *F.getParent(), SourceReplace->getType(), true,
+          SourceVar->getLinkage(), SourceReplace, SourceReplace->getName());
+
+      // copy any other attributes from original global variable string
+      // e.g. unamed_addr
+      NewGV->copyAttributesFrom(SourceVar);
+      NewGV->takeName(SourceVar);
+
+      // replace intrinsic source.
+      CI->setArgOperand(1, NewGV);
+
+      // Update number of bytes to copy (memcpy(...,...,X))
+      CI->setArgOperand(2,
+                        ConstantInt::get(BytesToCopy->getType(), TotalBytes));
+      NumGlobalStringsPadded++;
+      changed |= true;
+    }
+  }
+  return changed;
+}
+
 static bool
 OptimizeGlobalVars(Module &M,
                    function_ref<TargetTransformInfo &(Function &)> GetTTI,
@@ -2058,6 +2199,14 @@ OptimizeGlobalVars(Module &M,
       continue;
     }
 
+    // Pad global strings if allowed
+    for (Function &F : llvm::make_early_inc_range(M)) {
+      TargetTransformInfo &TTI = GetTTI(F);
+      if (TTI.useWidenGlobalStrings()) {
+        Changed |= tryWidenGlobalStrings(F, GetTTI);
+      }
+    }
+
     Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree);
   }
   return Changed;
diff --git a/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp b/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp
deleted file mode 100644
index 1439e8af04292e..00000000000000
--- a/llvm/lib/Transforms/Scalar/ARMWidenStrings.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//===- ARMWidenStrings.cpp - Widen strings to ---------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Widen strings to word boundaries to speed up  programs that use simple
-// strcpy's with constant strings as source and stack allocated array for
-// destination.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm-widen-strings"
-
-#include "llvm/Transforms/Scalar/ARMWidenStrings.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/TargetParser/Triple.h"
-#include "llvm/Transforms/Scalar.h"
-
-using namespace llvm;
-
-cl::opt<bool> DisableARMWidenStrings("disable-arm-widen-strings");
-
-namespace {
-
-class ARMWidenStrings {
-public:
-  /*
-  Max number of bytes that memcpy allows for lowering to load/stores before it
-  uses library function (__aeabi_memcpy).  This is the same value returned by
-  ARMSubtarget::getMaxInlineSizeThreshold which I would have called in place of
-  the constant int but can't get access to the subtarget info class from the
-  midend.
-  */
-  const unsigned int MemcpyInliningLimit = 64;
-
-  bool run(Function &F);
-};
-
-static bool IsCharArray(Type *t) {
-  const unsigned int CHAR_BIT_SIZE = 8;
-  return t && t->isArrayTy() && t->getArrayElementType()->isIntegerTy() &&
-         t->getArrayElementType()->getIntegerBitWidth() == CHAR_BIT_SIZE;
-}
-
-bool ARMWidenStrings::run(Function &F) {
-  if (DisableARMWidenStrings) {
-    return false;
-  }
-
-  LLVM_DEBUG(dbgs() << "Running ARMWidenStrings on module " << F.getName()
-                    << "\n");
-
-  for (Function::iterator b = F.begin(); b != F.end(); ++b) {
-    for (BasicBlock::iterator i = b->begin(); i != b->end(); ++i) {
-      CallInst *CI = dyn_cast<CallInst>(i);
-      if (!CI) {
-        continue;
-      }
-
-      Function *CallMemcpy = CI->getCalledFunction();
-      // find out if the current call instruction is a call to llvm memcpy
-      // intrinsics
-      if (CallMemcpy == NULL || !CallMemcpy->isIntrinsic() ||
-          CallMemcpy->getIntrinsicID() != Intrinsic::memcpy) {
-        continue;
-      }
-
-      auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
-      auto *SourceVar = dyn_cast<GlobalVariable>(CI->getArgOperand(1));
-      auto *BytesToCopy = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-      auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
-
-      if (!BytesToCopy) {
-        continue;
-      }
-
-      uint64_t NumBytesToCopy = BytesToCopy->getZExtValue();
-
-      if (!Alloca) {
-        continue;
-      }
-
-      // Source isn't a global constant variable
-      if (!SourceVar) {
-        continue;
-      }
-
-      if (!IsVolatile || IsVolatile->isOne()) {
-        continue;
-      }
-
-      if (NumBytesToCopy % 4 == 0) {
-        continue;
-      }
-
-      if (!SourceVar->hasInitializer() || !SourceVar->isConstant() ||
-          !SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr()) {
-        continue;
-      }
-
-      ConstantDataArray *SourceDataArray =
-          dyn_cast<ConstantDataArray>(SourceVar->getInitializer());
-      if (!SourceDataArray || !IsCharArray(SourceDataArray->getType())) {
-        continue;
-      }
-
-      if (!Alloca->isStaticAlloca()) {
-        continue;
-      }
-
-      // Make sure destination is definitley a char array.
-      if (!IsCharArray(Alloca->getAllocatedType())) {
-        continue;
-      }
-
-      uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
-      uint64_t SZSize = SourceDataArray->getType()->getNumElements();
-
-      // For safety purposes lets add a constraint and only padd when
-      // num bytes to copy == destination array size == source string
-      // which is a constant
-      if (NumBytesToCopy != DZSize || DZSize != SZSize) {
-        continue;
-      }
-      unsigned int NumBytesToPad = 4 - (NumBytesToCopy % 4);
-      unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
-
-      if (TotalBytes > MemcpyInliningLimit) {
-        continue;
-      }
-
-      // update destination char array to be word aligned (memcpy(X,...,...))
-      IRBuilder<> BuildAlloca(Alloca);
-      AllocaInst *NewAlloca = cast<AllocaInst>(BuildAlloca.CreateAlloca(
-          ArrayType::get(Alloca->getAllocatedType()->getArrayElementType(),
-                         NumBytesToCopy + NumBytesToPad)));
-      NewAlloca->takeName(Alloca);
-      NewAlloca->setAlignment(Alloca->getAlign());
-      Alloca->replaceAllUsesWith(NewAlloca);
-
-      // update source to be word aligned (memcpy(...,X,...))
-      // create replacement string with padded null bytes.
-      StringRef Data = SourceDataArray->getRawDataValues();
-      std::vector<uint8_t> StrData(Data.begin(), Data.end());
-      for (unsigned int p = 0; p < NumBytesToPad; p++)
-        StrData.push_back('\0');
-      auto Arr = ArrayRef(StrData.data(), TotalBytes);
-
-      // create new padded version of global variable string.
-      Constant *SourceReplace = ConstantDataArray::get(F.getContext(), Arr);
-      GlobalVariable *NewGV = new GlobalVariable(
-          *F.getParent(), SourceReplace->getType(), true,
-          SourceVar->getLinkage(), SourceReplace, SourceReplace->getName());
-
-      // copy any other attributes from original global variable string
-      // e.g. unamed_addr
-      NewGV->copyAttributesFrom(SourceVar);
-      NewGV->takeName(SourceVar);
-
-      // replace intrinsic source.
-      CI->setArgOperand(1, NewGV);
-
-      // Update number of bytes to copy (memcpy(...,...,X))
-      CI->setArgOperand(2,
-                        ConstantInt::get(BytesToCopy->getType(), TotalBytes));
-    }
-  }
-  return true;
-}
-
-} // end of anonymous namespace
-
-PreservedAnalyses ARMWidenStringsPass::run(Function &F,
-                                           FunctionAnalysisManager &AM) {
-  if (!ARMWidenStrings().run(F))
-    return PreservedAnalyses::all();
-
-  return PreservedAnalyses::none();
-}
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index a9607e4ebc6583..939a1457239567 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -2,7 +2,6 @@ add_llvm_component_library(LLVMScalarOpts
   ADCE.cpp
   AlignmentFromAssumptions.cpp
   AnnotationRemarks.cpp
-  ARMWidenStrings.cpp
   BDCE.cpp
   CallSiteSplitting.cpp
   ConstantHoisting.cpp
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll
index e11cf372c36a6e..6a8adf1af57a49 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>,arm-widen-strings" -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>,globalopt" -S | FileCheck %s
 ; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O0>" -S | FileCheck %s --check-prefix=TURNED-OFF
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll
index 2df8108f445fe1..46bc715b8f7501 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>,arm-widen-strings" -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>,globalopt" -S | FileCheck %s
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 ; CHECK: [64 x i8]
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll
index a0c1e213298167..d5545cb9d6b88d 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=arm-widen-strings -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-arm-none-eabi"
 
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll
index 67cb99023c5328..de11c4a899c8d6 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=arm-widen-strings -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-arm-none-eabi"
 
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll
index 3f02c02ad845b2..1ec13eb72a6e29 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=arm-widen-strings -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 ; CHECK: [48 x i8]
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll
index 937bfaecd8e3e9..7e9ddf7b1a8798 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=arm-widen-strings -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-arm-none-eabi"
 
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-volatile.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-volatile.ll
index 6cbd823a18c367..24e9131b11907b 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-volatile.ll
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-volatile.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=arm-widen-strings -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-arm-none-eabi"
 

>From d00f0d113276f40d1df160f72f59102abe8d0b35 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Wed, 18 Sep 2024 11:44:41 +0100
Subject: [PATCH 04/12] Review comments

Updating patch so that when attempting to widen
global strings we only check whether the variable is
being called by a memcpy intrinsic.

Change-Id: I088403636c2ed0acc231af77b399b1b95f1abbc2
---
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 226 ++++++++----------
 .../ARMWidenStrings/arm-widen-strings-1.ll    |  20 +-
 .../ARMWidenStrings/arm-widen-strings-2.ll    |  19 +-
 3 files changed, 122 insertions(+), 143 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 84c1585fede11c..07c0567c9b2111 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2031,143 +2031,127 @@ OptimizeFunctions(Module &M,
   return Changed;
 }
 
-static bool IsCharArray(Type *t) {
+static bool IsCharArray(Type *T) {
   const unsigned int CHAR_BIT_SIZE = 8;
-  return t && t->isArrayTy() && t->getArrayElementType()->isIntegerTy() &&
-         t->getArrayElementType()->getIntegerBitWidth() == CHAR_BIT_SIZE;
+  return T && T->isArrayTy() && T->getArrayElementType()->isIntegerTy() &&
+         T->getArrayElementType()->getIntegerBitWidth() == CHAR_BIT_SIZE;
 }
 
 static bool
-tryWidenGlobalStrings(Function &F,
-                      function_ref<TargetTransformInfo &(Function &)> GetTTI) {
-  bool changed = false;
-
-  for (Function::iterator b = F.begin(); b != F.end(); ++b) {
-    for (BasicBlock::iterator i = b->begin(); i != b->end(); ++i) {
-      CallInst *CI = dyn_cast<CallInst>(i);
-      if (!CI) {
-        continue;
-      }
+tryWidenGlobalString(CallInst *CI, GlobalVariable *SourceVar,
+                     function_ref<TargetTransformInfo &(Function &)> GetTTI,
+                     function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
 
-      TargetTransformInfo &TTI = GetTTI(F);
+  auto *F = CI->getCalledFunction();
+  auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
+  auto *BytesToCopy = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
 
-      Function *CallMemcpy = CI->getCalledFunction();
-      // find out if the current call instruction is a call to llvm memcpy
-      // intrinsics
-      if (CallMemcpy == NULL || !CallMemcpy->isIntrinsic() ||
-          CallMemcpy->getIntrinsicID() != Intrinsic::memcpy) {
-        continue;
-      }
+  if (!BytesToCopy)
+    return false;
 
-      auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
-      auto *SourceVar = dyn_cast<GlobalVariable>(CI->getArgOperand(1));
-      auto *BytesToCopy = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-      auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
+  uint64_t NumBytesToCopy = BytesToCopy->getZExtValue();
 
-      if (!BytesToCopy) {
-        continue;
-      }
+  if (!Alloca)
+    return false;
 
-      uint64_t NumBytesToCopy = BytesToCopy->getZExtValue();
+  if (!IsVolatile || IsVolatile->isOne())
+    return false;
 
-      if (!Alloca) {
-        continue;
-      }
+  if (NumBytesToCopy % 4 == 0)
+    return false;
 
-      // Source isn't a global constant variable
-      if (!SourceVar) {
-        continue;
-      }
+  if (!SourceVar->hasInitializer() || !SourceVar->isConstant() ||
+      !SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr())
+    return false;
 
-      if (!IsVolatile || IsVolatile->isOne()) {
-        continue;
-      }
+  ConstantDataArray *SourceDataArray =
+      dyn_cast<ConstantDataArray>(SourceVar->getInitializer());
+  if (!SourceDataArray || !IsCharArray(SourceDataArray->getType()))
+    return false;
 
-      if (NumBytesToCopy % 4 == 0) {
-        continue;
-      }
+  if (!Alloca->isStaticAlloca())
+    return false;
 
-      if (!SourceVar->hasInitializer() || !SourceVar->isConstant() ||
-          !SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr()) {
-        continue;
-      }
+  // Make sure destination is definitley a char array.
+  if (!IsCharArray(Alloca->getAllocatedType()))
+    return false;
 
-      ConstantDataArray *SourceDataArray =
-          dyn_cast<ConstantDataArray>(SourceVar->getInitializer());
-      if (!SourceDataArray || !IsCharArray(SourceDataArray->getType())) {
-        continue;
-      }
+  uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
+  uint64_t SZSize = SourceDataArray->getType()->getNumElements();
 
-      if (!Alloca->isStaticAlloca()) {
-        continue;
-      }
+  // For safety purposes lets add a constraint and only padd when
+  // num bytes to copy == destination array size == source string
+  // which is a constant
+  if (NumBytesToCopy != DZSize || DZSize != SZSize)
+    return false;
 
-      // Make sure destination is definitley a char array.
-      if (!IsCharArray(Alloca->getAllocatedType())) {
-        continue;
-      }
+  unsigned int NumBytesToPad = 4 - (NumBytesToCopy % 4);
+  unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
 
-      uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
-      uint64_t SZSize = SourceDataArray->getType()->getNumElements();
+  // Max number of bytes that memcpy allows for lowering to load/stores before
+  // it uses library function (__aeabi_memcpy).
+  TargetTransformInfo &TTI = GetTTI(*F);
+  unsigned MaxMemIntrinsicSize = TTI.getMaxMemIntrinsicInlineSizeThreshold();
+  if (TotalBytes > MaxMemIntrinsicSize)
+    return false;
 
-      // For safety purposes lets add a constraint and only padd when
-      // num bytes to copy == destination array size == source string
-      // which is a constant
-      if (NumBytesToCopy != DZSize || DZSize != SZSize) {
-        continue;
-      }
-      unsigned int NumBytesToPad = 4 - (NumBytesToCopy % 4);
-      unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
-
-      /*
-      Max number of bytes that memcpy allows for lowering to load/stores before
-      it uses library function (__aeabi_memcpy).
-      */
-      unsigned MaxMemIntrinsicSize =
-          TTI.getMaxMemIntrinsicInlineSizeThreshold();
-      if (TotalBytes > MaxMemIntrinsicSize) {
-        continue;
-      }
+  // Update destination char array to be word aligned (memcpy(X,...,...))
+  IRBuilder<> BuildAlloca(Alloca);
+  AllocaInst *NewAlloca = cast<AllocaInst>(BuildAlloca.CreateAlloca(
+      ArrayType::get(Alloca->getAllocatedType()->getArrayElementType(),
+                     NumBytesToCopy + NumBytesToPad)));
+  NewAlloca->takeName(Alloca);
+  NewAlloca->setAlignment(Alloca->getAlign());
+  Alloca->replaceAllUsesWith(NewAlloca);
+
+  // Update source to be word aligned (memcpy(...,X,...))
+  // create replacement string with padded null bytes.
+  StringRef Data = SourceDataArray->getRawDataValues();
+  std::vector<uint8_t> StrData(Data.begin(), Data.end());
+  for (unsigned int p = 0; p < NumBytesToPad; p++)
+    StrData.push_back('\0');
+  auto Arr = ArrayRef(StrData.data(), TotalBytes);
+
+  // Create new padded version of global variable string.
+  Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr);
+  GlobalVariable *NewGV = new GlobalVariable(
+      *(F->getParent()), SourceReplace->getType(), true,
+      SourceVar->getLinkage(), SourceReplace, SourceReplace->getName());
 
-      // update destination char array to be word aligned (memcpy(X,...,...))
-      IRBuilder<> BuildAlloca(Alloca);
-      AllocaInst *NewAlloca = cast<AllocaInst>(BuildAlloca.CreateAlloca(
-          ArrayType::get(Alloca->getAllocatedType()->getArrayElementType(),
-                         NumBytesToCopy + NumBytesToPad)));
-      NewAlloca->takeName(Alloca);
-      NewAlloca->setAlignment(Alloca->getAlign());
-      Alloca->replaceAllUsesWith(NewAlloca);
-
-      // update source to be word aligned (memcpy(...,X,...))
-      // create replacement string with padded null bytes.
-      StringRef Data = SourceDataArray->getRawDataValues();
-      std::vector<uint8_t> StrData(Data.begin(), Data.end());
-      for (unsigned int p = 0; p < NumBytesToPad; p++)
-        StrData.push_back('\0');
-      auto Arr = ArrayRef(StrData.data(), TotalBytes);
-
-      // create new padded version of global variable string.
-      Constant *SourceReplace = ConstantDataArray::get(F.getContext(), Arr);
-      GlobalVariable *NewGV = new GlobalVariable(
-          *F.getParent(), SourceReplace->getType(), true,
-          SourceVar->getLinkage(), SourceReplace, SourceReplace->getName());
-
-      // copy any other attributes from original global variable string
-      // e.g. unamed_addr
-      NewGV->copyAttributesFrom(SourceVar);
-      NewGV->takeName(SourceVar);
-
-      // replace intrinsic source.
-      CI->setArgOperand(1, NewGV);
-
-      // Update number of bytes to copy (memcpy(...,...,X))
-      CI->setArgOperand(2,
-                        ConstantInt::get(BytesToCopy->getType(), TotalBytes));
-      NumGlobalStringsPadded++;
-      changed |= true;
-    }
+  // Copy any other attributes from original global variable string
+  // e.g. unamed_addr
+  NewGV->copyAttributesFrom(SourceVar);
+  NewGV->takeName(SourceVar);
+
+  // Replace intrinsic source.
+  CI->setArgOperand(1, NewGV);
+
+  // Update number of bytes to copy (memcpy(...,...,X))
+  CI->setArgOperand(2, ConstantInt::get(BytesToCopy->getType(), TotalBytes));
+  NumGlobalStringsPadded++;
+  return true;
+}
+
+static bool tryWidenGlobalStringsUsedByMemcpy(
+    GlobalVariable *GV, function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+    function_ref<TargetTransformInfo &(Function &)> GetTTI) {
+  for (auto *User : GV->users()) {
+    CallInst *CI = dyn_cast<CallInst>(User);
+    if (!CI)
+      continue;
+
+    Function *F = CI->getCalledFunction();
+    if (!F || !F->isIntrinsic() || F->getIntrinsicID() != Intrinsic::memcpy)
+      continue;
+
+    TargetTransformInfo &TTI = GetTTI(*F);
+    if (!TTI.useWidenGlobalStrings())
+      return false;
+
+    return tryWidenGlobalString(CI, GV, GetTTI, GetTLI);
   }
-  return changed;
+  return false;
 }
 
 static bool
@@ -2199,13 +2183,9 @@ OptimizeGlobalVars(Module &M,
       continue;
     }
 
-    // Pad global strings if allowed
-    for (Function &F : llvm::make_early_inc_range(M)) {
-      TargetTransformInfo &TTI = GetTTI(F);
-      if (TTI.useWidenGlobalStrings()) {
-        Changed |= tryWidenGlobalStrings(F, GetTTI);
-      }
-    }
+    // For global variable strings called in a memcpy
+    // we try to pad to nearest valid alignment boundary
+    Changed |= tryWidenGlobalStringsUsedByMemcpy(&GV, GetTLI, GetTTI);
 
     Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree);
   }
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll
index 6a8adf1af57a49..432a5728315e6c 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll
@@ -1,25 +1,25 @@
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>,globalopt" -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
 ; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O0>" -S | FileCheck %s --check-prefix=TURNED-OFF
+
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 ; CHECK: [12 x i8]
 ; TURNED-OFF-NOT: [12 x i8]
 @.str = private unnamed_addr constant [10 x i8] c"123456789\00", align 1
 
-; Function Attrs: nounwind
-define hidden void @foo() #0 {
+define hidden void @foo() local_unnamed_addr {
 entry:
 ; CHECK: %something = alloca [12 x i8]
 ; TURNED-OFF-NOT: %something = alloca [12 x i8]
   %something = alloca [10 x i8], align 1
-  %arraydecay = getelementptr inbounds [10 x i8], ptr %something, i32 0, i32 0
-; CHECK: @llvm.memcpy.p0.p0.i32
-  %call = call ptr @strcpy(ptr %arraydecay, ptr @.str)
-  %arraydecay1 = getelementptr inbounds [10 x i8], ptr %something, i32 0, i32 0
-  %call2 = call i32 @bar(ptr %arraydecay1)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 10, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something)
   ret void
 }
 
-declare ptr @strcpy(ptr, ptr) #1
+declare i32 @bar(...) local_unnamed_addr
+
+; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) #0
 
-declare i32 @bar(...) #1
+attributes #0 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite) }
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll
index 46bc715b8f7501..ecbe93411e4eb7 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll
+++ b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll
@@ -4,19 +4,18 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 ; CHECK: [64 x i8]
 @.str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
 
-; Function Attrs: nounwind
-define hidden void @foo() #0 {
+define hidden void @foo() local_unnamed_addr {
 entry:
-; CHECK: %something = alloca [64 x i8]
+  ; CHECK: %something = alloca [64 x i8]
   %something = alloca [62 x i8], align 1
-  %arraydecay = getelementptr inbounds [62 x i8], ptr %something, i32 0, i32 0
-; CHECK: @llvm.memcpy.p0.p0.i32
-  %call = call ptr @strcpy(ptr %arraydecay, ptr @.str)
-  %arraydecay1 = getelementptr inbounds [62 x i8], ptr %something, i32 0, i32 0
-  %call2 = call i32 @bar(ptr %arraydecay1)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(62) %something, ptr noundef nonnull align 1 dereferenceable(62) @.str, i32 62, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something)
   ret void
 }
 
-declare ptr @strcpy(ptr, ptr) #1
+declare i32 @bar(...) local_unnamed_addr
 
-declare i32 @bar(...) #1
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }

>From 2a9597fda6c499c034eea7b46b7fe42efcd3c095 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Fri, 27 Sep 2024 12:47:01 +0100
Subject: [PATCH 05/12] Review comments

---
 .../llvm/Analysis/TargetTransformInfo.h       | 11 ++--
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  2 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  4 +-
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 18 ++++-
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h  |  2 +-
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 66 +++++++------------
 .../ARMWidenStrings/arm-widen-strings-1.ll    | 25 -------
 .../arm-widen-strings-ptrtoint.ll             | 42 ------------
 .../arm-widen-strings-struct-test.ll          | 52 ---------------
 .../GlobalOpt/ARM/arm-widen-strings-1.ll      | 33 ++++++++++
 .../ARM}/arm-widen-strings-2.ll               | 14 ++--
 .../arm-widen-strings-lengths-dont-match.ll   | 21 +++---
 .../arm-widen-strings-more-than-64-bytes.ll   | 20 +++---
 .../ARM/arm-widen-strings-ptrtoint.ll         | 59 +++++++++++++++++
 .../ARM/arm-widen-strings-struct-test.ll      | 48 ++++++++++++++
 .../ARM}/arm-widen-strings-volatile.ll        | 21 +++---
 16 files changed, 232 insertions(+), 206 deletions(-)
 delete mode 100644 llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll
 delete mode 100644 llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll
 delete mode 100644 llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
 rename llvm/test/Transforms/{ARMWidenStrings => GlobalOpt/ARM}/arm-widen-strings-2.ll (57%)
 rename llvm/test/Transforms/{ARMWidenStrings => GlobalOpt/ARM}/arm-widen-strings-lengths-dont-match.ll (57%)
 rename llvm/test/Transforms/{ARMWidenStrings => GlobalOpt/ARM}/arm-widen-strings-more-than-64-bytes.ll (59%)
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
 rename llvm/test/Transforms/{ARMWidenStrings => GlobalOpt/ARM}/arm-widen-strings-volatile.ll (57%)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2acdd561f61ce0..766b648cd7e36d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1786,8 +1786,9 @@ class TargetTransformInfo {
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
-  /// \return true if global strings should be padded to an alignment boundary
-  bool useWidenGlobalStrings() const;
+  /// \return For an array of given Size, return alignment boundary to
+  /// pad to. Default is no padding.
+  unsigned getNumBytesToPad(unsigned Size) const;
 
   /// @}
 
@@ -2182,7 +2183,7 @@ class TargetTransformInfo::Concept {
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
-  virtual bool useWidenGlobalStrings() const = 0;
+  virtual unsigned getNumBytesToPad(unsigned Size) const = 0;
 };
 
 template <typename T>
@@ -2957,8 +2958,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getMaxNumArgs();
   }
 
-  bool useWidenGlobalStrings() const override {
-    return Impl.useWidenGlobalStrings();
+  unsigned getNumBytesToPad(unsigned Size) const override {
+    return Impl.getNumBytesToPad(Size);
   }
 };
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index ac899608be0efd..aa26ad1088a0e5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -973,7 +973,7 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
-  bool useWidenGlobalStrings() const { return false; }
+  unsigned getNumBytesToPad(unsigned Size) const { return 0; }
 
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index e06d7bbb119dab..c51f75e7e1f526 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1348,8 +1348,8 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
-bool TargetTransformInfo::useWidenGlobalStrings() const {
-  return TTIImpl->useWidenGlobalStrings();
+unsigned TargetTransformInfo::getNumBytesToPad(unsigned Size) const {
+  return TTIImpl->getNumBytesToPad(Size);
 }
 
 TargetTransformInfo::Concept::~Concept() = default;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 7bc91e2935f3dc..2942cf378cf2b8 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2649,4 +2649,20 @@ bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
   }
 }
 
-bool ARMTTIImpl::useWidenGlobalStrings() const { return UseWidenGlobalStrings; }
\ No newline at end of file
+unsigned ARMTTIImpl::getNumBytesToPad(unsigned Size) const {
+  // We pad to 4 byte boundaries;
+  if (Size % 4 == 0)
+    return 0;
+
+  unsigned NumBytesToPad = 4 - (Size % 4);
+  unsigned NewSize = Size + NumBytesToPad;
+
+  // Max number of bytes that memcpy allows for lowering to load/stores before
+  // it uses library function (__aeabi_memcpy).
+  unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
+
+  if (NewSize > MaxMemIntrinsicSize)
+    return 0;
+
+  return NumBytesToPad;
+}
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 29b9d8a35eb5eb..4f23c6a00601d3 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -334,7 +334,7 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
 
   bool hasArmWideBranch(bool Thumb) const;
 
-  bool useWidenGlobalStrings() const;
+  unsigned getNumBytesToPad(unsigned Size) const;
 
   /// @}
 };
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 07c0567c9b2111..b1ce4f995d0aa2 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2037,70 +2037,44 @@ static bool IsCharArray(Type *T) {
          T->getArrayElementType()->getIntegerBitWidth() == CHAR_BIT_SIZE;
 }
 
-static bool
-tryWidenGlobalString(CallInst *CI, GlobalVariable *SourceVar,
-                     function_ref<TargetTransformInfo &(Function &)> GetTTI,
-                     function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
-
+static bool tryWidenGlobalString(CallInst *CI, GlobalVariable *SourceVar,
+                                 unsigned NumBytesToPad,
+                                 unsigned NumBytesToCopy,
+                                 ConstantInt *BytesToCopyOp) {
   auto *F = CI->getCalledFunction();
   auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
-  auto *BytesToCopy = dyn_cast<ConstantInt>(CI->getArgOperand(2));
   auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
 
-  if (!BytesToCopy)
-    return false;
-
-  uint64_t NumBytesToCopy = BytesToCopy->getZExtValue();
-
-  if (!Alloca)
-    return false;
-
-  if (!IsVolatile || IsVolatile->isOne())
-    return false;
-
-  if (NumBytesToCopy % 4 == 0)
+  if (!Alloca || !IsVolatile || IsVolatile->isOne())
     return false;
 
   if (!SourceVar->hasInitializer() || !SourceVar->isConstant() ||
       !SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr())
     return false;
 
+  if (!Alloca->isStaticAlloca() || !IsCharArray(Alloca->getAllocatedType()))
+    return false;
+
   ConstantDataArray *SourceDataArray =
       dyn_cast<ConstantDataArray>(SourceVar->getInitializer());
   if (!SourceDataArray || !IsCharArray(SourceDataArray->getType()))
     return false;
 
-  if (!Alloca->isStaticAlloca())
-    return false;
-
-  // Make sure destination is definitley a char array.
-  if (!IsCharArray(Alloca->getAllocatedType()))
-    return false;
-
   uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
   uint64_t SZSize = SourceDataArray->getType()->getNumElements();
 
-  // For safety purposes lets add a constraint and only padd when
+  // For safety purposes lets add a constraint and only pad when
   // num bytes to copy == destination array size == source string
   // which is a constant
   if (NumBytesToCopy != DZSize || DZSize != SZSize)
     return false;
 
-  unsigned int NumBytesToPad = 4 - (NumBytesToCopy % 4);
   unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
 
-  // Max number of bytes that memcpy allows for lowering to load/stores before
-  // it uses library function (__aeabi_memcpy).
-  TargetTransformInfo &TTI = GetTTI(*F);
-  unsigned MaxMemIntrinsicSize = TTI.getMaxMemIntrinsicInlineSizeThreshold();
-  if (TotalBytes > MaxMemIntrinsicSize)
-    return false;
-
   // Update destination char array to be word aligned (memcpy(X,...,...))
   IRBuilder<> BuildAlloca(Alloca);
-  AllocaInst *NewAlloca = cast<AllocaInst>(BuildAlloca.CreateAlloca(
-      ArrayType::get(Alloca->getAllocatedType()->getArrayElementType(),
-                     NumBytesToCopy + NumBytesToPad)));
+  AllocaInst *NewAlloca = BuildAlloca.CreateAlloca(ArrayType::get(
+      Alloca->getAllocatedType()->getArrayElementType(), TotalBytes));
   NewAlloca->takeName(Alloca);
   NewAlloca->setAlignment(Alloca->getAlign());
   Alloca->replaceAllUsesWith(NewAlloca);
@@ -2128,13 +2102,13 @@ tryWidenGlobalString(CallInst *CI, GlobalVariable *SourceVar,
   CI->setArgOperand(1, NewGV);
 
   // Update number of bytes to copy (memcpy(...,...,X))
-  CI->setArgOperand(2, ConstantInt::get(BytesToCopy->getType(), TotalBytes));
+  CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(), TotalBytes));
   NumGlobalStringsPadded++;
   return true;
 }
 
 static bool tryWidenGlobalStringsUsedByMemcpy(
-    GlobalVariable *GV, function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+    GlobalVariable *GV,
     function_ref<TargetTransformInfo &(Function &)> GetTTI) {
   for (auto *User : GV->users()) {
     CallInst *CI = dyn_cast<CallInst>(User);
@@ -2146,10 +2120,16 @@ static bool tryWidenGlobalStringsUsedByMemcpy(
       continue;
 
     TargetTransformInfo &TTI = GetTTI(*F);
-    if (!TTI.useWidenGlobalStrings())
-      return false;
+    auto *BytesToCopyOp = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+    if (!BytesToCopyOp)
+      continue;
+
+    unsigned NumBytesToCopy = BytesToCopyOp->getZExtValue();
+    unsigned NumBytesToPad = TTI.getNumBytesToPad(NumBytesToCopy);
 
-    return tryWidenGlobalString(CI, GV, GetTTI, GetTLI);
+    if (NumBytesToPad)
+      return tryWidenGlobalString(CI, GV, NumBytesToPad, NumBytesToCopy,
+                                  BytesToCopyOp);
   }
   return false;
 }
@@ -2185,7 +2165,7 @@ OptimizeGlobalVars(Module &M,
 
     // For global variable strings called in a memcpy
     // we try to pad to nearest valid alignment boundary
-    Changed |= tryWidenGlobalStringsUsedByMemcpy(&GV, GetTLI, GetTTI);
+    Changed |= tryWidenGlobalStringsUsedByMemcpy(&GV, GetTTI);
 
     Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree);
   }
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll
deleted file mode 100644
index 432a5728315e6c..00000000000000
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-1.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O0>" -S | FileCheck %s --check-prefix=TURNED-OFF
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-
-; CHECK: [12 x i8]
-; TURNED-OFF-NOT: [12 x i8]
- at .str = private unnamed_addr constant [10 x i8] c"123456789\00", align 1
-
-define hidden void @foo() local_unnamed_addr {
-entry:
-; CHECK: %something = alloca [12 x i8]
-; TURNED-OFF-NOT: %something = alloca [12 x i8]
-  %something = alloca [10 x i8], align 1
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 10, i1 false)
-  %call2 = call i32 @bar(ptr nonnull %something)
-  ret void
-}
-
-declare i32 @bar(...) local_unnamed_addr
-
-; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) #0
-
-attributes #0 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite) }
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll
deleted file mode 100644
index 1ec13eb72a6e29..00000000000000
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-ptrtoint.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-
-; CHECK: [48 x i8]
- at f.string1 = private unnamed_addr constant [45 x i8] c"The quick brown dog jumps over the lazy fox.\00", align 1
-
-; Function Attrs: nounwind
-define hidden i32 @f() {
-entry:
-  %string1 = alloca [45 x i8], align 1
-  %pos = alloca i32, align 4
-  %token = alloca ptr, align 4
-  call void @llvm.lifetime.start.p0i8(i64 45, ptr %string1)
-  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %string1, ptr align 1 @f.string1, i32 45, i1 false)
-  call void @llvm.lifetime.start.p0i8(i64 4, ptr %pos)
-  call void @llvm.lifetime.start.p0i8(i64 4, ptr %token)
-  %call = call ptr @strchr(ptr %string1, i32 101)
-  store ptr %call, ptr %token, align 4
-  %0 = load ptr, ptr %token, align 4
-  %sub.ptr.lhs.cast = ptrtoint ptr %0 to i32
-  %sub.ptr.rhs.cast = ptrtoint ptr %string1 to i32
-  %sub.ptr.sub = sub i32 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
-  %add = add nsw i32 %sub.ptr.sub, 1
-  store i32 %add, ptr %pos, align 4
-  %1 = load i32, ptr %pos, align 4
-  call void @llvm.lifetime.end.p0i8(i64 4, ptr %token)
-  call void @llvm.lifetime.end.p0i8(i64 4, ptr %pos)
-  call void @llvm.lifetime.end.p0i8(i64 45, ptr %string1)
-  ret i32 %1
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0i8(i64, ptr nocapture)
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1)
-
-; Function Attrs: nounwind
-declare ptr @strchr(ptr, i32)
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0i8(i64, ptr nocapture)
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll b/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll
deleted file mode 100644
index 7e9ddf7b1a8798..00000000000000
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-struct-test.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "thumbv6m-arm-none-eabi"
-
-%struct.P = type { i32, [13 x i8] }
-
-; CHECK-NOT: [16 x i8]
- at .str = private unnamed_addr constant [13 x i8] c"hello world\0A\00", align 1
- at .str.1 = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
- at __ARM_use_no_argv = global i32 1, section ".ARM.use_no_argv", align 4
- at llvm.used = appending global [1 x ptr] [ptr @__ARM_use_no_argv], section "llvm.metadata"
-
-; Function Attrs: nounwind
-define hidden i32 @main() local_unnamed_addr #0 {
-entry:
-  %p = alloca %struct.P, align 4
-  call void @llvm.lifetime.start(i64 20, ptr nonnull %p) #2
-  store i32 10, ptr %p, align 4, !tbaa !3
-  %arraydecay = getelementptr inbounds %struct.P, ptr %p, i32 0, i32 1, i32 0
-  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %arraydecay, ptr align 1 @.str, i32 13, i1 false)
-  %puts = call i32 @puts(ptr %arraydecay)
-  call void @llvm.lifetime.end(i64 20, ptr nonnull %p) #2
-  ret i32 0
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
-
-; Function Attrs: nounwind
-declare i32 @puts(ptr nocapture readonly) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m0" "target-features"="+strict-align" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind }
-
-!llvm.module.flags = !{!0, !1}
-!llvm.ident = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 1, !"min_enum_size", i32 4}
-!2 = !{!"Component: ARM Compiler 6 devbuild Tool: armclang [devbuild]"}
-!3 = !{!4, !5, i64 0}
-!4 = !{!"P", !5, i64 0, !6, i64 4}
-!5 = !{!"int", !6, i64 0}
-!6 = !{!"omnipotent char", !7, i64 0}
-!7 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
new file mode 100644
index 00000000000000..1fd82434de681c
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O0>" -S | FileCheck %s --check-prefix=TURNED-OFF
+
+; CHECK: [12 x i8]
+; TURNED-OFF-NOT: [12 x i8]
+ at .str = private unnamed_addr constant [10 x i8] c"123456789\00", align 1
+
+define hidden void @foo() local_unnamed_addr {
+; CHECK-LABEL: define hidden void @foo() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [12 x i8], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca [10 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 12, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    ret void
+;
+; TURNED-OFF-LABEL: define hidden void @foo() local_unnamed_addr {
+; TURNED-OFF-NEXT:  [[ENTRY:.*:]]
+; TURNED-OFF-NEXT:    [[SOMETHING:%.*]] = alloca [10 x i8], align 1
+; TURNED-OFF-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 10, i1 false)
+; TURNED-OFF-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; TURNED-OFF-NEXT:    ret void
+;
+entry:
+  %something = alloca [10 x i8], align 1
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 10, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something)
+  ret void
+}
+
+declare i32 @bar(...) local_unnamed_addr
+declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) #0
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
similarity index 57%
rename from llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll
rename to llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
index ecbe93411e4eb7..2e5f9eddb3a190 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-2.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
@@ -1,12 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>,globalopt" -S | FileCheck %s
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 ; CHECK: [64 x i8]
 @.str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
 
 define hidden void @foo() local_unnamed_addr {
+; CHECK-LABEL: define hidden void @foo() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [64 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(64) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(64) @.str, i32 64, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    ret void
+;
 entry:
-  ; CHECK: %something = alloca [64 x i8]
   %something = alloca [62 x i8], align 1
   call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(62) %something, ptr noundef nonnull align 1 dereferenceable(62) @.str, i32 62, i1 false)
   %call2 = call i32 @bar(ptr nonnull %something)
@@ -14,8 +20,4 @@ entry:
 }
 
 declare i32 @bar(...) local_unnamed_addr
-
-; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
 declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) #0
-
-attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
similarity index 57%
rename from llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll
rename to llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
index d5545cb9d6b88d..3f2996fc6d3577 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-lengths-dont-match.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
@@ -1,12 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "thumbv6m-arm-none-eabi"
-
 ; CHECK: [17 x i8]
 @.str = private unnamed_addr constant [17 x i8] c"aaaaaaaaaaaaaaaa\00", align 1
 
 ; Function Attrs: nounwind
 define hidden void @foo() local_unnamed_addr #0 {
+; CHECK-LABEL: define hidden void @foo() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [20 x i8], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 1 [[SOMETHING]], ptr align 1 @.str, i32 17, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %something = alloca [20 x i8], align 1
   call void @llvm.lifetime.start(i64 20, ptr nonnull %something) #3
@@ -16,13 +23,7 @@ entry:
   ret void
 }
 
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
-
 declare i32 @bar(...) local_unnamed_addr #2
-
-; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, ptr nocapture) #1
 declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-
-; Function Attrs: argmemonly nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
similarity index 59%
rename from llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll
rename to llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
index de11c4a899c8d6..9aa1255b9310fe 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-more-than-64-bytes.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
@@ -1,6 +1,5 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "thumbv6m-arm-none-eabi"
 
 ; CHECK: [65 x i8]
 ; CHECK-NOT: [68 x i8]
@@ -8,6 +7,15 @@ target triple = "thumbv6m-arm-none-eabi"
 
 ; Function Attrs: nounwind
 define hidden void @foo() local_unnamed_addr #0 {
+; CHECK-LABEL: define hidden void @foo() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [65 x i8], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 65, ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 1 [[SOMETHING]], ptr align 1 @.str, i32 65, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 65, ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %something = alloca [65 x i8], align 1
   call void @llvm.lifetime.start(i64 65, ptr nonnull %something) #3
@@ -17,13 +25,7 @@ entry:
   ret void
 }
 
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
-
 declare i32 @bar(...) local_unnamed_addr #2
-
-; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, ptr nocapture) #1
 declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-
-; Function Attrs: argmemonly nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
new file mode 100644
index 00000000000000..c9cb442dd6ea4e
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+; CHECK: [48 x i8]
+ at f.string1 = private unnamed_addr constant [45 x i8] c"The quick brown dog jumps over the lazy fox.\00", align 1
+
+; Function Attrs: nounwind
+define hidden i32 @f() {
+; CHECK-LABEL: define hidden i32 @f() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[STRING1:%.*]] = alloca [48 x i8], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca [45 x i8], align 1
+; CHECK-NEXT:    [[POS:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TOKEN:%.*]] = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 45, ptr [[STRING1]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[STRING1]], ptr align 1 @f.string1, i32 48, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[POS]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TOKEN]])
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr @strchr(ptr [[STRING1]], i32 101)
+; CHECK-NEXT:    store ptr [[CALL]], ptr [[TOKEN]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TOKEN]], align 4
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP1]] to i32
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STRING1]] to i32
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i32 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUB_PTR_SUB]], 1
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[POS]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[POS]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TOKEN]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[POS]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 45, ptr [[STRING1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %string1 = alloca [45 x i8], align 1
+  %pos = alloca i32, align 4
+  %token = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0i8(i64 45, ptr %string1)
+  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %string1, ptr align 1 @f.string1, i32 45, i1 false)
+  call void @llvm.lifetime.start.p0i8(i64 4, ptr %pos)
+  call void @llvm.lifetime.start.p0i8(i64 4, ptr %token)
+  %call = call ptr @strchr(ptr %string1, i32 101)
+  store ptr %call, ptr %token, align 4
+  %0 = load ptr, ptr %token, align 4
+  %sub.ptr.lhs.cast = ptrtoint ptr %0 to i32
+  %sub.ptr.rhs.cast = ptrtoint ptr %string1 to i32
+  %sub.ptr.sub = sub i32 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %add = add nsw i32 %sub.ptr.sub, 1
+  store i32 %add, ptr %pos, align 4
+  %1 = load i32, ptr %pos, align 4
+  call void @llvm.lifetime.end.p0i8(i64 4, ptr %token)
+  call void @llvm.lifetime.end.p0i8(i64 4, ptr %pos)
+  call void @llvm.lifetime.end.p0i8(i64 45, ptr %string1)
+  ret i32 %1
+}
+
+declare ptr @strchr(ptr, i32)
+declare void @llvm.lifetime.start.p0i8(i64, ptr nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, ptr nocapture)
+declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
new file mode 100644
index 00000000000000..9503b6c33a2120
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+%struct.P = type { i32, [13 x i8] }
+
+; CHECK-NOT: [16 x i8]
+ at .str = private unnamed_addr constant [13 x i8] c"hello world\0A\00", align 1
+
+; Function Attrs: nounwind
+define hidden i32 @main() local_unnamed_addr #0 {
+; CHECK-LABEL: define hidden i32 @main() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P:%.*]] = alloca [[STRUCT_P:%.*]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr nonnull [[P]])
+; CHECK-NEXT:    store i32 10, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [[STRUCT_P]], ptr [[P]], i32 0, i32 1, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[ARRAYDECAY]], ptr align 1 @.str, i32 13, i1 false)
+; CHECK-NEXT:    [[PUTS:%.*]] = call i32 @puts(ptr [[ARRAYDECAY]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr nonnull [[P]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %p = alloca %struct.P, align 4
+  call void @llvm.lifetime.start(i64 20, ptr nonnull %p) #2
+  store i32 10, ptr %p, align 4, !tbaa !1
+  %arraydecay = getelementptr inbounds %struct.P, ptr %p, i32 0, i32 1, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %arraydecay, ptr align 1 @.str, i32 13, i1 false)
+  %puts = call i32 @puts(ptr %arraydecay)
+  call void @llvm.lifetime.end(i64 20, ptr nonnull %p) #2
+  ret i32 0
+}
+
+declare i32 @puts(ptr nocapture readonly) #2
+declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end(i64, ptr nocapture) #1
+declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
+
+!1 = !{!2, !3, i64 0}
+!2 = !{!"P", !3, i64 0, !4, i64 4}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], i64 0}
+; CHECK: [[META1]] = !{!"P", [[META2]], i64 0, [[META3:![0-9]+]], i64 4}
+; CHECK: [[META2]] = !{!"int", [[META3]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-volatile.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
similarity index 57%
rename from llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-volatile.ll
rename to llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
index 24e9131b11907b..ba7b7d45719bbd 100644
--- a/llvm/test/Transforms/ARMWidenStrings/arm-widen-strings-volatile.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
@@ -1,12 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "thumbv6m-arm-none-eabi"
 
 ; CHECK-NOT: [64 x i8]
 @.str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
 
 ; Function Attrs: nounwind
 define hidden void @foo() local_unnamed_addr #0 {
+; CHECK-LABEL: define hidden void @foo() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [62 x i8], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [62 x i8], ptr [[SOMETHING]], i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 62, ptr nonnull [[TMP0]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 1 [[TMP0]], ptr align 1 @.str, i32 62, i1 true)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[TMP0]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 62, ptr nonnull [[TMP0]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %something = alloca [62 x i8], align 1
   %0 = getelementptr inbounds [62 x i8], ptr %something, i32 0, i32 0
@@ -17,13 +26,7 @@ entry:
   ret void
 }
 
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
-
 declare i32 @bar(...) local_unnamed_addr #2
-
-; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, ptr nocapture) #1
 declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-
-; Function Attrs: argmemonly nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1

>From 2abbe6d7955a66f56b7e57aaa9a42414211f78bc Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Wed, 2 Oct 2024 12:17:28 +0100
Subject: [PATCH 06/12] Responding to review comments

---
 .../llvm/Analysis/TargetTransformInfo.h       | 10 +--
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  4 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  6 +-
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 10 ++-
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h  |  2 +-
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 72 ++++++++++---------
 .../GlobalOpt/ARM/arm-widen-non-byte-array.ll | 22 ++++++
 .../ARM/arm-widen-string-multi-use.ll         | 33 +++++++++
 .../GlobalOpt/ARM/arm-widen-strings-1.ll      | 11 ---
 .../GlobalOpt/ARM/arm-widen-strings-2.ll      |  5 +-
 .../arm-widen-strings-lengths-dont-match.ll   |  3 -
 .../arm-widen-strings-more-than-64-bytes.ll   |  3 -
 .../ARM/arm-widen-strings-ptrtoint.ll         |  4 --
 .../ARM/arm-widen-strings-struct-test.ll      |  3 -
 .../ARM/arm-widen-strings-volatile.ll         |  3 -
 15 files changed, 118 insertions(+), 73 deletions(-)
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 766b648cd7e36d..88ad3e179e99fa 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1788,7 +1788,7 @@ class TargetTransformInfo {
 
   /// \return For an array of given Size, return alignment boundary to
   /// pad to. Default is no padding.
-  unsigned getNumBytesToPad(unsigned Size) const;
+  unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
 
   /// @}
 
@@ -2183,7 +2183,8 @@ class TargetTransformInfo::Concept {
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
-  virtual unsigned getNumBytesToPad(unsigned Size) const = 0;
+  virtual unsigned getNumBytesToPadGlobalArray(unsigned Size,
+                                               Type *ArrayType) const = 0;
 };
 
 template <typename T>
@@ -2958,8 +2959,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getMaxNumArgs();
   }
 
-  unsigned getNumBytesToPad(unsigned Size) const override {
-    return Impl.getNumBytesToPad(Size);
+  unsigned getNumBytesToPadGlobalArray(unsigned Size,
+                                       Type *ArrayType) const override {
+    return Impl.getNumBytesToPadGlobalArray(Size, ArrayType);
   }
 };
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index aa26ad1088a0e5..c4ad55406cb236 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -973,7 +973,9 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
-  unsigned getNumBytesToPad(unsigned Size) const { return 0; }
+  unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
+    return 0;
+  }
 
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c51f75e7e1f526..f3fd61ab4211af 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1348,8 +1348,10 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
-unsigned TargetTransformInfo::getNumBytesToPad(unsigned Size) const {
-  return TTIImpl->getNumBytesToPad(Size);
+unsigned
+TargetTransformInfo::getNumBytesToPadGlobalArray(unsigned Size,
+                                                 Type *ArrayType) const {
+  return TTIImpl->getNumBytesToPadGlobalArray(Size, ArrayType);
 }
 
 TargetTransformInfo::Concept::~Concept() = default;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 2942cf378cf2b8..caa9d6cdd1e6de 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2649,8 +2649,14 @@ bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
   }
 }
 
-unsigned ARMTTIImpl::getNumBytesToPad(unsigned Size) const {
-  // We pad to 4 byte boundaries;
+unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
+                                                 Type *ArrayType) const {
+  // Don't modify none integer array types
+  if (!ArrayType || !ArrayType->isArrayTy() ||
+      !ArrayType->getArrayElementType()->isIntegerTy())
+    return 0;
+
+  // We pad to 4 byte boundaries
   if (Size % 4 == 0)
     return 0;
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 4f23c6a00601d3..c5a1fcffa84013 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -334,7 +334,7 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
 
   bool hasArmWideBranch(bool Thumb) const;
 
-  unsigned getNumBytesToPad(unsigned Size) const;
+  unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
 
   /// @}
 };
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index b1ce4f995d0aa2..ee8a0501a26bea 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -92,8 +92,8 @@ STATISTIC(NumInternalFunc, "Number of internal functions");
 STATISTIC(NumColdCC, "Number of functions marked coldcc");
 STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
 STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed");
-STATISTIC(NumGlobalStringsPadded,
-          "Number of global strings padded to alignment boundary");
+STATISTIC(NumGlobalArraysPadded,
+          "Number of global arrays padded to alignment boundary");
 
 static cl::opt<bool>
     EnableColdCCStressTest("enable-coldcc-stress-test",
@@ -2031,16 +2031,10 @@ OptimizeFunctions(Module &M,
   return Changed;
 }
 
-static bool IsCharArray(Type *T) {
-  const unsigned int CHAR_BIT_SIZE = 8;
-  return T && T->isArrayTy() && T->getArrayElementType()->isIntegerTy() &&
-         T->getArrayElementType()->getIntegerBitWidth() == CHAR_BIT_SIZE;
-}
-
-static bool tryWidenGlobalString(CallInst *CI, GlobalVariable *SourceVar,
-                                 unsigned NumBytesToPad,
-                                 unsigned NumBytesToCopy,
-                                 ConstantInt *BytesToCopyOp) {
+static bool tryWidenGlobalArray(CallInst *CI, GlobalVariable *SourceVar,
+                                unsigned NumBytesToPad, unsigned NumBytesToCopy,
+                                ConstantInt *BytesToCopyOp,
+                                ConstantDataArray *SourceDataArray) {
   auto *F = CI->getCalledFunction();
   auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
   auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
@@ -2052,48 +2046,51 @@ static bool tryWidenGlobalString(CallInst *CI, GlobalVariable *SourceVar,
       !SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr())
     return false;
 
-  if (!Alloca->isStaticAlloca() || !IsCharArray(Alloca->getAllocatedType()))
-    return false;
-
-  ConstantDataArray *SourceDataArray =
-      dyn_cast<ConstantDataArray>(SourceVar->getInitializer());
-  if (!SourceDataArray || !IsCharArray(SourceDataArray->getType()))
+  if (!Alloca->isStaticAlloca())
     return false;
 
   uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
   uint64_t SZSize = SourceDataArray->getType()->getNumElements();
+  unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
+  // Calculate the number of elements to copy while avoiding floored
+  // division of integers returning wrong values i.e. copying one byte
+  // from an array of i16 would yield 0 elements to copy as supposed to 1.
+  unsigned NumElementsToCopy =
+      (NumBytesToCopy + ElementByteWidth - 1) / ElementByteWidth;
 
   // For safety purposes lets add a constraint and only pad when
-  // num bytes to copy == destination array size == source string
-  // which is a constant
-  if (NumBytesToCopy != DZSize || DZSize != SZSize)
+  // NumElementsToCopy == destination array size ==
+  // source string which is a constant
+  if (NumElementsToCopy != DZSize || DZSize != SZSize)
     return false;
 
   unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
+  NumElementsToCopy = (TotalBytes + ElementByteWidth - 1) / ElementByteWidth;
 
-  // Update destination char array to be word aligned (memcpy(X,...,...))
+  // Update destination array to be word aligned (memcpy(X,...,...))
   IRBuilder<> BuildAlloca(Alloca);
   AllocaInst *NewAlloca = BuildAlloca.CreateAlloca(ArrayType::get(
-      Alloca->getAllocatedType()->getArrayElementType(), TotalBytes));
+      Alloca->getAllocatedType()->getArrayElementType(), NumElementsToCopy));
   NewAlloca->takeName(Alloca);
   NewAlloca->setAlignment(Alloca->getAlign());
   Alloca->replaceAllUsesWith(NewAlloca);
+  Alloca->eraseFromParent();
 
   // Update source to be word aligned (memcpy(...,X,...))
-  // create replacement string with padded null bytes.
+  // create replacement with padded null bytes.
   StringRef Data = SourceDataArray->getRawDataValues();
   std::vector<uint8_t> StrData(Data.begin(), Data.end());
   for (unsigned int p = 0; p < NumBytesToPad; p++)
     StrData.push_back('\0');
   auto Arr = ArrayRef(StrData.data(), TotalBytes);
 
-  // Create new padded version of global variable string.
+  // Create new padded version of global variable.
   Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr);
   GlobalVariable *NewGV = new GlobalVariable(
       *(F->getParent()), SourceReplace->getType(), true,
       SourceVar->getLinkage(), SourceReplace, SourceReplace->getName());
 
-  // Copy any other attributes from original global variable string
+  // Copy any other attributes from original global variable
   // e.g. unamed_addr
   NewGV->copyAttributesFrom(SourceVar);
   NewGV->takeName(SourceVar);
@@ -2103,11 +2100,11 @@ static bool tryWidenGlobalString(CallInst *CI, GlobalVariable *SourceVar,
 
   // Update number of bytes to copy (memcpy(...,...,X))
   CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(), TotalBytes));
-  NumGlobalStringsPadded++;
+  NumGlobalArraysPadded++;
   return true;
 }
 
-static bool tryWidenGlobalStringsUsedByMemcpy(
+static bool tryWidenGlobalArraysUsedByMemcpy(
     GlobalVariable *GV,
     function_ref<TargetTransformInfo &(Function &)> GetTTI) {
   for (auto *User : GV->users()) {
@@ -2124,12 +2121,21 @@ static bool tryWidenGlobalStringsUsedByMemcpy(
     if (!BytesToCopyOp)
       continue;
 
+    if (!GV->hasInitializer())
+      continue;
+
+    ConstantDataArray *SourceDataArray =
+        dyn_cast<ConstantDataArray>(GV->getInitializer());
+    if (!SourceDataArray)
+      continue;
+
     unsigned NumBytesToCopy = BytesToCopyOp->getZExtValue();
-    unsigned NumBytesToPad = TTI.getNumBytesToPad(NumBytesToCopy);
+    unsigned NumBytesToPad = TTI.getNumBytesToPadGlobalArray(
+        NumBytesToCopy, SourceDataArray->getType());
 
     if (NumBytesToPad)
-      return tryWidenGlobalString(CI, GV, NumBytesToPad, NumBytesToCopy,
-                                  BytesToCopyOp);
+      return tryWidenGlobalArray(CI, GV, NumBytesToPad, NumBytesToCopy,
+                                 BytesToCopyOp, SourceDataArray);
   }
   return false;
 }
@@ -2163,9 +2169,9 @@ OptimizeGlobalVars(Module &M,
       continue;
     }
 
-    // For global variable strings called in a memcpy
+    // For global variable arrays called in a memcpy
     // we try to pad to nearest valid alignment boundary
-    Changed |= tryWidenGlobalStringsUsedByMemcpy(&GV, GetTTI);
+    Changed |= tryWidenGlobalArraysUsedByMemcpy(&GV, GetTTI);
 
     Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree);
   }
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
new file mode 100644
index 00000000000000..346612efdda6bc
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+ at .i16 = private unnamed_addr constant [5 x i16] [i16 1, i16 2, i16 3, i16 4, i16 5] , align 1
+
+define hidden void @memcpy_i16_array() local_unnamed_addr {
+; CHECK-LABEL: define hidden void @memcpy_i16_array() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING1:%.*]] = alloca [6 x i16], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 12, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [5 x i16], align 1
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 10, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something)
+  ret void
+}
+
+
+declare i32 @bar(...) local_unnamed_addr
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
new file mode 100644
index 00000000000000..7a3bc65edfe665
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+ at .i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
+
+define hidden void @memcpy_multiple() local_unnamed_addr {
+; CHECK-LABEL: define hidden void @memcpy_multiple() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    [[SOMETHING1:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    [[SOMETHING2:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @[[GLOB1:[0-9]+]], i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(3) @[[GLOB0:[0-9]+]], i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING2]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
+; CHECK-NEXT:    [[CALL4:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [3 x i8], align 1
+  %something1 = alloca [3 x i8], align 1
+  %something2 = alloca [3 x i8], align 1
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something1, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something2, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something)
+  %call3 = call i32 @bar(ptr nonnull %something1)
+  %call4 = call i32 @bar(ptr nonnull %something2)
+  ret void
+}
+
+declare i32 @bar(...) local_unnamed_addr
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
index 1fd82434de681c..0cec5bf1b635f8 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
@@ -1,27 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O0>" -S | FileCheck %s --check-prefix=TURNED-OFF
 
 ; CHECK: [12 x i8]
-; TURNED-OFF-NOT: [12 x i8]
 @.str = private unnamed_addr constant [10 x i8] c"123456789\00", align 1
 
 define hidden void @foo() local_unnamed_addr {
 ; CHECK-LABEL: define hidden void @foo() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [12 x i8], align 1
-; CHECK-NEXT:    [[TMP0:%.*]] = alloca [10 x i8], align 1
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 12, i1 false)
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
 ; CHECK-NEXT:    ret void
 ;
-; TURNED-OFF-LABEL: define hidden void @foo() local_unnamed_addr {
-; TURNED-OFF-NEXT:  [[ENTRY:.*:]]
-; TURNED-OFF-NEXT:    [[SOMETHING:%.*]] = alloca [10 x i8], align 1
-; TURNED-OFF-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 10, i1 false)
-; TURNED-OFF-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
-; TURNED-OFF-NEXT:    ret void
-;
 entry:
   %something = alloca [10 x i8], align 1
   call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 10, i1 false)
@@ -30,4 +20,3 @@ entry:
 }
 
 declare i32 @bar(...) local_unnamed_addr
-declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) #0
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
index 2e5f9eddb3a190..5d1d008aafcadd 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>,globalopt" -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
 
 ; CHECK: [64 x i8]
 @.str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
@@ -8,7 +8,7 @@ define hidden void @foo() local_unnamed_addr {
 ; CHECK-LABEL: define hidden void @foo() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [64 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(64) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(64) @.str, i32 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(62) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(62) @.str, i32 64, i1 false)
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
 ; CHECK-NEXT:    ret void
 ;
@@ -20,4 +20,3 @@ entry:
 }
 
 declare i32 @bar(...) local_unnamed_addr
-declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) #0
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
index 3f2996fc6d3577..f1c8ac260c9ef7 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
@@ -24,6 +24,3 @@ entry:
 }
 
 declare i32 @bar(...) local_unnamed_addr #2
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
index 9aa1255b9310fe..961653753f54ce 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
@@ -26,6 +26,3 @@ entry:
 }
 
 declare i32 @bar(...) local_unnamed_addr #2
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
index c9cb442dd6ea4e..e82712ebe22ea1 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
@@ -9,7 +9,6 @@ define hidden i32 @f() {
 ; CHECK-LABEL: define hidden i32 @f() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[STRING1:%.*]] = alloca [48 x i8], align 1
-; CHECK-NEXT:    [[TMP0:%.*]] = alloca [45 x i8], align 1
 ; CHECK-NEXT:    [[POS:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TOKEN:%.*]] = alloca ptr, align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 45, ptr [[STRING1]])
@@ -54,6 +53,3 @@ entry:
 }
 
 declare ptr @strchr(ptr, i32)
-declare void @llvm.lifetime.start.p0i8(i64, ptr nocapture)
-declare void @llvm.lifetime.end.p0i8(i64, ptr nocapture)
-declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
index 9503b6c33a2120..9cb0c53bf16527 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
@@ -30,9 +30,6 @@ entry:
 }
 
 declare i32 @puts(ptr nocapture readonly) #2
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
 
 !1 = !{!2, !3, i64 0}
 !2 = !{!"P", !3, i64 0, !4, i64 4}
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
index ba7b7d45719bbd..4d2559579ce1be 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
@@ -27,6 +27,3 @@ entry:
 }
 
 declare i32 @bar(...) local_unnamed_addr #2
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1

>From af3e13d7944d722b78e2dc2bfee856070b9078d8 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Fri, 4 Oct 2024 13:15:59 +0100
Subject: [PATCH 07/12] Review comments: eliminating generation of multiple
 globals

---
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 41 +++++++++++--------
 .../GlobalOpt/ARM/arm-widen-non-byte-array.ll | 10 ++---
 .../ARM/arm-widen-string-multi-use.ll         | 14 +++----
 .../GlobalOpt/ARM/arm-widen-strings-1.ll      | 11 +++--
 .../GlobalOpt/ARM/arm-widen-strings-2.ll      | 11 +++--
 .../arm-widen-strings-lengths-dont-match.ll   |  8 ++--
 .../arm-widen-strings-more-than-64-bytes.ll   |  8 ++--
 .../ARM/arm-widen-strings-ptrtoint.ll         |  9 ++--
 .../ARM/arm-widen-strings-struct-test.ll      |  6 +--
 .../ARM/arm-widen-strings-volatile.ll         |  8 ++--
 10 files changed, 65 insertions(+), 61 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index ee8a0501a26bea..ba9f6d1a395c58 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2031,11 +2031,10 @@ OptimizeFunctions(Module &M,
   return Changed;
 }
 
-static bool tryWidenGlobalArray(CallInst *CI, GlobalVariable *SourceVar,
-                                unsigned NumBytesToPad, unsigned NumBytesToCopy,
-                                ConstantInt *BytesToCopyOp,
-                                ConstantDataArray *SourceDataArray) {
-  auto *F = CI->getCalledFunction();
+static bool tryWidenDestArray(Function *F, CallInst *CI,
+                              GlobalVariable *SourceVar, unsigned NumBytesToPad,
+                              unsigned NumBytesToCopy,
+                              ConstantDataArray *SourceDataArray) {
   auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
   auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
 
@@ -2055,8 +2054,7 @@ static bool tryWidenGlobalArray(CallInst *CI, GlobalVariable *SourceVar,
   // Calculate the number of elements to copy while avoiding floored
   // division of integers returning wrong values i.e. copying one byte
   // from an array of i16 would yield 0 elements to copy as supposed to 1.
-  unsigned NumElementsToCopy =
-      (NumBytesToCopy + ElementByteWidth - 1) / ElementByteWidth;
+  unsigned NumElementsToCopy = divideCeil(NumBytesToCopy, ElementByteWidth);
 
   // For safety purposes lets add a constraint and only pad when
   // NumElementsToCopy == destination array size ==
@@ -2065,7 +2063,7 @@ static bool tryWidenGlobalArray(CallInst *CI, GlobalVariable *SourceVar,
     return false;
 
   unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
-  NumElementsToCopy = (TotalBytes + ElementByteWidth - 1) / ElementByteWidth;
+  NumElementsToCopy = divideCeil(TotalBytes, ElementByteWidth);
 
   // Update destination array to be word aligned (memcpy(X,...,...))
   IRBuilder<> BuildAlloca(Alloca);
@@ -2075,14 +2073,21 @@ static bool tryWidenGlobalArray(CallInst *CI, GlobalVariable *SourceVar,
   NewAlloca->setAlignment(Alloca->getAlign());
   Alloca->replaceAllUsesWith(NewAlloca);
   Alloca->eraseFromParent();
+  return true;
+}
 
+static bool widenGlobalArray(Function *F, CallInst *CI,
+                             GlobalVariable *SourceVar, unsigned NumBytesToPad,
+                             unsigned NumBytesToCopy,
+                             ConstantInt *BytesToCopyOp,
+                             ConstantDataArray *SourceDataArray) {
   // Update source to be word aligned (memcpy(...,X,...))
   // create replacement with padded null bytes.
   StringRef Data = SourceDataArray->getRawDataValues();
   std::vector<uint8_t> StrData(Data.begin(), Data.end());
   for (unsigned int p = 0; p < NumBytesToPad; p++)
     StrData.push_back('\0');
-  auto Arr = ArrayRef(StrData.data(), TotalBytes);
+  auto Arr = ArrayRef(StrData.data(), NumBytesToCopy + NumBytesToPad);
 
   // Create new padded version of global variable.
   Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr);
@@ -2095,11 +2100,8 @@ static bool tryWidenGlobalArray(CallInst *CI, GlobalVariable *SourceVar,
   NewGV->copyAttributesFrom(SourceVar);
   NewGV->takeName(SourceVar);
 
-  // Replace intrinsic source.
-  CI->setArgOperand(1, NewGV);
-
-  // Update number of bytes to copy (memcpy(...,...,X))
-  CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(), TotalBytes));
+  CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(),
+                                        NumBytesToCopy + NumBytesToPad));
   NumGlobalArraysPadded++;
   return true;
 }
@@ -2133,9 +2135,14 @@ static bool tryWidenGlobalArraysUsedByMemcpy(
     unsigned NumBytesToPad = TTI.getNumBytesToPadGlobalArray(
         NumBytesToCopy, SourceDataArray->getType());
 
-    if (NumBytesToPad)
-      return tryWidenGlobalArray(CI, GV, NumBytesToPad, NumBytesToCopy,
-                                 BytesToCopyOp, SourceDataArray);
+    if (NumBytesToPad) {
+      bool DestWidened = tryWidenDestArray(F, CI, GV, NumBytesToPad,
+                                           NumBytesToCopy, SourceDataArray);
+      if (DestWidened) {
+        return widenGlobalArray(F, CI, GV, NumBytesToPad, NumBytesToCopy,
+                                BytesToCopyOp, SourceDataArray);
+      }
+    }
   }
   return false;
 }
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
index 346612efdda6bc..61e72c89817835 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
 
 @.i16 = private unnamed_addr constant [5 x i16] [i16 1, i16 2, i16 3, i16 4, i16 5] , align 1
 
-define hidden void @memcpy_i16_array() local_unnamed_addr {
-; CHECK-LABEL: define hidden void @memcpy_i16_array() local_unnamed_addr {
+define  void @memcpy_i16_array()  {
+; CHECK-LABEL: define void @memcpy_i16_array() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING1:%.*]] = alloca [6 x i16], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(10) @[[GLOB0:[0-9]+]], i32 12, i1 false)
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
 ; CHECK-NEXT:    ret void
 ;
@@ -19,4 +19,4 @@ entry:
 }
 
 
-declare i32 @bar(...) local_unnamed_addr
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
index 7a3bc65edfe665..91ffdb58165173 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
@@ -1,17 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
 
 @.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
 
-define hidden void @memcpy_multiple() local_unnamed_addr {
-; CHECK-LABEL: define hidden void @memcpy_multiple() local_unnamed_addr {
+define  void @memcpy_multiple()  {
+; CHECK-LABEL: define void @memcpy_multiple() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [4 x i8], align 1
 ; CHECK-NEXT:    [[SOMETHING1:%.*]] = alloca [4 x i8], align 1
 ; CHECK-NEXT:    [[SOMETHING2:%.*]] = alloca [4 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @[[GLOB1:[0-9]+]], i32 4, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(3) @[[GLOB0:[0-9]+]], i32 4, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING2]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @[[GLOB0:[0-9]+]], i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(3) @[[GLOB0]], i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING2]], ptr noundef nonnull align 1 dereferenceable(3) @[[GLOB0]], i32 4, i1 false)
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
 ; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
 ; CHECK-NEXT:    [[CALL4:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING2]])
@@ -30,4 +30,4 @@ entry:
   ret void
 }
 
-declare i32 @bar(...) local_unnamed_addr
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
index 0cec5bf1b635f8..91cf90a21de913 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
@@ -1,14 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
 
-; CHECK: [12 x i8]
 @.str = private unnamed_addr constant [10 x i8] c"123456789\00", align 1
 
-define hidden void @foo() local_unnamed_addr {
-; CHECK-LABEL: define hidden void @foo() local_unnamed_addr {
+define  void @foo()  {
+; CHECK-LABEL: define void @foo() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [12 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(10) @[[GLOB0:[0-9]+]], i32 12, i1 false)
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
 ; CHECK-NEXT:    ret void
 ;
@@ -19,4 +18,4 @@ entry:
   ret void
 }
 
-declare i32 @bar(...) local_unnamed_addr
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
index 5d1d008aafcadd..30c14af7caf672 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
@@ -1,14 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
 
-; CHECK: [64 x i8]
 @.str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
 
-define hidden void @foo() local_unnamed_addr {
-; CHECK-LABEL: define hidden void @foo() local_unnamed_addr {
+define  void @foo()  {
+; CHECK-LABEL: define void @foo() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [64 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(62) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(62) @.str, i32 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(62) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(62) @[[GLOB0:[0-9]+]], i32 64, i1 false)
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
 ; CHECK-NEXT:    ret void
 ;
@@ -19,4 +18,4 @@ entry:
   ret void
 }
 
-declare i32 @bar(...) local_unnamed_addr
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
index f1c8ac260c9ef7..b8e02c3f996da0 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
 ; CHECK: [17 x i8]
 @.str = private unnamed_addr constant [17 x i8] c"aaaaaaaaaaaaaaaa\00", align 1
 
 ; Function Attrs: nounwind
-define hidden void @foo() local_unnamed_addr #0 {
-; CHECK-LABEL: define hidden void @foo() local_unnamed_addr {
+define  void @foo()   {
+; CHECK-LABEL: define void @foo() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [20 x i8], align 1
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr nonnull [[SOMETHING]])
@@ -23,4 +23,4 @@ entry:
   ret void
 }
 
-declare i32 @bar(...) local_unnamed_addr #2
+declare i32 @bar(...)  #2
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
index 961653753f54ce..4ac31aa2f976d5 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
 
 ; CHECK: [65 x i8]
 ; CHECK-NOT: [68 x i8]
 @.str = private unnamed_addr constant [65 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzz\00", align 1
 
 ; Function Attrs: nounwind
-define hidden void @foo() local_unnamed_addr #0 {
-; CHECK-LABEL: define hidden void @foo() local_unnamed_addr {
+define  void @foo()   {
+; CHECK-LABEL: define void @foo() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [65 x i8], align 1
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 65, ptr nonnull [[SOMETHING]])
@@ -25,4 +25,4 @@ entry:
   ret void
 }
 
-declare i32 @bar(...) local_unnamed_addr #2
+declare i32 @bar(...)  #2
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
index e82712ebe22ea1..ce29192948a57b 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
@@ -1,18 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
 
-; CHECK: [48 x i8]
 @f.string1 = private unnamed_addr constant [45 x i8] c"The quick brown dog jumps over the lazy fox.\00", align 1
 
 ; Function Attrs: nounwind
-define hidden i32 @f() {
-; CHECK-LABEL: define hidden i32 @f() local_unnamed_addr {
+define  i32 @f() {
+; CHECK-LABEL: define i32 @f() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[STRING1:%.*]] = alloca [48 x i8], align 1
 ; CHECK-NEXT:    [[POS:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TOKEN:%.*]] = alloca ptr, align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 45, ptr [[STRING1]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[STRING1]], ptr align 1 @f.string1, i32 48, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[STRING1]], ptr align 1 @[[GLOB0:[0-9]+]], i32 48, i1 false)
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[POS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TOKEN]])
 ; CHECK-NEXT:    [[CALL:%.*]] = call ptr @strchr(ptr [[STRING1]], i32 101)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
index 9cb0c53bf16527..5367572704b145 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
 %struct.P = type { i32, [13 x i8] }
 
 ; CHECK-NOT: [16 x i8]
 @.str = private unnamed_addr constant [13 x i8] c"hello world\0A\00", align 1
 
 ; Function Attrs: nounwind
-define hidden i32 @main() local_unnamed_addr #0 {
-; CHECK-LABEL: define hidden i32 @main() local_unnamed_addr {
+define  i32 @main()   {
+; CHECK-LABEL: define i32 @main() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[P:%.*]] = alloca [[STRUCT_P:%.*]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr nonnull [[P]])
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
index 4d2559579ce1be..b735a778874232 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=globalopt -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
 
 ; CHECK-NOT: [64 x i8]
 @.str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
 
 ; Function Attrs: nounwind
-define hidden void @foo() local_unnamed_addr #0 {
-; CHECK-LABEL: define hidden void @foo() local_unnamed_addr {
+define  void @foo()   {
+; CHECK-LABEL: define void @foo() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [62 x i8], align 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [62 x i8], ptr [[SOMETHING]], i32 0, i32 0
@@ -26,4 +26,4 @@ entry:
   ret void
 }
 
-declare i32 @bar(...) local_unnamed_addr #2
+declare i32 @bar(...)  #2

>From 99bba5aa2824f04b0572f89c8609306d162b5f1a Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Fri, 4 Oct 2024 21:55:28 +0100
Subject: [PATCH 08/12] Correcting and refactoring elimination

---
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 113 +++++++++++-------
 .../GlobalOpt/ARM/arm-widen-non-byte-array.ll |   2 +-
 .../ARM/arm-widen-string-multi-use.ll         |  14 +--
 .../GlobalOpt/ARM/arm-widen-strings-1.ll      |   2 +-
 .../GlobalOpt/ARM/arm-widen-strings-2.ll      |   2 +-
 .../ARM/arm-widen-strings-ptrtoint.ll         |   2 +-
 6 files changed, 78 insertions(+), 57 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index ba9f6d1a395c58..21ec83af024118 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2031,41 +2031,38 @@ OptimizeFunctions(Module &M,
   return Changed;
 }
 
-static bool tryWidenDestArray(Function *F, CallInst *CI,
-                              GlobalVariable *SourceVar, unsigned NumBytesToPad,
-                              unsigned NumBytesToCopy,
-                              ConstantDataArray *SourceDataArray) {
+static bool callInstIsMemcpy(CallInst *CI) {
+  if (!CI)
+    return false;
+
+  Function *F = CI->getCalledFunction();
+  if (!F || !F->isIntrinsic() || F->getIntrinsicID() != Intrinsic::memcpy)
+    return false;
+
+  return true;
+}
+
+static bool destArrayCanBeWidened(CallInst *CI) {
   auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
   auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
 
   if (!Alloca || !IsVolatile || IsVolatile->isOne())
     return false;
 
-  if (!SourceVar->hasInitializer() || !SourceVar->isConstant() ||
-      !SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr())
-    return false;
-
   if (!Alloca->isStaticAlloca())
     return false;
 
-  uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
-  uint64_t SZSize = SourceDataArray->getType()->getNumElements();
-  unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
-  // Calculate the number of elements to copy while avoiding floored
-  // division of integers returning wrong values i.e. copying one byte
-  // from an array of i16 would yield 0 elements to copy as supposed to 1.
-  unsigned NumElementsToCopy = divideCeil(NumBytesToCopy, ElementByteWidth);
-
-  // For safety purposes lets add a constraint and only pad when
-  // NumElementsToCopy == destination array size ==
-  // source string which is a constant
-  if (NumElementsToCopy != DZSize || DZSize != SZSize)
-    return false;
+  return true;
+}
 
+static void widenDestArray(CallInst *CI, const unsigned NumBytesToPad,
+                           const unsigned NumBytesToCopy,
+                           ConstantDataArray *SourceDataArray) {
+  unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
   unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
-  NumElementsToCopy = divideCeil(TotalBytes, ElementByteWidth);
-
+  unsigned NumElementsToCopy = divideCeil(TotalBytes, ElementByteWidth);
   // Update destination array to be word aligned (memcpy(X,...,...))
+  auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
   IRBuilder<> BuildAlloca(Alloca);
   AllocaInst *NewAlloca = BuildAlloca.CreateAlloca(ArrayType::get(
       Alloca->getAllocatedType()->getArrayElementType(), NumElementsToCopy));
@@ -2073,14 +2070,17 @@ static bool tryWidenDestArray(Function *F, CallInst *CI,
   NewAlloca->setAlignment(Alloca->getAlign());
   Alloca->replaceAllUsesWith(NewAlloca);
   Alloca->eraseFromParent();
-  return true;
 }
 
-static bool widenGlobalArray(Function *F, CallInst *CI,
-                             GlobalVariable *SourceVar, unsigned NumBytesToPad,
-                             unsigned NumBytesToCopy,
-                             ConstantInt *BytesToCopyOp,
-                             ConstantDataArray *SourceDataArray) {
+static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
+                                        const unsigned NumBytesToPad,
+                                        const unsigned NumBytesToCopy,
+                                        ConstantInt *BytesToCopyOp,
+                                        ConstantDataArray *SourceDataArray) {
+  if (!SourceVar->hasInitializer() || !SourceVar->isConstant() ||
+      !SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr())
+    return false;
+
   // Update source to be word aligned (memcpy(...,X,...))
   // create replacement with padded null bytes.
   StringRef Data = SourceDataArray->getRawDataValues();
@@ -2100,8 +2100,20 @@ static bool widenGlobalArray(Function *F, CallInst *CI,
   NewGV->copyAttributesFrom(SourceVar);
   NewGV->takeName(SourceVar);
 
-  CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(),
-                                        NumBytesToCopy + NumBytesToPad));
+  // Update arguments of remaining uses  that
+  // are memcpys.
+  for (auto *User : SourceVar->users()) {
+    auto *CI = dyn_cast<CallInst>(User);
+    if (!callInstIsMemcpy(CI))
+      continue;
+
+    widenDestArray(CI, NumBytesToPad, NumBytesToCopy, SourceDataArray);
+
+    CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(),
+                                          NumBytesToCopy + NumBytesToPad));
+  }
+  SourceVar->replaceAllUsesWith(NewGV);
+
   NumGlobalArraysPadded++;
   return true;
 }
@@ -2109,39 +2121,48 @@ static bool widenGlobalArray(Function *F, CallInst *CI,
 static bool tryWidenGlobalArraysUsedByMemcpy(
     GlobalVariable *GV,
     function_ref<TargetTransformInfo &(Function &)> GetTTI) {
+
+  if (!GV->hasInitializer())
+    return false;
+
   for (auto *User : GV->users()) {
     CallInst *CI = dyn_cast<CallInst>(User);
-    if (!CI)
+    if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
       continue;
 
     Function *F = CI->getCalledFunction();
-    if (!F || !F->isIntrinsic() || F->getIntrinsicID() != Intrinsic::memcpy)
-      continue;
 
-    TargetTransformInfo &TTI = GetTTI(*F);
     auto *BytesToCopyOp = dyn_cast<ConstantInt>(CI->getArgOperand(2));
     if (!BytesToCopyOp)
       continue;
 
-    if (!GV->hasInitializer())
-      continue;
-
     ConstantDataArray *SourceDataArray =
         dyn_cast<ConstantDataArray>(GV->getInitializer());
     if (!SourceDataArray)
       continue;
 
     unsigned NumBytesToCopy = BytesToCopyOp->getZExtValue();
-    unsigned NumBytesToPad = TTI.getNumBytesToPadGlobalArray(
-        NumBytesToCopy, SourceDataArray->getType());
 
+    auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
+    uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
+    uint64_t SZSize = SourceDataArray->getType()->getNumElements();
+    unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
+    // Calculate the number of elements to copy while avoiding floored
+    // division of integers returning wrong values i.e. copying one byte
+    // from an array of i16 would yield 0 elements to copy as supposed to 1.
+    unsigned NumElementsToCopy = divideCeil(NumBytesToCopy, ElementByteWidth);
+
+    // For safety purposes lets add a constraint and only pad when
+    // NumElementsToCopy == destination array size ==
+    // source which is a constant
+    if (NumElementsToCopy != DZSize || DZSize != SZSize)
+      continue;
+
+    unsigned NumBytesToPad = GetTTI(*F).getNumBytesToPadGlobalArray(
+        NumBytesToCopy, SourceDataArray->getType());
     if (NumBytesToPad) {
-      bool DestWidened = tryWidenDestArray(F, CI, GV, NumBytesToPad,
-                                           NumBytesToCopy, SourceDataArray);
-      if (DestWidened) {
-        return widenGlobalArray(F, CI, GV, NumBytesToPad, NumBytesToCopy,
-                                BytesToCopyOp, SourceDataArray);
-      }
+      return tryWidenGlobalArrayAndDests(F, GV, NumBytesToPad, NumBytesToCopy,
+                                         BytesToCopyOp, SourceDataArray);
     }
   }
   return false;
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
index 61e72c89817835..c7ca7271fd3d27 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
@@ -7,7 +7,7 @@ define  void @memcpy_i16_array()  {
 ; CHECK-LABEL: define void @memcpy_i16_array() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING1:%.*]] = alloca [6 x i16], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(10) @[[GLOB0:[0-9]+]], i32 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 12, i1 false)
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
index 91ffdb58165173..e37925a78d2c3a 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
@@ -6,15 +6,15 @@
 define  void @memcpy_multiple()  {
 ; CHECK-LABEL: define void @memcpy_multiple() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [4 x i8], align 1
-; CHECK-NEXT:    [[SOMETHING1:%.*]] = alloca [4 x i8], align 1
 ; CHECK-NEXT:    [[SOMETHING2:%.*]] = alloca [4 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @[[GLOB0:[0-9]+]], i32 4, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(3) @[[GLOB0]], i32 4, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING2]], ptr noundef nonnull align 1 dereferenceable(3) @[[GLOB0]], i32 4, i1 false)
-; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    [[SOMETHING1:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    [[SOMETHING3:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING2]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING3]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING2]])
 ; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
-; CHECK-NEXT:    [[CALL4:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING2]])
+; CHECK-NEXT:    [[CALL4:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING3]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
index 91cf90a21de913..8ea9e2804370e1 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
@@ -7,7 +7,7 @@ define  void @foo()  {
 ; CHECK-LABEL: define void @foo() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [12 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(10) @[[GLOB0:[0-9]+]], i32 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 12, i1 false)
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
index 30c14af7caf672..ad3620b14ea234 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
@@ -7,7 +7,7 @@ define  void @foo()  {
 ; CHECK-LABEL: define void @foo() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [64 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(62) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(62) @[[GLOB0:[0-9]+]], i32 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(62) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(62) @.str, i32 64, i1 false)
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
index ce29192948a57b..64f57884cd39e1 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
@@ -11,7 +11,7 @@ define  i32 @f() {
 ; CHECK-NEXT:    [[POS:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TOKEN:%.*]] = alloca ptr, align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 45, ptr [[STRING1]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[STRING1]], ptr align 1 @[[GLOB0:[0-9]+]], i32 48, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[STRING1]], ptr align 1 @f.string1, i32 48, i1 false)
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[POS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TOKEN]])
 ; CHECK-NEXT:    [[CALL:%.*]] = call ptr @strchr(ptr [[STRING1]], i32 101)

>From 1af80e456906aeef93e1f029d697891a06b79a7d Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Wed, 9 Oct 2024 17:17:43 +0100
Subject: [PATCH 09/12] Fix bug when copying to global dest

The case in which copying from a global
source to a global dest wasn't handled and
caused opt to crash. This is now handled and
a new test has been added to check

Change-Id: Ieb0467797fcee888f6e95e68af4dac9c05d70a4d
---
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 91 ++++++++++++-------
 .../GlobalOpt/ARM/arm-widen-global-dest.ll    | 26 ++++++
 2 files changed, 86 insertions(+), 31 deletions(-)
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 21ec83af024118..4cc968a872957c 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2043,9 +2043,13 @@ static bool callInstIsMemcpy(CallInst *CI) {
 }
 
 static bool destArrayCanBeWidened(CallInst *CI) {
+  auto *GV = dyn_cast<GlobalVariable>(CI->getArgOperand(0));
   auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
   auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
 
+  if (!GV || !GV->hasInitializer())
+    return false;
+
   if (!Alloca || !IsVolatile || IsVolatile->isOne())
     return false;
 
@@ -2055,21 +2059,61 @@ static bool destArrayCanBeWidened(CallInst *CI) {
   return true;
 }
 
+static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, Function *F,
+                                           unsigned NumBytesToPad,
+                                           unsigned NumBytesToCopy) {
+  if (!OldVar->hasInitializer())
+    return nullptr;
+
+  ConstantDataArray *DataArray =
+      dyn_cast<ConstantDataArray>(OldVar->getInitializer());
+  if (!DataArray)
+    return nullptr;
+
+  // Update to be word aligned (memcpy(...,X,...))
+  // create replacement with padded null bytes.
+  StringRef Data = DataArray->getRawDataValues();
+  std::vector<uint8_t> StrData(Data.begin(), Data.end());
+  for (unsigned int p = 0; p < NumBytesToPad; p++)
+    StrData.push_back('\0');
+  auto Arr = ArrayRef(StrData.data(), NumBytesToCopy + NumBytesToPad);
+  // Create new padded version of global variable.
+  Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr);
+  GlobalVariable *NewGV = new GlobalVariable(
+      *(F->getParent()), SourceReplace->getType(), true, OldVar->getLinkage(),
+      SourceReplace, SourceReplace->getName());
+  // Copy any other attributes from original global variable
+  // e.g. unamed_addr
+  NewGV->copyAttributesFrom(OldVar);
+  NewGV->takeName(OldVar);
+  return NewGV;
+}
+
 static void widenDestArray(CallInst *CI, const unsigned NumBytesToPad,
                            const unsigned NumBytesToCopy,
                            ConstantDataArray *SourceDataArray) {
-  unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
-  unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
-  unsigned NumElementsToCopy = divideCeil(TotalBytes, ElementByteWidth);
-  // Update destination array to be word aligned (memcpy(X,...,...))
+
+  // Dest array can be global or local
+  auto *DestGV = dyn_cast<GlobalVariable>(CI->getArgOperand(0));
   auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
-  IRBuilder<> BuildAlloca(Alloca);
-  AllocaInst *NewAlloca = BuildAlloca.CreateAlloca(ArrayType::get(
-      Alloca->getAllocatedType()->getArrayElementType(), NumElementsToCopy));
-  NewAlloca->takeName(Alloca);
-  NewAlloca->setAlignment(Alloca->getAlign());
-  Alloca->replaceAllUsesWith(NewAlloca);
-  Alloca->eraseFromParent();
+  if (DestGV) {
+    auto *F = CI->getCalledFunction();
+    auto *NewDestGV =
+        widenGlobalVariable(DestGV, F, NumBytesToPad, NumBytesToCopy);
+    DestGV->replaceAllUsesWith(NewDestGV);
+  } else if (Alloca) {
+    unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
+    unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
+    unsigned NumElementsToCopy = divideCeil(TotalBytes, ElementByteWidth);
+    // Update destination array to be word aligned (memcpy(X,...,...))
+    IRBuilder<> BuildAlloca(Alloca);
+    AllocaInst *NewAlloca = BuildAlloca.CreateAlloca(ArrayType::get(
+        Alloca->getAllocatedType()->getArrayElementType(), NumElementsToCopy));
+    NewAlloca->takeName(Alloca);
+    NewAlloca->setAlignment(Alloca->getAlign());
+    Alloca->replaceAllUsesWith(NewAlloca);
+    Alloca->eraseFromParent();
+  }
 }
 
 static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
@@ -2081,25 +2125,10 @@ static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
       !SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr())
     return false;
 
-  // Update source to be word aligned (memcpy(...,X,...))
-  // create replacement with padded null bytes.
-  StringRef Data = SourceDataArray->getRawDataValues();
-  std::vector<uint8_t> StrData(Data.begin(), Data.end());
-  for (unsigned int p = 0; p < NumBytesToPad; p++)
-    StrData.push_back('\0');
-  auto Arr = ArrayRef(StrData.data(), NumBytesToCopy + NumBytesToPad);
-
-  // Create new padded version of global variable.
-  Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr);
-  GlobalVariable *NewGV = new GlobalVariable(
-      *(F->getParent()), SourceReplace->getType(), true,
-      SourceVar->getLinkage(), SourceReplace, SourceReplace->getName());
-
-  // Copy any other attributes from original global variable
-  // e.g. unamed_addr
-  NewGV->copyAttributesFrom(SourceVar);
-  NewGV->takeName(SourceVar);
-
+  auto *NewSourceGV =
+      widenGlobalVariable(SourceVar, F, NumBytesToPad, NumBytesToCopy);
+  if (!NewSourceGV)
+    return false;
   // Update arguments of remaining uses  that
   // are memcpys.
   for (auto *User : SourceVar->users()) {
@@ -2112,7 +2141,7 @@ static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
     CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(),
                                           NumBytesToCopy + NumBytesToPad));
   }
-  SourceVar->replaceAllUsesWith(NewGV);
+  SourceVar->replaceAllUsesWith(NewSourceGV);
 
   NumGlobalArraysPadded++;
   return true;
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
new file mode 100644
index 00000000000000..affa7d620804c0
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+; CHECK: [4 x i8]
+ at .i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
+; CHECK: [4 x i8]
+ at other = private unnamed_addr global [3 x i8] [i8 1, i8 2, i8 3] , align 1
+
+define  void @memcpy_multiple()  {
+; CHECK-LABEL: define void @memcpy_multiple() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [3 x i8], align 1
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something)
+  ret void
+}
+
+declare i32 @bar(...)

>From 5a30a89e88f782e5812b800c2a7b2f1c3d968dd2 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Thu, 10 Oct 2024 12:00:32 +0100
Subject: [PATCH 10/12] Addressing review comments

Change-Id: I029312362f9dd714b2e9bc206cc002883d761b8b
---
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp      |  7 ++++++-
 llvm/lib/Transforms/IPO/GlobalOpt.cpp               | 13 +++++++------
 .../GlobalOpt/ARM/arm-widen-global-dest.ll          |  4 +---
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index caa9d6cdd1e6de..67bc1578ed6d0a 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -56,7 +56,7 @@ static cl::opt<bool>
     AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
                   cl::desc("Enable the generation of WLS loops"));
 
-static cl::opt<unsigned> UseWidenGlobalStrings(
+static cl::opt<bool> UseWidenGlobalArrays(
     "widen-global-strings", cl::Hidden, cl::init(true),
     cl::desc("Enable the widening of global strings to alignment boundaries"));
 
@@ -2651,6 +2651,11 @@ bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
 
 unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
                                                  Type *ArrayType) const {
+    if (!UseWidenGlobalArrays){
+        LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
+        return false;
+    }
+
   // Don't modify none integer array types
   if (!ArrayType || !ArrayType->isArrayTy() ||
       !ArrayType->getArrayElementType()->isIntegerTy())
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 4cc968a872957c..197404d12cb2e3 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2043,13 +2043,9 @@ static bool callInstIsMemcpy(CallInst *CI) {
 }
 
 static bool destArrayCanBeWidened(CallInst *CI) {
-  auto *GV = dyn_cast<GlobalVariable>(CI->getArgOperand(0));
   auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
   auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
 
-  if (!GV || !GV->hasInitializer())
-    return false;
-
   if (!Alloca || !IsVolatile || IsVolatile->isOne())
     return false;
 
@@ -2129,12 +2125,17 @@ static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
       widenGlobalVariable(SourceVar, F, NumBytesToPad, NumBytesToCopy);
   if (!NewSourceGV)
     return false;
+
   // Update arguments of remaining uses  that
   // are memcpys.
   for (auto *User : SourceVar->users()) {
     auto *CI = dyn_cast<CallInst>(User);
-    if (!callInstIsMemcpy(CI))
-      continue;
+      if (!callInstIsMemcpy(CI))
+          continue;
+
+    if (CI->getArgOperand(1) != SourceVar)
+        continue;
+
 
     widenDestArray(CI, NumBytesToPad, NumBytesToCopy, SourceDataArray);
 
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
index affa7d620804c0..e1a19a2ab03560 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
@@ -1,9 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+; RUN: opt <%s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
 
-; CHECK: [4 x i8]
 @.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
-; CHECK: [4 x i8]
 @other = private unnamed_addr global [3 x i8] [i8 1, i8 2, i8 3] , align 1
 
 define  void @memcpy_multiple()  {

>From 70116719b04e004cb1fb49ea0fa804ae57f1861c Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Thu, 10 Oct 2024 12:00:35 +0100
Subject: [PATCH 11/12] Addressing review comments

Change-Id: Idc7b14cc785eb88552dd72947eb0df128baa7e90
---
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 8 ++++----
 llvm/lib/Transforms/IPO/GlobalOpt.cpp          | 7 +++----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 67bc1578ed6d0a..8655f95c726f62 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2651,10 +2651,10 @@ bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
 
 unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
                                                  Type *ArrayType) const {
-    if (!UseWidenGlobalArrays){
-        LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
-        return false;
-    }
+  if (!UseWidenGlobalArrays) {
+    LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
+    return false;
+  }
 
   // Don't modify none integer array types
   if (!ArrayType || !ArrayType->isArrayTy() ||
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 197404d12cb2e3..16e60bcc456559 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2130,12 +2130,11 @@ static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
   // are memcpys.
   for (auto *User : SourceVar->users()) {
     auto *CI = dyn_cast<CallInst>(User);
-      if (!callInstIsMemcpy(CI))
-          continue;
+    if (!callInstIsMemcpy(CI))
+      continue;
 
     if (CI->getArgOperand(1) != SourceVar)
-        continue;
-
+      continue;
 
     widenDestArray(CI, NumBytesToPad, NumBytesToCopy, SourceDataArray);
 

>From eac5972fcff26c5d4987f25301ddad5309d2bbc8 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Fri, 11 Oct 2024 12:16:34 +0100
Subject: [PATCH 12/12] Review comments

- Removed handling of global variable destinations. We simply
  don't pad these for now
- Added check that destination array is an array type and added
  test.

Change-Id: Ifc53051952ef69c4af64827402baf7d69cab4824
---
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 16 +++-----
 .../GlobalOpt/ARM/arm-widen-dest-non-array.ll | 39 +++++++++++++++++++
 .../GlobalOpt/ARM/arm-widen-global-dest.ll    | 16 +++++---
 3 files changed, 55 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 16e60bcc456559..80c50d6a4bf598 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2043,8 +2043,8 @@ static bool callInstIsMemcpy(CallInst *CI) {
 }
 
 static bool destArrayCanBeWidened(CallInst *CI) {
-  auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
   auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
+  auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
 
   if (!Alloca || !IsVolatile || IsVolatile->isOne())
     return false;
@@ -2052,6 +2052,9 @@ static bool destArrayCanBeWidened(CallInst *CI) {
   if (!Alloca->isStaticAlloca())
     return false;
 
+  if (!Alloca->getAllocatedType()->isArrayTy())
+    return false;
+
   return true;
 }
 
@@ -2089,15 +2092,8 @@ static void widenDestArray(CallInst *CI, const unsigned NumBytesToPad,
                            const unsigned NumBytesToCopy,
                            ConstantDataArray *SourceDataArray) {
 
-  // Dest array can be global or local
-  auto *DestGV = dyn_cast<GlobalVariable>(CI->getArgOperand(0));
   auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
-  if (DestGV) {
-    auto *F = CI->getCalledFunction();
-    auto *NewDestGV =
-        widenGlobalVariable(DestGV, F, NumBytesToPad, NumBytesToCopy);
-    DestGV->replaceAllUsesWith(NewDestGV);
-  } else if (Alloca) {
+  if (Alloca) {
     unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
     unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
     unsigned NumElementsToCopy = divideCeil(TotalBytes, ElementByteWidth);
@@ -2130,7 +2126,7 @@ static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
   // are memcpys.
   for (auto *User : SourceVar->users()) {
     auto *CI = dyn_cast<CallInst>(User);
-    if (!callInstIsMemcpy(CI))
+    if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
       continue;
 
     if (CI->getArgOperand(1) != SourceVar)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll
new file mode 100644
index 00000000000000..ab04e0a5bc697e
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+ at .i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
+
+define  void @memcpy_struct()  {
+; CHECK-LABEL: define void @memcpy_struct() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca { i8, i8, i8 }, align 1
+; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca {i8, i8, i8}, align 1
+  %call1 = call i32 @bar(ptr nonnull %something)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  ret void
+}
+
+
+ at .i8_multi = private unnamed_addr constant [2 x [3 x i8]] [[3 x i8] [i8 1, i8 2, i8 3], [3 x i8] [i8 4, i8 5, i8 6]] , align 1
+
+define  void @memcpy_array_multidimensional()  {
+; CHECK-LABEL: define void @memcpy_array_multidimensional() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [2 x [3 x i8]], align 1
+; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [2 x [3 x i8]], align 1
+  %call1 = call i32 @bar(ptr nonnull %something)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
+  ret void
+}
+
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
index e1a19a2ab03560..f435ffdeed2c8e 100644
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
@@ -1,23 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt <%s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
 
- at .i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
+; CHECK: [3 x i8]
 @other = private unnamed_addr global [3 x i8] [i8 1, i8 2, i8 3] , align 1
+; CHECK: [4 x i8]
+ at .i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
 
 define  void @memcpy_multiple()  {
 ; CHECK-LABEL: define void @memcpy_multiple() local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [4 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @bar(ptr nonnull @other)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %something = alloca [3 x i8], align 1
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  %call1 = call i32 @bar(ptr nonnull %something)
+  %call2 = call i32 @bar(ptr nonnull @other)
   call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
-  %call2 = call i32 @bar(ptr nonnull %something)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
   ret void
 }
 



More information about the llvm-commits mailing list