[clang] [compiler-rt] [llvm] [XRay][RISCV] RISCV support for XRay (PR #117368)

Mon Nov 25 15:09:12 PST 2024

https://github.com/mshockwave updated https://github.com/llvm/llvm-project/pull/117368

>From 599370a06008092f6aa883bf11600d0b66707bc0 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Wed, 20 Nov 2024 14:37:57 -0800
Subject: [PATCH 1/2] [XRay][RISCV] RISCV support for XRay

Add RISC-V support for XRay. The RV64 implementation has been tested in
both QEMU and in our production environment.

Currently this requires D and C extensions, but since both RV64GC and
RVA22/RVA23 are becoming mainstream, I don't think this requirement will
be a big problem.

Based on Ashwin Poduval's previous work:
https://reviews.llvm.org/D117929

Co-authored-by: Ashwin Poduval <ashwin.poduval at gmail.com>
---
 clang/lib/Driver/XRayArgs.cpp                 |   2 +
 .../cmake/Modules/AllSupportedArchDefs.cmake  |   2 +-
 compiler-rt/lib/xray/CMakeLists.txt           |  12 +
 compiler-rt/lib/xray/xray_interface.cpp       |   4 +
 compiler-rt/lib/xray/xray_riscv.cpp           | 296 ++++++++++++++++++
 .../lib/xray/xray_trampoline_riscv32.S        |  83 +++++
 .../lib/xray/xray_trampoline_riscv64.S        |  83 +++++
 .../lib/xray/xray_trampoline_riscv_common.S   |  97 ++++++
 compiler-rt/lib/xray/xray_tsc.h               |   2 +-
 llvm/lib/CodeGen/XRayInstrumentation.cpp      |   7 +-
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     |  82 +++++
 llvm/lib/Target/RISCV/RISCVSubtarget.h        |   3 +
 llvm/lib/XRay/InstrumentationMap.cpp          |   3 +-
 .../RISCV/xray-attribute-instrumentation.ll   |  24 ++
 14 files changed, 695 insertions(+), 5 deletions(-)
 create mode 100644 compiler-rt/lib/xray/xray_riscv.cpp
 create mode 100644 compiler-rt/lib/xray/xray_trampoline_riscv32.S
 create mode 100644 compiler-rt/lib/xray/xray_trampoline_riscv64.S
 create mode 100644 compiler-rt/lib/xray/xray_trampoline_riscv_common.S
 create mode 100644 llvm/test/CodeGen/RISCV/xray-attribute-instrumentation.ll

diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp
index de5c38ebc3abbd..f8c213334a2b40 100644
--- a/clang/lib/Driver/XRayArgs.cpp
+++ b/clang/lib/Driver/XRayArgs.cpp
@@ -51,6 +51,8 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) {
     case llvm::Triple::mips64:
     case llvm::Triple::mips64el:
     case llvm::Triple::systemz:
+    case llvm::Triple::riscv32:
+    case llvm::Triple::riscv64:
       break;
     default:
       D.Diag(diag::err_drv_unsupported_opt_for_target)
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index b29ae179c2b4f4..5a1e8db61023b0 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -102,7 +102,7 @@ if(APPLE)
 set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM64})
 else()
 set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64}
-		powerpc64le ${HEXAGON} ${LOONGARCH64})
+               powerpc64le ${HEXAGON} ${LOONGARCH64} ${RISCV32} ${RISCV64})
 endif()
 set(ALL_XRAY_DSO_SUPPORTED_ARCH ${X86_64} ${ARM64})
 set(ALL_SHADOWCALLSTACK_SUPPORTED_ARCH ${ARM64})
diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt
index 7e3f1a0aa616e5..e7f01a2f4f1640 100644
--- a/compiler-rt/lib/xray/CMakeLists.txt
+++ b/compiler-rt/lib/xray/CMakeLists.txt
@@ -96,6 +96,16 @@ set(hexagon_SOURCES
   xray_trampoline_hexagon.S
   )
 
+set(riscv32_SOURCES
+  xray_riscv.cpp
+  xray_trampoline_riscv32.S
+  )
+
+set(riscv64_SOURCES
+  xray_riscv.cpp
+  xray_trampoline_riscv64.S
+  )
+
 set(XRAY_SOURCE_ARCHS
   arm
   armhf
@@ -156,6 +166,8 @@ set(XRAY_ALL_SOURCE_FILES
   ${mips64_SOURCES}
   ${mips64el_SOURCES}
   ${powerpc64le_SOURCES}
+  ${riscv32_SOURCES}
+  ${riscv64_SOURCES}
   ${XRAY_IMPL_HEADERS}
   )
 list(REMOVE_DUPLICATES XRAY_ALL_SOURCE_FILES)
diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp
index b6f0e6762f1681..e66736d9a344e1 100644
--- a/compiler-rt/lib/xray/xray_interface.cpp
+++ b/compiler-rt/lib/xray/xray_interface.cpp
@@ -57,6 +57,10 @@ static const int16_t cSledLength = 64;
 static const int16_t cSledLength = 8;
 #elif defined(__hexagon__)
 static const int16_t cSledLength = 20;
+#elif SANITIZER_RISCV64
+static const int16_t cSledLength = 76;
+#elif defined(__riscv) && (__riscv_xlen == 32)
+static const int16_t cSledLength = 60;
 #else
 #error "Unsupported CPU Architecture"
 #endif /* CPU architecture */
diff --git a/compiler-rt/lib/xray/xray_riscv.cpp b/compiler-rt/lib/xray/xray_riscv.cpp
new file mode 100644
index 00000000000000..89ce9305ef3dbe
--- /dev/null
+++ b/compiler-rt/lib/xray/xray_riscv.cpp
@@ -0,0 +1,296 @@
+//===-- xray_riscv.cpp ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of riscv-specific routines (32- and 64-bit).
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <atomic>
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum PatchOpcodes : uint32_t {
+  PO_ADDI = 0x00000013, // addi rd, rs1, imm
+  PO_ADD = 0x00000033,  // add rd, rs1, rs2
+  PO_SW = 0x00002023,   // sw rt, base(offset)
+  PO_SD = 0x00003023,   // sd rt, base(offset)
+  PO_LUI = 0x00000037,  // lui rd, imm
+  PO_ORI = 0x00006013,  // ori rd, rs1, imm
+  PO_OR = 0x00006033,   // or rd, rs1, rs2
+  PO_SLLI = 0x00001013, // slli rd, rs, shamt
+  PO_SRLI = 0x00005013, // srli rd, rs, shamt
+  PO_JALR = 0x00000067, // jalr rs
+  PO_LW = 0x00002003,   // lw rd, base(offset)
+  PO_LD = 0x00003003,   // ld rd, base(offset)
+  PO_J = 0x0000006f,    // jal #n_bytes
+  PO_NOP = 0x00000013,  // nop - pseduo-instruction, same as addi x0, x0, 0
+};
+
+enum RegNum : uint32_t {
+  RN_R0 = 0x0,
+  RN_RA = 0x1,
+  RN_SP = 0x2,
+  RN_T0 = 0x5,
+  RN_T1 = 0x6,
+  RN_T2 = 0x7,
+  RN_A0 = 0xa,
+};
+
+static inline uint32_t encodeRTypeInstruction(uint32_t Opcode, uint32_t Rs1,
+                                              uint32_t Rs2, uint32_t Rd) {
+  return Rs2 << 20 | Rs1 << 15 | Rd << 7 | Opcode;
+}
+
+static inline uint32_t encodeITypeInstruction(uint32_t Opcode, uint32_t Rs1,
+                                              uint32_t Rd, uint32_t Imm) {
+  return Imm << 20 | Rs1 << 15 | Rd << 7 | Opcode;
+}
+
+static inline uint32_t encodeSTypeInstruction(uint32_t Opcode, uint32_t Rs1,
+                                              uint32_t Rs2, uint32_t Imm) {
+  uint32_t imm_msbs = (Imm & 0xfe0) << 25;
+  uint32_t imm_lsbs = (Imm & 0x01f) << 7;
+  return imm_msbs | Rs2 << 20 | Rs1 << 15 | imm_lsbs | Opcode;
+}
+
+static inline uint32_t encodeUTypeInstruction(uint32_t Opcode, uint32_t Rd,
+                                              uint32_t Imm) {
+  return Imm << 12 | Rd << 7 | Opcode;
+}
+
+static inline uint32_t encodeJTypeInstruction(uint32_t Opcode, uint32_t Rd,
+                                              uint32_t Imm) {
+  uint32_t imm_msb = (Imm & 0x80000) << 31;
+  uint32_t imm_lsbs = (Imm & 0x003ff) << 21;
+  uint32_t imm_11 = (Imm & 0x00400) << 20;
+  uint32_t imm_1912 = (Imm & 0x7f800) << 12;
+  return imm_msb | imm_lsbs | imm_11 | imm_1912 | Rd << 7 | Opcode;
+}
+
+#if SANITIZER_RISCV64
+static uint32_t hi20(uint64_t val) { return (val + 0x800) >> 12; }
+static uint32_t lo12(uint64_t val) { return val & 0xfff; }
+#elif defined(__riscv) && (__riscv_xlen == 32)
+static uint32_t hi20(uint32_t val) { return (val + 0x800) >> 12; }
+static uint32_t lo12(uint32_t val) { return val & 0xfff; }
+#endif
+
+static inline bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled,
+                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // xray_sled_n:
+  //	J .tmpN
+  //	29 or 37 C.NOPs (58 or 74 bytes)
+  //	.tmpN
+  //
+  // With one of the following runtime patches:
+  //
+  // xray_sled_n (32-bit):
+  //    addi sp, sp, -16                                ;create stack frame
+  //    sw ra, 12(sp)                                   ;save return address
+  //    sw t2, 8(sp)                                    ;save register t2
+  //    sw t1, 4(sp)                                    ;save register t1
+  //    sw a0, 0(sp)                                    ;save register a0
+  //    lui t1, %hi(__xray_FunctionEntry/Exit)
+  //    addi t1, t1, %lo(__xray_FunctionEntry/Exit)
+  //    lui a0, %hi(function_id)
+  //    addi a0, a0, %lo(function_id)                   ;pass function id
+  //    jalr t1                                         ;call Tracing hook
+  //    lw a0, 0(sp)                                    ;restore register a0
+  //    lw t1, 4(sp)                                    ;restore register t1
+  //    lw t2, 8(sp)                                    ;restore register t2
+  //    lw ra, 12(sp)                                   ;restore return address
+  //    addi sp, sp, 16                                 ;delete stack frame
+  //
+  // xray_sled_n (64-bit):
+  //    addi sp, sp, -32                                ;create stack frame
+  //    sd ra, 24(sp)                                   ;save return address
+  //    sd t2, 16(sp)                                   ;save register t2
+  //    sd t1, 8(sp)                                    ;save register t1
+  //    sd a0, 0(sp)                                    ;save register a0
+  //    lui t2, %highest(__xray_FunctionEntry/Exit)
+  //    addi t2, t2, %higher(__xray_FunctionEntry/Exit)
+  //    slli t2, t2, 32
+  //    lui t1, t1, %hi(__xray_FunctionEntry/Exit)
+  //    addi t1, t1, %lo(__xray_FunctionEntry/Exit)
+  //    add t1, t2, t1
+  //    lui a0, %hi(function_id)
+  //    addi a0, a0, %lo(function_id)                   ;pass function id
+  //    jalr t1                                         ;call Tracing hook
+  //    ld a0, 0(sp)                                    ;restore register a0
+  //    ld t1, 8(sp)                                    ;restore register t1
+  //    ld t2, 16(sp)                                   ;restore register t2
+  //    ld ra, 24(sp)                                   ;restore return address
+  //    addi sp, sp, 32                                 ;delete stack frame
+  //
+  // Replacement of the first 4-byte instruction should be the last and atomic
+  // operation, so that the user code which reaches the sled concurrently
+  // either jumps over the whole sled, or executes the whole sled when the
+  // latter is ready.
+  //
+  // When |Enable|==false, we set back the first instruction in the sled to be
+  //   J 60 bytes (rv32)
+  //   J 76 bytes (rv64)
+
+  uint32_t *Address = reinterpret_cast<uint32_t *>(Sled.address());
+  if (Enable) {
+    // If the ISA is RISCV 64, the Tracing Hook needs to be typecast to a 64 bit
+    // value
+#if SANITIZER_RISCV64
+    uint32_t LoTracingHookAddr = lo12(reinterpret_cast<uint64_t>(TracingHook));
+    uint32_t HiTracingHookAddr = hi20(reinterpret_cast<uint64_t>(TracingHook));
+    uint32_t HigherTracingHookAddr =
+        lo12((reinterpret_cast<uint64_t>(TracingHook) + 0x80000000) >> 32);
+    uint32_t HighestTracingHookAddr =
+        hi20((reinterpret_cast<uint64_t>(TracingHook) + 0x80000000) >> 32);
+    // We typecast the Tracing Hook to a 32 bit value for RISCV32
+#elif defined(__riscv) && (__riscv_xlen == 32)
+    uint32_t LoTracingHookAddr = lo12(reinterpret_cast<uint32_t>(TracingHook));
+    uint32_t HiTracingHookAddr = hi20((reinterpret_cast<uint32_t>(TracingHook));
+#endif
+    uint32_t LoFunctionID = lo12(FuncId);
+    uint32_t HiFunctionID = hi20(FuncId);
+    // The sled that is patched in for RISCV64 defined below. We need the entire
+    // sleds corresponding to both ISAs to be protected by defines because the
+    // first few instructions are all different, because we store doubles in
+    // case of RV64 and store words for RV32. Subsequently, we have LUI - and in
+    // case of RV64, we need extra instructions from this point on, so we see
+    // differences in addresses to which instructions are stored.
+#if SANITIZER_RISCV64
+    Address[1] = encodeSTypeInstruction(PatchOpcodes::PO_SD, RegNum::RN_SP,
+                                        RegNum::RN_RA, 0x18);
+    Address[2] = encodeSTypeInstruction(PatchOpcodes::PO_SD, RegNum::RN_SP,
+                                        RegNum::RN_T2, 0x10);
+    Address[3] = encodeSTypeInstruction(PatchOpcodes::PO_SD, RegNum::RN_SP,
+                                        RegNum::RN_T1, 0x8);
+    Address[4] = encodeSTypeInstruction(PatchOpcodes::PO_SD, RegNum::RN_SP,
+                                        RegNum::RN_A0, 0x0);
+    Address[5] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_T2,
+                                        HighestTracingHookAddr);
+    Address[6] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_T2,
+                                        RegNum::RN_T2, HigherTracingHookAddr);
+    Address[7] = encodeITypeInstruction(PatchOpcodes::PO_SLLI, RegNum::RN_T2,
+                                        RegNum::RN_T2, 0x20);
+    Address[8] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_T1,
+                                        HiTracingHookAddr);
+    Address[9] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_T1,
+                                        RegNum::RN_T1, LoTracingHookAddr);
+    Address[10] = encodeRTypeInstruction(PatchOpcodes::PO_ADD, RegNum::RN_T1,
+                                         RegNum::RN_T2, RegNum::RN_T1);
+    Address[11] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_A0,
+                                         HiFunctionID);
+    Address[12] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_A0,
+                                         RegNum::RN_A0, LoFunctionID);
+    Address[13] = encodeITypeInstruction(PatchOpcodes::PO_JALR, RegNum::RN_T1,
+                                         RegNum::RN_RA, 0x0);
+    Address[14] = encodeITypeInstruction(PatchOpcodes::PO_LD, RegNum::RN_SP,
+                                         RegNum::RN_A0, 0x0);
+    Address[15] = encodeITypeInstruction(PatchOpcodes::PO_LD, RegNum::RN_SP,
+                                         RegNum::RN_T1, 0x8);
+    Address[16] = encodeITypeInstruction(PatchOpcodes::PO_LD, RegNum::RN_SP,
+                                         RegNum::RN_T2, 0x10);
+    Address[17] = encodeITypeInstruction(PatchOpcodes::PO_LD, RegNum::RN_SP,
+                                         RegNum::RN_RA, 0x18);
+    Address[18] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_SP,
+                                         RegNum::RN_SP, 0x20);
+    uint32_t CreateStackSpace = encodeITypeInstruction(
+        PatchOpcodes::PO_ADDI, RegNum::RN_SP, RegNum::RN_SP, 0xffe0);
+#elif defined(__riscv) && (__riscv_xlen == 32)
+    Address[1] = encodeSTypeInstruction(PatchOpcodes::PO_SW, RegNum::RN_SP,
+                                        RegNum::RN_RA, 0x0c);
+    Address[2] = encodeSTypeInstruction(PatchOpcodes::PO_SW, RegNum::RN_SP,
+                                        RegNum::RN_T2, 0x08);
+    Address[3] = encodeSTypeInstruction(PatchOpcodes::PO_SW, RegNum::RN_SP,
+                                        RegNum::RN_T1, 0x4);
+    Address[4] = encodeSTypeInstruction(PatchOpcodes::PO_SW, RegNum::RN_SP,
+                                        RegNum::RN_A0, 0x0);
+    Address[5] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_T1,
+                                        HiTracingHookAddr);
+    Address[6] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_T1,
+                                        RegNum::RN_T1, LoTracingHookAddr);
+    Address[7] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_A0,
+                                        HiFunctionID);
+    Address[8] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_A0,
+                                        RegNum::RN_A0, LoFunctionID);
+    Address[9] = encodeITypeInstruction(PatchOpcodes::PO_JALR, RegNum::RN_T1,
+                                        RegNum::RN_RA, 0x0);
+    Address[10] = encodeITypeInstruction(PatchOpcodes::PO_LW, RegNum::RN_SP,
+                                         RegNum::RN_A0, 0x0);
+    Address[11] = encodeITypeInstruction(PatchOpcodes::PO_LW, RegNum::RN_SP,
+                                         RegNum::RN_T1, 0x4);
+    Address[12] = encodeITypeInstruction(PatchOpcodes::PO_LW, RegNum::RN_SP,
+                                         RegNum::RN_T2, 0x08);
+    Address[13] = encodeITypeInstruction(PatchOpcodes::PO_LW, RegNum::RN_SP,
+                                         RegNum::RN_RA, 0x0c);
+    Address[14] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_SP,
+                                         RegNum::RN_SP, 0x10);
+    uint32_t CreateStackSpace = encodeITypeInstruction(
+        PatchOpcodes::PO_ADDI, RegNum::RN_SP, RegNum::RN_SP, 0xfff0);
+#endif
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(Address), CreateStackSpace,
+        std::memory_order_release);
+  } else {
+    uint32_t CreateBranch = encodeJTypeInstruction(
+    // Jump distance is different in both ISAs due to difference in size of
+    // sleds
+#if SANITIZER_RISCV64
+        PatchOpcodes::PO_J, RegNum::RN_R0,
+        0x026); // jump encodes an offset in multiples of 2 bytes. 38*2 = 76
+#elif defined(__riscv) && (__riscv_xlen == 32)
+        PatchOpcodes::PO_J, RegNum::RN_R0,
+        0x01e); // jump encodes an offset in multiples of 2 bytes. 30*2 = 60
+#endif
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(Address), CreateBranch,
+        std::memory_order_release);
+  }
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        const XRayTrampolines &Trampolines,
+                        bool LogArgs) XRAY_NEVER_INSTRUMENT {
+  // We don't support Logging argument at this moment, so we always
+  // use EntryTrampoline.
+  return patchSled(Enable, FuncId, Sled, Trampolines.EntryTrampoline);
+}
+
+bool patchFunctionExit(
+    const bool Enable, const uint32_t FuncId, const XRaySledEntry &Sled,
+    const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, Trampolines.ExitTrampoline);
+}
+
+bool patchFunctionTailExit(
+    const bool Enable, const uint32_t FuncId, const XRaySledEntry &Sled,
+    const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, Trampolines.TailExitTrampoline);
+}
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return false;
+}
+} // namespace __xray
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {}
diff --git a/compiler-rt/lib/xray/xray_trampoline_riscv32.S b/compiler-rt/lib/xray/xray_trampoline_riscv32.S
new file mode 100644
index 00000000000000..9916e0321d24fd
--- /dev/null
+++ b/compiler-rt/lib/xray/xray_trampoline_riscv32.S
@@ -0,0 +1,83 @@
+//===-- xray_trampoline_riscv32.s ----------------------------------*- ASM -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the riscv32-specific assembler for the trampolines.
+//
+//===----------------------------------------------------------------------===//
+
+.macro SAVE_ARG_REGISTERS
+	// Push argument registers to stack
+	addi	sp, sp, -100
+	.cfi_def_cfa_offset 100
+	sw	ra, 96(sp)
+	.cfi_offset ra, -4
+	sw	a7, 92(sp)
+	sw	a6, 88(sp)
+	sw	a5, 84(sp)
+	sw	a4, 80(sp)
+	sw	a3, 76(sp)
+	sw	a2, 72(sp)
+	sw	a1, 68(sp)
+	sw	a0, 64(sp)
+	fsd	fa7, 56(sp)
+	fsd	fa6, 48(sp)
+	fsd	fa5, 40(sp)
+	fsd	fa4, 32(sp)
+	fsd	fa3, 24(sp)
+	fsd	fa2, 16(sp)
+	fsd	fa1, 8(sp)
+	fsd	fa0, 0(sp)
+.endm
+
+.macro RESTORE_ARG_REGISTERS
+	// Restore argument registers
+	fld	fa0, 0(sp)
+	fld	fa1, 8(sp)
+	fld	fa2, 16(sp)
+	fld	fa3, 24(sp)
+	fld	fa4, 32(sp)
+	fld	fa5, 40(sp)
+	fld	fa6, 48(sp)
+	fld	fa7, 56(sp)
+	lw	a0, 64(sp)
+	lw	a1, 68(sp)
+	lw	a2, 72(sp)
+	lw	a3, 76(sp)
+	lw	a4, 80(sp)
+	lw	a5, 84(sp)
+	lw	a6, 88(sp)
+	lw	a7, 92(sp)
+	lw	ra, 96(sp)
+	addi	sp, sp, 100
+.endm
+
+.macro SAVE_RET_REGISTERS
+	// Push return registers to stack
+	addi	sp, sp, -28
+	.cfi_def_cfa_offset 28
+	sw	ra, 24(sp)
+	.cfi_offset ra, -4
+	sw	a1, 20(sp)
+	sw	a0, 16(sp)
+	fsd	fa1, 8(sp)
+	fsd	fa0, 0(sp)
+.endm
+
+.macro RESTORE_RET_REGISTERS
+	// Restore return registers
+	fld	fa0, 0(sp)
+	fld	fa1, 8(sp)
+	lw	a0, 16(sp)
+	lw	a1, 20(sp)
+	lw	ra, 24(sp)
+	addi	sp, sp, 28
+.endm
+
+#include "xray_trampoline_riscv_common.S"
diff --git a/compiler-rt/lib/xray/xray_trampoline_riscv64.S b/compiler-rt/lib/xray/xray_trampoline_riscv64.S
new file mode 100644
index 00000000000000..102b9881567d9a
--- /dev/null
+++ b/compiler-rt/lib/xray/xray_trampoline_riscv64.S
@@ -0,0 +1,83 @@
+//===-- xray_trampoline_riscv64.s ----------------------------------*- ASM -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the riscv64-specific assembler for the trampolines.
+//
+//===----------------------------------------------------------------------===//
+
+.macro SAVE_ARG_REGISTERS
+	// Push return registers to stack
+	addi	sp, sp, -136
+	.cfi_def_cfa_offset 136
+        sd	ra, 128(sp)
+        .cfi_offset ra, -8
+        sd	a7, 120(sp)
+        sd	a6, 112(sp)
+        sd	a5, 104(sp)
+        sd	a4, 96(sp)
+        sd	a3, 88(sp)
+        sd	a2, 80(sp)
+        sd	a1, 72(sp)
+        sd	a0, 64(sp)
+        fsd	fa7, 56(sp)
+        fsd	fa6, 48(sp)
+        fsd	fa5, 40(sp)
+        fsd	fa4, 32(sp)
+        fsd	fa3, 24(sp)
+        fsd	fa2, 16(sp)
+        fsd	fa1, 8(sp)
+        fsd	fa0, 0(sp)
+.endm
+
+.macro SAVE_RET_REGISTERS
+	// Push return registers to stack
+        addi    sp, sp, -40
+        .cfi_def_cfa_offset 40
+        sd      ra, 32(sp)
+        .cfi_offset ra, -8
+        sd      a1, 24(sp)
+        sd      a0, 16(sp)
+        fsd     fa1, 8(sp)
+        fsd     fa0, 0(sp)
+.endm
+
+.macro RESTORE_RET_REGISTERS
+	// Restore return registers
+        fld     fa0, 0(sp)
+        fld     fa1, 8(sp)
+        ld      a0, 16(sp)
+        ld      a1, 24(sp)
+        ld      ra, 32(sp)
+        addi    sp, sp, 40
+.endm
+
+.macro RESTORE_ARG_REGISTERS
+        // Restore argument registers
+	fld	fa0, 0(sp)
+	fld	fa1, 8(sp)
+	fld	fa2, 16(sp)
+	fld	fa3, 24(sp)
+	fld	fa4, 32(sp)
+	fld	fa5, 40(sp)
+	fld	fa6, 48(sp)
+	fld	fa7, 56(sp)
+	ld	a0, 64(sp)
+	ld	a1, 72(sp)
+	ld	a2, 80(sp)
+	ld	a3, 88(sp)
+	ld	a4, 96(sp)
+	ld	a5, 104(sp)
+	ld	a6, 112(sp)
+	ld	a7, 120(sp)
+	ld	ra, 128(sp)
+	addi	sp, sp, 136
+.endm
+
+#include "xray_trampoline_riscv_common.S"
diff --git a/compiler-rt/lib/xray/xray_trampoline_riscv_common.S b/compiler-rt/lib/xray/xray_trampoline_riscv_common.S
new file mode 100644
index 00000000000000..7ce6fa0dcda31b
--- /dev/null
+++ b/compiler-rt/lib/xray/xray_trampoline_riscv_common.S
@@ -0,0 +1,97 @@
+//===-- xray_trampoline_riscv_common.s --------------------------*- ASM -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the trampolines code shared between riscv32 and riscv64.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../builtins/assembly.h"
+#include "../sanitizer_common/sanitizer_asm.h"
+
+	.text
+	.p2align 2
+	.global ASM_SYMBOL(__xray_FunctionEntry)
+	ASM_TYPE_FUNCTION(__xray_FunctionEntry)
+ASM_SYMBOL(__xray_FunctionEntry):
+	CFI_STARTPROC
+        SAVE_ARG_REGISTERS
+
+	// Load the handler function pointer into a2
+	la	a2, ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
+	ld	a2, 0(a2)
+
+	// Handler address will be null if it is not set
+	beq	a2, x0, 1f
+
+	// If we reach here, we are tracing an event
+	// a0 already contains function id
+	// a1 = 0 means we are tracing an entry event
+	mv	a1, x0
+	jalr	a2
+
+1:
+        RESTORE_ARG_REGISTERS
+	jr	ra
+	ASM_SIZE(__xray_FunctionEntry)
+	CFI_ENDPROC
+
+	.text
+	.p2align 2
+	.global ASM_SYMBOL(__xray_FunctionExit)
+	ASM_TYPE_FUNCTION(__xray_FunctionExit)
+ASM_SYMBOL(__xray_FunctionExit):
+	CFI_STARTPROC
+	SAVE_RET_REGISTERS
+
+	// Load the handler function pointer into a2
+	la	a2, ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
+	ld	a2, 0(a2)
+
+	// Handler address will be null if it is not set
+	beq	a2, x0, 1f
+
+	// If we reach here, we are tracing an event
+	// a0 already contains function id
+	// a1 = 1 means we are tracing an exit event
+	addi	a1, x0, 1
+	jalr	a2
+
+1:
+	RESTORE_RET_REGISTERS
+        jr	ra
+	ASM_SIZE(__xray_FunctionExit)
+	CFI_ENDPROC
+
+	.text
+	.p2align 2
+	.global ASM_SYMBOL(__xray_FunctionTailExit)
+	ASM_TYPE_FUNCTION(__xray_FunctionTailExit)
+ASM_SYMBOL(__xray_FunctionTailExit):
+	CFI_STARTPROC
+        SAVE_ARG_REGISTERS
+
+	// Load the handler function pointer into a2
+	la	a2, ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
+	ld	a2, 0(a2)
+
+	// Handler address will be null if it is not set
+	beq	a2, x0, 1f
+
+	// If we reach here, we are tracing an event
+	// a0 already contains function id
+	// a1 = 2 means we are tracing a tail exit event
+	addi	a1, x0, 2
+	jalr	a2
+
+1:
+        RESTORE_ARG_REGISTERS
+	jr	ra
+	ASM_SIZE(__xray_FunctionTailExit)
+	CFI_ENDPROC
diff --git a/compiler-rt/lib/xray/xray_tsc.h b/compiler-rt/lib/xray/xray_tsc.h
index e1cafe1bf11d2d..b62a686d6ce0f2 100644
--- a/compiler-rt/lib/xray/xray_tsc.h
+++ b/compiler-rt/lib/xray/xray_tsc.h
@@ -43,7 +43,7 @@ inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
 #elif defined(__powerpc64__)
 #include "xray_powerpc64.inc"
 #elif defined(__arm__) || defined(__aarch64__) || defined(__mips__) ||         \
-    defined(__hexagon__) || defined(__loongarch_lp64)
+    defined(__hexagon__) || defined(__loongarch_lp64) || defined(__riscv)
 // Emulated TSC.
 // There is no instruction like RDTSCP in user mode on ARM. ARM's CP15 does
 //   not have a constant frequency like TSC on x86(_64), it may go faster
diff --git a/llvm/lib/CodeGen/XRayInstrumentation.cpp b/llvm/lib/CodeGen/XRayInstrumentation.cpp
index 8f718d884cd067..1191d70f77f3e0 100644
--- a/llvm/lib/CodeGen/XRayInstrumentation.cpp
+++ b/llvm/lib/CodeGen/XRayInstrumentation.cpp
@@ -233,10 +233,13 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
     case Triple::ArchType::mips:
     case Triple::ArchType::mipsel:
     case Triple::ArchType::mips64:
-    case Triple::ArchType::mips64el: {
+    case Triple::ArchType::mips64el:
+    case Triple::ArchType::riscv32:
+    case Triple::ArchType::riscv64: {
       // For the architectures which don't have a single return instruction
       InstrumentationOptions op;
-      op.HandleTailcall = false;
+      // RISC-V support patching tail calls.
+      op.HandleTailcall = MF.getTarget().getTargetTriple().isRISCV();
       op.HandleAllReturns = true;
       prependRetWithPatchableExit(MF, TII, op);
       break;
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 0d818bc837fb70..5382751b0d8502 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -113,6 +113,12 @@ class RISCVAsmPrinter : public AsmPrinter {
 
   void emitNTLHint(const MachineInstr *MI);
 
+  // XRay Support
+  void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr *MI);
+  void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr *MI);
+  void LowerPATCHABLE_TAIL_CALL(const MachineInstr *MI);
+  void emitSled(const MachineInstr *MI, SledKind Kind);
+
   bool lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
 };
 }
@@ -316,6 +322,22 @@ void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) {
     return LowerPATCHPOINT(*OutStreamer, SM, *MI);
   case TargetOpcode::STATEPOINT:
     return LowerSTATEPOINT(*OutStreamer, SM, *MI);
+  case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
+    // patchable-function-entry is handled in lowerToMCInst
+    // Therefore, we break out of the switch statement if we encounter it here.
+    const Function &F = MI->getParent()->getParent()->getFunction();
+    if (F.hasFnAttribute("patchable-function-entry"))
+      break;
+
+    LowerPATCHABLE_FUNCTION_ENTER(MI);
+    return;
+  }
+  case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
+    LowerPATCHABLE_FUNCTION_EXIT(MI);
+    return;
+  case TargetOpcode::PATCHABLE_TAIL_CALL:
+    LowerPATCHABLE_TAIL_CALL(MI);
+    return;
   }
 
   MCInst OutInst;
@@ -453,11 +475,71 @@ bool RISCVAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   SetupMachineFunction(MF);
   emitFunctionBody();
 
+  // Emit the XRay table
+  emitXRayTable();
+
   if (EmittedOptionArch)
     RTS.emitDirectiveOptionPop();
   return false;
 }
 
+void RISCVAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr *MI) {
+  emitSled(MI, SledKind::FUNCTION_ENTER);
+}
+
+void RISCVAsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr *MI) {
+  emitSled(MI, SledKind::FUNCTION_EXIT);
+}
+
+void RISCVAsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr *MI) {
+  emitSled(MI, SledKind::TAIL_CALL);
+}
+
+void RISCVAsmPrinter::emitSled(const MachineInstr *MI, SledKind Kind) {
+  // We want to emit the jump instruction and the nops constituting the sled.
+  // The format is as follows:
+  // .Lxray_sled_N
+  //   ALIGN
+  //   J .tmpN
+  //   29 or 37 C.NOP instructions
+  // .tmpN
+
+  // The following variable holds the count of the number of NOPs to be patched
+  // in for XRay instrumentation during compilation.
+  // Note that RV64 and RV32 each has a sled of 76 and 60 bytes, respectively.
+  // Assuming we're using JAL to jump to .tmpN, then we only need
+  // (76 - 4)/2 = 36 NOPs for RV64 and (60 - 4)/2 = 28 for RV32. However, there
+  // is a chance that we'll use C.JAL instead, so an additional NOP is needed.
+  const uint8_t NoopsInSledCount =
+      MI->getParent()->getParent()->getSubtarget<RISCVSubtarget>().is64Bit()
+          ? 37
+          : 29;
+
+  OutStreamer->emitCodeAlignment(Align(4), &getSubtargetInfo());
+  auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+  OutStreamer->emitLabel(CurSled);
+  auto Target = OutContext.createTempSymbol();
+
+  const MCExpr *TargetExpr = MCSymbolRefExpr::create(
+      Target, MCSymbolRefExpr::VariantKind::VK_None, OutContext);
+
+  // Emit "J bytes" instruction, which jumps over the nop sled to the actual
+  // start of function.
+  EmitToStreamer(
+      *OutStreamer,
+      MCInstBuilder(RISCV::JAL).addReg(RISCV::X0).addExpr(TargetExpr));
+
+  // Emit NOP instructions
+  for (int8_t I = 0; I < NoopsInSledCount; ++I)
+    EmitToStreamer(*OutStreamer, MCInstBuilder(RISCV::ADDI)
+                                     .addReg(RISCV::X0)
+                                     .addReg(RISCV::X0)
+                                     .addImm(0));
+
+  OutStreamer->emitLabel(Target);
+  recordSled(CurSled, *MI, Kind, 2);
+}
+
 void RISCVAsmPrinter::emitStartOfAsmFile(Module &M) {
   RISCVTargetStreamer &RTS =
       static_cast<RISCVTargetStreamer &>(*OutStreamer->getTargetStreamer());
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index f2c0a3d85c998a..f8a8e03fd28d16 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -223,6 +223,9 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     return UserReservedRegister[i];
   }
 
+  // XRay support - require D and C extensions.
+  bool isXRaySupported() const override { return hasStdExtD() && hasStdExtC(); }
+
   // Vector codegen related methods.
   bool hasVInstructions() const { return HasStdExtZve32x; }
   bool hasVInstructionsI64() const { return HasStdExtZve64x; }
diff --git a/llvm/lib/XRay/InstrumentationMap.cpp b/llvm/lib/XRay/InstrumentationMap.cpp
index 800f0a0f47e425..0ebdcd5bac7526 100644
--- a/llvm/lib/XRay/InstrumentationMap.cpp
+++ b/llvm/lib/XRay/InstrumentationMap.cpp
@@ -63,7 +63,8 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
         ObjFile.getBinary()->getArch() == Triple::loongarch64 ||
         ObjFile.getBinary()->getArch() == Triple::ppc64le ||
         ObjFile.getBinary()->getArch() == Triple::arm ||
-        ObjFile.getBinary()->getArch() == Triple::aarch64))
+        ObjFile.getBinary()->getArch() == Triple::aarch64 ||
+        ObjFile.getBinary()->getArch() == Triple::riscv64))
     return make_error<StringError>(
         "File format not supported (only does ELF and Mach-O little endian "
         "64-bit).",
diff --git a/llvm/test/CodeGen/RISCV/xray-attribute-instrumentation.ll b/llvm/test/CodeGen/RISCV/xray-attribute-instrumentation.ll
new file mode 100644
index 00000000000000..a8fc3bff0b0f5b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xray-attribute-instrumentation.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=riscv32-unknown-linux-gnu -mattr=+d,+c < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=riscv64-unknown-linux-gnu -mattr=+d,+c < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-RISCV64 %s
+
+define i32 @foo() nounwind "function-instrument"="xray-always" {
+; CHECK:                        .p2align 2
+; CHECK-LABEL:                  .Lxray_sled_0:
+; CHECK-NEXT:                   j .Ltmp0
+; CHECK-COUNT-29:               nop
+; CHECK-RISCV64-COUNT-8:        nop
+; CHECK-LABEL:                  .Ltmp0:
+  ret i32 0
+; CHECK:                        .p2align 2
+; CHECK-LABEL:                  .Lxray_sled_1:
+; CHECK-NEXT:                   j .Ltmp1
+; CHECK-COUNT-29:               nop
+; CHECK-RISCV64-COUNT-8:        nop
+; CHECK-LABEL:                  .Ltmp1:
+; CHECK-NEXT:                   ret
+}
+; CHECK:                        .section xray_instr_map,"ao", at progbits,foo
+; CHECK-LABEL:                  .Lxray_sleds_start0:
+; CHECK:                        .Lxray_sled_0-[[TMP:.Ltmp[0-9]+]]
+; CHECK:                        .Lxray_sled_1-[[TMP:.Ltmp[0-9]+]]
+; CHECK-LABEL:                  .Lxray_sleds_end0:

>From d43a31ca80ce513a4db06572406ad6f910aec119 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Mon, 25 Nov 2024 14:08:28 -0800
Subject: [PATCH 2/2] Address review comments

---
 compiler-rt/lib/xray/xray_interface.cpp       |   4 +-
 compiler-rt/lib/xray/xray_riscv.cpp           | 234 ++++++++----------
 .../lib/xray/xray_trampoline_riscv32.S        |  96 +++----
 .../lib/xray/xray_trampoline_riscv64.S        | 104 ++++----
 .../lib/xray/xray_trampoline_riscv_common.S   |   6 +-
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     |  10 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |  20 ++
 .../RISCV/xray-attribute-instrumentation.ll   |   4 +-
 8 files changed, 238 insertions(+), 240 deletions(-)

diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp
index e66736d9a344e1..637972856ecaa7 100644
--- a/compiler-rt/lib/xray/xray_interface.cpp
+++ b/compiler-rt/lib/xray/xray_interface.cpp
@@ -58,9 +58,9 @@ static const int16_t cSledLength = 8;
 #elif defined(__hexagon__)
 static const int16_t cSledLength = 20;
 #elif SANITIZER_RISCV64
-static const int16_t cSledLength = 76;
+static const int16_t cSledLength = 68;
 #elif defined(__riscv) && (__riscv_xlen == 32)
-static const int16_t cSledLength = 60;
+static const int16_t cSledLength = 52;
 #else
 #error "Unsupported CPU Architecture"
 #endif /* CPU architecture */
diff --git a/compiler-rt/lib/xray/xray_riscv.cpp b/compiler-rt/lib/xray/xray_riscv.cpp
index 89ce9305ef3dbe..644c65029b8c71 100644
--- a/compiler-rt/lib/xray/xray_riscv.cpp
+++ b/compiler-rt/lib/xray/xray_riscv.cpp
@@ -8,7 +8,7 @@
 //
 // This file is a part of XRay, a dynamic runtime instrumentation system.
 //
-// Implementation of riscv-specific routines (32- and 64-bit).
+// Implementation of RISC-V specific routines (32- and 64-bit).
 //
 //===----------------------------------------------------------------------===//
 #include "sanitizer_common/sanitizer_common.h"
@@ -22,28 +22,24 @@ namespace __xray {
 enum PatchOpcodes : uint32_t {
   PO_ADDI = 0x00000013, // addi rd, rs1, imm
   PO_ADD = 0x00000033,  // add rd, rs1, rs2
-  PO_SW = 0x00002023,   // sw rt, base(offset)
-  PO_SD = 0x00003023,   // sd rt, base(offset)
+  PO_SW = 0x00002023,   // sw rs2, imm(rs1)
+  PO_SD = 0x00003023,   // sd rs2, imm(rs1)
   PO_LUI = 0x00000037,  // lui rd, imm
-  PO_ORI = 0x00006013,  // ori rd, rs1, imm
   PO_OR = 0x00006033,   // or rd, rs1, rs2
-  PO_SLLI = 0x00001013, // slli rd, rs, shamt
-  PO_SRLI = 0x00005013, // srli rd, rs, shamt
-  PO_JALR = 0x00000067, // jalr rs
-  PO_LW = 0x00002003,   // lw rd, base(offset)
-  PO_LD = 0x00003003,   // ld rd, base(offset)
-  PO_J = 0x0000006f,    // jal #n_bytes
-  PO_NOP = 0x00000013,  // nop - pseduo-instruction, same as addi x0, x0, 0
+  PO_SLLI = 0x00001013, // slli rd, rs1, shamt
+  PO_JALR = 0x00000067, // jalr rd, rs1
+  PO_LW = 0x00002003,   // lw rd, imm(rs1)
+  PO_LD = 0x00003003,   // ld rd, imm(rs1)
+  PO_J = 0x0000006f,    // jal imm
+  PO_NOP = PO_ADDI,     // addi x0, x0, 0
 };
 
 enum RegNum : uint32_t {
-  RN_R0 = 0x0,
-  RN_RA = 0x1,
-  RN_SP = 0x2,
-  RN_T0 = 0x5,
-  RN_T1 = 0x6,
-  RN_T2 = 0x7,
-  RN_A0 = 0xa,
+  RN_X0 = 0,
+  RN_RA = 1,
+  RN_SP = 2,
+  RN_T1 = 6,
+  RN_A0 = 10,
 };
 
 static inline uint32_t encodeRTypeInstruction(uint32_t Opcode, uint32_t Rs1,
@@ -58,9 +54,9 @@ static inline uint32_t encodeITypeInstruction(uint32_t Opcode, uint32_t Rs1,
 
 static inline uint32_t encodeSTypeInstruction(uint32_t Opcode, uint32_t Rs1,
                                               uint32_t Rs2, uint32_t Imm) {
-  uint32_t imm_msbs = (Imm & 0xfe0) << 25;
-  uint32_t imm_lsbs = (Imm & 0x01f) << 7;
-  return imm_msbs | Rs2 << 20 | Rs1 << 15 | imm_lsbs | Opcode;
+  uint32_t ImmMSB = (Imm & 0xfe0) << 25;
+  uint32_t ImmLSB = (Imm & 0x01f) << 7;
+  return ImmMSB | Rs2 << 20 | Rs1 << 15 | ImmLSB | Opcode;
 }
 
 static inline uint32_t encodeUTypeInstruction(uint32_t Opcode, uint32_t Rd,
@@ -70,20 +66,15 @@ static inline uint32_t encodeUTypeInstruction(uint32_t Opcode, uint32_t Rd,
 
 static inline uint32_t encodeJTypeInstruction(uint32_t Opcode, uint32_t Rd,
                                               uint32_t Imm) {
-  uint32_t imm_msb = (Imm & 0x80000) << 31;
-  uint32_t imm_lsbs = (Imm & 0x003ff) << 21;
-  uint32_t imm_11 = (Imm & 0x00400) << 20;
-  uint32_t imm_1912 = (Imm & 0x7f800) << 12;
-  return imm_msb | imm_lsbs | imm_11 | imm_1912 | Rd << 7 | Opcode;
+  uint32_t ImmMSB = (Imm & 0x100000) << 31;
+  uint32_t ImmLSB = (Imm & 0x7fe) << 21;
+  uint32_t Imm11 = (Imm & 0x800) << 20;
+  uint32_t Imm1912 = (Imm & 0xff000) << 12;
+  return ImmMSB | ImmLSB | Imm11 | Imm1912 | Rd << 7 | Opcode;
 }
 
-#if SANITIZER_RISCV64
-static uint32_t hi20(uint64_t val) { return (val + 0x800) >> 12; }
-static uint32_t lo12(uint64_t val) { return val & 0xfff; }
-#elif defined(__riscv) && (__riscv_xlen == 32)
 static uint32_t hi20(uint32_t val) { return (val + 0x800) >> 12; }
 static uint32_t lo12(uint32_t val) { return val & 0xfff; }
-#endif
 
 static inline bool patchSled(const bool Enable, const uint32_t FuncId,
                              const XRaySledEntry &Sled,
@@ -93,7 +84,7 @@ static inline bool patchSled(const bool Enable, const uint32_t FuncId,
   //
   // xray_sled_n:
   //	J .tmpN
-  //	29 or 37 C.NOPs (58 or 74 bytes)
+  //	25 or 33 C.NOPs (50 or 66 bytes)
   //	.tmpN
   //
   // With one of the following runtime patches:
@@ -101,38 +92,34 @@ static inline bool patchSled(const bool Enable, const uint32_t FuncId,
   // xray_sled_n (32-bit):
   //    addi sp, sp, -16                                ;create stack frame
   //    sw ra, 12(sp)                                   ;save return address
-  //    sw t2, 8(sp)                                    ;save register t2
-  //    sw t1, 4(sp)                                    ;save register t1
-  //    sw a0, 0(sp)                                    ;save register a0
-  //    lui t1, %hi(__xray_FunctionEntry/Exit)
-  //    addi t1, t1, %lo(__xray_FunctionEntry/Exit)
+  //    sw t1, 8(sp)                                    ;save register t1
+  //    sw a0, 4(sp)                                    ;save register a0
+  //    lui ra, %hi(__xray_FunctionEntry/Exit)
+  //    addi ra, ra, %lo(__xray_FunctionEntry/Exit)
   //    lui a0, %hi(function_id)
   //    addi a0, a0, %lo(function_id)                   ;pass function id
-  //    jalr t1                                         ;call Tracing hook
-  //    lw a0, 0(sp)                                    ;restore register a0
-  //    lw t1, 4(sp)                                    ;restore register t1
-  //    lw t2, 8(sp)                                    ;restore register t2
+  //    jalr ra                                         ;call Tracing hook
+  //    lw a0, 4(sp)                                    ;restore register a0
+  //    lw t1, 8(sp)                                    ;restore register t1
   //    lw ra, 12(sp)                                   ;restore return address
   //    addi sp, sp, 16                                 ;delete stack frame
   //
   // xray_sled_n (64-bit):
   //    addi sp, sp, -32                                ;create stack frame
   //    sd ra, 24(sp)                                   ;save return address
-  //    sd t2, 16(sp)                                   ;save register t2
-  //    sd t1, 8(sp)                                    ;save register t1
-  //    sd a0, 0(sp)                                    ;save register a0
-  //    lui t2, %highest(__xray_FunctionEntry/Exit)
-  //    addi t2, t2, %higher(__xray_FunctionEntry/Exit)
-  //    slli t2, t2, 32
-  //    lui t1, t1, %hi(__xray_FunctionEntry/Exit)
-  //    addi t1, t1, %lo(__xray_FunctionEntry/Exit)
-  //    add t1, t2, t1
+  //    sd t1, 16(sp)                                   ;save register t1
+  //    sd a0, 8(sp)                                    ;save register a0
+  //    lui t1, %highest(__xray_FunctionEntry/Exit)
+  //    addi t1, t1, %higher(__xray_FunctionEntry/Exit)
+  //    slli t1, t1, 32
+  //    lui ra, ra, %hi(__xray_FunctionEntry/Exit)
+  //    addi ra, ra, %lo(__xray_FunctionEntry/Exit)
+  //    add ra, t1, ra
   //    lui a0, %hi(function_id)
   //    addi a0, a0, %lo(function_id)                   ;pass function id
-  //    jalr t1                                         ;call Tracing hook
-  //    ld a0, 0(sp)                                    ;restore register a0
-  //    ld t1, 8(sp)                                    ;restore register t1
-  //    ld t2, 16(sp)                                   ;restore register t2
+  //    jalr ra                                         ;call Tracing hook
+  //    ld a0, 8(sp)                                    ;restore register a0
+  //    ld t1, 16(sp)                                   ;restore register t1
   //    ld ra, 24(sp)                                   ;restore return address
   //    addi sp, sp, 32                                 ;delete stack frame
   //
@@ -142,104 +129,87 @@ static inline bool patchSled(const bool Enable, const uint32_t FuncId,
   // latter is ready.
   //
   // When |Enable|==false, we set back the first instruction in the sled to be
-  //   J 60 bytes (rv32)
-  //   J 76 bytes (rv64)
+  //   J 52 bytes (rv32)
+  //   J 68 bytes (rv64)
 
   uint32_t *Address = reinterpret_cast<uint32_t *>(Sled.address());
   if (Enable) {
-    // If the ISA is RISCV 64, the Tracing Hook needs to be typecast to a 64 bit
-    // value
 #if SANITIZER_RISCV64
+    // If the ISA is RV64, the Tracing Hook needs to be typecast to a 64 bit
+    // value.
     uint32_t LoTracingHookAddr = lo12(reinterpret_cast<uint64_t>(TracingHook));
     uint32_t HiTracingHookAddr = hi20(reinterpret_cast<uint64_t>(TracingHook));
     uint32_t HigherTracingHookAddr =
         lo12((reinterpret_cast<uint64_t>(TracingHook) + 0x80000000) >> 32);
     uint32_t HighestTracingHookAddr =
         hi20((reinterpret_cast<uint64_t>(TracingHook) + 0x80000000) >> 32);
-    // We typecast the Tracing Hook to a 32 bit value for RISCV32
 #elif defined(__riscv) && (__riscv_xlen == 32)
+    // We typecast the Tracing Hook to a 32 bit value for RV32
     uint32_t LoTracingHookAddr = lo12(reinterpret_cast<uint32_t>(TracingHook));
     uint32_t HiTracingHookAddr = hi20((reinterpret_cast<uint32_t>(TracingHook));
 #endif
     uint32_t LoFunctionID = lo12(FuncId);
     uint32_t HiFunctionID = hi20(FuncId);
+
     // The sled that is patched in for RISCV64 defined below. We need the entire
     // sleds corresponding to both ISAs to be protected by defines because the
     // first few instructions are all different, because we store doubles in
     // case of RV64 and store words for RV32. Subsequently, we have LUI - and in
     // case of RV64, we need extra instructions from this point on, so we see
     // differences in addresses to which instructions are stored.
+    size_t Idx = 1U;
+    const uint32_t XLenBytes = __riscv_xlen / 8;
 #if SANITIZER_RISCV64
-    Address[1] = encodeSTypeInstruction(PatchOpcodes::PO_SD, RegNum::RN_SP,
-                                        RegNum::RN_RA, 0x18);
-    Address[2] = encodeSTypeInstruction(PatchOpcodes::PO_SD, RegNum::RN_SP,
-                                        RegNum::RN_T2, 0x10);
-    Address[3] = encodeSTypeInstruction(PatchOpcodes::PO_SD, RegNum::RN_SP,
-                                        RegNum::RN_T1, 0x8);
-    Address[4] = encodeSTypeInstruction(PatchOpcodes::PO_SD, RegNum::RN_SP,
-                                        RegNum::RN_A0, 0x0);
-    Address[5] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_T2,
-                                        HighestTracingHookAddr);
-    Address[6] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_T2,
-                                        RegNum::RN_T2, HigherTracingHookAddr);
-    Address[7] = encodeITypeInstruction(PatchOpcodes::PO_SLLI, RegNum::RN_T2,
-                                        RegNum::RN_T2, 0x20);
-    Address[8] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_T1,
-                                        HiTracingHookAddr);
-    Address[9] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_T1,
-                                        RegNum::RN_T1, LoTracingHookAddr);
-    Address[10] = encodeRTypeInstruction(PatchOpcodes::PO_ADD, RegNum::RN_T1,
-                                         RegNum::RN_T2, RegNum::RN_T1);
-    Address[11] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_A0,
-                                         HiFunctionID);
-    Address[12] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_A0,
-                                         RegNum::RN_A0, LoFunctionID);
-    Address[13] = encodeITypeInstruction(PatchOpcodes::PO_JALR, RegNum::RN_T1,
-                                         RegNum::RN_RA, 0x0);
-    Address[14] = encodeITypeInstruction(PatchOpcodes::PO_LD, RegNum::RN_SP,
-                                         RegNum::RN_A0, 0x0);
-    Address[15] = encodeITypeInstruction(PatchOpcodes::PO_LD, RegNum::RN_SP,
-                                         RegNum::RN_T1, 0x8);
-    Address[16] = encodeITypeInstruction(PatchOpcodes::PO_LD, RegNum::RN_SP,
-                                         RegNum::RN_T2, 0x10);
-    Address[17] = encodeITypeInstruction(PatchOpcodes::PO_LD, RegNum::RN_SP,
-                                         RegNum::RN_RA, 0x18);
-    Address[18] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_SP,
-                                         RegNum::RN_SP, 0x20);
-    uint32_t CreateStackSpace = encodeITypeInstruction(
-        PatchOpcodes::PO_ADDI, RegNum::RN_SP, RegNum::RN_SP, 0xffe0);
+    const unsigned LoadOp = PatchOpcodes::PO_LD;
+    const unsigned StoreOp = PatchOpcodes::PO_SD;
 #elif defined(__riscv) && (__riscv_xlen == 32)
-    Address[1] = encodeSTypeInstruction(PatchOpcodes::PO_SW, RegNum::RN_SP,
-                                        RegNum::RN_RA, 0x0c);
-    Address[2] = encodeSTypeInstruction(PatchOpcodes::PO_SW, RegNum::RN_SP,
-                                        RegNum::RN_T2, 0x08);
-    Address[3] = encodeSTypeInstruction(PatchOpcodes::PO_SW, RegNum::RN_SP,
-                                        RegNum::RN_T1, 0x4);
-    Address[4] = encodeSTypeInstruction(PatchOpcodes::PO_SW, RegNum::RN_SP,
-                                        RegNum::RN_A0, 0x0);
-    Address[5] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_T1,
-                                        HiTracingHookAddr);
-    Address[6] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_T1,
-                                        RegNum::RN_T1, LoTracingHookAddr);
-    Address[7] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_A0,
-                                        HiFunctionID);
-    Address[8] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_A0,
-                                        RegNum::RN_A0, LoFunctionID);
-    Address[9] = encodeITypeInstruction(PatchOpcodes::PO_JALR, RegNum::RN_T1,
-                                        RegNum::RN_RA, 0x0);
-    Address[10] = encodeITypeInstruction(PatchOpcodes::PO_LW, RegNum::RN_SP,
-                                         RegNum::RN_A0, 0x0);
-    Address[11] = encodeITypeInstruction(PatchOpcodes::PO_LW, RegNum::RN_SP,
-                                         RegNum::RN_T1, 0x4);
-    Address[12] = encodeITypeInstruction(PatchOpcodes::PO_LW, RegNum::RN_SP,
-                                         RegNum::RN_T2, 0x08);
-    Address[13] = encodeITypeInstruction(PatchOpcodes::PO_LW, RegNum::RN_SP,
-                                         RegNum::RN_RA, 0x0c);
-    Address[14] = encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_SP,
-                                         RegNum::RN_SP, 0x10);
-    uint32_t CreateStackSpace = encodeITypeInstruction(
-        PatchOpcodes::PO_ADDI, RegNum::RN_SP, RegNum::RN_SP, 0xfff0);
+    const unsigned LoadOp = PatchOpcodes::PO_LW;
+    const unsigned StoreOp = PatchOpcodes::PO_SW;
+#endif
+
+    Address[Idx++] = encodeSTypeInstruction(StoreOp, RegNum::RN_SP,
+                                            RegNum::RN_RA, 3 * XLenBytes);
+    Address[Idx++] = encodeSTypeInstruction(StoreOp, RegNum::RN_SP,
+                                            RegNum::RN_T1, 2 * XLenBytes);
+    Address[Idx++] = encodeSTypeInstruction(StoreOp, RegNum::RN_SP,
+                                            RegNum::RN_A0, XLenBytes);
+
+#if SANITIZER_RISCV64
+    Address[Idx++] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_T1,
+                                            HighestTracingHookAddr);
+    Address[Idx++] =
+        encodeITypeInstruction(PatchOpcodes::PO_ADDI, RegNum::RN_T1,
+                               RegNum::RN_T1, HigherTracingHookAddr);
+    Address[Idx++] = encodeITypeInstruction(PatchOpcodes::PO_SLLI,
+                                            RegNum::RN_T1, RegNum::RN_T1, 32);
+#endif
+    Address[Idx++] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_RA,
+                                            HiTracingHookAddr);
+    Address[Idx++] = encodeITypeInstruction(
+        PatchOpcodes::PO_ADDI, RegNum::RN_RA, RegNum::RN_RA, LoTracingHookAddr);
+#if SANITIZER_RISCV64
+    Address[Idx++] = encodeRTypeInstruction(PatchOpcodes::PO_ADD, RegNum::RN_RA,
+                                            RegNum::RN_T1, RegNum::RN_RA);
 #endif
+    Address[Idx++] = encodeUTypeInstruction(PatchOpcodes::PO_LUI, RegNum::RN_A0,
+                                            HiFunctionID);
+    Address[Idx++] = encodeITypeInstruction(
+        PatchOpcodes::PO_ADDI, RegNum::RN_A0, RegNum::RN_A0, LoFunctionID);
+    Address[Idx++] = encodeITypeInstruction(PatchOpcodes::PO_JALR,
+                                            RegNum::RN_RA, RegNum::RN_RA, 0);
+
+    Address[Idx++] =
+        encodeITypeInstruction(LoadOp, RegNum::RN_SP, RegNum::RN_A0, XLenBytes);
+    Address[Idx++] = encodeITypeInstruction(LoadOp, RegNum::RN_SP,
+                                            RegNum::RN_T1, 2 * XLenBytes);
+    Address[Idx++] = encodeITypeInstruction(LoadOp, RegNum::RN_SP,
+                                            RegNum::RN_RA, 3 * XLenBytes);
+    Address[Idx++] = encodeITypeInstruction(
+        PatchOpcodes::PO_ADDI, RegNum::RN_SP, RegNum::RN_SP, 4 * XLenBytes);
+
+    uint32_t CreateStackSpace = encodeITypeInstruction(
+        PatchOpcodes::PO_ADDI, RegNum::RN_SP, RegNum::RN_SP, -4 * XLenBytes);
+
     std::atomic_store_explicit(
         reinterpret_cast<std::atomic<uint32_t> *>(Address), CreateStackSpace,
         std::memory_order_release);
@@ -248,11 +218,11 @@ static inline bool patchSled(const bool Enable, const uint32_t FuncId,
     // Jump distance is different in both ISAs due to difference in size of
     // sleds
 #if SANITIZER_RISCV64
-        PatchOpcodes::PO_J, RegNum::RN_R0,
-        0x026); // jump encodes an offset in multiples of 2 bytes. 38*2 = 76
+        PatchOpcodes::PO_J, RegNum::RN_X0,
+        68); // jump encodes an offset of 68
 #elif defined(__riscv) && (__riscv_xlen == 32)
-        PatchOpcodes::PO_J, RegNum::RN_R0,
-        0x01e); // jump encodes an offset in multiples of 2 bytes. 30*2 = 60
+        PatchOpcodes::PO_J, RegNum::RN_X0,
+        52); // jump encodes an offset of 52
 #endif
     std::atomic_store_explicit(
         reinterpret_cast<std::atomic<uint32_t> *>(Address), CreateBranch,
@@ -265,7 +235,7 @@ bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
                         const XRaySledEntry &Sled,
                         const XRayTrampolines &Trampolines,
                         bool LogArgs) XRAY_NEVER_INSTRUMENT {
-  // We don't support Logging argument at this moment, so we always
+  // We don't support logging argument at this moment, so we always
   // use EntryTrampoline.
   return patchSled(Enable, FuncId, Sled, Trampolines.EntryTrampoline);
 }
diff --git a/compiler-rt/lib/xray/xray_trampoline_riscv32.S b/compiler-rt/lib/xray/xray_trampoline_riscv32.S
index 9916e0321d24fd..f23457bb8c2822 100644
--- a/compiler-rt/lib/xray/xray_trampoline_riscv32.S
+++ b/compiler-rt/lib/xray/xray_trampoline_riscv32.S
@@ -14,58 +14,58 @@
 
 .macro SAVE_ARG_REGISTERS
 	// Push argument registers to stack
-	addi	sp, sp, -100
-	.cfi_def_cfa_offset 100
-	sw	ra, 96(sp)
+	addi	sp, sp, -112
+	.cfi_def_cfa_offset 112
+	sw	ra, 108(sp)
 	.cfi_offset ra, -4
-	sw	a7, 92(sp)
-	sw	a6, 88(sp)
-	sw	a5, 84(sp)
-	sw	a4, 80(sp)
-	sw	a3, 76(sp)
-	sw	a2, 72(sp)
-	sw	a1, 68(sp)
-	sw	a0, 64(sp)
-	fsd	fa7, 56(sp)
-	fsd	fa6, 48(sp)
-	fsd	fa5, 40(sp)
-	fsd	fa4, 32(sp)
-	fsd	fa3, 24(sp)
-	fsd	fa2, 16(sp)
-	fsd	fa1, 8(sp)
-	fsd	fa0, 0(sp)
+	sw	a7, 104(sp)
+	sw	a6, 100(sp)
+	sw	a5, 96(sp)
+	sw	a4, 92(sp)
+	sw	a3, 88(sp)
+	sw	a2, 84(sp)
+	sw	a1, 80(sp)
+	sw	a0, 76(sp)
+	fsd	fa7, 64(sp)
+	fsd	fa6, 56(sp)
+	fsd	fa5, 48(sp)
+	fsd	fa4, 40(sp)
+	fsd	fa3, 32(sp)
+	fsd	fa2, 24(sp)
+	fsd	fa1, 16(sp)
+	fsd	fa0, 8(sp)
 .endm
 
 .macro RESTORE_ARG_REGISTERS
 	// Restore argument registers
-	fld	fa0, 0(sp)
-	fld	fa1, 8(sp)
-	fld	fa2, 16(sp)
-	fld	fa3, 24(sp)
-	fld	fa4, 32(sp)
-	fld	fa5, 40(sp)
-	fld	fa6, 48(sp)
-	fld	fa7, 56(sp)
-	lw	a0, 64(sp)
-	lw	a1, 68(sp)
-	lw	a2, 72(sp)
-	lw	a3, 76(sp)
-	lw	a4, 80(sp)
-	lw	a5, 84(sp)
-	lw	a6, 88(sp)
-	lw	a7, 92(sp)
-	lw	ra, 96(sp)
-	addi	sp, sp, 100
+	fld	fa0, 8(sp)
+	fld	fa1, 16(sp)
+	fld	fa2, 24(sp)
+	fld	fa3, 32(sp)
+	fld	fa4, 40(sp)
+	fld	fa5, 48(sp)
+	fld	fa6, 56(sp)
+	fld	fa7, 64(sp)
+	lw	a0, 76(sp)
+	lw	a1, 80(sp)
+	lw	a2, 84(sp)
+	lw	a3, 88(sp)
+	lw	a4, 92(sp)
+	lw	a5, 96(sp)
+	lw	a6, 100(sp)
+	lw	a7, 104(sp)
+	lw	ra, 108(sp)
+	addi	sp, sp, 112
 .endm
 
 .macro SAVE_RET_REGISTERS
 	// Push return registers to stack
-	addi	sp, sp, -28
-	.cfi_def_cfa_offset 28
-	sw	ra, 24(sp)
+	addi	sp, sp, -32
+	.cfi_def_cfa_offset 32
+	sw	ra, 28(sp)
 	.cfi_offset ra, -4
-	sw	a1, 20(sp)
-	sw	a0, 16(sp)
+	sw	a1, 24(sp)
+	sw	a0, 20(sp)
 	fsd	fa1, 8(sp)
 	fsd	fa0, 0(sp)
 .endm
@@ -74,10 +74,14 @@
 	// Restore return registers
 	fld	fa0, 0(sp)
 	fld	fa1, 8(sp)
-	lw	a0, 16(sp)
-	lw	a1, 20(sp)
-	lw	ra, 24(sp)
-	addi	sp, sp, 28
+	lw	a0, 20(sp)
+	lw	a1, 24(sp)
+	lw	ra, 28(sp)
+	addi	sp, sp, 32
+.endm
+
+.macro LOAD_XLEN, rd, src
+        lw      \rd, \src
 .endm
 
 #include "xray_trampoline_riscv_common.S"
diff --git a/compiler-rt/lib/xray/xray_trampoline_riscv64.S b/compiler-rt/lib/xray/xray_trampoline_riscv64.S
index 102b9881567d9a..90c47a953eb4c0 100644
--- a/compiler-rt/lib/xray/xray_trampoline_riscv64.S
+++ b/compiler-rt/lib/xray/xray_trampoline_riscv64.S
@@ -14,70 +14,74 @@
 
 .macro SAVE_ARG_REGISTERS
 	// Push return registers to stack
-	addi	sp, sp, -136
-	.cfi_def_cfa_offset 136
-        sd	ra, 128(sp)
+	addi	sp, sp, -144
+	.cfi_def_cfa_offset 144
+        sd	ra, 136(sp)
         .cfi_offset ra, -8
-        sd	a7, 120(sp)
-        sd	a6, 112(sp)
-        sd	a5, 104(sp)
-        sd	a4, 96(sp)
-        sd	a3, 88(sp)
-        sd	a2, 80(sp)
-        sd	a1, 72(sp)
-        sd	a0, 64(sp)
-        fsd	fa7, 56(sp)
-        fsd	fa6, 48(sp)
-        fsd	fa5, 40(sp)
-        fsd	fa4, 32(sp)
-        fsd	fa3, 24(sp)
-        fsd	fa2, 16(sp)
-        fsd	fa1, 8(sp)
-        fsd	fa0, 0(sp)
+        sd	a7, 128(sp)
+        sd	a6, 120(sp)
+        sd	a5, 112(sp)
+        sd	a4, 104(sp)
+        sd	a3, 96(sp)
+        sd	a2, 88(sp)
+        sd	a1, 80(sp)
+        sd	a0, 72(sp)
+        fsd	fa7, 64(sp)
+        fsd	fa6, 56(sp)
+        fsd	fa5, 48(sp)
+        fsd	fa4, 40(sp)
+        fsd	fa3, 32(sp)
+        fsd	fa2, 24(sp)
+        fsd	fa1, 16(sp)
+        fsd	fa0, 8(sp)
 .endm
 
 .macro SAVE_RET_REGISTERS
 	// Push return registers to stack
-        addi    sp, sp, -40
-        .cfi_def_cfa_offset 40
-        sd      ra, 32(sp)
+        addi    sp, sp, -48
+        .cfi_def_cfa_offset 48
+        sd      ra, 40(sp)
         .cfi_offset ra, -8
-        sd      a1, 24(sp)
-        sd      a0, 16(sp)
-        fsd     fa1, 8(sp)
-        fsd     fa0, 0(sp)
+        sd      a1, 32(sp)
+        sd      a0, 24(sp)
+        fsd     fa1, 16(sp)
+        fsd     fa0, 8(sp)
 .endm
 
 .macro RESTORE_RET_REGISTERS
 	// Restore return registers
-        fld     fa0, 0(sp)
-        fld     fa1, 8(sp)
-        ld      a0, 16(sp)
-        ld      a1, 24(sp)
-        ld      ra, 32(sp)
-        addi    sp, sp, 40
+        fld     fa0, 8(sp)
+        fld     fa1, 16(sp)
+        ld      a0, 24(sp)
+        ld      a1, 32(sp)
+        ld      ra, 40(sp)
+        addi    sp, sp, 48
 .endm
 
 .macro RESTORE_ARG_REGISTERS
         // Restore argument registers
-	fld	fa0, 0(sp)
-	fld	fa1, 8(sp)
-	fld	fa2, 16(sp)
-	fld	fa3, 24(sp)
-	fld	fa4, 32(sp)
-	fld	fa5, 40(sp)
-	fld	fa6, 48(sp)
-	fld	fa7, 56(sp)
-	ld	a0, 64(sp)
-	ld	a1, 72(sp)
-	ld	a2, 80(sp)
-	ld	a3, 88(sp)
-	ld	a4, 96(sp)
-	ld	a5, 104(sp)
-	ld	a6, 112(sp)
-	ld	a7, 120(sp)
-	ld	ra, 128(sp)
-	addi	sp, sp, 136
+	fld	fa0, 8(sp)
+	fld	fa1, 16(sp)
+	fld	fa2, 24(sp)
+	fld	fa3, 32(sp)
+	fld	fa4, 40(sp)
+	fld	fa5, 48(sp)
+	fld	fa6, 56(sp)
+	fld	fa7, 64(sp)
+	ld	a0, 72(sp)
+	ld	a1, 80(sp)
+	ld	a2, 88(sp)
+	ld	a3, 96(sp)
+	ld	a4, 104(sp)
+	ld	a5, 112(sp)
+	ld	a6, 120(sp)
+	ld	a7, 128(sp)
+	ld	ra, 136(sp)
+	addi	sp, sp, 144
+.endm
+
+.macro LOAD_XLEN, rd, src
+        ld      \rd, \src
 .endm
 
 #include "xray_trampoline_riscv_common.S"
diff --git a/compiler-rt/lib/xray/xray_trampoline_riscv_common.S b/compiler-rt/lib/xray/xray_trampoline_riscv_common.S
index 7ce6fa0dcda31b..95f5a9b1189a88 100644
--- a/compiler-rt/lib/xray/xray_trampoline_riscv_common.S
+++ b/compiler-rt/lib/xray/xray_trampoline_riscv_common.S
@@ -25,7 +25,7 @@ ASM_SYMBOL(__xray_FunctionEntry):
 
 	// Load the handler function pointer into a2
 	la	a2, ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
-	ld	a2, 0(a2)
+        LOAD_XLEN   a2, 0(a2)
 
 	// Handler address will be null if it is not set
 	beq	a2, x0, 1f
@@ -52,7 +52,7 @@ ASM_SYMBOL(__xray_FunctionExit):
 
 	// Load the handler function pointer into a2
 	la	a2, ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
-	ld	a2, 0(a2)
+        LOAD_XLEN   a2, 0(a2)
 
 	// Handler address will be null if it is not set
 	beq	a2, x0, 1f
@@ -79,7 +79,7 @@ ASM_SYMBOL(__xray_FunctionTailExit):
 
 	// Load the handler function pointer into a2
 	la	a2, ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
-	ld	a2, 0(a2)
+        LOAD_XLEN   a2, 0(a2)
 
 	// Handler address will be null if it is not set
 	beq	a2, x0, 1f
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 5382751b0d8502..0d1047aba5546d 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -501,19 +501,19 @@ void RISCVAsmPrinter::emitSled(const MachineInstr *MI, SledKind Kind) {
   // .Lxray_sled_N
   //   ALIGN
   //   J .tmpN
-  //   29 or 37 C.NOP instructions
+  //   25 or 33 C.NOP instructions
   // .tmpN
 
   // The following variable holds the count of the number of NOPs to be patched
   // in for XRay instrumentation during compilation.
-  // Note that RV64 and RV32 each has a sled of 76 and 60 bytes, respectively.
+  // Note that RV64 and RV32 each has a sled of 68 and 52 bytes, respectively.
   // Assuming we're using JAL to jump to .tmpN, then we only need
-  // (76 - 4)/2 = 36 NOPs for RV64 and (60 - 4)/2 = 28 for RV32. However, there
+  // (68 - 4)/2 = 32 NOPs for RV64 and (52 - 4)/2 = 24 for RV32. However, there
   // is a chance that we'll use C.JAL instead, so an additional NOP is needed.
   const uint8_t NoopsInSledCount =
       MI->getParent()->getParent()->getSubtarget<RISCVSubtarget>().is64Bit()
-          ? 37
-          : 29;
+          ? 33
+          : 25;
 
   OutStreamer->emitCodeAlignment(Align(4), &getSubtargetInfo());
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 933e776da47404..a1d68bce4a8213 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1575,6 +1575,26 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     // No patch bytes means at most a PseudoCall is emitted
     return std::max(NumBytes, 8U);
   }
+  case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
+  case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
+  case TargetOpcode::PATCHABLE_TAIL_CALL: {
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    const Function &F = MF.getFunction();
+    if (Opcode == TargetOpcode::PATCHABLE_FUNCTION_ENTER &&
+        F.hasFnAttribute("patchable-function-entry")) {
+      unsigned Num;
+      if (F.getFnAttribute("patchable-function-entry")
+              .getValueAsString()
+              .getAsInteger(10, Num))
+        return get(Opcode).getSize();
+
+      // Number of C.NOP or NOP
+      return (STI.hasStdExtCOrZca() ? 2 : 4) * Num;
+    }
+    // XRay uses C.JAL + 25 or 33 C.NOP for each sled in RV32 and RV64,
+    // respectively.
+    return STI.is64Bit() ? 68 : 52;
+  }
   default:
     return get(Opcode).getSize();
   }
diff --git a/llvm/test/CodeGen/RISCV/xray-attribute-instrumentation.ll b/llvm/test/CodeGen/RISCV/xray-attribute-instrumentation.ll
index a8fc3bff0b0f5b..185dab38b33f87 100644
--- a/llvm/test/CodeGen/RISCV/xray-attribute-instrumentation.ll
+++ b/llvm/test/CodeGen/RISCV/xray-attribute-instrumentation.ll
@@ -5,14 +5,14 @@ define i32 @foo() nounwind "function-instrument"="xray-always" {
 ; CHECK:                        .p2align 2
 ; CHECK-LABEL:                  .Lxray_sled_0:
 ; CHECK-NEXT:                   j .Ltmp0
-; CHECK-COUNT-29:               nop
+; CHECK-COUNT-25:               nop
 ; CHECK-RISCV64-COUNT-8:        nop
 ; CHECK-LABEL:                  .Ltmp0:
   ret i32 0
 ; CHECK:                        .p2align 2
 ; CHECK-LABEL:                  .Lxray_sled_1:
 ; CHECK-NEXT:                   j .Ltmp1
-; CHECK-COUNT-29:               nop
+; CHECK-COUNT-25:               nop
 ; CHECK-RISCV64-COUNT-8:        nop
 ; CHECK-LABEL:                  .Ltmp1:
 ; CHECK-NEXT:                   ret