[clang] 8a39505 - [RISCV] Support scalar/fix-length vector NTLH intrinsic with different domain

Piyou Chen via cfe-commits cfe-commits at lists.llvm.org
Mon Apr 24 20:20:08 PDT 2023


Author: Piyou Chen
Date: 2023-04-24T20:15:14-07:00
New Revision: 8a3950510f819308f7ead16c339484147c69c84a

URL: https://github.com/llvm/llvm-project/commit/8a3950510f819308f7ead16c339484147c69c84a
DIFF: https://github.com/llvm/llvm-project/commit/8a3950510f819308f7ead16c339484147c69c84a.diff

LOG: [RISCV] Support scalar/fix-length vector NTLH intrinsic with different domain

This commit implements the two NTLH intrinsic functions.

```
type __riscv_ntl_load (type *ptr, int domain);
void __riscv_ntl_store (type *ptr, type val, int domain);

```

```
enum {
  __RISCV_NTLH_INNERMOST_PRIVATE = 2,
  __RISCV_NTLH_ALL_PRIVATE,
  __RISCV_NTLH_INNERMOST_SHARED,
  __RISCV_NTLH_ALL
};
```

We encode the non-temporal domain into MachineMemOperand flags.

1. Create the RISC-V built-in function with custom semantic checking.
2. Assume the domain argument is a compile time constant,
and make it as LLVM IR metadata (nontemp_node).
3. Encode domain value as two bits MachineMemOperand TargetMMOflag.
4. According to MachineMemOperand TargetMMOflag, select corrsponding ntlh instruction.

Currently, it supports scalar type and fixed-length vector type.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D143364

Added: 
    clang/lib/Headers/riscv_ntlh.h
    clang/test/CodeGen/RISCV/ntlh-intrinsics/riscv32-zihintntl.c

Modified: 
    clang/include/clang/Basic/BuiltinsRISCV.def
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/lib/Headers/CMakeLists.txt
    clang/lib/Sema/SemaChecking.cpp
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
    llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
    llvm/lib/Target/RISCV/RISCVInstrInfo.h
    llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
    llvm/test/CodeGen/RISCV/nontemporal-scalable.ll
    llvm/test/CodeGen/RISCV/nontemporal.ll

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsRISCV.def b/clang/include/clang/Basic/BuiltinsRISCV.def
index 3ca7654a32adc..370ef0af8f9a5 100644
--- a/clang/include/clang/Basic/BuiltinsRISCV.def
+++ b/clang/include/clang/Basic/BuiltinsRISCV.def
@@ -79,5 +79,9 @@ TARGET_BUILTIN(__builtin_riscv_sm4ks, "LiLiLiIUc", "nc", "zksed")
 TARGET_BUILTIN(__builtin_riscv_sm3p0, "LiLi", "nc", "zksh")
 TARGET_BUILTIN(__builtin_riscv_sm3p1, "LiLi", "nc", "zksh")
 
+// Zihintntl extension
+TARGET_BUILTIN(__builtin_riscv_ntl_load, "v.", "t", "experimental-zihintntl")
+TARGET_BUILTIN(__builtin_riscv_ntl_store, "v.", "t", "experimental-zihintntl")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 803ad398c449e..ca11127440fa9 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19760,6 +19760,11 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
     assert(Error == ASTContext::GE_None && "Unexpected error");
   }
 
+  if (BuiltinID == RISCV::BI__builtin_riscv_ntl_load)
+    ICEArguments |= (1 << 1);
+  if (BuiltinID == RISCV::BI__builtin_riscv_ntl_store)
+    ICEArguments |= (1 << 2);
+
   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
     // If this is a normal argument, just emit it as a scalar.
     if ((ICEArguments & (1 << i)) == 0) {
@@ -19962,6 +19967,56 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
     IntrinsicTypes = {ResultType};
     break;
 
+  // Zihintntl
+  case RISCV::BI__builtin_riscv_ntl_load: {
+    llvm::Type *ResTy = ConvertType(E->getType());
+    ConstantInt *Mode = cast<ConstantInt>(Ops[1]);
+
+    llvm::MDNode *RISCVDomainNode = llvm::MDNode::get(
+        getLLVMContext(),
+        llvm::ConstantAsMetadata::get(Builder.getInt32(Mode->getZExtValue())));
+    llvm::MDNode *NontemporalNode = llvm::MDNode::get(
+        getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
+
+    int Width;
+    if(ResTy->isScalableTy()) {
+      const ScalableVectorType *SVTy = cast<ScalableVectorType>(ResTy);
+      llvm::Type *ScalarTy = ResTy->getScalarType();
+      Width = ScalarTy->getPrimitiveSizeInBits() *
+              SVTy->getElementCount().getKnownMinValue();
+    } else
+      Width = ResTy->getPrimitiveSizeInBits();
+    LoadInst *Load = Builder.CreateLoad(
+        Address(Ops[0], ResTy, CharUnits::fromQuantity(Width / 8)));
+
+    Load->setMetadata(CGM.getModule().getMDKindID("nontemporal"),
+                      NontemporalNode);
+    Load->setMetadata(CGM.getModule().getMDKindID("riscv-nontemporal-domain"),
+                      RISCVDomainNode);
+
+    return Load;
+  }
+  case RISCV::BI__builtin_riscv_ntl_store: {
+    ConstantInt *Mode = cast<ConstantInt>(Ops[2]);
+
+    llvm::MDNode *RISCVDomainNode = llvm::MDNode::get(
+        getLLVMContext(),
+        llvm::ConstantAsMetadata::get(Builder.getInt32(Mode->getZExtValue())));
+    llvm::MDNode *NontemporalNode = llvm::MDNode::get(
+        getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
+
+    Value *BC = Builder.CreateBitCast(
+        Ops[0], llvm::PointerType::getUnqual(Ops[1]->getType()), "cast");
+
+    StoreInst *Store = Builder.CreateDefaultAlignedStore(Ops[1], BC);
+    Store->setMetadata(CGM.getModule().getMDKindID("nontemporal"),
+                       NontemporalNode);
+    Store->setMetadata(CGM.getModule().getMDKindID("riscv-nontemporal-domain"),
+                       RISCVDomainNode);
+
+    return Store;
+  }
+
   // Vector builtins are handled from here.
 #include "clang/Basic/riscv_vector_builtin_cg.inc"
   }

diff  --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 52f0e23a265cb..7c2f323652bb5 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -98,6 +98,10 @@ set(ppc_htm_files
   htmxlintrin.h
   )
 
+set(riscv_files
+  riscv_ntlh.h
+  )
+
 set(systemz_files
   s390intrin.h
   vecintrin.h
@@ -244,6 +248,7 @@ set(files
   ${opencl_files}
   ${ppc_files}
   ${ppc_htm_files}
+  ${riscv_files}
   ${systemz_files}
   ${ve_files}
   ${x86_files}
@@ -425,7 +430,7 @@ add_header_target("loongarch-resource-headers" "${loongarch_files}")
 add_header_target("mips-resource-headers" "${mips_msa_files}")
 add_header_target("ppc-resource-headers" "${ppc_files};${ppc_wrapper_files}")
 add_header_target("ppc-htm-resource-headers" "${ppc_htm_files}")
-add_header_target("riscv-resource-headers" "${riscv_generated_files}")
+add_header_target("riscv-resource-headers" "${riscv_files};${riscv_generated_files}")
 add_header_target("systemz-resource-headers" "${systemz_files}")
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
@@ -548,6 +553,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT riscv-resource-headers)
 
+install(
+  FILES ${riscv_files}
+  DESTINATION ${header_install_dir}
+  EXCLUDE_FROM_ALL
+  COMPONENT riscv-resource-headers)
+  
 install(
   FILES ${systemz_files}
   DESTINATION ${header_install_dir}

diff  --git a/clang/lib/Headers/riscv_ntlh.h b/clang/lib/Headers/riscv_ntlh.h
new file mode 100644
index 0000000000000..9ce1709205835
--- /dev/null
+++ b/clang/lib/Headers/riscv_ntlh.h
@@ -0,0 +1,28 @@
+/*===---- riscv_ntlh.h - RISC-V NTLH intrinsics ----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_NTLH_H
+#define __RISCV_NTLH_H
+
+#ifndef __riscv_zihintntl
+#error "NTLH intrinsics require the NTLH extension."
+#endif
+
+enum {
+  __RISCV_NTLH_INNERMOST_PRIVATE = 2,
+  __RISCV_NTLH_ALL_PRIVATE,
+  __RISCV_NTLH_INNERMOST_SHARED,
+  __RISCV_NTLH_ALL
+};
+
+#define __riscv_ntl_load(PTR, DOMAIN) __builtin_riscv_ntl_load((PTR), (DOMAIN))
+#define __riscv_ntl_store(PTR, VAL, DOMAIN)                                    \
+  __builtin_riscv_ntl_store((PTR), (VAL), (DOMAIN))
+
+#endif
\ No newline at end of file

diff  --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index f66eb9fcf13dc..eca106fa0a185 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -4652,6 +4652,65 @@ bool Sema::CheckRISCVBuiltinFunctionCall(const TargetInfo &TI,
   // Check if rnum is in [0, 10]
   case RISCV::BI__builtin_riscv_aes64ks1i_64:
     return SemaBuiltinConstantArgRange(TheCall, 1, 0, 10);
+  case RISCV::BI__builtin_riscv_ntl_load:
+  case RISCV::BI__builtin_riscv_ntl_store:
+    DeclRefExpr *DRE =
+        cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
+    assert((BuiltinID == RISCV::BI__builtin_riscv_ntl_store ||
+            BuiltinID == RISCV::BI__builtin_riscv_ntl_load) &&
+           "Unexpected RISC-V nontemporal load/store builtin!");
+    bool IsStore = BuiltinID == RISCV::BI__builtin_riscv_ntl_store;
+    unsigned NumArgs = IsStore ? 3 : 2;
+
+    if (checkArgCount(*this, TheCall, NumArgs))
+      return true;
+
+    // Domain value should be compile-time constant.
+    // 2 <= domain <= 5
+    if (SemaBuiltinConstantArgRange(TheCall, NumArgs - 1, 2, 5))
+      return true;
+
+    Expr *PointerArg = TheCall->getArg(0);
+    ExprResult PointerArgResult =
+        DefaultFunctionArrayLvalueConversion(PointerArg);
+
+    if (PointerArgResult.isInvalid())
+      return true;
+    PointerArg = PointerArgResult.get();
+
+    const PointerType *PtrType = PointerArg->getType()->getAs<PointerType>();
+    if (!PtrType) {
+      Diag(DRE->getBeginLoc(), diag::err_nontemporal_builtin_must_be_pointer)
+          << PointerArg->getType() << PointerArg->getSourceRange();
+      return true;
+    }
+
+    QualType ValType = PtrType->getPointeeType();
+    ValType = ValType.getUnqualifiedType();
+    if (!ValType->isIntegerType() && !ValType->isAnyPointerType() &&
+        !ValType->isBlockPointerType() && !ValType->isFloatingType() &&
+        !ValType->isVectorType() && !ValType->isRVVType()) {
+      Diag(DRE->getBeginLoc(),
+           diag::err_nontemporal_builtin_must_be_pointer_intfltptr_or_vector)
+          << PointerArg->getType() << PointerArg->getSourceRange();
+      return true;
+    }
+
+    if (!IsStore) {
+      TheCall->setType(ValType);
+      return false;
+    }
+
+    ExprResult ValArg = TheCall->getArg(1);
+    InitializedEntity Entity = InitializedEntity::InitializeParameter(
+        Context, ValType, /*consume*/ false);
+    ValArg = PerformCopyInitialization(Entity, SourceLocation(), ValArg);
+    if (ValArg.isInvalid())
+      return true;
+
+    TheCall->setArg(1, ValArg.get());
+    TheCall->setType(Context.VoidTy);
+    return false;
   }
 
   return false;

diff  --git a/clang/test/CodeGen/RISCV/ntlh-intrinsics/riscv32-zihintntl.c b/clang/test/CodeGen/RISCV/ntlh-intrinsics/riscv32-zihintntl.c
new file mode 100644
index 0000000000000..9126e686c4202
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/ntlh-intrinsics/riscv32-zihintntl.c
@@ -0,0 +1,181 @@
+// RUN: %clang_cc1  -triple riscv32 -target-feature +v -target-feature +experimental-zihintntl -emit-llvm %s -o - \
+// RUN:     | FileCheck %s
+
+#include <riscv_ntlh.h>
+#include <riscv_vector.h>
+
+signed char sc;
+unsigned char uc;
+signed short ss;
+unsigned short us;
+signed int si;
+unsigned int ui;
+signed long long sll;
+unsigned long long ull;
+_Float16 h1, h2;
+float f1, f2;
+double d1, d2;
+typedef int v4si __attribute__((vector_size(16)));
+typedef signed short v8ss __attribute__((vector_size(16)));
+typedef signed char v16sc __attribute__((vector_size(16)));
+v4si v4si1, v4si2;
+v8ss v8ss1, v8ss2;
+v16sc v16sc1, v16sc2;
+vint32m1_t *scvi1, *scvi2;
+vint16m1_t *scvs1, *scvs2;
+vint8m1_t *scvc1, *scvc2;
+
+// clang-format off
+void ntl_all_sizes() {                                       // CHECK-LABEL: ntl_all_sizes
+  uc = __riscv_ntl_load(&sc, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !5
+  sc = __riscv_ntl_load(&uc, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !5
+  us = __riscv_ntl_load(&ss, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5
+  ss = __riscv_ntl_load(&us, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5
+  ui = __riscv_ntl_load(&si, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5
+  si = __riscv_ntl_load(&ui, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5
+  ull = __riscv_ntl_load(&sll, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+  sll = __riscv_ntl_load(&ull, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+  h1 = __riscv_ntl_load(&h2, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5
+  f1 = __riscv_ntl_load(&f2, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5
+  d1 = __riscv_ntl_load(&d2, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+  v4si1 = __riscv_ntl_load(&v4si2, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5
+  v8ss1 = __riscv_ntl_load(&v8ss2, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5
+  v16sc1 = __riscv_ntl_load(&v16sc2, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5
+  *scvi1 = __riscv_ntl_load(scvi2, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+  *scvs1 = __riscv_ntl_load(scvs2, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+  *scvc1 = __riscv_ntl_load(scvc2, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: load <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+
+  uc = __riscv_ntl_load(&sc, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !6
+  sc = __riscv_ntl_load(&uc, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !6
+  us = __riscv_ntl_load(&ss, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6
+  ss = __riscv_ntl_load(&us, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6
+  ui = __riscv_ntl_load(&si, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6
+  si = __riscv_ntl_load(&ui, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6
+  ull = __riscv_ntl_load(&sll, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+  sll = __riscv_ntl_load(&ull, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+  h1 = __riscv_ntl_load(&h2, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6
+  f1 = __riscv_ntl_load(&f2, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6
+  d1 = __riscv_ntl_load(&d2, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+  v4si1 = __riscv_ntl_load(&v4si2, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6
+  v8ss1 = __riscv_ntl_load(&v8ss2, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6
+  v16sc1 = __riscv_ntl_load(&v16sc2, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6
+  *scvi1 = __riscv_ntl_load(scvi2, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+  *scvs1 = __riscv_ntl_load(scvs2, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+  *scvc1 = __riscv_ntl_load(scvc2, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: load <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+
+  uc = __riscv_ntl_load(&sc, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !7
+  sc = __riscv_ntl_load(&uc, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !7
+  us = __riscv_ntl_load(&ss, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7
+  ss = __riscv_ntl_load(&us, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7
+  ui = __riscv_ntl_load(&si, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7
+  si = __riscv_ntl_load(&ui, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7
+  ull = __riscv_ntl_load(&sll, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+  sll = __riscv_ntl_load(&ull, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+  h1 = __riscv_ntl_load(&h2, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7
+  f1 = __riscv_ntl_load(&f2, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7
+  d1 = __riscv_ntl_load(&d2, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+  v4si1 = __riscv_ntl_load(&v4si2, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7
+  v8ss1 = __riscv_ntl_load(&v8ss2, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7
+  v16sc1 = __riscv_ntl_load(&v16sc2, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7
+  *scvi1 = __riscv_ntl_load(scvi2, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+  *scvs1 = __riscv_ntl_load(scvs2, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+  *scvc1 = __riscv_ntl_load(scvc2, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: load <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+
+  uc = __riscv_ntl_load(&sc, __RISCV_NTLH_ALL);   // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !8
+  sc = __riscv_ntl_load(&uc, __RISCV_NTLH_ALL);   // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !8
+  us = __riscv_ntl_load(&ss, __RISCV_NTLH_ALL);   // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8
+  ss = __riscv_ntl_load(&us, __RISCV_NTLH_ALL);   // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8
+  ui = __riscv_ntl_load(&si, __RISCV_NTLH_ALL);   // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8
+  si = __riscv_ntl_load(&ui, __RISCV_NTLH_ALL);   // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8
+  ull = __riscv_ntl_load(&sll, __RISCV_NTLH_ALL); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+  sll = __riscv_ntl_load(&ull, __RISCV_NTLH_ALL); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+  h1 = __riscv_ntl_load(&h2, __RISCV_NTLH_ALL);   // CHECK: load half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8
+  f1 = __riscv_ntl_load(&f2, __RISCV_NTLH_ALL);   // CHECK: load float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8
+  d1 = __riscv_ntl_load(&d2, __RISCV_NTLH_ALL);   // CHECK: load double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+  v4si1 = __riscv_ntl_load(&v4si2, __RISCV_NTLH_ALL);   // CHECK: load <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8
+  v8ss1 = __riscv_ntl_load(&v8ss2, __RISCV_NTLH_ALL);   // CHECK: load <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8
+  v16sc1 = __riscv_ntl_load(&v16sc2, __RISCV_NTLH_ALL);   // CHECK: load <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8
+  *scvi1 = __riscv_ntl_load(scvi2, __RISCV_NTLH_ALL);   // CHECK: load <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+  *scvs1 = __riscv_ntl_load(scvs2, __RISCV_NTLH_ALL);   // CHECK: load <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+  *scvc1 = __riscv_ntl_load(scvc2, __RISCV_NTLH_ALL);   // CHECK: load <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+
+  __riscv_ntl_store(&uc, 1, __RISCV_NTLH_INNERMOST_PRIVATE);    // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&sc, 1, __RISCV_NTLH_INNERMOST_PRIVATE);    // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&us, 1, __RISCV_NTLH_INNERMOST_PRIVATE);    // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&ss, 1, __RISCV_NTLH_INNERMOST_PRIVATE);    // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&ui, 1, __RISCV_NTLH_INNERMOST_PRIVATE);    // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&si, 1, __RISCV_NTLH_INNERMOST_PRIVATE);    // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&ull, 1, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&sll, 1, __RISCV_NTLH_INNERMOST_PRIVATE);   // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&h1, 1.0, __RISCV_NTLH_INNERMOST_PRIVATE);  // CHECK: store half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&f1, 1.0, __RISCV_NTLH_INNERMOST_PRIVATE);  // CHECK: store float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&d1, 1.0, __RISCV_NTLH_INNERMOST_PRIVATE);  // CHECK: store double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&v4si1, v4si2, __RISCV_NTLH_INNERMOST_PRIVATE);  // CHECK: store <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&v8ss1, v8ss2, __RISCV_NTLH_INNERMOST_PRIVATE);  // CHECK: store <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(&v16sc1, v16sc2, __RISCV_NTLH_INNERMOST_PRIVATE);  // CHECK: store <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(scvi2, *scvi1, __RISCV_NTLH_INNERMOST_PRIVATE);  // CHECK: store <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(scvs2, *scvs1, __RISCV_NTLH_INNERMOST_PRIVATE);  // CHECK: store <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+  __riscv_ntl_store(scvc2, *scvc1, __RISCV_NTLH_INNERMOST_PRIVATE);  // CHECK: store <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+
+  __riscv_ntl_store(&uc, 1, __RISCV_NTLH_ALL_PRIVATE);    // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&sc, 1, __RISCV_NTLH_ALL_PRIVATE);    // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&us, 1, __RISCV_NTLH_ALL_PRIVATE);    // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&ss, 1, __RISCV_NTLH_ALL_PRIVATE);    // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&ui, 1, __RISCV_NTLH_ALL_PRIVATE);    // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&si, 1, __RISCV_NTLH_ALL_PRIVATE);    // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&ull, 1, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&sll, 1, __RISCV_NTLH_ALL_PRIVATE);   // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&h1, 1.0, __RISCV_NTLH_ALL_PRIVATE);  // CHECK: store half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&f1, 1.0, __RISCV_NTLH_ALL_PRIVATE);  // CHECK: store float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&d1, 1.0, __RISCV_NTLH_ALL_PRIVATE);  // CHECK: store double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&v4si1, v4si2, __RISCV_NTLH_ALL_PRIVATE);  // CHECK: store <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&v8ss1, v8ss2, __RISCV_NTLH_ALL_PRIVATE);  // CHECK: store <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(&v16sc1, v16sc2, __RISCV_NTLH_ALL_PRIVATE);  // CHECK: store <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(scvi2, *scvi1, __RISCV_NTLH_ALL_PRIVATE);  // CHECK: store <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(scvs2, *scvs1, __RISCV_NTLH_ALL_PRIVATE);  // CHECK: store <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+  __riscv_ntl_store(scvc2, *scvc1, __RISCV_NTLH_ALL_PRIVATE);  // CHECK: store <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+
+  __riscv_ntl_store(&uc, 1, __RISCV_NTLH_INNERMOST_SHARED);    // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&sc, 1, __RISCV_NTLH_INNERMOST_SHARED);    // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&us, 1, __RISCV_NTLH_INNERMOST_SHARED);    // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&ss, 1, __RISCV_NTLH_INNERMOST_SHARED);    // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&ui, 1, __RISCV_NTLH_INNERMOST_SHARED);    // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&si, 1, __RISCV_NTLH_INNERMOST_SHARED);    // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&ull, 1, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&sll, 1, __RISCV_NTLH_INNERMOST_SHARED);   // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&h1, 1.0, __RISCV_NTLH_INNERMOST_SHARED);  // CHECK: store half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&f1, 1.0, __RISCV_NTLH_INNERMOST_SHARED);  // CHECK: store float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&d1, 1.0, __RISCV_NTLH_INNERMOST_SHARED);  // CHECK: store double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&v4si1, v4si2, __RISCV_NTLH_INNERMOST_SHARED);  // CHECK: store <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&v8ss1, v8ss2, __RISCV_NTLH_INNERMOST_SHARED);  // CHECK: store <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(&v16sc1, v16sc2, __RISCV_NTLH_INNERMOST_SHARED);  // CHECK: store <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(scvi2, *scvi1, __RISCV_NTLH_INNERMOST_SHARED);  // CHECK: store <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(scvs2, *scvs1, __RISCV_NTLH_INNERMOST_SHARED);  // CHECK: store <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+  __riscv_ntl_store(scvc2, *scvc1, __RISCV_NTLH_INNERMOST_SHARED);  // CHECK: store <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+
+  __riscv_ntl_store(&uc, 1, __RISCV_NTLH_ALL);    // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&sc, 1, __RISCV_NTLH_ALL);    // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&us, 1, __RISCV_NTLH_ALL);    // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&ss, 1, __RISCV_NTLH_ALL);    // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&ui, 1, __RISCV_NTLH_ALL);    // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&si, 1, __RISCV_NTLH_ALL);    // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&ull, 1, __RISCV_NTLH_ALL);   // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&sll, 1, __RISCV_NTLH_ALL);   // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&h1, 1.0, __RISCV_NTLH_ALL);  // CHECK: store half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&f1, 1.0, __RISCV_NTLH_ALL);  // CHECK: store float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&d1, 1.0, __RISCV_NTLH_ALL);  // CHECK: store double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&v4si1, v4si2, __RISCV_NTLH_ALL);  // CHECK: store <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&v8ss1, v8ss2, __RISCV_NTLH_ALL);  // CHECK: store <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(&v16sc1, v16sc2, __RISCV_NTLH_ALL);  // CHECK: store <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(scvi2, *scvi1, __RISCV_NTLH_ALL);  // CHECK: store <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(scvs2, *scvs1, __RISCV_NTLH_ALL);  // CHECK: store <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+  __riscv_ntl_store(scvc2, *scvc1, __RISCV_NTLH_ALL);  // CHECK: store <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+
+}
+// clang-format on
+
+// CHECK: !4 = !{i32 1}
+// CHECK: !5 = !{i32 2}
+// CHECK: !6 = !{i32 3}
+// CHECK: !7 = !{i32 4}
+// CHECK: !8 = !{i32 5}
\ No newline at end of file

diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 747f40ff38f4c..be90bdf970d19 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -433,6 +433,13 @@ class TargetLoweringBase {
     return MachineMemOperand::MONone;
   }
 
+  /// This callback is used to inspect load/store SDNode.
+  /// The default implementation does nothing.
+  virtual MachineMemOperand::Flags
+  getTargetMMOFlags(const MemSDNode &Node) const {
+    return MachineMemOperand::MONone;
+  }
+
   MachineMemOperand::Flags
   getLoadMemOperandFlags(const LoadInst &LI, const DataLayout &DL,
                          AssumptionCache *AC = nullptr,
@@ -672,6 +679,13 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// Return true if it is valid to merge the TargetMMOFlags in two SDNodes.
+  virtual bool
+  areTwoSDNodeTargetMMOFlagsMergeable(const MemSDNode &NodeX,
+                                      const MemSDNode &NodeY) const {
+    return true;
+  }
+
   /// Use bitwise logic to make pairs of compares more efficient. For example:
   /// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
   /// This should be true when it takes more than one instruction to lower

diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ade34f70c9768..842ecc751d12e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -19412,6 +19412,8 @@ void DAGCombiner::getStoreMergeCandidates(
     // Don't mix temporal stores with non-temporal stores.
     if (St->isNonTemporal() != Other->isNonTemporal())
       return false;
+    if (!TLI.areTwoSDNodeTargetMMOFlagsMergeable(*St, *Other))
+      return false;
     SDValue OtherBC = peekThroughBitcasts(Other->getValue());
     // Allow merging constants of 
diff erent types as integers.
     bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
@@ -19437,6 +19439,9 @@ void DAGCombiner::getStoreMergeCandidates(
       // Don't mix temporal loads with non-temporal loads.
       if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
         return false;
+      if (!TLI.areTwoSDNodeTargetMMOFlagsMergeable(*cast<LoadSDNode>(Val),
+                                                   *OtherLd))
+        return false;
       if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
         return false;
       break;
@@ -20061,10 +20066,14 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
     if (IsNonTemporalLoad)
       LdMMOFlags |= MachineMemOperand::MONonTemporal;
 
+    LdMMOFlags |= TLI.getTargetMMOFlags(*FirstLoad);
+
     MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore
                                               ? MachineMemOperand::MONonTemporal
                                               : MachineMemOperand::MONone;
 
+    StMMOFlags |= TLI.getTargetMMOFlags(*StoreNodes[0].MemNode);
+
     SDValue NewLoad, NewStore;
     if (UseVectorTy || !DoIntegerTruncate) {
       NewLoad = DAG.getLoad(

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 677c4befbff76..1f6c59309a43c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -15736,6 +15736,57 @@ RISCVTargetLowering::getRegisterByName(const char *RegName, LLT VT,
   return Reg;
 }
 
+MachineMemOperand::Flags
+RISCVTargetLowering::getTargetMMOFlags(const Instruction &I) const {
+  const MDNode *NontemporalInfo = I.getMetadata(LLVMContext::MD_nontemporal);
+
+  if (NontemporalInfo == nullptr)
+    return MachineMemOperand::MONone;
+
+  // 1 for default value work as __RISCV_NTLH_ALL
+  // 2 -> __RISCV_NTLH_INNERMOST_PRIVATE
+  // 3 -> __RISCV_NTLH_ALL_PRIVATE
+  // 4 -> __RISCV_NTLH_INNERMOST_SHARED
+  // 5 -> __RISCV_NTLH_ALL
+  int NontemporalLevel = 5;
+  const MDNode *RISCVNontemporalInfo =
+      I.getMetadata("riscv-nontemporal-domain");
+  if (RISCVNontemporalInfo != nullptr)
+    NontemporalLevel =
+        cast<ConstantInt>(
+            cast<ConstantAsMetadata>(RISCVNontemporalInfo->getOperand(0))
+                ->getValue())
+            ->getZExtValue();
+
+  assert((1 <= NontemporalLevel && NontemporalLevel <= 5) &&
+         "RISC-V target doesn't support this non-temporal domain.");
+
+  NontemporalLevel -= 2;
+  MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
+  if (NontemporalLevel & 0b1)
+    Flags |= MONontemporalBit0;
+  if (NontemporalLevel & 0b10)
+    Flags |= MONontemporalBit1;
+
+  return Flags;
+}
+
+MachineMemOperand::Flags
+RISCVTargetLowering::getTargetMMOFlags(const MemSDNode &Node) const {
+
+  MachineMemOperand::Flags NodeFlags = Node.getMemOperand()->getFlags();
+  MachineMemOperand::Flags TargetFlags = MachineMemOperand::MONone;
+  TargetFlags |= (NodeFlags & MONontemporalBit0);
+  TargetFlags |= (NodeFlags & MONontemporalBit1);
+
+  return TargetFlags;
+}
+
+bool RISCVTargetLowering::areTwoSDNodeTargetMMOFlagsMergeable(
+    const MemSDNode &NodeX, const MemSDNode &NodeY) const {
+  return getTargetMMOFlags(NodeX) == getTargetMMOFlags(NodeY);
+}
+
 namespace llvm::RISCVVIntrinsicsTable {
 
 #define GET_RISCVVIntrinsicsTable_IMPL

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index de0e970108b81..3780986407601 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -482,6 +482,16 @@ class RISCVTargetLowering : public TargetLowering {
   // This method returns the name of a target specific DAG node.
   const char *getTargetNodeName(unsigned Opcode) const override;
 
+  MachineMemOperand::Flags
+  getTargetMMOFlags(const Instruction &I) const override;
+
+  MachineMemOperand::Flags
+  getTargetMMOFlags(const MemSDNode &Node) const override;
+
+  bool
+  areTwoSDNodeTargetMMOFlagsMergeable(const MemSDNode &NodeX,
+                                      const MemSDNode &NodeY) const override;
+
   ConstraintType getConstraintType(StringRef Constraint) const override;
 
   unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override;

diff  --git a/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp b/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
index 209438a6165bb..30f74b6995d7d 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
@@ -67,11 +67,27 @@ bool RISCVInsertNTLHInsts::runOnMachineFunction(MachineFunction &MF) {
         continue;
       MachineMemOperand *MMO = *(MBBI.memoperands_begin());
       if (MMO->isNonTemporal()) {
+        uint64_t NontemporalMode = 0;
+        if (MMO->getFlags() & MONontemporalBit0)
+          NontemporalMode += 0b1;
+        if (MMO->getFlags() & MONontemporalBit1)
+          NontemporalMode += 0b10;
+
+        static const uint16_t NTLOpc[] = {
+            RISCV::PseudoNTLP1, RISCV::PseudoNTLPALL, RISCV::PseudoNTLS1,
+            RISCV::PseudoNTLALL};
+        static const uint16_t CNTLOpc[] = {
+            RISCV::PseudoCNTLP1, RISCV::PseudoCNTLPALL, RISCV::PseudoCNTLS1,
+            RISCV::PseudoCNTLALL};
+
+        unsigned CurrNTLOpc;
         DebugLoc DL = MBBI.getDebugLoc();
         if (ST.hasStdExtCOrZca() && ST.enableRVCHintInstrs())
-          BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoCNTLALL));
+          CurrNTLOpc = CNTLOpc[NontemporalMode];
         else
-          BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoNTLALL));
+          CurrNTLOpc = NTLOpc[NontemporalMode];
+
+        BuildMI(MBB, MBBI, DL, TII->get(CurrNTLOpc));
         Changed = true;
       }
     }

diff  --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 0f96d3c19da4e..4a8c5966d4b0d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2628,6 +2628,14 @@ void RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
   }
 }
 
+ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
+RISCVInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
+  static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
+      {{MONontemporalBit0, "riscv-nontemporal-domain-bit-0"},
+       {MONontemporalBit1, "riscv-nontemporal-domain-bit-1"}};
+  return makeArrayRef(TargetFlags);
+}
+
 // Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
 bool RISCV::isSEXT_W(const MachineInstr &MI) {
   return MI.getOpcode() == RISCV::ADDIW && MI.getOperand(1).isReg() &&

diff  --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 01f112a386d08..9810f73930ca6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -25,6 +25,11 @@ namespace llvm {
 
 class RISCVSubtarget;
 
+static const MachineMemOperand::Flags MONontemporalBit0 =
+    MachineMemOperand::MOTargetFlag1;
+static const MachineMemOperand::Flags MONontemporalBit1 =
+    MachineMemOperand::MOTargetFlag2;
+
 namespace RISCVCC {
 
 enum CondCode {
@@ -227,6 +232,9 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
 
   std::optional<unsigned> getInverseOpcode(unsigned Opcode) const override;
 
+  ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
+  getSerializableMachineMemOperandTargetFlags() const override;
+
 protected:
   const RISCVSubtarget &STI;
 };

diff  --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
index ecc5ddedee007..b8adaf4d84831 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
@@ -12,11 +12,23 @@
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 4 in {
-  def PseudoNTLALL :  Pseudo<(outs), (ins), [], "ntl.all">, 
+  def PseudoNTLP1   :  Pseudo<(outs), (ins), [], "ntl.p1">, 
+                             PseudoInstExpansion<(ADD X0, X0, X2)>;
+  def PseudoNTLPALL :  Pseudo<(outs), (ins), [], "ntl.pall">, 
+                             PseudoInstExpansion<(ADD X0, X0, X3)>;
+  def PseudoNTLS1   :  Pseudo<(outs), (ins), [], "ntl.s1">, 
+                             PseudoInstExpansion<(ADD X0, X0, X4)>;
+  def PseudoNTLALL  :  Pseudo<(outs), (ins), [], "ntl.all">, 
                              PseudoInstExpansion<(ADD X0, X0, X5)>;
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 2 in {
-  def PseudoCNTLALL :  Pseudo<(outs), (ins), [], "c.ntl.all">,
+  def PseudoCNTLP1   :  Pseudo<(outs), (ins), [], "c.ntl.p1">,
+                              PseudoInstExpansion<(C_ADD_HINT X0, X0, X2)>;
+  def PseudoCNTLPALL :  Pseudo<(outs), (ins), [], "c.ntl.pall">,
+                              PseudoInstExpansion<(C_ADD_HINT X0, X0, X3)>;
+  def PseudoCNTLS1   :  Pseudo<(outs), (ins), [], "c.ntl.s1">,
+                              PseudoInstExpansion<(C_ADD_HINT X0, X0, X4)>;
+  def PseudoCNTLALL  :  Pseudo<(outs), (ins), [], "c.ntl.all">,
                               PseudoInstExpansion<(C_ADD_HINT X0, X0, X5)>;
 }

diff  --git a/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll b/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll
index 6a4ecf38427ec..66dad442dc087 100644
--- a/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll
+++ b/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll
@@ -130,4 +130,520 @@ define void @test_nontemporal_store_nxv16i8(ptr %p, <vscale x 16 x i8> %v) {
   ret void
 }
 
+define <vscale x 2 x i64> @test_nontemporal_P1_load_nxv2i64(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_nxv2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vl2re64.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_nxv2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vl2re64.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 4 x i32> @test_nontemporal_P1_load_nxv4i32(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_nxv4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vl2re32.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_nxv4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vl2re32.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 8 x i16> @test_nontemporal_P1_load_nxv8i16(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_nxv8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vl2re16.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_nxv8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vl2re16.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 16 x i8> @test_nontemporal_P1_load_nxv16i8(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_nxv16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vl2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_nxv16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vl2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret <vscale x 16 x i8> %1
+}
+
+define void @test_nontemporal_P1_store_nxv2i64(ptr %p, <vscale x 2 x i64> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_nxv2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_nxv2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_nxv4i32(ptr %p, <vscale x 4 x i32> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_nxv4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_nxv4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_nxv8i16(ptr %p, <vscale x 8 x i16> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_nxv8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_nxv8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_nxv16i8(ptr %p, <vscale x 16 x i8> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_nxv16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_nxv16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define <vscale x 2 x i64> @test_nontemporal_PALL_load_nxv2i64(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_nxv2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vl2re64.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_nxv2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vl2re64.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 4 x i32> @test_nontemporal_PALL_load_nxv4i32(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_nxv4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vl2re32.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_nxv4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vl2re32.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 8 x i16> @test_nontemporal_PALL_load_nxv8i16(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_nxv8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vl2re16.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_nxv8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vl2re16.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 16 x i8> @test_nontemporal_PALL_load_nxv16i8(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_nxv16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vl2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_nxv16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vl2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret <vscale x 16 x i8> %1
+}
+
+define void @test_nontemporal_PALL_store_nxv2i64(ptr %p, <vscale x 2 x i64> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_nxv2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_nxv2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_nxv4i32(ptr %p, <vscale x 4 x i32> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_nxv4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_nxv4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_nxv8i16(ptr %p, <vscale x 8 x i16> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_nxv8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_nxv8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_nxv16i8(ptr %p, <vscale x 16 x i8> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_nxv16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_nxv16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define <vscale x 2 x i64> @test_nontemporal_S1_load_nxv2i64(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_nxv2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vl2re64.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_nxv2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vl2re64.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 4 x i32> @test_nontemporal_S1_load_nxv4i32(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_nxv4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vl2re32.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_nxv4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vl2re32.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 8 x i16> @test_nontemporal_S1_load_nxv8i16(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_nxv8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vl2re16.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_nxv8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vl2re16.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 16 x i8> @test_nontemporal_S1_load_nxv16i8(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_nxv16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vl2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_nxv16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vl2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret <vscale x 16 x i8> %1
+}
+
+define void @test_nontemporal_S1_store_nxv2i64(ptr %p, <vscale x 2 x i64> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_nxv2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_nxv2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_nxv4i32(ptr %p, <vscale x 4 x i32> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_nxv4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_nxv4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_nxv8i16(ptr %p, <vscale x 8 x i16> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_nxv8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_nxv8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_nxv16i8(ptr %p, <vscale x 16 x i8> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_nxv16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_nxv16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define <vscale x 2 x i64> @test_nontemporal_ALL_load_nxv2i64(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_nxv2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vl2re64.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_nxv2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vl2re64.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 4 x i32> @test_nontemporal_ALL_load_nxv4i32(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_nxv4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vl2re32.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_nxv4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vl2re32.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 8 x i16> @test_nontemporal_ALL_load_nxv8i16(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_nxv8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vl2re16.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_nxv8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vl2re16.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 16 x i8> @test_nontemporal_ALL_load_nxv16i8(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_nxv16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vl2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_nxv16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vl2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <vscale x 16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret <vscale x 16 x i8> %1
+}
+
+define void @test_nontemporal_ALL_store_nxv2i64(ptr %p, <vscale x 2 x i64> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_nxv2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_nxv2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_nxv4i32(ptr %p, <vscale x 4 x i32> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_nxv4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_nxv4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_nxv8i16(ptr %p, <vscale x 8 x i16> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_nxv8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_nxv8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_nxv16i8(ptr %p, <vscale x 16 x i8> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_nxv16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_nxv16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <vscale x 16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
 !0 = !{i32 1}
+!1 = !{i32 2}
+!2 = !{i32 3}
+!3 = !{i32 4}
+!4 = !{i32 5}

diff  --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll
index 190b896f486c6..e1b19a9c33835 100644
--- a/llvm/test/CodeGen/RISCV/nontemporal.ll
+++ b/llvm/test/CodeGen/RISCV/nontemporal.ll
@@ -1438,4 +1438,5665 @@ define void @test_nontemporal_store_v2i64(ptr %p, <2 x i64> %v) {
   ret void
 }
 
+define i64 @test_nontemporal_P1_load_i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a1, 4(a0)
+; CHECK-RV32-NEXT:    mv a0, a2
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    ld a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a2, 0(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a1, 4(a0)
+; CHECK-RV32C-NEXT:    mv a0, a2
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    ld a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    lw a2, 0(a0)
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    lw a1, 4(a0)
+; CHECK-RV32V-NEXT:    mv a0, a2
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret i64 %1
+}
+
+define i32 @test_nontemporal_P1_load_i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    lw a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    lw a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    lw a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    lw a0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i32, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret i32 %1
+}
+
+define i16 @test_nontemporal_P1_load_i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    lh a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lh a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    lh a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lh a0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    lh a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    lh a0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i16, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret i16 %1
+}
+
+define i8 @test_nontemporal_P1_load_i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i8, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret i8 %1
+}
+
+define half @test_nontemporal_P1_load_half(ptr %p) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    flh fa5, 0(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    flh fa4, 6(a0)
+; CHECK-RV64-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    flh fa5, 0(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    flh fa4, 6(a0)
+; CHECK-RV32-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_half:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    flh fa5, 0(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    flh fa4, 6(a0)
+; CHECK-RV64C-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_half:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    flh fa5, 0(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    flh fa4, 6(a0)
+; CHECK-RV32C-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_half:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    flh fa5, 0(a0)
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    flh fa4, 6(a0)
+; CHECK-RV64V-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_half:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    flh fa5, 0(a0)
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    flh fa4, 6(a0)
+; CHECK-RV32V-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV32V-NEXT:    ret
+  %1 = load half, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  %2 = getelementptr half, ptr %p, i32 3
+  %3 = load half, ptr %2, !nontemporal !0, !riscv-nontemporal-domain !1
+  %4 = fadd half %1, %3
+  ret half %4
+}
+
+define float @test_nontemporal_P1_load_float(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_float:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_float:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_float:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_float:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load float, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret float %1
+}
+
+define double @test_nontemporal_P1_load_double(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_double:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_double:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_double:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_double:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load double, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret double %1
+}
+
+define <16 x i8> @test_nontemporal_P1_load_v16i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_v16i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_v16i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v16i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    ld a2, 8(a1)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    ld a1, 0(a1)
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v16i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_v16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vle8.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_v16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vle8.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_nontemporal_P1_load_v8i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_v8i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_v8i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v8i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    ld a2, 8(a1)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    ld a1, 0(a1)
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v8i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_v8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vle16.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_v8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vle16.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_nontemporal_P1_load_v4i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_v4i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_v4i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v4i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    ld a2, 8(a1)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    ld a1, 0(a1)
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v4i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_v4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vle32.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_v4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vle32.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_nontemporal_P1_load_v2i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    ld a2, 0(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    ld a1, 8(a0)
+; CHECK-RV64-NEXT:    mv a0, a2
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v2i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    ld a2, 0(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    ld a1, 8(a0)
+; CHECK-RV64C-NEXT:    mv a0, a2
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v2i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_v2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vle64.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_v2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vle64.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret <2 x i64> %1
+}
+
+define void @test_nontemporal_P1_store_i64(ptr %p, i64 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sw a2, 4(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sw a2, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    sd a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    sw a2, 4(a0)
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    sw a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i64 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_i32(ptr %p, i32 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sw a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    sw a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    sw a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i32 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_i16(ptr %p, i16 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sh a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sh a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sh a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sh a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    sh a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    sh a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i16 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_i8(ptr %p, i8 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    sb a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    sb a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i8 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_half(ptr %p, half %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_half:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_half:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_half:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_half:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store half %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_float(ptr %p, float %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_float:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_float:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_float:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_float:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store float %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_double(ptr %p, double %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_double:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_double:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_double:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_double:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store double %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_v16i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    addi sp, sp, -16
+; CHECK-RV64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT:    .cfi_offset s0, -8
+; CHECK-RV64-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64-NEXT:    lbu a2, 0(a1)
+; CHECK-RV64-NEXT:    lbu a3, 8(a1)
+; CHECK-RV64-NEXT:    lbu a4, 16(a1)
+; CHECK-RV64-NEXT:    lbu a5, 24(a1)
+; CHECK-RV64-NEXT:    lbu a6, 32(a1)
+; CHECK-RV64-NEXT:    lbu a7, 40(a1)
+; CHECK-RV64-NEXT:    lbu t0, 48(a1)
+; CHECK-RV64-NEXT:    lbu t1, 56(a1)
+; CHECK-RV64-NEXT:    lbu t2, 64(a1)
+; CHECK-RV64-NEXT:    lbu t3, 72(a1)
+; CHECK-RV64-NEXT:    lbu t4, 80(a1)
+; CHECK-RV64-NEXT:    lbu t5, 88(a1)
+; CHECK-RV64-NEXT:    lbu t6, 120(a1)
+; CHECK-RV64-NEXT:    lbu s0, 112(a1)
+; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb a1, 12(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT:    addi sp, sp, 16
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_v16i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    addi sp, sp, -16
+; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    .cfi_offset s0, -4
+; CHECK-RV32-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32-NEXT:    lbu a2, 0(a1)
+; CHECK-RV32-NEXT:    lbu a3, 4(a1)
+; CHECK-RV32-NEXT:    lbu a4, 8(a1)
+; CHECK-RV32-NEXT:    lbu a5, 12(a1)
+; CHECK-RV32-NEXT:    lbu a6, 16(a1)
+; CHECK-RV32-NEXT:    lbu a7, 20(a1)
+; CHECK-RV32-NEXT:    lbu t0, 24(a1)
+; CHECK-RV32-NEXT:    lbu t1, 28(a1)
+; CHECK-RV32-NEXT:    lbu t2, 32(a1)
+; CHECK-RV32-NEXT:    lbu t3, 36(a1)
+; CHECK-RV32-NEXT:    lbu t4, 40(a1)
+; CHECK-RV32-NEXT:    lbu t5, 44(a1)
+; CHECK-RV32-NEXT:    lbu t6, 60(a1)
+; CHECK-RV32-NEXT:    lbu s0, 56(a1)
+; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb a1, 12(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    addi sp, sp, 16
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v16i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    addi sp, sp, -16
+; CHECK-RV64C-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV64C-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
+; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
+; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
+; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
+; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
+; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
+; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb a1, 12(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb a3, 9(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb t2, 4(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb t1, 3(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb t0, 2(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb a7, 1(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sb a6, 0(a0)
+; CHECK-RV64C-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT:    addi sp, sp, 16
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v16i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    addi sp, sp, -16
+; CHECK-RV32C-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV32C-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
+; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
+; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
+; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
+; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
+; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
+; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb a1, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb a3, 9(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb t2, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb t1, 3(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb t0, 2(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb a7, 1(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sb a6, 0(a0)
+; CHECK-RV32C-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT:    addi sp, sp, 16
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_v16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vse8.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_v16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vse8.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_v8i16(ptr %p, <8 x i16> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_v8i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lh a2, 0(a1)
+; CHECK-RV64-NEXT:    lh a3, 8(a1)
+; CHECK-RV64-NEXT:    lh a4, 16(a1)
+; CHECK-RV64-NEXT:    lh a5, 24(a1)
+; CHECK-RV64-NEXT:    lh a6, 56(a1)
+; CHECK-RV64-NEXT:    lh a7, 48(a1)
+; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a1, 32(a1)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sh a1, 8(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_v8i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lh a2, 0(a1)
+; CHECK-RV32-NEXT:    lh a3, 4(a1)
+; CHECK-RV32-NEXT:    lh a4, 8(a1)
+; CHECK-RV32-NEXT:    lh a5, 12(a1)
+; CHECK-RV32-NEXT:    lh a6, 28(a1)
+; CHECK-RV32-NEXT:    lh a7, 24(a1)
+; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a1, 16(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sh a1, 8(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v8i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a6, 0(a1)
+; CHECK-RV64C-NEXT:    lh a7, 8(a1)
+; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh a5, 24(a1)
+; CHECK-RV64C-NEXT:    lh a2, 56(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 40(a1)
+; CHECK-RV64C-NEXT:    lh a1, 32(a1)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sh a3, 12(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sh a1, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sh a5, 6(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sh a6, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v8i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a6, 0(a1)
+; CHECK-RV32C-NEXT:    lh a7, 4(a1)
+; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh a5, 12(a1)
+; CHECK-RV32C-NEXT:    lh a2, 28(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 20(a1)
+; CHECK-RV32C-NEXT:    lh a1, 16(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sh a3, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sh a1, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sh a5, 6(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sh a6, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_v8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vse16.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_v8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vse16.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_v4i32(ptr %p, <4 x i32> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_v4i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 24(a1)
+; CHECK-RV64-NEXT:    lw a3, 16(a1)
+; CHECK-RV64-NEXT:    lw a4, 8(a1)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sw a2, 12(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sw a3, 8(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sw a4, 4(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_v4i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v4i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lw a2, 24(a1)
+; CHECK-RV64C-NEXT:    lw a3, 16(a1)
+; CHECK-RV64C-NEXT:    lw a4, 8(a1)
+; CHECK-RV64C-NEXT:    lw a1, 0(a1)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sw a2, 12(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sw a3, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sw a4, 4(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sw a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v4i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_v4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vse32.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_v4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vse32.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define void @test_nontemporal_P1_store_v2i64(ptr %p, <2 x i64> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    ntl.p1
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    ntl.p1
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v2i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.p1
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v2i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.p1
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_v2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.p1
+; CHECK-RV64V-NEXT:    vse64.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_v2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.p1
+; CHECK-RV32V-NEXT:    vse64.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+  ret void
+}
+
+define i64 @test_nontemporal_PALL_load_i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a1, 4(a0)
+; CHECK-RV32-NEXT:    mv a0, a2
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    ld a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a2, 0(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a1, 4(a0)
+; CHECK-RV32C-NEXT:    mv a0, a2
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    ld a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    lw a2, 0(a0)
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    lw a1, 4(a0)
+; CHECK-RV32V-NEXT:    mv a0, a2
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret i64 %1
+}
+
+define i32 @test_nontemporal_PALL_load_i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    lw a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    lw a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    lw a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    lw a0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i32, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret i32 %1
+}
+
+define i16 @test_nontemporal_PALL_load_i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    lh a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lh a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    lh a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lh a0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    lh a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    lh a0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i16, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret i16 %1
+}
+
+define i8 @test_nontemporal_PALL_load_i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i8, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret i8 %1
+}
+
+define half @test_nontemporal_PALL_load_half(ptr %p) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    flh fa5, 0(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    flh fa4, 6(a0)
+; CHECK-RV64-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    flh fa5, 0(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    flh fa4, 6(a0)
+; CHECK-RV32-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_half:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    flh fa5, 0(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    flh fa4, 6(a0)
+; CHECK-RV64C-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_half:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    flh fa5, 0(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    flh fa4, 6(a0)
+; CHECK-RV32C-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_half:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    flh fa5, 0(a0)
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    flh fa4, 6(a0)
+; CHECK-RV64V-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_half:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    flh fa5, 0(a0)
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    flh fa4, 6(a0)
+; CHECK-RV32V-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV32V-NEXT:    ret
+  %1 = load half, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  %2 = getelementptr half, ptr %p, i32 3
+  %3 = load half, ptr %2, !nontemporal !0, !riscv-nontemporal-domain !2
+  %4 = fadd half %1, %3
+  ret half %4
+}
+
+define float @test_nontemporal_PALL_load_float(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_float:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_float:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_float:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_float:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load float, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret float %1
+}
+
+define double @test_nontemporal_PALL_load_double(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_double:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_double:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_double:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_double:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load double, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret double %1
+}
+
+define <16 x i8> @test_nontemporal_PALL_load_v16i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v16i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v16i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v16i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    ld a2, 8(a1)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    ld a1, 0(a1)
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v16i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_v16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vle8.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_v16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vle8.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_nontemporal_PALL_load_v8i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v8i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v8i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v8i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    ld a2, 8(a1)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    ld a1, 0(a1)
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v8i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_v8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vle16.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_v8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vle16.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_nontemporal_PALL_load_v4i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v4i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v4i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v4i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    ld a2, 8(a1)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    ld a1, 0(a1)
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v4i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_v4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vle32.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_v4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vle32.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_nontemporal_PALL_load_v2i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    ld a2, 0(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    ld a1, 8(a0)
+; CHECK-RV64-NEXT:    mv a0, a2
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v2i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    ld a2, 0(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    ld a1, 8(a0)
+; CHECK-RV64C-NEXT:    mv a0, a2
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v2i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_v2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vle64.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_v2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vle64.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret <2 x i64> %1
+}
+
+define void @test_nontemporal_PALL_store_i64(ptr %p, i64 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sw a2, 4(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sw a2, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    sd a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    sw a2, 4(a0)
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    sw a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i64 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_i32(ptr %p, i32 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sw a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    sw a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    sw a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i32 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_i16(ptr %p, i16 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sh a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sh a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sh a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sh a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    sh a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    sh a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i16 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_i8(ptr %p, i8 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    sb a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    sb a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i8 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_half(ptr %p, half %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_half:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_half:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_half:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_half:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store half %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_float(ptr %p, float %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_float:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_float:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_float:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_float:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store float %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_double(ptr %p, double %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_double:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_double:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_double:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_double:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store double %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v16i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    addi sp, sp, -16
+; CHECK-RV64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT:    .cfi_offset s0, -8
+; CHECK-RV64-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64-NEXT:    lbu a2, 0(a1)
+; CHECK-RV64-NEXT:    lbu a3, 8(a1)
+; CHECK-RV64-NEXT:    lbu a4, 16(a1)
+; CHECK-RV64-NEXT:    lbu a5, 24(a1)
+; CHECK-RV64-NEXT:    lbu a6, 32(a1)
+; CHECK-RV64-NEXT:    lbu a7, 40(a1)
+; CHECK-RV64-NEXT:    lbu t0, 48(a1)
+; CHECK-RV64-NEXT:    lbu t1, 56(a1)
+; CHECK-RV64-NEXT:    lbu t2, 64(a1)
+; CHECK-RV64-NEXT:    lbu t3, 72(a1)
+; CHECK-RV64-NEXT:    lbu t4, 80(a1)
+; CHECK-RV64-NEXT:    lbu t5, 88(a1)
+; CHECK-RV64-NEXT:    lbu t6, 120(a1)
+; CHECK-RV64-NEXT:    lbu s0, 112(a1)
+; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb a1, 12(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT:    addi sp, sp, 16
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v16i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    addi sp, sp, -16
+; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    .cfi_offset s0, -4
+; CHECK-RV32-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32-NEXT:    lbu a2, 0(a1)
+; CHECK-RV32-NEXT:    lbu a3, 4(a1)
+; CHECK-RV32-NEXT:    lbu a4, 8(a1)
+; CHECK-RV32-NEXT:    lbu a5, 12(a1)
+; CHECK-RV32-NEXT:    lbu a6, 16(a1)
+; CHECK-RV32-NEXT:    lbu a7, 20(a1)
+; CHECK-RV32-NEXT:    lbu t0, 24(a1)
+; CHECK-RV32-NEXT:    lbu t1, 28(a1)
+; CHECK-RV32-NEXT:    lbu t2, 32(a1)
+; CHECK-RV32-NEXT:    lbu t3, 36(a1)
+; CHECK-RV32-NEXT:    lbu t4, 40(a1)
+; CHECK-RV32-NEXT:    lbu t5, 44(a1)
+; CHECK-RV32-NEXT:    lbu t6, 60(a1)
+; CHECK-RV32-NEXT:    lbu s0, 56(a1)
+; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb a1, 12(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    addi sp, sp, 16
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v16i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    addi sp, sp, -16
+; CHECK-RV64C-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV64C-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
+; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
+; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
+; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
+; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
+; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
+; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb a1, 12(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb a3, 9(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb t2, 4(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb t1, 3(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb t0, 2(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb a7, 1(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sb a6, 0(a0)
+; CHECK-RV64C-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT:    addi sp, sp, 16
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v16i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    addi sp, sp, -16
+; CHECK-RV32C-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV32C-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
+; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
+; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
+; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
+; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
+; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
+; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb a1, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb a3, 9(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb t2, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb t1, 3(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb t0, 2(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb a7, 1(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sb a6, 0(a0)
+; CHECK-RV32C-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT:    addi sp, sp, 16
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_v16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vse8.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_v16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vse8.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_v8i16(ptr %p, <8 x i16> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v8i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lh a2, 0(a1)
+; CHECK-RV64-NEXT:    lh a3, 8(a1)
+; CHECK-RV64-NEXT:    lh a4, 16(a1)
+; CHECK-RV64-NEXT:    lh a5, 24(a1)
+; CHECK-RV64-NEXT:    lh a6, 56(a1)
+; CHECK-RV64-NEXT:    lh a7, 48(a1)
+; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a1, 32(a1)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sh a1, 8(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v8i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lh a2, 0(a1)
+; CHECK-RV32-NEXT:    lh a3, 4(a1)
+; CHECK-RV32-NEXT:    lh a4, 8(a1)
+; CHECK-RV32-NEXT:    lh a5, 12(a1)
+; CHECK-RV32-NEXT:    lh a6, 28(a1)
+; CHECK-RV32-NEXT:    lh a7, 24(a1)
+; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a1, 16(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sh a1, 8(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v8i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a6, 0(a1)
+; CHECK-RV64C-NEXT:    lh a7, 8(a1)
+; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh a5, 24(a1)
+; CHECK-RV64C-NEXT:    lh a2, 56(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 40(a1)
+; CHECK-RV64C-NEXT:    lh a1, 32(a1)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sh a3, 12(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sh a1, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sh a5, 6(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sh a6, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v8i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a6, 0(a1)
+; CHECK-RV32C-NEXT:    lh a7, 4(a1)
+; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh a5, 12(a1)
+; CHECK-RV32C-NEXT:    lh a2, 28(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 20(a1)
+; CHECK-RV32C-NEXT:    lh a1, 16(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sh a3, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sh a1, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sh a5, 6(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sh a6, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_v8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vse16.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_v8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vse16.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_v4i32(ptr %p, <4 x i32> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v4i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 24(a1)
+; CHECK-RV64-NEXT:    lw a3, 16(a1)
+; CHECK-RV64-NEXT:    lw a4, 8(a1)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sw a2, 12(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sw a3, 8(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sw a4, 4(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v4i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v4i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lw a2, 24(a1)
+; CHECK-RV64C-NEXT:    lw a3, 16(a1)
+; CHECK-RV64C-NEXT:    lw a4, 8(a1)
+; CHECK-RV64C-NEXT:    lw a1, 0(a1)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sw a2, 12(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sw a3, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sw a4, 4(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sw a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v4i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_v4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vse32.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_v4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vse32.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define void @test_nontemporal_PALL_store_v2i64(ptr %p, <2 x i64> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    ntl.pall
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    ntl.pall
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v2i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.pall
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v2i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.pall
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_v2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.pall
+; CHECK-RV64V-NEXT:    vse64.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_v2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.pall
+; CHECK-RV32V-NEXT:    vse64.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+  ret void
+}
+
+define i64 @test_nontemporal_S1_load_i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a1, 4(a0)
+; CHECK-RV32-NEXT:    mv a0, a2
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    ld a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a2, 0(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a1, 4(a0)
+; CHECK-RV32C-NEXT:    mv a0, a2
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    ld a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    lw a2, 0(a0)
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    lw a1, 4(a0)
+; CHECK-RV32V-NEXT:    mv a0, a2
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret i64 %1
+}
+
+define i32 @test_nontemporal_S1_load_i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    lw a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    lw a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    lw a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    lw a0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i32, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret i32 %1
+}
+
+define i16 @test_nontemporal_S1_load_i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    lh a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lh a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    lh a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lh a0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    lh a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    lh a0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i16, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret i16 %1
+}
+
+define i8 @test_nontemporal_S1_load_i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i8, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret i8 %1
+}
+
+define half @test_nontemporal_S1_load_half(ptr %p) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    flh fa5, 0(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    flh fa4, 6(a0)
+; CHECK-RV64-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    flh fa5, 0(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    flh fa4, 6(a0)
+; CHECK-RV32-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_half:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    flh fa5, 0(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    flh fa4, 6(a0)
+; CHECK-RV64C-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_half:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    flh fa5, 0(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    flh fa4, 6(a0)
+; CHECK-RV32C-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_half:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    flh fa5, 0(a0)
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    flh fa4, 6(a0)
+; CHECK-RV64V-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_half:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    flh fa5, 0(a0)
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    flh fa4, 6(a0)
+; CHECK-RV32V-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV32V-NEXT:    ret
+  %1 = load half, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  %2 = getelementptr half, ptr %p, i32 3
+  %3 = load half, ptr %2, !nontemporal !0, !riscv-nontemporal-domain !3
+  %4 = fadd half %1, %3
+  ret half %4
+}
+
+define float @test_nontemporal_S1_load_float(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_float:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_float:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_float:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_float:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load float, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret float %1
+}
+
+define double @test_nontemporal_S1_load_double(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_double:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_double:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_double:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_double:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load double, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret double %1
+}
+
+define <16 x i8> @test_nontemporal_S1_load_v16i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_v16i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_v16i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v16i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    ld a2, 8(a1)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    ld a1, 0(a1)
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v16i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_v16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vle8.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_v16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vle8.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_nontemporal_S1_load_v8i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_v8i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_v8i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v8i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    ld a2, 8(a1)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    ld a1, 0(a1)
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v8i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_v8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vle16.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_v8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vle16.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_nontemporal_S1_load_v4i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_v4i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_v4i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v4i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    ld a2, 8(a1)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    ld a1, 0(a1)
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v4i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_v4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vle32.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_v4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vle32.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_nontemporal_S1_load_v2i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    ld a2, 0(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    ld a1, 8(a0)
+; CHECK-RV64-NEXT:    mv a0, a2
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v2i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    ld a2, 0(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    ld a1, 8(a0)
+; CHECK-RV64C-NEXT:    mv a0, a2
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v2i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_v2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vle64.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_v2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vle64.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret <2 x i64> %1
+}
+
+define void @test_nontemporal_S1_store_i64(ptr %p, i64 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sw a2, 4(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sw a2, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    sd a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    sw a2, 4(a0)
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    sw a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i64 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_i32(ptr %p, i32 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sw a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    sw a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    sw a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i32 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_i16(ptr %p, i16 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sh a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sh a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sh a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sh a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    sh a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    sh a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i16 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_i8(ptr %p, i8 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    sb a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    sb a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i8 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_half(ptr %p, half %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_half:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_half:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_half:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_half:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store half %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_float(ptr %p, float %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_float:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_float:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_float:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_float:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store float %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_double(ptr %p, double %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_double:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_double:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_double:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_double:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store double %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_v16i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    addi sp, sp, -16
+; CHECK-RV64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT:    .cfi_offset s0, -8
+; CHECK-RV64-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64-NEXT:    lbu a2, 0(a1)
+; CHECK-RV64-NEXT:    lbu a3, 8(a1)
+; CHECK-RV64-NEXT:    lbu a4, 16(a1)
+; CHECK-RV64-NEXT:    lbu a5, 24(a1)
+; CHECK-RV64-NEXT:    lbu a6, 32(a1)
+; CHECK-RV64-NEXT:    lbu a7, 40(a1)
+; CHECK-RV64-NEXT:    lbu t0, 48(a1)
+; CHECK-RV64-NEXT:    lbu t1, 56(a1)
+; CHECK-RV64-NEXT:    lbu t2, 64(a1)
+; CHECK-RV64-NEXT:    lbu t3, 72(a1)
+; CHECK-RV64-NEXT:    lbu t4, 80(a1)
+; CHECK-RV64-NEXT:    lbu t5, 88(a1)
+; CHECK-RV64-NEXT:    lbu t6, 120(a1)
+; CHECK-RV64-NEXT:    lbu s0, 112(a1)
+; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb a1, 12(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT:    addi sp, sp, 16
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_v16i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    addi sp, sp, -16
+; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    .cfi_offset s0, -4
+; CHECK-RV32-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32-NEXT:    lbu a2, 0(a1)
+; CHECK-RV32-NEXT:    lbu a3, 4(a1)
+; CHECK-RV32-NEXT:    lbu a4, 8(a1)
+; CHECK-RV32-NEXT:    lbu a5, 12(a1)
+; CHECK-RV32-NEXT:    lbu a6, 16(a1)
+; CHECK-RV32-NEXT:    lbu a7, 20(a1)
+; CHECK-RV32-NEXT:    lbu t0, 24(a1)
+; CHECK-RV32-NEXT:    lbu t1, 28(a1)
+; CHECK-RV32-NEXT:    lbu t2, 32(a1)
+; CHECK-RV32-NEXT:    lbu t3, 36(a1)
+; CHECK-RV32-NEXT:    lbu t4, 40(a1)
+; CHECK-RV32-NEXT:    lbu t5, 44(a1)
+; CHECK-RV32-NEXT:    lbu t6, 60(a1)
+; CHECK-RV32-NEXT:    lbu s0, 56(a1)
+; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb a1, 12(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    addi sp, sp, 16
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v16i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    addi sp, sp, -16
+; CHECK-RV64C-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV64C-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
+; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
+; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
+; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
+; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
+; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
+; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb a1, 12(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb a3, 9(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb t2, 4(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb t1, 3(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb t0, 2(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb a7, 1(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sb a6, 0(a0)
+; CHECK-RV64C-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT:    addi sp, sp, 16
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v16i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    addi sp, sp, -16
+; CHECK-RV32C-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV32C-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
+; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
+; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
+; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
+; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
+; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
+; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb a1, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb a3, 9(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb t2, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb t1, 3(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb t0, 2(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb a7, 1(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sb a6, 0(a0)
+; CHECK-RV32C-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT:    addi sp, sp, 16
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_v16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vse8.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_v16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vse8.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_v8i16(ptr %p, <8 x i16> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_v8i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lh a2, 0(a1)
+; CHECK-RV64-NEXT:    lh a3, 8(a1)
+; CHECK-RV64-NEXT:    lh a4, 16(a1)
+; CHECK-RV64-NEXT:    lh a5, 24(a1)
+; CHECK-RV64-NEXT:    lh a6, 56(a1)
+; CHECK-RV64-NEXT:    lh a7, 48(a1)
+; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a1, 32(a1)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sh a1, 8(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_v8i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lh a2, 0(a1)
+; CHECK-RV32-NEXT:    lh a3, 4(a1)
+; CHECK-RV32-NEXT:    lh a4, 8(a1)
+; CHECK-RV32-NEXT:    lh a5, 12(a1)
+; CHECK-RV32-NEXT:    lh a6, 28(a1)
+; CHECK-RV32-NEXT:    lh a7, 24(a1)
+; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a1, 16(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sh a1, 8(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v8i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a6, 0(a1)
+; CHECK-RV64C-NEXT:    lh a7, 8(a1)
+; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh a5, 24(a1)
+; CHECK-RV64C-NEXT:    lh a2, 56(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 40(a1)
+; CHECK-RV64C-NEXT:    lh a1, 32(a1)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sh a3, 12(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sh a1, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sh a5, 6(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sh a6, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v8i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a6, 0(a1)
+; CHECK-RV32C-NEXT:    lh a7, 4(a1)
+; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh a5, 12(a1)
+; CHECK-RV32C-NEXT:    lh a2, 28(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 20(a1)
+; CHECK-RV32C-NEXT:    lh a1, 16(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sh a3, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sh a1, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sh a5, 6(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sh a6, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_v8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vse16.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_v8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vse16.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_v4i32(ptr %p, <4 x i32> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_v4i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 24(a1)
+; CHECK-RV64-NEXT:    lw a3, 16(a1)
+; CHECK-RV64-NEXT:    lw a4, 8(a1)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sw a2, 12(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sw a3, 8(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sw a4, 4(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_v4i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v4i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lw a2, 24(a1)
+; CHECK-RV64C-NEXT:    lw a3, 16(a1)
+; CHECK-RV64C-NEXT:    lw a4, 8(a1)
+; CHECK-RV64C-NEXT:    lw a1, 0(a1)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sw a2, 12(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sw a3, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sw a4, 4(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sw a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v4i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_v4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vse32.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_v4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vse32.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define void @test_nontemporal_S1_store_v2i64(ptr %p, <2 x i64> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    ntl.s1
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    ntl.s1
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v2i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.s1
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v2i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.s1
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_v2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.s1
+; CHECK-RV64V-NEXT:    vse64.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_v2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.s1
+; CHECK-RV32V-NEXT:    vse64.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+  ret void
+}
+
+define i64 @test_nontemporal_ALL_load_i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 4(a0)
+; CHECK-RV32-NEXT:    mv a0, a2
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    ld a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a2, 0(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a1, 4(a0)
+; CHECK-RV32C-NEXT:    mv a0, a2
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    ld a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    lw a2, 0(a0)
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    lw a1, 4(a0)
+; CHECK-RV32V-NEXT:    mv a0, a2
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret i64 %1
+}
+
+define i32 @test_nontemporal_ALL_load_i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lw a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    lw a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    lw a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    lw a0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i32, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret i32 %1
+}
+
+define i16 @test_nontemporal_ALL_load_i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lh a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lh a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    lh a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lh a0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    lh a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    lh a0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i16, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret i16 %1
+}
+
+define i8 @test_nontemporal_ALL_load_i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load i8, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret i8 %1
+}
+
+define half @test_nontemporal_ALL_load_half(ptr %p) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    flh fa5, 0(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    flh fa4, 6(a0)
+; CHECK-RV64-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    flh fa5, 0(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    flh fa4, 6(a0)
+; CHECK-RV32-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_half:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    flh fa5, 0(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    flh fa4, 6(a0)
+; CHECK-RV64C-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_half:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    flh fa5, 0(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    flh fa4, 6(a0)
+; CHECK-RV32C-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_half:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    flh fa5, 0(a0)
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    flh fa4, 6(a0)
+; CHECK-RV64V-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_half:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    flh fa5, 0(a0)
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    flh fa4, 6(a0)
+; CHECK-RV32V-NEXT:    fadd.h fa0, fa5, fa4
+; CHECK-RV32V-NEXT:    ret
+  %1 = load half, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  %2 = getelementptr half, ptr %p, i32 3
+  %3 = load half, ptr %2, !nontemporal !0, !riscv-nontemporal-domain !4
+  %4 = fadd half %1, %3
+  ret half %4
+}
+
+define float @test_nontemporal_ALL_load_float(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_float:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_float:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_float:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_float:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load float, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret float %1
+}
+
+define double @test_nontemporal_ALL_load_double(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_double:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_double:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_double:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_double:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load double, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret double %1
+}
+
+define <16 x i8> @test_nontemporal_ALL_load_v16i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v16i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v16i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v16i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    ld a2, 8(a1)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    ld a1, 0(a1)
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v16i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_v16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vle8.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_v16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vle8.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_nontemporal_ALL_load_v8i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v8i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v8i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v8i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    ld a2, 8(a1)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    ld a1, 0(a1)
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v8i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_v8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vle16.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_v8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vle16.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_nontemporal_ALL_load_v4i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v4i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v4i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v4i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    ld a2, 8(a1)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    ld a1, 0(a1)
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v4i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_v4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vle32.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_v4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vle32.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_nontemporal_ALL_load_v2i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a2, 0(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a1, 8(a0)
+; CHECK-RV64-NEXT:    mv a0, a2
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v2i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    ld a2, 0(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    ld a1, 8(a0)
+; CHECK-RV64C-NEXT:    mv a0, a2
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v2i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_v2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vle64.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_v2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vle64.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  %1 = load <2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret <2 x i64> %1
+}
+
+define void @test_nontemporal_ALL_store_i64(ptr %p, i64 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a2, 4(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sw a2, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    sd a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    sw a2, 4(a0)
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    sw a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i64 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_i32(ptr %p, i32 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sw a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    sw a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    sw a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i32 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_i16(ptr %p, i16 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sh a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sh a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sh a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sh a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    sh a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    sh a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i16 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_i8(ptr %p, i8 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    sb a1, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    sb a1, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store i8 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_half(ptr %p, half %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_half:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_half:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_half:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_half:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store half %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_float(ptr %p, float %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_float:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_float:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_float:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_float:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store float %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_double(ptr %p, double %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_double:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_double:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_double:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_double:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32V-NEXT:    ret
+  store double %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v16i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    addi sp, sp, -16
+; CHECK-RV64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT:    .cfi_offset s0, -8
+; CHECK-RV64-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64-NEXT:    lbu a2, 0(a1)
+; CHECK-RV64-NEXT:    lbu a3, 8(a1)
+; CHECK-RV64-NEXT:    lbu a4, 16(a1)
+; CHECK-RV64-NEXT:    lbu a5, 24(a1)
+; CHECK-RV64-NEXT:    lbu a6, 32(a1)
+; CHECK-RV64-NEXT:    lbu a7, 40(a1)
+; CHECK-RV64-NEXT:    lbu t0, 48(a1)
+; CHECK-RV64-NEXT:    lbu t1, 56(a1)
+; CHECK-RV64-NEXT:    lbu t2, 64(a1)
+; CHECK-RV64-NEXT:    lbu t3, 72(a1)
+; CHECK-RV64-NEXT:    lbu t4, 80(a1)
+; CHECK-RV64-NEXT:    lbu t5, 88(a1)
+; CHECK-RV64-NEXT:    lbu t6, 120(a1)
+; CHECK-RV64-NEXT:    lbu s0, 112(a1)
+; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb a1, 12(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT:    addi sp, sp, 16
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v16i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    addi sp, sp, -16
+; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    .cfi_offset s0, -4
+; CHECK-RV32-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32-NEXT:    lbu a2, 0(a1)
+; CHECK-RV32-NEXT:    lbu a3, 4(a1)
+; CHECK-RV32-NEXT:    lbu a4, 8(a1)
+; CHECK-RV32-NEXT:    lbu a5, 12(a1)
+; CHECK-RV32-NEXT:    lbu a6, 16(a1)
+; CHECK-RV32-NEXT:    lbu a7, 20(a1)
+; CHECK-RV32-NEXT:    lbu t0, 24(a1)
+; CHECK-RV32-NEXT:    lbu t1, 28(a1)
+; CHECK-RV32-NEXT:    lbu t2, 32(a1)
+; CHECK-RV32-NEXT:    lbu t3, 36(a1)
+; CHECK-RV32-NEXT:    lbu t4, 40(a1)
+; CHECK-RV32-NEXT:    lbu t5, 44(a1)
+; CHECK-RV32-NEXT:    lbu t6, 60(a1)
+; CHECK-RV32-NEXT:    lbu s0, 56(a1)
+; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb a1, 12(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    addi sp, sp, 16
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v16i8:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    addi sp, sp, -16
+; CHECK-RV64C-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV64C-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
+; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
+; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
+; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
+; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
+; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
+; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb a1, 12(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb a3, 9(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb t2, 4(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb t1, 3(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb t0, 2(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb a7, 1(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sb a6, 0(a0)
+; CHECK-RV64C-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT:    addi sp, sp, 16
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v16i8:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    addi sp, sp, -16
+; CHECK-RV32C-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-RV32C-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
+; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
+; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
+; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
+; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
+; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
+; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb a1, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb a3, 9(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb t2, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb t1, 3(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb t0, 2(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb a7, 1(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sb a6, 0(a0)
+; CHECK-RV32C-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT:    addi sp, sp, 16
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_v16i8:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vse8.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_v16i8:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vse8.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_v8i16(ptr %p, <8 x i16> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v8i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lh a2, 0(a1)
+; CHECK-RV64-NEXT:    lh a3, 8(a1)
+; CHECK-RV64-NEXT:    lh a4, 16(a1)
+; CHECK-RV64-NEXT:    lh a5, 24(a1)
+; CHECK-RV64-NEXT:    lh a6, 56(a1)
+; CHECK-RV64-NEXT:    lh a7, 48(a1)
+; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a1, 32(a1)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sh a1, 8(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v8i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lh a2, 0(a1)
+; CHECK-RV32-NEXT:    lh a3, 4(a1)
+; CHECK-RV32-NEXT:    lh a4, 8(a1)
+; CHECK-RV32-NEXT:    lh a5, 12(a1)
+; CHECK-RV32-NEXT:    lh a6, 28(a1)
+; CHECK-RV32-NEXT:    lh a7, 24(a1)
+; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a1, 16(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sh a1, 8(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v8i16:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a6, 0(a1)
+; CHECK-RV64C-NEXT:    lh a7, 8(a1)
+; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh a5, 24(a1)
+; CHECK-RV64C-NEXT:    lh a2, 56(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 40(a1)
+; CHECK-RV64C-NEXT:    lh a1, 32(a1)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sh a3, 12(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sh a1, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sh a5, 6(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sh a6, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v8i16:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a6, 0(a1)
+; CHECK-RV32C-NEXT:    lh a7, 4(a1)
+; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh a5, 12(a1)
+; CHECK-RV32C-NEXT:    lh a2, 28(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 20(a1)
+; CHECK-RV32C-NEXT:    lh a1, 16(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sh a3, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sh a1, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sh a5, 6(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sh a6, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_v8i16:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vse16.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_v8i16:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vse16.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_v4i32(ptr %p, <4 x i32> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v4i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 24(a1)
+; CHECK-RV64-NEXT:    lw a3, 16(a1)
+; CHECK-RV64-NEXT:    lw a4, 8(a1)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sw a2, 12(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sw a3, 8(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sw a4, 4(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v4i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v4i32:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lw a2, 24(a1)
+; CHECK-RV64C-NEXT:    lw a3, 16(a1)
+; CHECK-RV64C-NEXT:    lw a4, 8(a1)
+; CHECK-RV64C-NEXT:    lw a1, 0(a1)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sw a2, 12(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sw a3, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sw a4, 4(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sw a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v4i32:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_v4i32:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vse32.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_v4i32:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vse32.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+define void @test_nontemporal_ALL_store_v2i64(ptr %p, <2 x i64> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v2i64:
+; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sd a2, 8(a0)
+; CHECK-RV64C-NEXT:    c.ntl.all
+; CHECK-RV64C-NEXT:    sd a1, 0(a0)
+; CHECK-RV64C-NEXT:    ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v2i64:
+; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lw a2, 12(a1)
+; CHECK-RV32C-NEXT:    lw a3, 8(a1)
+; CHECK-RV32C-NEXT:    lw a4, 4(a1)
+; CHECK-RV32C-NEXT:    lw a1, 0(a1)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sw a2, 12(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sw a3, 8(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sw a4, 4(a0)
+; CHECK-RV32C-NEXT:    c.ntl.all
+; CHECK-RV32C-NEXT:    sw a1, 0(a0)
+; CHECK-RV32C-NEXT:    ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_v2i64:
+; CHECK-RV64V:       # %bb.0:
+; CHECK-RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT:    ntl.all
+; CHECK-RV64V-NEXT:    vse64.v v8, (a0)
+; CHECK-RV64V-NEXT:    ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_v2i64:
+; CHECK-RV32V:       # %bb.0:
+; CHECK-RV32V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT:    ntl.all
+; CHECK-RV32V-NEXT:    vse64.v v8, (a0)
+; CHECK-RV32V-NEXT:    ret
+  store <2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+  ret void
+}
+
+
 !0 = !{i32 1}
+!1 = !{i32 2}
+!2 = !{i32 3}
+!3 = !{i32 4}
+!4 = !{i32 5}


        


More information about the cfe-commits mailing list