[clang] 8a39505 - [RISCV] Support scalar/fix-length vector NTLH intrinsic with different domain
Piyou Chen via cfe-commits
cfe-commits at lists.llvm.org
Mon Apr 24 20:20:08 PDT 2023
Author: Piyou Chen
Date: 2023-04-24T20:15:14-07:00
New Revision: 8a3950510f819308f7ead16c339484147c69c84a
URL: https://github.com/llvm/llvm-project/commit/8a3950510f819308f7ead16c339484147c69c84a
DIFF: https://github.com/llvm/llvm-project/commit/8a3950510f819308f7ead16c339484147c69c84a.diff
LOG: [RISCV] Support scalar/fix-length vector NTLH intrinsic with different domain
This commit implements the two NTLH intrinsic functions.
```
type __riscv_ntl_load (type *ptr, int domain);
void __riscv_ntl_store (type *ptr, type val, int domain);
```
```
enum {
__RISCV_NTLH_INNERMOST_PRIVATE = 2,
__RISCV_NTLH_ALL_PRIVATE,
__RISCV_NTLH_INNERMOST_SHARED,
__RISCV_NTLH_ALL
};
```
We encode the non-temporal domain into MachineMemOperand flags.
1. Create the RISC-V built-in function with custom semantic checking.
2. Assume the domain argument is a compile time constant,
and make it as LLVM IR metadata (nontemp_node).
3. Encode domain value as two bits MachineMemOperand TargetMMOflag.
4. According to MachineMemOperand TargetMMOflag, select corrsponding ntlh instruction.
Currently, it supports scalar type and fixed-length vector type.
Reviewed By: craig.topper
Differential Revision: https://reviews.llvm.org/D143364
Added:
clang/lib/Headers/riscv_ntlh.h
clang/test/CodeGen/RISCV/ntlh-intrinsics/riscv32-zihintntl.c
Modified:
clang/include/clang/Basic/BuiltinsRISCV.def
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/Headers/CMakeLists.txt
clang/lib/Sema/SemaChecking.cpp
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.h
llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
llvm/lib/Target/RISCV/RISCVInstrInfo.h
llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
llvm/test/CodeGen/RISCV/nontemporal-scalable.ll
llvm/test/CodeGen/RISCV/nontemporal.ll
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsRISCV.def b/clang/include/clang/Basic/BuiltinsRISCV.def
index 3ca7654a32adc..370ef0af8f9a5 100644
--- a/clang/include/clang/Basic/BuiltinsRISCV.def
+++ b/clang/include/clang/Basic/BuiltinsRISCV.def
@@ -79,5 +79,9 @@ TARGET_BUILTIN(__builtin_riscv_sm4ks, "LiLiLiIUc", "nc", "zksed")
TARGET_BUILTIN(__builtin_riscv_sm3p0, "LiLi", "nc", "zksh")
TARGET_BUILTIN(__builtin_riscv_sm3p1, "LiLi", "nc", "zksh")
+// Zihintntl extension
+TARGET_BUILTIN(__builtin_riscv_ntl_load, "v.", "t", "experimental-zihintntl")
+TARGET_BUILTIN(__builtin_riscv_ntl_store, "v.", "t", "experimental-zihintntl")
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 803ad398c449e..ca11127440fa9 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19760,6 +19760,11 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
assert(Error == ASTContext::GE_None && "Unexpected error");
}
+ if (BuiltinID == RISCV::BI__builtin_riscv_ntl_load)
+ ICEArguments |= (1 << 1);
+ if (BuiltinID == RISCV::BI__builtin_riscv_ntl_store)
+ ICEArguments |= (1 << 2);
+
for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
// If this is a normal argument, just emit it as a scalar.
if ((ICEArguments & (1 << i)) == 0) {
@@ -19962,6 +19967,56 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
IntrinsicTypes = {ResultType};
break;
+ // Zihintntl
+ case RISCV::BI__builtin_riscv_ntl_load: {
+ llvm::Type *ResTy = ConvertType(E->getType());
+ ConstantInt *Mode = cast<ConstantInt>(Ops[1]);
+
+ llvm::MDNode *RISCVDomainNode = llvm::MDNode::get(
+ getLLVMContext(),
+ llvm::ConstantAsMetadata::get(Builder.getInt32(Mode->getZExtValue())));
+ llvm::MDNode *NontemporalNode = llvm::MDNode::get(
+ getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
+
+ int Width;
+ if(ResTy->isScalableTy()) {
+ const ScalableVectorType *SVTy = cast<ScalableVectorType>(ResTy);
+ llvm::Type *ScalarTy = ResTy->getScalarType();
+ Width = ScalarTy->getPrimitiveSizeInBits() *
+ SVTy->getElementCount().getKnownMinValue();
+ } else
+ Width = ResTy->getPrimitiveSizeInBits();
+ LoadInst *Load = Builder.CreateLoad(
+ Address(Ops[0], ResTy, CharUnits::fromQuantity(Width / 8)));
+
+ Load->setMetadata(CGM.getModule().getMDKindID("nontemporal"),
+ NontemporalNode);
+ Load->setMetadata(CGM.getModule().getMDKindID("riscv-nontemporal-domain"),
+ RISCVDomainNode);
+
+ return Load;
+ }
+ case RISCV::BI__builtin_riscv_ntl_store: {
+ ConstantInt *Mode = cast<ConstantInt>(Ops[2]);
+
+ llvm::MDNode *RISCVDomainNode = llvm::MDNode::get(
+ getLLVMContext(),
+ llvm::ConstantAsMetadata::get(Builder.getInt32(Mode->getZExtValue())));
+ llvm::MDNode *NontemporalNode = llvm::MDNode::get(
+ getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
+
+ Value *BC = Builder.CreateBitCast(
+ Ops[0], llvm::PointerType::getUnqual(Ops[1]->getType()), "cast");
+
+ StoreInst *Store = Builder.CreateDefaultAlignedStore(Ops[1], BC);
+ Store->setMetadata(CGM.getModule().getMDKindID("nontemporal"),
+ NontemporalNode);
+ Store->setMetadata(CGM.getModule().getMDKindID("riscv-nontemporal-domain"),
+ RISCVDomainNode);
+
+ return Store;
+ }
+
// Vector builtins are handled from here.
#include "clang/Basic/riscv_vector_builtin_cg.inc"
}
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 52f0e23a265cb..7c2f323652bb5 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -98,6 +98,10 @@ set(ppc_htm_files
htmxlintrin.h
)
+set(riscv_files
+ riscv_ntlh.h
+ )
+
set(systemz_files
s390intrin.h
vecintrin.h
@@ -244,6 +248,7 @@ set(files
${opencl_files}
${ppc_files}
${ppc_htm_files}
+ ${riscv_files}
${systemz_files}
${ve_files}
${x86_files}
@@ -425,7 +430,7 @@ add_header_target("loongarch-resource-headers" "${loongarch_files}")
add_header_target("mips-resource-headers" "${mips_msa_files}")
add_header_target("ppc-resource-headers" "${ppc_files};${ppc_wrapper_files}")
add_header_target("ppc-htm-resource-headers" "${ppc_htm_files}")
-add_header_target("riscv-resource-headers" "${riscv_generated_files}")
+add_header_target("riscv-resource-headers" "${riscv_files};${riscv_generated_files}")
add_header_target("systemz-resource-headers" "${systemz_files}")
add_header_target("ve-resource-headers" "${ve_files}")
add_header_target("webassembly-resource-headers" "${webassembly_files}")
@@ -548,6 +553,12 @@ install(
EXCLUDE_FROM_ALL
COMPONENT riscv-resource-headers)
+install(
+ FILES ${riscv_files}
+ DESTINATION ${header_install_dir}
+ EXCLUDE_FROM_ALL
+ COMPONENT riscv-resource-headers)
+
install(
FILES ${systemz_files}
DESTINATION ${header_install_dir}
diff --git a/clang/lib/Headers/riscv_ntlh.h b/clang/lib/Headers/riscv_ntlh.h
new file mode 100644
index 0000000000000..9ce1709205835
--- /dev/null
+++ b/clang/lib/Headers/riscv_ntlh.h
@@ -0,0 +1,28 @@
+/*===---- riscv_ntlh.h - RISC-V NTLH intrinsics ----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_NTLH_H
+#define __RISCV_NTLH_H
+
+#ifndef __riscv_zihintntl
+#error "NTLH intrinsics require the NTLH extension."
+#endif
+
+enum {
+ __RISCV_NTLH_INNERMOST_PRIVATE = 2,
+ __RISCV_NTLH_ALL_PRIVATE,
+ __RISCV_NTLH_INNERMOST_SHARED,
+ __RISCV_NTLH_ALL
+};
+
+#define __riscv_ntl_load(PTR, DOMAIN) __builtin_riscv_ntl_load((PTR), (DOMAIN))
+#define __riscv_ntl_store(PTR, VAL, DOMAIN) \
+ __builtin_riscv_ntl_store((PTR), (VAL), (DOMAIN))
+
+#endif
\ No newline at end of file
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index f66eb9fcf13dc..eca106fa0a185 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -4652,6 +4652,65 @@ bool Sema::CheckRISCVBuiltinFunctionCall(const TargetInfo &TI,
// Check if rnum is in [0, 10]
case RISCV::BI__builtin_riscv_aes64ks1i_64:
return SemaBuiltinConstantArgRange(TheCall, 1, 0, 10);
+ case RISCV::BI__builtin_riscv_ntl_load:
+ case RISCV::BI__builtin_riscv_ntl_store:
+ DeclRefExpr *DRE =
+ cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
+ assert((BuiltinID == RISCV::BI__builtin_riscv_ntl_store ||
+ BuiltinID == RISCV::BI__builtin_riscv_ntl_load) &&
+ "Unexpected RISC-V nontemporal load/store builtin!");
+ bool IsStore = BuiltinID == RISCV::BI__builtin_riscv_ntl_store;
+ unsigned NumArgs = IsStore ? 3 : 2;
+
+ if (checkArgCount(*this, TheCall, NumArgs))
+ return true;
+
+ // Domain value should be compile-time constant.
+ // 2 <= domain <= 5
+ if (SemaBuiltinConstantArgRange(TheCall, NumArgs - 1, 2, 5))
+ return true;
+
+ Expr *PointerArg = TheCall->getArg(0);
+ ExprResult PointerArgResult =
+ DefaultFunctionArrayLvalueConversion(PointerArg);
+
+ if (PointerArgResult.isInvalid())
+ return true;
+ PointerArg = PointerArgResult.get();
+
+ const PointerType *PtrType = PointerArg->getType()->getAs<PointerType>();
+ if (!PtrType) {
+ Diag(DRE->getBeginLoc(), diag::err_nontemporal_builtin_must_be_pointer)
+ << PointerArg->getType() << PointerArg->getSourceRange();
+ return true;
+ }
+
+ QualType ValType = PtrType->getPointeeType();
+ ValType = ValType.getUnqualifiedType();
+ if (!ValType->isIntegerType() && !ValType->isAnyPointerType() &&
+ !ValType->isBlockPointerType() && !ValType->isFloatingType() &&
+ !ValType->isVectorType() && !ValType->isRVVType()) {
+ Diag(DRE->getBeginLoc(),
+ diag::err_nontemporal_builtin_must_be_pointer_intfltptr_or_vector)
+ << PointerArg->getType() << PointerArg->getSourceRange();
+ return true;
+ }
+
+ if (!IsStore) {
+ TheCall->setType(ValType);
+ return false;
+ }
+
+ ExprResult ValArg = TheCall->getArg(1);
+ InitializedEntity Entity = InitializedEntity::InitializeParameter(
+ Context, ValType, /*consume*/ false);
+ ValArg = PerformCopyInitialization(Entity, SourceLocation(), ValArg);
+ if (ValArg.isInvalid())
+ return true;
+
+ TheCall->setArg(1, ValArg.get());
+ TheCall->setType(Context.VoidTy);
+ return false;
}
return false;
diff --git a/clang/test/CodeGen/RISCV/ntlh-intrinsics/riscv32-zihintntl.c b/clang/test/CodeGen/RISCV/ntlh-intrinsics/riscv32-zihintntl.c
new file mode 100644
index 0000000000000..9126e686c4202
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/ntlh-intrinsics/riscv32-zihintntl.c
@@ -0,0 +1,181 @@
+// RUN: %clang_cc1 -triple riscv32 -target-feature +v -target-feature +experimental-zihintntl -emit-llvm %s -o - \
+// RUN: | FileCheck %s
+
+#include <riscv_ntlh.h>
+#include <riscv_vector.h>
+
+signed char sc;
+unsigned char uc;
+signed short ss;
+unsigned short us;
+signed int si;
+unsigned int ui;
+signed long long sll;
+unsigned long long ull;
+_Float16 h1, h2;
+float f1, f2;
+double d1, d2;
+typedef int v4si __attribute__((vector_size(16)));
+typedef signed short v8ss __attribute__((vector_size(16)));
+typedef signed char v16sc __attribute__((vector_size(16)));
+v4si v4si1, v4si2;
+v8ss v8ss1, v8ss2;
+v16sc v16sc1, v16sc2;
+vint32m1_t *scvi1, *scvi2;
+vint16m1_t *scvs1, *scvs2;
+vint8m1_t *scvc1, *scvc2;
+
+// clang-format off
+void ntl_all_sizes() { // CHECK-LABEL: ntl_all_sizes
+ uc = __riscv_ntl_load(&sc, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !5
+ sc = __riscv_ntl_load(&uc, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !5
+ us = __riscv_ntl_load(&ss, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5
+ ss = __riscv_ntl_load(&us, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5
+ ui = __riscv_ntl_load(&si, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5
+ si = __riscv_ntl_load(&ui, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5
+ ull = __riscv_ntl_load(&sll, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+ sll = __riscv_ntl_load(&ull, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+ h1 = __riscv_ntl_load(&h2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5
+ f1 = __riscv_ntl_load(&f2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5
+ d1 = __riscv_ntl_load(&d2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+ v4si1 = __riscv_ntl_load(&v4si2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5
+ v8ss1 = __riscv_ntl_load(&v8ss2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5
+ v16sc1 = __riscv_ntl_load(&v16sc2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5
+ *scvi1 = __riscv_ntl_load(scvi2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+ *scvs1 = __riscv_ntl_load(scvs2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+ *scvc1 = __riscv_ntl_load(scvc2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: load <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+
+ uc = __riscv_ntl_load(&sc, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !6
+ sc = __riscv_ntl_load(&uc, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !6
+ us = __riscv_ntl_load(&ss, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6
+ ss = __riscv_ntl_load(&us, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6
+ ui = __riscv_ntl_load(&si, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6
+ si = __riscv_ntl_load(&ui, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6
+ ull = __riscv_ntl_load(&sll, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+ sll = __riscv_ntl_load(&ull, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+ h1 = __riscv_ntl_load(&h2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6
+ f1 = __riscv_ntl_load(&f2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6
+ d1 = __riscv_ntl_load(&d2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+ v4si1 = __riscv_ntl_load(&v4si2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6
+ v8ss1 = __riscv_ntl_load(&v8ss2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6
+ v16sc1 = __riscv_ntl_load(&v16sc2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6
+ *scvi1 = __riscv_ntl_load(scvi2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+ *scvs1 = __riscv_ntl_load(scvs2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+ *scvc1 = __riscv_ntl_load(scvc2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: load <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+
+ uc = __riscv_ntl_load(&sc, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !7
+ sc = __riscv_ntl_load(&uc, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !7
+ us = __riscv_ntl_load(&ss, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7
+ ss = __riscv_ntl_load(&us, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7
+ ui = __riscv_ntl_load(&si, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7
+ si = __riscv_ntl_load(&ui, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7
+ ull = __riscv_ntl_load(&sll, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+ sll = __riscv_ntl_load(&ull, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+ h1 = __riscv_ntl_load(&h2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7
+ f1 = __riscv_ntl_load(&f2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7
+ d1 = __riscv_ntl_load(&d2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+ v4si1 = __riscv_ntl_load(&v4si2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7
+ v8ss1 = __riscv_ntl_load(&v8ss2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7
+ v16sc1 = __riscv_ntl_load(&v16sc2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7
+ *scvi1 = __riscv_ntl_load(scvi2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+ *scvs1 = __riscv_ntl_load(scvs2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+ *scvc1 = __riscv_ntl_load(scvc2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: load <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+
+ uc = __riscv_ntl_load(&sc, __RISCV_NTLH_ALL); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !8
+ sc = __riscv_ntl_load(&uc, __RISCV_NTLH_ALL); // CHECK: load i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !8
+ us = __riscv_ntl_load(&ss, __RISCV_NTLH_ALL); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8
+ ss = __riscv_ntl_load(&us, __RISCV_NTLH_ALL); // CHECK: load i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8
+ ui = __riscv_ntl_load(&si, __RISCV_NTLH_ALL); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8
+ si = __riscv_ntl_load(&ui, __RISCV_NTLH_ALL); // CHECK: load i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8
+ ull = __riscv_ntl_load(&sll, __RISCV_NTLH_ALL); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+ sll = __riscv_ntl_load(&ull, __RISCV_NTLH_ALL); // CHECK: load i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+ h1 = __riscv_ntl_load(&h2, __RISCV_NTLH_ALL); // CHECK: load half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8
+ f1 = __riscv_ntl_load(&f2, __RISCV_NTLH_ALL); // CHECK: load float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8
+ d1 = __riscv_ntl_load(&d2, __RISCV_NTLH_ALL); // CHECK: load double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+ v4si1 = __riscv_ntl_load(&v4si2, __RISCV_NTLH_ALL); // CHECK: load <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8
+ v8ss1 = __riscv_ntl_load(&v8ss2, __RISCV_NTLH_ALL); // CHECK: load <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8
+ v16sc1 = __riscv_ntl_load(&v16sc2, __RISCV_NTLH_ALL); // CHECK: load <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8
+ *scvi1 = __riscv_ntl_load(scvi2, __RISCV_NTLH_ALL); // CHECK: load <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+ *scvs1 = __riscv_ntl_load(scvs2, __RISCV_NTLH_ALL); // CHECK: load <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+ *scvc1 = __riscv_ntl_load(scvc2, __RISCV_NTLH_ALL); // CHECK: load <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+
+ __riscv_ntl_store(&uc, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&sc, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&us, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&ss, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&ui, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&si, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&ull, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&sll, 1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&h1, 1.0, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&f1, 1.0, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&d1, 1.0, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&v4si1, v4si2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&v8ss1, v8ss2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(&v16sc1, v16sc2, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(scvi2, *scvi1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(scvs2, *scvs1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+ __riscv_ntl_store(scvc2, *scvc1, __RISCV_NTLH_INNERMOST_PRIVATE); // CHECK: store <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !5
+
+ __riscv_ntl_store(&uc, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&sc, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&us, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&ss, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&ui, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&si, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&ull, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&sll, 1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&h1, 1.0, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&f1, 1.0, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&d1, 1.0, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&v4si1, v4si2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&v8ss1, v8ss2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(&v16sc1, v16sc2, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(scvi2, *scvi1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(scvs2, *scvs1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+ __riscv_ntl_store(scvc2, *scvc1, __RISCV_NTLH_ALL_PRIVATE); // CHECK: store <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !6
+
+ __riscv_ntl_store(&uc, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&sc, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&us, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&ss, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&ui, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&si, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&ull, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&sll, 1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&h1, 1.0, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&f1, 1.0, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&d1, 1.0, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&v4si1, v4si2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&v8ss1, v8ss2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(&v16sc1, v16sc2, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(scvi2, *scvi1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(scvs2, *scvs1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+ __riscv_ntl_store(scvc2, *scvc1, __RISCV_NTLH_INNERMOST_SHARED); // CHECK: store <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !7
+
+ __riscv_ntl_store(&uc, 1, __RISCV_NTLH_ALL); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&sc, 1, __RISCV_NTLH_ALL); // CHECK: store i8{{.*}}align 1, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&us, 1, __RISCV_NTLH_ALL); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&ss, 1, __RISCV_NTLH_ALL); // CHECK: store i16{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&ui, 1, __RISCV_NTLH_ALL); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&si, 1, __RISCV_NTLH_ALL); // CHECK: store i32{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&ull, 1, __RISCV_NTLH_ALL); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&sll, 1, __RISCV_NTLH_ALL); // CHECK: store i64{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&h1, 1.0, __RISCV_NTLH_ALL); // CHECK: store half{{.*}}align 2, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&f1, 1.0, __RISCV_NTLH_ALL); // CHECK: store float{{.*}}align 4, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&d1, 1.0, __RISCV_NTLH_ALL); // CHECK: store double{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&v4si1, v4si2, __RISCV_NTLH_ALL); // CHECK: store <4 x i32>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&v8ss1, v8ss2, __RISCV_NTLH_ALL); // CHECK: store <8 x i16>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(&v16sc1, v16sc2, __RISCV_NTLH_ALL); // CHECK: store <16 x i8>{{.*}}align 16, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(scvi2, *scvi1, __RISCV_NTLH_ALL); // CHECK: store <vscale x 2 x i32>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(scvs2, *scvs1, __RISCV_NTLH_ALL); // CHECK: store <vscale x 4 x i16>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+ __riscv_ntl_store(scvc2, *scvc1, __RISCV_NTLH_ALL); // CHECK: store <vscale x 8 x i8>{{.*}}align 8, !nontemporal !4, !riscv-nontemporal-domain !8
+
+}
+// clang-format on
+
+// CHECK: !4 = !{i32 1}
+// CHECK: !5 = !{i32 2}
+// CHECK: !6 = !{i32 3}
+// CHECK: !7 = !{i32 4}
+// CHECK: !8 = !{i32 5}
\ No newline at end of file
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 747f40ff38f4c..be90bdf970d19 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -433,6 +433,13 @@ class TargetLoweringBase {
return MachineMemOperand::MONone;
}
+ /// This callback is used to inspect load/store SDNode.
+ /// The default implementation does nothing.
+ virtual MachineMemOperand::Flags
+ getTargetMMOFlags(const MemSDNode &Node) const {
+ return MachineMemOperand::MONone;
+ }
+
MachineMemOperand::Flags
getLoadMemOperandFlags(const LoadInst &LI, const DataLayout &DL,
AssumptionCache *AC = nullptr,
@@ -672,6 +679,13 @@ class TargetLoweringBase {
return false;
}
+ /// Return true if it is valid to merge the TargetMMOFlags in two SDNodes.
+ virtual bool
+ areTwoSDNodeTargetMMOFlagsMergeable(const MemSDNode &NodeX,
+ const MemSDNode &NodeY) const {
+ return true;
+ }
+
/// Use bitwise logic to make pairs of compares more efficient. For example:
/// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
/// This should be true when it takes more than one instruction to lower
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ade34f70c9768..842ecc751d12e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -19412,6 +19412,8 @@ void DAGCombiner::getStoreMergeCandidates(
// Don't mix temporal stores with non-temporal stores.
if (St->isNonTemporal() != Other->isNonTemporal())
return false;
+ if (!TLI.areTwoSDNodeTargetMMOFlagsMergeable(*St, *Other))
+ return false;
SDValue OtherBC = peekThroughBitcasts(Other->getValue());
// Allow merging constants of
diff erent types as integers.
bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
@@ -19437,6 +19439,9 @@ void DAGCombiner::getStoreMergeCandidates(
// Don't mix temporal loads with non-temporal loads.
if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
return false;
+ if (!TLI.areTwoSDNodeTargetMMOFlagsMergeable(*cast<LoadSDNode>(Val),
+ *OtherLd))
+ return false;
if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
return false;
break;
@@ -20061,10 +20066,14 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
if (IsNonTemporalLoad)
LdMMOFlags |= MachineMemOperand::MONonTemporal;
+ LdMMOFlags |= TLI.getTargetMMOFlags(*FirstLoad);
+
MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore
? MachineMemOperand::MONonTemporal
: MachineMemOperand::MONone;
+ StMMOFlags |= TLI.getTargetMMOFlags(*StoreNodes[0].MemNode);
+
SDValue NewLoad, NewStore;
if (UseVectorTy || !DoIntegerTruncate) {
NewLoad = DAG.getLoad(
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 677c4befbff76..1f6c59309a43c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -15736,6 +15736,57 @@ RISCVTargetLowering::getRegisterByName(const char *RegName, LLT VT,
return Reg;
}
+MachineMemOperand::Flags
+RISCVTargetLowering::getTargetMMOFlags(const Instruction &I) const {
+ const MDNode *NontemporalInfo = I.getMetadata(LLVMContext::MD_nontemporal);
+
+ if (NontemporalInfo == nullptr)
+ return MachineMemOperand::MONone;
+
+ // 1 for default value work as __RISCV_NTLH_ALL
+ // 2 -> __RISCV_NTLH_INNERMOST_PRIVATE
+ // 3 -> __RISCV_NTLH_ALL_PRIVATE
+ // 4 -> __RISCV_NTLH_INNERMOST_SHARED
+ // 5 -> __RISCV_NTLH_ALL
+ int NontemporalLevel = 5;
+ const MDNode *RISCVNontemporalInfo =
+ I.getMetadata("riscv-nontemporal-domain");
+ if (RISCVNontemporalInfo != nullptr)
+ NontemporalLevel =
+ cast<ConstantInt>(
+ cast<ConstantAsMetadata>(RISCVNontemporalInfo->getOperand(0))
+ ->getValue())
+ ->getZExtValue();
+
+ assert((1 <= NontemporalLevel && NontemporalLevel <= 5) &&
+ "RISC-V target doesn't support this non-temporal domain.");
+
+ NontemporalLevel -= 2;
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
+ if (NontemporalLevel & 0b1)
+ Flags |= MONontemporalBit0;
+ if (NontemporalLevel & 0b10)
+ Flags |= MONontemporalBit1;
+
+ return Flags;
+}
+
+MachineMemOperand::Flags
+RISCVTargetLowering::getTargetMMOFlags(const MemSDNode &Node) const {
+
+ MachineMemOperand::Flags NodeFlags = Node.getMemOperand()->getFlags();
+ MachineMemOperand::Flags TargetFlags = MachineMemOperand::MONone;
+ TargetFlags |= (NodeFlags & MONontemporalBit0);
+ TargetFlags |= (NodeFlags & MONontemporalBit1);
+
+ return TargetFlags;
+}
+
+bool RISCVTargetLowering::areTwoSDNodeTargetMMOFlagsMergeable(
+ const MemSDNode &NodeX, const MemSDNode &NodeY) const {
+ return getTargetMMOFlags(NodeX) == getTargetMMOFlags(NodeY);
+}
+
namespace llvm::RISCVVIntrinsicsTable {
#define GET_RISCVVIntrinsicsTable_IMPL
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index de0e970108b81..3780986407601 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -482,6 +482,16 @@ class RISCVTargetLowering : public TargetLowering {
// This method returns the name of a target specific DAG node.
const char *getTargetNodeName(unsigned Opcode) const override;
+ MachineMemOperand::Flags
+ getTargetMMOFlags(const Instruction &I) const override;
+
+ MachineMemOperand::Flags
+ getTargetMMOFlags(const MemSDNode &Node) const override;
+
+ bool
+ areTwoSDNodeTargetMMOFlagsMergeable(const MemSDNode &NodeX,
+ const MemSDNode &NodeY) const override;
+
ConstraintType getConstraintType(StringRef Constraint) const override;
unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp b/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
index 209438a6165bb..30f74b6995d7d 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
@@ -67,11 +67,27 @@ bool RISCVInsertNTLHInsts::runOnMachineFunction(MachineFunction &MF) {
continue;
MachineMemOperand *MMO = *(MBBI.memoperands_begin());
if (MMO->isNonTemporal()) {
+ uint64_t NontemporalMode = 0;
+ if (MMO->getFlags() & MONontemporalBit0)
+ NontemporalMode += 0b1;
+ if (MMO->getFlags() & MONontemporalBit1)
+ NontemporalMode += 0b10;
+
+ static const uint16_t NTLOpc[] = {
+ RISCV::PseudoNTLP1, RISCV::PseudoNTLPALL, RISCV::PseudoNTLS1,
+ RISCV::PseudoNTLALL};
+ static const uint16_t CNTLOpc[] = {
+ RISCV::PseudoCNTLP1, RISCV::PseudoCNTLPALL, RISCV::PseudoCNTLS1,
+ RISCV::PseudoCNTLALL};
+
+ unsigned CurrNTLOpc;
DebugLoc DL = MBBI.getDebugLoc();
if (ST.hasStdExtCOrZca() && ST.enableRVCHintInstrs())
- BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoCNTLALL));
+ CurrNTLOpc = CNTLOpc[NontemporalMode];
else
- BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoNTLALL));
+ CurrNTLOpc = NTLOpc[NontemporalMode];
+
+ BuildMI(MBB, MBBI, DL, TII->get(CurrNTLOpc));
Changed = true;
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 0f96d3c19da4e..4a8c5966d4b0d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2628,6 +2628,14 @@ void RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
}
}
+ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
+RISCVInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
+ static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
+ {{MONontemporalBit0, "riscv-nontemporal-domain-bit-0"},
+ {MONontemporalBit1, "riscv-nontemporal-domain-bit-1"}};
+ return makeArrayRef(TargetFlags);
+}
+
// Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
bool RISCV::isSEXT_W(const MachineInstr &MI) {
return MI.getOpcode() == RISCV::ADDIW && MI.getOperand(1).isReg() &&
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 01f112a386d08..9810f73930ca6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -25,6 +25,11 @@ namespace llvm {
class RISCVSubtarget;
+static const MachineMemOperand::Flags MONontemporalBit0 =
+ MachineMemOperand::MOTargetFlag1;
+static const MachineMemOperand::Flags MONontemporalBit1 =
+ MachineMemOperand::MOTargetFlag2;
+
namespace RISCVCC {
enum CondCode {
@@ -227,6 +232,9 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
std::optional<unsigned> getInverseOpcode(unsigned Opcode) const override;
+ ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
+ getSerializableMachineMemOperandTargetFlags() const override;
+
protected:
const RISCVSubtarget &STI;
};
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
index ecc5ddedee007..b8adaf4d84831 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
@@ -12,11 +12,23 @@
//===----------------------------------------------------------------------===//
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 4 in {
- def PseudoNTLALL : Pseudo<(outs), (ins), [], "ntl.all">,
+ def PseudoNTLP1 : Pseudo<(outs), (ins), [], "ntl.p1">,
+ PseudoInstExpansion<(ADD X0, X0, X2)>;
+ def PseudoNTLPALL : Pseudo<(outs), (ins), [], "ntl.pall">,
+ PseudoInstExpansion<(ADD X0, X0, X3)>;
+ def PseudoNTLS1 : Pseudo<(outs), (ins), [], "ntl.s1">,
+ PseudoInstExpansion<(ADD X0, X0, X4)>;
+ def PseudoNTLALL : Pseudo<(outs), (ins), [], "ntl.all">,
PseudoInstExpansion<(ADD X0, X0, X5)>;
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 2 in {
- def PseudoCNTLALL : Pseudo<(outs), (ins), [], "c.ntl.all">,
+ def PseudoCNTLP1 : Pseudo<(outs), (ins), [], "c.ntl.p1">,
+ PseudoInstExpansion<(C_ADD_HINT X0, X0, X2)>;
+ def PseudoCNTLPALL : Pseudo<(outs), (ins), [], "c.ntl.pall">,
+ PseudoInstExpansion<(C_ADD_HINT X0, X0, X3)>;
+ def PseudoCNTLS1 : Pseudo<(outs), (ins), [], "c.ntl.s1">,
+ PseudoInstExpansion<(C_ADD_HINT X0, X0, X4)>;
+ def PseudoCNTLALL : Pseudo<(outs), (ins), [], "c.ntl.all">,
PseudoInstExpansion<(C_ADD_HINT X0, X0, X5)>;
}
diff --git a/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll b/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll
index 6a4ecf38427ec..66dad442dc087 100644
--- a/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll
+++ b/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll
@@ -130,4 +130,520 @@ define void @test_nontemporal_store_nxv16i8(ptr %p, <vscale x 16 x i8> %v) {
ret void
}
+define <vscale x 2 x i64> @test_nontemporal_P1_load_nxv2i64(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_nxv2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vl2re64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_nxv2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vl2re64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 4 x i32> @test_nontemporal_P1_load_nxv4i32(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_nxv4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vl2re32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_nxv4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vl2re32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 8 x i16> @test_nontemporal_P1_load_nxv8i16(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_nxv8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vl2re16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_nxv8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vl2re16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 16 x i8> @test_nontemporal_P1_load_nxv16i8(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_nxv16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vl2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_nxv16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vl2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret <vscale x 16 x i8> %1
+}
+
+define void @test_nontemporal_P1_store_nxv2i64(ptr %p, <vscale x 2 x i64> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_nxv2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_nxv2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_nxv4i32(ptr %p, <vscale x 4 x i32> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_nxv4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_nxv4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_nxv8i16(ptr %p, <vscale x 8 x i16> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_nxv8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_nxv8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_nxv16i8(ptr %p, <vscale x 16 x i8> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_nxv16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_nxv16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define <vscale x 2 x i64> @test_nontemporal_PALL_load_nxv2i64(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_nxv2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vl2re64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_nxv2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vl2re64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 4 x i32> @test_nontemporal_PALL_load_nxv4i32(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_nxv4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vl2re32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_nxv4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vl2re32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 8 x i16> @test_nontemporal_PALL_load_nxv8i16(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_nxv8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vl2re16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_nxv8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vl2re16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 16 x i8> @test_nontemporal_PALL_load_nxv16i8(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_nxv16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vl2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_nxv16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vl2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret <vscale x 16 x i8> %1
+}
+
+define void @test_nontemporal_PALL_store_nxv2i64(ptr %p, <vscale x 2 x i64> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_nxv2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_nxv2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_nxv4i32(ptr %p, <vscale x 4 x i32> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_nxv4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_nxv4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_nxv8i16(ptr %p, <vscale x 8 x i16> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_nxv8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_nxv8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_nxv16i8(ptr %p, <vscale x 16 x i8> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_nxv16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_nxv16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define <vscale x 2 x i64> @test_nontemporal_S1_load_nxv2i64(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_nxv2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vl2re64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_nxv2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vl2re64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 4 x i32> @test_nontemporal_S1_load_nxv4i32(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_nxv4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vl2re32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_nxv4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vl2re32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 8 x i16> @test_nontemporal_S1_load_nxv8i16(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_nxv8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vl2re16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_nxv8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vl2re16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 16 x i8> @test_nontemporal_S1_load_nxv16i8(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_nxv16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vl2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_nxv16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vl2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret <vscale x 16 x i8> %1
+}
+
+define void @test_nontemporal_S1_store_nxv2i64(ptr %p, <vscale x 2 x i64> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_nxv2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_nxv2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_nxv4i32(ptr %p, <vscale x 4 x i32> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_nxv4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_nxv4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_nxv8i16(ptr %p, <vscale x 8 x i16> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_nxv8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_nxv8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_nxv16i8(ptr %p, <vscale x 16 x i8> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_nxv16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_nxv16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define <vscale x 2 x i64> @test_nontemporal_ALL_load_nxv2i64(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_nxv2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vl2re64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_nxv2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vl2re64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 4 x i32> @test_nontemporal_ALL_load_nxv4i32(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_nxv4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vl2re32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_nxv4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vl2re32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 8 x i16> @test_nontemporal_ALL_load_nxv8i16(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_nxv8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vl2re16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_nxv8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vl2re16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 16 x i8> @test_nontemporal_ALL_load_nxv16i8(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_nxv16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vl2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_nxv16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vl2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret <vscale x 16 x i8> %1
+}
+
+define void @test_nontemporal_ALL_store_nxv2i64(ptr %p, <vscale x 2 x i64> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_nxv2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_nxv2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_nxv4i32(ptr %p, <vscale x 4 x i32> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_nxv4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_nxv4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_nxv8i16(ptr %p, <vscale x 8 x i16> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_nxv8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_nxv8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_nxv16i8(ptr %p, <vscale x 16 x i8> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_nxv16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_nxv16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
!0 = !{i32 1}
+!1 = !{i32 2}
+!2 = !{i32 3}
+!3 = !{i32 4}
+!4 = !{i32 5}
diff --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll
index 190b896f486c6..e1b19a9c33835 100644
--- a/llvm/test/CodeGen/RISCV/nontemporal.ll
+++ b/llvm/test/CodeGen/RISCV/nontemporal.ll
@@ -1438,4 +1438,5665 @@ define void @test_nontemporal_store_v2i64(ptr %p, <2 x i64> %v) {
ret void
}
+define i64 @test_nontemporal_P1_load_i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a2, 0(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a1, 4(a0)
+; CHECK-RV32-NEXT: mv a0, a2
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: ld a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a2, 0(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a1, 4(a0)
+; CHECK-RV32C-NEXT: mv a0, a2
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: ld a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: lw a2, 0(a0)
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: lw a1, 4(a0)
+; CHECK-RV32V-NEXT: mv a0, a2
+; CHECK-RV32V-NEXT: ret
+ %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret i64 %1
+}
+
+define i32 @test_nontemporal_P1_load_i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: lw a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: lw a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: lw a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: lw a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load i32, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret i32 %1
+}
+
+define i16 @test_nontemporal_P1_load_i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: lh a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lh a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: lh a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lh a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: lh a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: lh a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load i16, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret i16 %1
+}
+
+define i8 @test_nontemporal_P1_load_i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: lbu a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lbu a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: lbu a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lbu a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: lbu a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: lbu a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load i8, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret i8 %1
+}
+
+define half @test_nontemporal_P1_load_half(ptr %p) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_half:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: flh fa5, 0(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: flh fa4, 6(a0)
+; CHECK-RV64-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_half:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: flh fa5, 0(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: flh fa4, 6(a0)
+; CHECK-RV32-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_half:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: flh fa5, 0(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: flh fa4, 6(a0)
+; CHECK-RV64C-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_half:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: flh fa5, 0(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: flh fa4, 6(a0)
+; CHECK-RV32C-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_half:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: flh fa5, 0(a0)
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: flh fa4, 6(a0)
+; CHECK-RV64V-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_half:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: flh fa5, 0(a0)
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: flh fa4, 6(a0)
+; CHECK-RV32V-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32V-NEXT: ret
+ %1 = load half, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ %2 = getelementptr half, ptr %p, i32 3
+ %3 = load half, ptr %2, !nontemporal !0, !riscv-nontemporal-domain !1
+ %4 = fadd half %1, %3
+ ret half %4
+}
+
+define float @test_nontemporal_P1_load_float(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_float:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: flw fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_float:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: flw fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_float:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: flw fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_float:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: flw fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_float:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: flw fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_float:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: flw fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load float, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret float %1
+}
+
+define double @test_nontemporal_P1_load_double(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_double:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: fld fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_double:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: fld fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_double:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: fld fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_double:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: fld fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_double:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: fld fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_double:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: fld fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load double, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret double %1
+}
+
+define <16 x i8> @test_nontemporal_P1_load_v16i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_v16i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_v16i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v16i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v16i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_v16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vle8.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_v16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vle8.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret <16 x i8> %1
+}
+
+define <8 x i16> @test_nontemporal_P1_load_v8i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_v8i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_v8i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v8i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v8i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_v8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vle16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_v8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vle16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret <8 x i16> %1
+}
+
+define <4 x i32> @test_nontemporal_P1_load_v4i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_v4i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_v4i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v4i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v4i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_v4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vle32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_v4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vle32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret <4 x i32> %1
+}
+
+define <2 x i64> @test_nontemporal_P1_load_v2i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_load_v2i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: ld a2, 0(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: ld a1, 8(a0)
+; CHECK-RV64-NEXT: mv a0, a2
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_load_v2i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v2i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: ld a2, 0(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: ld a1, 8(a0)
+; CHECK-RV64C-NEXT: mv a0, a2
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v2i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_load_v2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vle64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_load_v2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vle64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret <2 x i64> %1
+}
+
+define void @test_nontemporal_P1_store_i64(ptr %p, i64 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sw a2, 4(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sw a2, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: sd a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: sw a2, 4(a0)
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: sw a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i64 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_i32(ptr %p, i32 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sw a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sw a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: sw a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: sw a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i32 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_i16(ptr %p, i16 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sh a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sh a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sh a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sh a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: sh a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: sh a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i16 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_i8(ptr %p, i8 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: sb a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: sb a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i8 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_half(ptr %p, half %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_half:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_half:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_half:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_half:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_half:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_half:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store half %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_float(ptr %p, float %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_float:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_float:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_float:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_float:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_float:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_float:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store float %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_double(ptr %p, double %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_double:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_double:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_double:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_double:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_double:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_double:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store double %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_v16i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: addi sp, sp, -16
+; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT: .cfi_offset s0, -8
+; CHECK-RV64-NEXT: .cfi_offset s1, -16
+; CHECK-RV64-NEXT: lbu a2, 0(a1)
+; CHECK-RV64-NEXT: lbu a3, 8(a1)
+; CHECK-RV64-NEXT: lbu a4, 16(a1)
+; CHECK-RV64-NEXT: lbu a5, 24(a1)
+; CHECK-RV64-NEXT: lbu a6, 32(a1)
+; CHECK-RV64-NEXT: lbu a7, 40(a1)
+; CHECK-RV64-NEXT: lbu t0, 48(a1)
+; CHECK-RV64-NEXT: lbu t1, 56(a1)
+; CHECK-RV64-NEXT: lbu t2, 64(a1)
+; CHECK-RV64-NEXT: lbu t3, 72(a1)
+; CHECK-RV64-NEXT: lbu t4, 80(a1)
+; CHECK-RV64-NEXT: lbu t5, 88(a1)
+; CHECK-RV64-NEXT: lbu t6, 120(a1)
+; CHECK-RV64-NEXT: lbu s0, 112(a1)
+; CHECK-RV64-NEXT: lbu s1, 104(a1)
+; CHECK-RV64-NEXT: lbu a1, 96(a1)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb t6, 15(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb s0, 14(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb s1, 13(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb a1, 12(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb t5, 11(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb t4, 10(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb t3, 9(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb t2, 8(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb t1, 7(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb t0, 6(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb a7, 5(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb a6, 4(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb a5, 3(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb a4, 2(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb a3, 1(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sb a2, 0(a0)
+; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT: addi sp, sp, 16
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_v16i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: addi sp, sp, -16
+; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT: .cfi_offset s0, -4
+; CHECK-RV32-NEXT: .cfi_offset s1, -8
+; CHECK-RV32-NEXT: lbu a2, 0(a1)
+; CHECK-RV32-NEXT: lbu a3, 4(a1)
+; CHECK-RV32-NEXT: lbu a4, 8(a1)
+; CHECK-RV32-NEXT: lbu a5, 12(a1)
+; CHECK-RV32-NEXT: lbu a6, 16(a1)
+; CHECK-RV32-NEXT: lbu a7, 20(a1)
+; CHECK-RV32-NEXT: lbu t0, 24(a1)
+; CHECK-RV32-NEXT: lbu t1, 28(a1)
+; CHECK-RV32-NEXT: lbu t2, 32(a1)
+; CHECK-RV32-NEXT: lbu t3, 36(a1)
+; CHECK-RV32-NEXT: lbu t4, 40(a1)
+; CHECK-RV32-NEXT: lbu t5, 44(a1)
+; CHECK-RV32-NEXT: lbu t6, 60(a1)
+; CHECK-RV32-NEXT: lbu s0, 56(a1)
+; CHECK-RV32-NEXT: lbu s1, 52(a1)
+; CHECK-RV32-NEXT: lbu a1, 48(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb t6, 15(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb s0, 14(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb s1, 13(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb a1, 12(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb t5, 11(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb t4, 10(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb t3, 9(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb t2, 8(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb t1, 7(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb t0, 6(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb a7, 5(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb a6, 4(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb a5, 3(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb a4, 2(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb a3, 1(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sb a2, 0(a0)
+; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT: addi sp, sp, 16
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v16i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: addi sp, sp, -16
+; CHECK-RV64C-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV64C-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT: .cfi_offset s0, -8
+; CHECK-RV64C-NEXT: .cfi_offset s1, -16
+; CHECK-RV64C-NEXT: lbu a6, 0(a1)
+; CHECK-RV64C-NEXT: lbu a7, 8(a1)
+; CHECK-RV64C-NEXT: lbu t0, 16(a1)
+; CHECK-RV64C-NEXT: lbu t1, 24(a1)
+; CHECK-RV64C-NEXT: lbu t2, 32(a1)
+; CHECK-RV64C-NEXT: lbu t3, 40(a1)
+; CHECK-RV64C-NEXT: lbu t4, 48(a1)
+; CHECK-RV64C-NEXT: lbu t5, 56(a1)
+; CHECK-RV64C-NEXT: lbu t6, 64(a1)
+; CHECK-RV64C-NEXT: lbu a3, 72(a1)
+; CHECK-RV64C-NEXT: lbu a4, 80(a1)
+; CHECK-RV64C-NEXT: lbu a5, 88(a1)
+; CHECK-RV64C-NEXT: lbu a2, 120(a1)
+; CHECK-RV64C-NEXT: lbu s0, 112(a1)
+; CHECK-RV64C-NEXT: lbu s1, 104(a1)
+; CHECK-RV64C-NEXT: lbu a1, 96(a1)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb a2, 15(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb s0, 14(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb s1, 13(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb a1, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb a5, 11(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb a4, 10(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb a3, 9(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb t6, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb t5, 7(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb t4, 6(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb t3, 5(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb t2, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb t1, 3(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb t0, 2(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb a7, 1(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sb a6, 0(a0)
+; CHECK-RV64C-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT: ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT: addi sp, sp, 16
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v16i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: addi sp, sp, -16
+; CHECK-RV32C-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV32C-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT: .cfi_offset s0, -4
+; CHECK-RV32C-NEXT: .cfi_offset s1, -8
+; CHECK-RV32C-NEXT: lbu a6, 0(a1)
+; CHECK-RV32C-NEXT: lbu a7, 4(a1)
+; CHECK-RV32C-NEXT: lbu t0, 8(a1)
+; CHECK-RV32C-NEXT: lbu t1, 12(a1)
+; CHECK-RV32C-NEXT: lbu t2, 16(a1)
+; CHECK-RV32C-NEXT: lbu t3, 20(a1)
+; CHECK-RV32C-NEXT: lbu t4, 24(a1)
+; CHECK-RV32C-NEXT: lbu t5, 28(a1)
+; CHECK-RV32C-NEXT: lbu t6, 32(a1)
+; CHECK-RV32C-NEXT: lbu a3, 36(a1)
+; CHECK-RV32C-NEXT: lbu a4, 40(a1)
+; CHECK-RV32C-NEXT: lbu a5, 44(a1)
+; CHECK-RV32C-NEXT: lbu a2, 60(a1)
+; CHECK-RV32C-NEXT: lbu s0, 56(a1)
+; CHECK-RV32C-NEXT: lbu s1, 52(a1)
+; CHECK-RV32C-NEXT: lbu a1, 48(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb a2, 15(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb s0, 14(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb s1, 13(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb a1, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb a5, 11(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb a4, 10(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb a3, 9(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb t6, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb t5, 7(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb t4, 6(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb t3, 5(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb t2, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb t1, 3(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb t0, 2(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb a7, 1(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sb a6, 0(a0)
+; CHECK-RV32C-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT: addi sp, sp, 16
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_v16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vse8.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_v16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vse8.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_v8i16(ptr %p, <8 x i16> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_v8i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lh a2, 0(a1)
+; CHECK-RV64-NEXT: lh a3, 8(a1)
+; CHECK-RV64-NEXT: lh a4, 16(a1)
+; CHECK-RV64-NEXT: lh a5, 24(a1)
+; CHECK-RV64-NEXT: lh a6, 56(a1)
+; CHECK-RV64-NEXT: lh a7, 48(a1)
+; CHECK-RV64-NEXT: lh t0, 40(a1)
+; CHECK-RV64-NEXT: lh a1, 32(a1)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sh a6, 14(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sh a7, 12(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sh t0, 10(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sh a1, 8(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sh a5, 6(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sh a4, 4(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sh a3, 2(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sh a2, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_v8i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lh a2, 0(a1)
+; CHECK-RV32-NEXT: lh a3, 4(a1)
+; CHECK-RV32-NEXT: lh a4, 8(a1)
+; CHECK-RV32-NEXT: lh a5, 12(a1)
+; CHECK-RV32-NEXT: lh a6, 28(a1)
+; CHECK-RV32-NEXT: lh a7, 24(a1)
+; CHECK-RV32-NEXT: lh t0, 20(a1)
+; CHECK-RV32-NEXT: lh a1, 16(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sh a6, 14(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sh a7, 12(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sh t0, 10(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sh a1, 8(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sh a5, 6(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sh a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sh a3, 2(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sh a2, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v8i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: lh a6, 0(a1)
+; CHECK-RV64C-NEXT: lh a7, 8(a1)
+; CHECK-RV64C-NEXT: lh t0, 16(a1)
+; CHECK-RV64C-NEXT: lh a5, 24(a1)
+; CHECK-RV64C-NEXT: lh a2, 56(a1)
+; CHECK-RV64C-NEXT: lh a3, 48(a1)
+; CHECK-RV64C-NEXT: lh a4, 40(a1)
+; CHECK-RV64C-NEXT: lh a1, 32(a1)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sh a2, 14(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sh a3, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sh a4, 10(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sh a1, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sh a5, 6(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sh t0, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sh a7, 2(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sh a6, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v8i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lh a6, 0(a1)
+; CHECK-RV32C-NEXT: lh a7, 4(a1)
+; CHECK-RV32C-NEXT: lh t0, 8(a1)
+; CHECK-RV32C-NEXT: lh a5, 12(a1)
+; CHECK-RV32C-NEXT: lh a2, 28(a1)
+; CHECK-RV32C-NEXT: lh a3, 24(a1)
+; CHECK-RV32C-NEXT: lh a4, 20(a1)
+; CHECK-RV32C-NEXT: lh a1, 16(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sh a2, 14(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sh a3, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sh a4, 10(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sh a1, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sh a5, 6(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sh t0, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sh a7, 2(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sh a6, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_v8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vse16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_v8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vse16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_v4i32(ptr %p, <4 x i32> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_v4i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lw a2, 24(a1)
+; CHECK-RV64-NEXT: lw a3, 16(a1)
+; CHECK-RV64-NEXT: lw a4, 8(a1)
+; CHECK-RV64-NEXT: lw a1, 0(a1)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sw a2, 12(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sw a3, 8(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sw a4, 4(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sw a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_v4i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v4i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: lw a2, 24(a1)
+; CHECK-RV64C-NEXT: lw a3, 16(a1)
+; CHECK-RV64C-NEXT: lw a4, 8(a1)
+; CHECK-RV64C-NEXT: lw a1, 0(a1)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sw a2, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sw a3, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sw a4, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sw a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v4i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_v4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vse32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_v4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vse32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define void @test_nontemporal_P1_store_v2i64(ptr %p, <2 x i64> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_P1_store_v2i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: ntl.p1
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_P1_store_v2i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.p1
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v2i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.p1
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v2i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.p1
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_P1_store_v2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.p1
+; CHECK-RV64V-NEXT: vse64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_P1_store_v2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.p1
+; CHECK-RV32V-NEXT: vse64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1
+ ret void
+}
+
+define i64 @test_nontemporal_PALL_load_i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a2, 0(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a1, 4(a0)
+; CHECK-RV32-NEXT: mv a0, a2
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: ld a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a2, 0(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a1, 4(a0)
+; CHECK-RV32C-NEXT: mv a0, a2
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: ld a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: lw a2, 0(a0)
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: lw a1, 4(a0)
+; CHECK-RV32V-NEXT: mv a0, a2
+; CHECK-RV32V-NEXT: ret
+ %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret i64 %1
+}
+
+define i32 @test_nontemporal_PALL_load_i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: lw a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: lw a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: lw a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: lw a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load i32, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret i32 %1
+}
+
+define i16 @test_nontemporal_PALL_load_i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: lh a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lh a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: lh a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lh a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: lh a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: lh a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load i16, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret i16 %1
+}
+
+define i8 @test_nontemporal_PALL_load_i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: lbu a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lbu a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: lbu a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lbu a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: lbu a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: lbu a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load i8, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret i8 %1
+}
+
+define half @test_nontemporal_PALL_load_half(ptr %p) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_half:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: flh fa5, 0(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: flh fa4, 6(a0)
+; CHECK-RV64-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_half:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: flh fa5, 0(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: flh fa4, 6(a0)
+; CHECK-RV32-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_half:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: flh fa5, 0(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: flh fa4, 6(a0)
+; CHECK-RV64C-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_half:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: flh fa5, 0(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: flh fa4, 6(a0)
+; CHECK-RV32C-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_half:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: flh fa5, 0(a0)
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: flh fa4, 6(a0)
+; CHECK-RV64V-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_half:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: flh fa5, 0(a0)
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: flh fa4, 6(a0)
+; CHECK-RV32V-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32V-NEXT: ret
+ %1 = load half, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ %2 = getelementptr half, ptr %p, i32 3
+ %3 = load half, ptr %2, !nontemporal !0, !riscv-nontemporal-domain !2
+ %4 = fadd half %1, %3
+ ret half %4
+}
+
+define float @test_nontemporal_PALL_load_float(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_float:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: flw fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_float:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: flw fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_float:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: flw fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_float:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: flw fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_float:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: flw fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_float:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: flw fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load float, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret float %1
+}
+
+define double @test_nontemporal_PALL_load_double(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_double:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: fld fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_double:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: fld fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_double:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: fld fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_double:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: fld fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_double:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: fld fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_double:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: fld fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load double, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret double %1
+}
+
+define <16 x i8> @test_nontemporal_PALL_load_v16i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v16i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v16i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v16i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v16i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_v16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vle8.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_v16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vle8.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret <16 x i8> %1
+}
+
+define <8 x i16> @test_nontemporal_PALL_load_v8i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v8i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v8i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v8i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v8i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_v8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vle16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_v8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vle16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret <8 x i16> %1
+}
+
+define <4 x i32> @test_nontemporal_PALL_load_v4i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v4i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v4i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v4i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v4i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_v4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vle32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_v4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vle32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret <4 x i32> %1
+}
+
+define <2 x i64> @test_nontemporal_PALL_load_v2i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v2i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: ld a2, 0(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: ld a1, 8(a0)
+; CHECK-RV64-NEXT: mv a0, a2
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v2i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v2i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: ld a2, 0(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: ld a1, 8(a0)
+; CHECK-RV64C-NEXT: mv a0, a2
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v2i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_v2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vle64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_v2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vle64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret <2 x i64> %1
+}
+
+define void @test_nontemporal_PALL_store_i64(ptr %p, i64 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sw a2, 4(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sw a2, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: sd a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: sw a2, 4(a0)
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: sw a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i64 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_i32(ptr %p, i32 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sw a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sw a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: sw a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: sw a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i32 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_i16(ptr %p, i16 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sh a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sh a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sh a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sh a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: sh a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: sh a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i16 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_i8(ptr %p, i8 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: sb a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: sb a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i8 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_half(ptr %p, half %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_half:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_half:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_half:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_half:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_half:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_half:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store half %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_float(ptr %p, float %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_float:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_float:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_float:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_float:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_float:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_float:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store float %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_double(ptr %p, double %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_double:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_double:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_double:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_double:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_double:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_double:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store double %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v16i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: addi sp, sp, -16
+; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT: .cfi_offset s0, -8
+; CHECK-RV64-NEXT: .cfi_offset s1, -16
+; CHECK-RV64-NEXT: lbu a2, 0(a1)
+; CHECK-RV64-NEXT: lbu a3, 8(a1)
+; CHECK-RV64-NEXT: lbu a4, 16(a1)
+; CHECK-RV64-NEXT: lbu a5, 24(a1)
+; CHECK-RV64-NEXT: lbu a6, 32(a1)
+; CHECK-RV64-NEXT: lbu a7, 40(a1)
+; CHECK-RV64-NEXT: lbu t0, 48(a1)
+; CHECK-RV64-NEXT: lbu t1, 56(a1)
+; CHECK-RV64-NEXT: lbu t2, 64(a1)
+; CHECK-RV64-NEXT: lbu t3, 72(a1)
+; CHECK-RV64-NEXT: lbu t4, 80(a1)
+; CHECK-RV64-NEXT: lbu t5, 88(a1)
+; CHECK-RV64-NEXT: lbu t6, 120(a1)
+; CHECK-RV64-NEXT: lbu s0, 112(a1)
+; CHECK-RV64-NEXT: lbu s1, 104(a1)
+; CHECK-RV64-NEXT: lbu a1, 96(a1)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb t6, 15(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb s0, 14(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb s1, 13(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb a1, 12(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb t5, 11(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb t4, 10(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb t3, 9(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb t2, 8(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb t1, 7(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb t0, 6(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb a7, 5(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb a6, 4(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb a5, 3(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb a4, 2(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb a3, 1(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sb a2, 0(a0)
+; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT: addi sp, sp, 16
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v16i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: addi sp, sp, -16
+; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT: .cfi_offset s0, -4
+; CHECK-RV32-NEXT: .cfi_offset s1, -8
+; CHECK-RV32-NEXT: lbu a2, 0(a1)
+; CHECK-RV32-NEXT: lbu a3, 4(a1)
+; CHECK-RV32-NEXT: lbu a4, 8(a1)
+; CHECK-RV32-NEXT: lbu a5, 12(a1)
+; CHECK-RV32-NEXT: lbu a6, 16(a1)
+; CHECK-RV32-NEXT: lbu a7, 20(a1)
+; CHECK-RV32-NEXT: lbu t0, 24(a1)
+; CHECK-RV32-NEXT: lbu t1, 28(a1)
+; CHECK-RV32-NEXT: lbu t2, 32(a1)
+; CHECK-RV32-NEXT: lbu t3, 36(a1)
+; CHECK-RV32-NEXT: lbu t4, 40(a1)
+; CHECK-RV32-NEXT: lbu t5, 44(a1)
+; CHECK-RV32-NEXT: lbu t6, 60(a1)
+; CHECK-RV32-NEXT: lbu s0, 56(a1)
+; CHECK-RV32-NEXT: lbu s1, 52(a1)
+; CHECK-RV32-NEXT: lbu a1, 48(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb t6, 15(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb s0, 14(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb s1, 13(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb a1, 12(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb t5, 11(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb t4, 10(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb t3, 9(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb t2, 8(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb t1, 7(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb t0, 6(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb a7, 5(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb a6, 4(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb a5, 3(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb a4, 2(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb a3, 1(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sb a2, 0(a0)
+; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT: addi sp, sp, 16
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v16i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: addi sp, sp, -16
+; CHECK-RV64C-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV64C-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT: .cfi_offset s0, -8
+; CHECK-RV64C-NEXT: .cfi_offset s1, -16
+; CHECK-RV64C-NEXT: lbu a6, 0(a1)
+; CHECK-RV64C-NEXT: lbu a7, 8(a1)
+; CHECK-RV64C-NEXT: lbu t0, 16(a1)
+; CHECK-RV64C-NEXT: lbu t1, 24(a1)
+; CHECK-RV64C-NEXT: lbu t2, 32(a1)
+; CHECK-RV64C-NEXT: lbu t3, 40(a1)
+; CHECK-RV64C-NEXT: lbu t4, 48(a1)
+; CHECK-RV64C-NEXT: lbu t5, 56(a1)
+; CHECK-RV64C-NEXT: lbu t6, 64(a1)
+; CHECK-RV64C-NEXT: lbu a3, 72(a1)
+; CHECK-RV64C-NEXT: lbu a4, 80(a1)
+; CHECK-RV64C-NEXT: lbu a5, 88(a1)
+; CHECK-RV64C-NEXT: lbu a2, 120(a1)
+; CHECK-RV64C-NEXT: lbu s0, 112(a1)
+; CHECK-RV64C-NEXT: lbu s1, 104(a1)
+; CHECK-RV64C-NEXT: lbu a1, 96(a1)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb a2, 15(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb s0, 14(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb s1, 13(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb a1, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb a5, 11(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb a4, 10(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb a3, 9(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb t6, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb t5, 7(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb t4, 6(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb t3, 5(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb t2, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb t1, 3(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb t0, 2(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb a7, 1(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sb a6, 0(a0)
+; CHECK-RV64C-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT: ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT: addi sp, sp, 16
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v16i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: addi sp, sp, -16
+; CHECK-RV32C-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV32C-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT: .cfi_offset s0, -4
+; CHECK-RV32C-NEXT: .cfi_offset s1, -8
+; CHECK-RV32C-NEXT: lbu a6, 0(a1)
+; CHECK-RV32C-NEXT: lbu a7, 4(a1)
+; CHECK-RV32C-NEXT: lbu t0, 8(a1)
+; CHECK-RV32C-NEXT: lbu t1, 12(a1)
+; CHECK-RV32C-NEXT: lbu t2, 16(a1)
+; CHECK-RV32C-NEXT: lbu t3, 20(a1)
+; CHECK-RV32C-NEXT: lbu t4, 24(a1)
+; CHECK-RV32C-NEXT: lbu t5, 28(a1)
+; CHECK-RV32C-NEXT: lbu t6, 32(a1)
+; CHECK-RV32C-NEXT: lbu a3, 36(a1)
+; CHECK-RV32C-NEXT: lbu a4, 40(a1)
+; CHECK-RV32C-NEXT: lbu a5, 44(a1)
+; CHECK-RV32C-NEXT: lbu a2, 60(a1)
+; CHECK-RV32C-NEXT: lbu s0, 56(a1)
+; CHECK-RV32C-NEXT: lbu s1, 52(a1)
+; CHECK-RV32C-NEXT: lbu a1, 48(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb a2, 15(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb s0, 14(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb s1, 13(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb a1, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb a5, 11(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb a4, 10(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb a3, 9(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb t6, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb t5, 7(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb t4, 6(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb t3, 5(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb t2, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb t1, 3(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb t0, 2(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb a7, 1(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sb a6, 0(a0)
+; CHECK-RV32C-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT: addi sp, sp, 16
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_v16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vse8.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_v16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vse8.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_v8i16(ptr %p, <8 x i16> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v8i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lh a2, 0(a1)
+; CHECK-RV64-NEXT: lh a3, 8(a1)
+; CHECK-RV64-NEXT: lh a4, 16(a1)
+; CHECK-RV64-NEXT: lh a5, 24(a1)
+; CHECK-RV64-NEXT: lh a6, 56(a1)
+; CHECK-RV64-NEXT: lh a7, 48(a1)
+; CHECK-RV64-NEXT: lh t0, 40(a1)
+; CHECK-RV64-NEXT: lh a1, 32(a1)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sh a6, 14(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sh a7, 12(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sh t0, 10(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sh a1, 8(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sh a5, 6(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sh a4, 4(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sh a3, 2(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sh a2, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v8i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lh a2, 0(a1)
+; CHECK-RV32-NEXT: lh a3, 4(a1)
+; CHECK-RV32-NEXT: lh a4, 8(a1)
+; CHECK-RV32-NEXT: lh a5, 12(a1)
+; CHECK-RV32-NEXT: lh a6, 28(a1)
+; CHECK-RV32-NEXT: lh a7, 24(a1)
+; CHECK-RV32-NEXT: lh t0, 20(a1)
+; CHECK-RV32-NEXT: lh a1, 16(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sh a6, 14(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sh a7, 12(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sh t0, 10(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sh a1, 8(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sh a5, 6(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sh a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sh a3, 2(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sh a2, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v8i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: lh a6, 0(a1)
+; CHECK-RV64C-NEXT: lh a7, 8(a1)
+; CHECK-RV64C-NEXT: lh t0, 16(a1)
+; CHECK-RV64C-NEXT: lh a5, 24(a1)
+; CHECK-RV64C-NEXT: lh a2, 56(a1)
+; CHECK-RV64C-NEXT: lh a3, 48(a1)
+; CHECK-RV64C-NEXT: lh a4, 40(a1)
+; CHECK-RV64C-NEXT: lh a1, 32(a1)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sh a2, 14(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sh a3, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sh a4, 10(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sh a1, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sh a5, 6(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sh t0, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sh a7, 2(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sh a6, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v8i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lh a6, 0(a1)
+; CHECK-RV32C-NEXT: lh a7, 4(a1)
+; CHECK-RV32C-NEXT: lh t0, 8(a1)
+; CHECK-RV32C-NEXT: lh a5, 12(a1)
+; CHECK-RV32C-NEXT: lh a2, 28(a1)
+; CHECK-RV32C-NEXT: lh a3, 24(a1)
+; CHECK-RV32C-NEXT: lh a4, 20(a1)
+; CHECK-RV32C-NEXT: lh a1, 16(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sh a2, 14(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sh a3, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sh a4, 10(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sh a1, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sh a5, 6(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sh t0, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sh a7, 2(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sh a6, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_v8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vse16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_v8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vse16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_v4i32(ptr %p, <4 x i32> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v4i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lw a2, 24(a1)
+; CHECK-RV64-NEXT: lw a3, 16(a1)
+; CHECK-RV64-NEXT: lw a4, 8(a1)
+; CHECK-RV64-NEXT: lw a1, 0(a1)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sw a2, 12(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sw a3, 8(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sw a4, 4(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sw a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v4i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v4i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: lw a2, 24(a1)
+; CHECK-RV64C-NEXT: lw a3, 16(a1)
+; CHECK-RV64C-NEXT: lw a4, 8(a1)
+; CHECK-RV64C-NEXT: lw a1, 0(a1)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sw a2, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sw a3, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sw a4, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sw a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v4i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_v4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vse32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_v4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vse32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define void @test_nontemporal_PALL_store_v2i64(ptr %p, <2 x i64> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v2i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: ntl.pall
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v2i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.pall
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v2i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.pall
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v2i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.pall
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_PALL_store_v2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.pall
+; CHECK-RV64V-NEXT: vse64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_PALL_store_v2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.pall
+; CHECK-RV32V-NEXT: vse64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2
+ ret void
+}
+
+define i64 @test_nontemporal_S1_load_i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a2, 0(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a1, 4(a0)
+; CHECK-RV32-NEXT: mv a0, a2
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: ld a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a2, 0(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a1, 4(a0)
+; CHECK-RV32C-NEXT: mv a0, a2
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: ld a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: lw a2, 0(a0)
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: lw a1, 4(a0)
+; CHECK-RV32V-NEXT: mv a0, a2
+; CHECK-RV32V-NEXT: ret
+ %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret i64 %1
+}
+
+define i32 @test_nontemporal_S1_load_i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: lw a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: lw a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: lw a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: lw a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load i32, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret i32 %1
+}
+
+define i16 @test_nontemporal_S1_load_i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: lh a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lh a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: lh a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lh a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: lh a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: lh a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load i16, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret i16 %1
+}
+
+define i8 @test_nontemporal_S1_load_i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: lbu a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lbu a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: lbu a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lbu a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: lbu a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: lbu a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load i8, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret i8 %1
+}
+
+define half @test_nontemporal_S1_load_half(ptr %p) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_half:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: flh fa5, 0(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: flh fa4, 6(a0)
+; CHECK-RV64-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_half:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: flh fa5, 0(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: flh fa4, 6(a0)
+; CHECK-RV32-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_half:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: flh fa5, 0(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: flh fa4, 6(a0)
+; CHECK-RV64C-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_half:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: flh fa5, 0(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: flh fa4, 6(a0)
+; CHECK-RV32C-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_half:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: flh fa5, 0(a0)
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: flh fa4, 6(a0)
+; CHECK-RV64V-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_half:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: flh fa5, 0(a0)
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: flh fa4, 6(a0)
+; CHECK-RV32V-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32V-NEXT: ret
+ %1 = load half, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ %2 = getelementptr half, ptr %p, i32 3
+ %3 = load half, ptr %2, !nontemporal !0, !riscv-nontemporal-domain !3
+ %4 = fadd half %1, %3
+ ret half %4
+}
+
+define float @test_nontemporal_S1_load_float(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_float:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: flw fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_float:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: flw fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_float:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: flw fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_float:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: flw fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_float:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: flw fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_float:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: flw fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load float, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret float %1
+}
+
+define double @test_nontemporal_S1_load_double(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_double:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: fld fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_double:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: fld fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_double:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: fld fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_double:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: fld fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_double:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: fld fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_double:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: fld fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load double, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret double %1
+}
+
+define <16 x i8> @test_nontemporal_S1_load_v16i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_v16i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_v16i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v16i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v16i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_v16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vle8.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_v16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vle8.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret <16 x i8> %1
+}
+
+define <8 x i16> @test_nontemporal_S1_load_v8i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_v8i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_v8i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v8i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v8i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_v8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vle16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_v8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vle16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret <8 x i16> %1
+}
+
+define <4 x i32> @test_nontemporal_S1_load_v4i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_v4i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_v4i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v4i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v4i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_v4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vle32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_v4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vle32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret <4 x i32> %1
+}
+
+define <2 x i64> @test_nontemporal_S1_load_v2i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_load_v2i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: ld a2, 0(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: ld a1, 8(a0)
+; CHECK-RV64-NEXT: mv a0, a2
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_load_v2i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v2i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: ld a2, 0(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: ld a1, 8(a0)
+; CHECK-RV64C-NEXT: mv a0, a2
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v2i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_load_v2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vle64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_load_v2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vle64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret <2 x i64> %1
+}
+
+define void @test_nontemporal_S1_store_i64(ptr %p, i64 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sw a2, 4(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sw a2, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: sd a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: sw a2, 4(a0)
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: sw a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i64 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_i32(ptr %p, i32 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sw a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sw a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: sw a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: sw a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i32 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_i16(ptr %p, i16 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sh a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sh a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sh a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sh a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: sh a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: sh a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i16 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_i8(ptr %p, i8 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: sb a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: sb a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i8 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_half(ptr %p, half %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_half:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_half:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_half:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_half:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_half:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_half:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store half %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_float(ptr %p, float %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_float:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_float:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_float:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_float:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_float:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_float:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store float %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_double(ptr %p, double %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_double:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_double:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_double:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_double:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_double:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_double:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store double %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_v16i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: addi sp, sp, -16
+; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT: .cfi_offset s0, -8
+; CHECK-RV64-NEXT: .cfi_offset s1, -16
+; CHECK-RV64-NEXT: lbu a2, 0(a1)
+; CHECK-RV64-NEXT: lbu a3, 8(a1)
+; CHECK-RV64-NEXT: lbu a4, 16(a1)
+; CHECK-RV64-NEXT: lbu a5, 24(a1)
+; CHECK-RV64-NEXT: lbu a6, 32(a1)
+; CHECK-RV64-NEXT: lbu a7, 40(a1)
+; CHECK-RV64-NEXT: lbu t0, 48(a1)
+; CHECK-RV64-NEXT: lbu t1, 56(a1)
+; CHECK-RV64-NEXT: lbu t2, 64(a1)
+; CHECK-RV64-NEXT: lbu t3, 72(a1)
+; CHECK-RV64-NEXT: lbu t4, 80(a1)
+; CHECK-RV64-NEXT: lbu t5, 88(a1)
+; CHECK-RV64-NEXT: lbu t6, 120(a1)
+; CHECK-RV64-NEXT: lbu s0, 112(a1)
+; CHECK-RV64-NEXT: lbu s1, 104(a1)
+; CHECK-RV64-NEXT: lbu a1, 96(a1)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb t6, 15(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb s0, 14(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb s1, 13(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb a1, 12(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb t5, 11(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb t4, 10(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb t3, 9(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb t2, 8(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb t1, 7(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb t0, 6(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb a7, 5(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb a6, 4(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb a5, 3(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb a4, 2(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb a3, 1(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sb a2, 0(a0)
+; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT: addi sp, sp, 16
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_v16i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: addi sp, sp, -16
+; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT: .cfi_offset s0, -4
+; CHECK-RV32-NEXT: .cfi_offset s1, -8
+; CHECK-RV32-NEXT: lbu a2, 0(a1)
+; CHECK-RV32-NEXT: lbu a3, 4(a1)
+; CHECK-RV32-NEXT: lbu a4, 8(a1)
+; CHECK-RV32-NEXT: lbu a5, 12(a1)
+; CHECK-RV32-NEXT: lbu a6, 16(a1)
+; CHECK-RV32-NEXT: lbu a7, 20(a1)
+; CHECK-RV32-NEXT: lbu t0, 24(a1)
+; CHECK-RV32-NEXT: lbu t1, 28(a1)
+; CHECK-RV32-NEXT: lbu t2, 32(a1)
+; CHECK-RV32-NEXT: lbu t3, 36(a1)
+; CHECK-RV32-NEXT: lbu t4, 40(a1)
+; CHECK-RV32-NEXT: lbu t5, 44(a1)
+; CHECK-RV32-NEXT: lbu t6, 60(a1)
+; CHECK-RV32-NEXT: lbu s0, 56(a1)
+; CHECK-RV32-NEXT: lbu s1, 52(a1)
+; CHECK-RV32-NEXT: lbu a1, 48(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb t6, 15(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb s0, 14(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb s1, 13(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb a1, 12(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb t5, 11(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb t4, 10(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb t3, 9(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb t2, 8(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb t1, 7(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb t0, 6(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb a7, 5(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb a6, 4(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb a5, 3(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb a4, 2(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb a3, 1(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sb a2, 0(a0)
+; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT: addi sp, sp, 16
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v16i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: addi sp, sp, -16
+; CHECK-RV64C-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV64C-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT: .cfi_offset s0, -8
+; CHECK-RV64C-NEXT: .cfi_offset s1, -16
+; CHECK-RV64C-NEXT: lbu a6, 0(a1)
+; CHECK-RV64C-NEXT: lbu a7, 8(a1)
+; CHECK-RV64C-NEXT: lbu t0, 16(a1)
+; CHECK-RV64C-NEXT: lbu t1, 24(a1)
+; CHECK-RV64C-NEXT: lbu t2, 32(a1)
+; CHECK-RV64C-NEXT: lbu t3, 40(a1)
+; CHECK-RV64C-NEXT: lbu t4, 48(a1)
+; CHECK-RV64C-NEXT: lbu t5, 56(a1)
+; CHECK-RV64C-NEXT: lbu t6, 64(a1)
+; CHECK-RV64C-NEXT: lbu a3, 72(a1)
+; CHECK-RV64C-NEXT: lbu a4, 80(a1)
+; CHECK-RV64C-NEXT: lbu a5, 88(a1)
+; CHECK-RV64C-NEXT: lbu a2, 120(a1)
+; CHECK-RV64C-NEXT: lbu s0, 112(a1)
+; CHECK-RV64C-NEXT: lbu s1, 104(a1)
+; CHECK-RV64C-NEXT: lbu a1, 96(a1)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb a2, 15(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb s0, 14(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb s1, 13(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb a1, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb a5, 11(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb a4, 10(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb a3, 9(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb t6, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb t5, 7(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb t4, 6(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb t3, 5(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb t2, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb t1, 3(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb t0, 2(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb a7, 1(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sb a6, 0(a0)
+; CHECK-RV64C-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT: ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT: addi sp, sp, 16
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v16i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: addi sp, sp, -16
+; CHECK-RV32C-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV32C-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT: .cfi_offset s0, -4
+; CHECK-RV32C-NEXT: .cfi_offset s1, -8
+; CHECK-RV32C-NEXT: lbu a6, 0(a1)
+; CHECK-RV32C-NEXT: lbu a7, 4(a1)
+; CHECK-RV32C-NEXT: lbu t0, 8(a1)
+; CHECK-RV32C-NEXT: lbu t1, 12(a1)
+; CHECK-RV32C-NEXT: lbu t2, 16(a1)
+; CHECK-RV32C-NEXT: lbu t3, 20(a1)
+; CHECK-RV32C-NEXT: lbu t4, 24(a1)
+; CHECK-RV32C-NEXT: lbu t5, 28(a1)
+; CHECK-RV32C-NEXT: lbu t6, 32(a1)
+; CHECK-RV32C-NEXT: lbu a3, 36(a1)
+; CHECK-RV32C-NEXT: lbu a4, 40(a1)
+; CHECK-RV32C-NEXT: lbu a5, 44(a1)
+; CHECK-RV32C-NEXT: lbu a2, 60(a1)
+; CHECK-RV32C-NEXT: lbu s0, 56(a1)
+; CHECK-RV32C-NEXT: lbu s1, 52(a1)
+; CHECK-RV32C-NEXT: lbu a1, 48(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb a2, 15(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb s0, 14(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb s1, 13(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb a1, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb a5, 11(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb a4, 10(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb a3, 9(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb t6, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb t5, 7(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb t4, 6(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb t3, 5(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb t2, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb t1, 3(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb t0, 2(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb a7, 1(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sb a6, 0(a0)
+; CHECK-RV32C-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT: addi sp, sp, 16
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_v16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vse8.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_v16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vse8.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_v8i16(ptr %p, <8 x i16> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_v8i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lh a2, 0(a1)
+; CHECK-RV64-NEXT: lh a3, 8(a1)
+; CHECK-RV64-NEXT: lh a4, 16(a1)
+; CHECK-RV64-NEXT: lh a5, 24(a1)
+; CHECK-RV64-NEXT: lh a6, 56(a1)
+; CHECK-RV64-NEXT: lh a7, 48(a1)
+; CHECK-RV64-NEXT: lh t0, 40(a1)
+; CHECK-RV64-NEXT: lh a1, 32(a1)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sh a6, 14(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sh a7, 12(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sh t0, 10(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sh a1, 8(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sh a5, 6(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sh a4, 4(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sh a3, 2(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sh a2, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_v8i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lh a2, 0(a1)
+; CHECK-RV32-NEXT: lh a3, 4(a1)
+; CHECK-RV32-NEXT: lh a4, 8(a1)
+; CHECK-RV32-NEXT: lh a5, 12(a1)
+; CHECK-RV32-NEXT: lh a6, 28(a1)
+; CHECK-RV32-NEXT: lh a7, 24(a1)
+; CHECK-RV32-NEXT: lh t0, 20(a1)
+; CHECK-RV32-NEXT: lh a1, 16(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sh a6, 14(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sh a7, 12(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sh t0, 10(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sh a1, 8(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sh a5, 6(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sh a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sh a3, 2(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sh a2, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v8i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: lh a6, 0(a1)
+; CHECK-RV64C-NEXT: lh a7, 8(a1)
+; CHECK-RV64C-NEXT: lh t0, 16(a1)
+; CHECK-RV64C-NEXT: lh a5, 24(a1)
+; CHECK-RV64C-NEXT: lh a2, 56(a1)
+; CHECK-RV64C-NEXT: lh a3, 48(a1)
+; CHECK-RV64C-NEXT: lh a4, 40(a1)
+; CHECK-RV64C-NEXT: lh a1, 32(a1)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sh a2, 14(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sh a3, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sh a4, 10(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sh a1, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sh a5, 6(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sh t0, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sh a7, 2(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sh a6, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v8i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lh a6, 0(a1)
+; CHECK-RV32C-NEXT: lh a7, 4(a1)
+; CHECK-RV32C-NEXT: lh t0, 8(a1)
+; CHECK-RV32C-NEXT: lh a5, 12(a1)
+; CHECK-RV32C-NEXT: lh a2, 28(a1)
+; CHECK-RV32C-NEXT: lh a3, 24(a1)
+; CHECK-RV32C-NEXT: lh a4, 20(a1)
+; CHECK-RV32C-NEXT: lh a1, 16(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sh a2, 14(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sh a3, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sh a4, 10(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sh a1, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sh a5, 6(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sh t0, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sh a7, 2(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sh a6, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_v8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vse16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_v8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vse16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_v4i32(ptr %p, <4 x i32> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_v4i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lw a2, 24(a1)
+; CHECK-RV64-NEXT: lw a3, 16(a1)
+; CHECK-RV64-NEXT: lw a4, 8(a1)
+; CHECK-RV64-NEXT: lw a1, 0(a1)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sw a2, 12(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sw a3, 8(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sw a4, 4(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sw a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_v4i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v4i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: lw a2, 24(a1)
+; CHECK-RV64C-NEXT: lw a3, 16(a1)
+; CHECK-RV64C-NEXT: lw a4, 8(a1)
+; CHECK-RV64C-NEXT: lw a1, 0(a1)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sw a2, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sw a3, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sw a4, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sw a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v4i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_v4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vse32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_v4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vse32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define void @test_nontemporal_S1_store_v2i64(ptr %p, <2 x i64> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_S1_store_v2i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: ntl.s1
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_S1_store_v2i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.s1
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v2i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.s1
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v2i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.s1
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_S1_store_v2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.s1
+; CHECK-RV64V-NEXT: vse64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_S1_store_v2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.s1
+; CHECK-RV32V-NEXT: vse64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3
+ ret void
+}
+
+define i64 @test_nontemporal_ALL_load_i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a2, 0(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a1, 4(a0)
+; CHECK-RV32-NEXT: mv a0, a2
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a2, 0(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a1, 4(a0)
+; CHECK-RV32C-NEXT: mv a0, a2
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: ld a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: lw a2, 0(a0)
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: lw a1, 4(a0)
+; CHECK-RV32V-NEXT: mv a0, a2
+; CHECK-RV32V-NEXT: ret
+ %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret i64 %1
+}
+
+define i32 @test_nontemporal_ALL_load_i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: lw a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: lw a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: lw a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: lw a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load i32, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret i32 %1
+}
+
+define i16 @test_nontemporal_ALL_load_i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: lh a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lh a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: lh a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lh a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: lh a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: lh a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load i16, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret i16 %1
+}
+
+define i8 @test_nontemporal_ALL_load_i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: lbu a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lbu a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: lbu a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lbu a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: lbu a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: lbu a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load i8, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret i8 %1
+}
+
+define half @test_nontemporal_ALL_load_half(ptr %p) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_half:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: flh fa5, 0(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: flh fa4, 6(a0)
+; CHECK-RV64-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_half:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: flh fa5, 0(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: flh fa4, 6(a0)
+; CHECK-RV32-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_half:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: flh fa5, 0(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: flh fa4, 6(a0)
+; CHECK-RV64C-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_half:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: flh fa5, 0(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: flh fa4, 6(a0)
+; CHECK-RV32C-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_half:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: flh fa5, 0(a0)
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: flh fa4, 6(a0)
+; CHECK-RV64V-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_half:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: flh fa5, 0(a0)
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: flh fa4, 6(a0)
+; CHECK-RV32V-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32V-NEXT: ret
+ %1 = load half, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ %2 = getelementptr half, ptr %p, i32 3
+ %3 = load half, ptr %2, !nontemporal !0, !riscv-nontemporal-domain !4
+ %4 = fadd half %1, %3
+ ret half %4
+}
+
+define float @test_nontemporal_ALL_load_float(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_float:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: flw fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_float:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: flw fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_float:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: flw fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_float:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: flw fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_float:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: flw fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_float:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: flw fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load float, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret float %1
+}
+
+define double @test_nontemporal_ALL_load_double(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_double:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: fld fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_double:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: fld fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_double:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: fld fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_double:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: fld fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_double:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: fld fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_double:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: fld fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load double, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret double %1
+}
+
+define <16 x i8> @test_nontemporal_ALL_load_v16i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v16i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v16i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v16i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v16i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_v16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vle8.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_v16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vle8.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <16 x i8>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret <16 x i8> %1
+}
+
+define <8 x i16> @test_nontemporal_ALL_load_v8i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v8i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v8i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v8i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v8i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_v8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vle16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_v8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vle16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <8 x i16>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret <8 x i16> %1
+}
+
+define <4 x i32> @test_nontemporal_ALL_load_v4i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v4i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v4i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v4i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v4i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_v4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vle32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_v4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vle32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <4 x i32>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret <4 x i32> %1
+}
+
+define <2 x i64> @test_nontemporal_ALL_load_v2i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v2i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a2, 0(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a1, 8(a0)
+; CHECK-RV64-NEXT: mv a0, a2
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v2i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v2i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a2, 0(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a1, 8(a0)
+; CHECK-RV64C-NEXT: mv a0, a2
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v2i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_v2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vle64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_v2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vle64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <2 x i64>, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret <2 x i64> %1
+}
+
+define void @test_nontemporal_ALL_store_i64(ptr %p, i64 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a2, 4(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a2, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: sd a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: sw a2, 4(a0)
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: sw a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i64 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_i32(ptr %p, i32 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sw a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sw a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: sw a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: sw a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i32 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_i16(ptr %p, i16 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: sh a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: sh a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i16 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_i8(ptr %p, i8 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: sb a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: sb a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store i8 %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_half(ptr %p, half %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_half:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_half:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_half:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_half:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_half:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_half:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store half %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_float(ptr %p, float %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_float:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_float:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_float:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_float:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_float:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_float:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store float %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_double(ptr %p, double %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_double:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_double:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_double:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_double:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_double:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_double:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+ store double %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v16i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: addi sp, sp, -16
+; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT: .cfi_offset s0, -8
+; CHECK-RV64-NEXT: .cfi_offset s1, -16
+; CHECK-RV64-NEXT: lbu a2, 0(a1)
+; CHECK-RV64-NEXT: lbu a3, 8(a1)
+; CHECK-RV64-NEXT: lbu a4, 16(a1)
+; CHECK-RV64-NEXT: lbu a5, 24(a1)
+; CHECK-RV64-NEXT: lbu a6, 32(a1)
+; CHECK-RV64-NEXT: lbu a7, 40(a1)
+; CHECK-RV64-NEXT: lbu t0, 48(a1)
+; CHECK-RV64-NEXT: lbu t1, 56(a1)
+; CHECK-RV64-NEXT: lbu t2, 64(a1)
+; CHECK-RV64-NEXT: lbu t3, 72(a1)
+; CHECK-RV64-NEXT: lbu t4, 80(a1)
+; CHECK-RV64-NEXT: lbu t5, 88(a1)
+; CHECK-RV64-NEXT: lbu t6, 120(a1)
+; CHECK-RV64-NEXT: lbu s0, 112(a1)
+; CHECK-RV64-NEXT: lbu s1, 104(a1)
+; CHECK-RV64-NEXT: lbu a1, 96(a1)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t6, 15(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb s0, 14(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb s1, 13(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a1, 12(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t5, 11(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t4, 10(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t3, 9(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t2, 8(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t1, 7(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t0, 6(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a7, 5(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a6, 4(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a5, 3(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a4, 2(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a3, 1(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a2, 0(a0)
+; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT: addi sp, sp, 16
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v16i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: addi sp, sp, -16
+; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT: .cfi_offset s0, -4
+; CHECK-RV32-NEXT: .cfi_offset s1, -8
+; CHECK-RV32-NEXT: lbu a2, 0(a1)
+; CHECK-RV32-NEXT: lbu a3, 4(a1)
+; CHECK-RV32-NEXT: lbu a4, 8(a1)
+; CHECK-RV32-NEXT: lbu a5, 12(a1)
+; CHECK-RV32-NEXT: lbu a6, 16(a1)
+; CHECK-RV32-NEXT: lbu a7, 20(a1)
+; CHECK-RV32-NEXT: lbu t0, 24(a1)
+; CHECK-RV32-NEXT: lbu t1, 28(a1)
+; CHECK-RV32-NEXT: lbu t2, 32(a1)
+; CHECK-RV32-NEXT: lbu t3, 36(a1)
+; CHECK-RV32-NEXT: lbu t4, 40(a1)
+; CHECK-RV32-NEXT: lbu t5, 44(a1)
+; CHECK-RV32-NEXT: lbu t6, 60(a1)
+; CHECK-RV32-NEXT: lbu s0, 56(a1)
+; CHECK-RV32-NEXT: lbu s1, 52(a1)
+; CHECK-RV32-NEXT: lbu a1, 48(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t6, 15(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb s0, 14(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb s1, 13(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a1, 12(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t5, 11(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t4, 10(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t3, 9(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t2, 8(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t1, 7(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t0, 6(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a7, 5(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a6, 4(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a5, 3(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a4, 2(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a3, 1(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a2, 0(a0)
+; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT: addi sp, sp, 16
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v16i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: addi sp, sp, -16
+; CHECK-RV64C-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV64C-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT: .cfi_offset s0, -8
+; CHECK-RV64C-NEXT: .cfi_offset s1, -16
+; CHECK-RV64C-NEXT: lbu a6, 0(a1)
+; CHECK-RV64C-NEXT: lbu a7, 8(a1)
+; CHECK-RV64C-NEXT: lbu t0, 16(a1)
+; CHECK-RV64C-NEXT: lbu t1, 24(a1)
+; CHECK-RV64C-NEXT: lbu t2, 32(a1)
+; CHECK-RV64C-NEXT: lbu t3, 40(a1)
+; CHECK-RV64C-NEXT: lbu t4, 48(a1)
+; CHECK-RV64C-NEXT: lbu t5, 56(a1)
+; CHECK-RV64C-NEXT: lbu t6, 64(a1)
+; CHECK-RV64C-NEXT: lbu a3, 72(a1)
+; CHECK-RV64C-NEXT: lbu a4, 80(a1)
+; CHECK-RV64C-NEXT: lbu a5, 88(a1)
+; CHECK-RV64C-NEXT: lbu a2, 120(a1)
+; CHECK-RV64C-NEXT: lbu s0, 112(a1)
+; CHECK-RV64C-NEXT: lbu s1, 104(a1)
+; CHECK-RV64C-NEXT: lbu a1, 96(a1)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a2, 15(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb s0, 14(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb s1, 13(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a1, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a5, 11(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a4, 10(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a3, 9(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t6, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t5, 7(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t4, 6(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t3, 5(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t2, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t1, 3(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t0, 2(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a7, 1(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a6, 0(a0)
+; CHECK-RV64C-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT: ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT: addi sp, sp, 16
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v16i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: addi sp, sp, -16
+; CHECK-RV32C-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV32C-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT: .cfi_offset s0, -4
+; CHECK-RV32C-NEXT: .cfi_offset s1, -8
+; CHECK-RV32C-NEXT: lbu a6, 0(a1)
+; CHECK-RV32C-NEXT: lbu a7, 4(a1)
+; CHECK-RV32C-NEXT: lbu t0, 8(a1)
+; CHECK-RV32C-NEXT: lbu t1, 12(a1)
+; CHECK-RV32C-NEXT: lbu t2, 16(a1)
+; CHECK-RV32C-NEXT: lbu t3, 20(a1)
+; CHECK-RV32C-NEXT: lbu t4, 24(a1)
+; CHECK-RV32C-NEXT: lbu t5, 28(a1)
+; CHECK-RV32C-NEXT: lbu t6, 32(a1)
+; CHECK-RV32C-NEXT: lbu a3, 36(a1)
+; CHECK-RV32C-NEXT: lbu a4, 40(a1)
+; CHECK-RV32C-NEXT: lbu a5, 44(a1)
+; CHECK-RV32C-NEXT: lbu a2, 60(a1)
+; CHECK-RV32C-NEXT: lbu s0, 56(a1)
+; CHECK-RV32C-NEXT: lbu s1, 52(a1)
+; CHECK-RV32C-NEXT: lbu a1, 48(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a2, 15(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb s0, 14(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb s1, 13(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a1, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a5, 11(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a4, 10(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a3, 9(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t6, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t5, 7(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t4, 6(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t3, 5(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t2, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t1, 3(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t0, 2(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a7, 1(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a6, 0(a0)
+; CHECK-RV32C-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT: addi sp, sp, 16
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_v16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vse8.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_v16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vse8.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <16 x i8> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_v8i16(ptr %p, <8 x i16> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v8i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lh a2, 0(a1)
+; CHECK-RV64-NEXT: lh a3, 8(a1)
+; CHECK-RV64-NEXT: lh a4, 16(a1)
+; CHECK-RV64-NEXT: lh a5, 24(a1)
+; CHECK-RV64-NEXT: lh a6, 56(a1)
+; CHECK-RV64-NEXT: lh a7, 48(a1)
+; CHECK-RV64-NEXT: lh t0, 40(a1)
+; CHECK-RV64-NEXT: lh a1, 32(a1)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a6, 14(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a7, 12(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh t0, 10(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a1, 8(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a5, 6(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a4, 4(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a3, 2(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a2, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v8i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lh a2, 0(a1)
+; CHECK-RV32-NEXT: lh a3, 4(a1)
+; CHECK-RV32-NEXT: lh a4, 8(a1)
+; CHECK-RV32-NEXT: lh a5, 12(a1)
+; CHECK-RV32-NEXT: lh a6, 28(a1)
+; CHECK-RV32-NEXT: lh a7, 24(a1)
+; CHECK-RV32-NEXT: lh t0, 20(a1)
+; CHECK-RV32-NEXT: lh a1, 16(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a6, 14(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a7, 12(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh t0, 10(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a1, 8(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a5, 6(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a3, 2(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a2, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v8i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: lh a6, 0(a1)
+; CHECK-RV64C-NEXT: lh a7, 8(a1)
+; CHECK-RV64C-NEXT: lh t0, 16(a1)
+; CHECK-RV64C-NEXT: lh a5, 24(a1)
+; CHECK-RV64C-NEXT: lh a2, 56(a1)
+; CHECK-RV64C-NEXT: lh a3, 48(a1)
+; CHECK-RV64C-NEXT: lh a4, 40(a1)
+; CHECK-RV64C-NEXT: lh a1, 32(a1)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a2, 14(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a3, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a4, 10(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a1, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a5, 6(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh t0, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a7, 2(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a6, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v8i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lh a6, 0(a1)
+; CHECK-RV32C-NEXT: lh a7, 4(a1)
+; CHECK-RV32C-NEXT: lh t0, 8(a1)
+; CHECK-RV32C-NEXT: lh a5, 12(a1)
+; CHECK-RV32C-NEXT: lh a2, 28(a1)
+; CHECK-RV32C-NEXT: lh a3, 24(a1)
+; CHECK-RV32C-NEXT: lh a4, 20(a1)
+; CHECK-RV32C-NEXT: lh a1, 16(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a2, 14(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a3, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a4, 10(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a1, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a5, 6(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh t0, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a7, 2(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a6, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_v8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vse16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_v8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vse16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <8 x i16> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_v4i32(ptr %p, <4 x i32> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v4i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lw a2, 24(a1)
+; CHECK-RV64-NEXT: lw a3, 16(a1)
+; CHECK-RV64-NEXT: lw a4, 8(a1)
+; CHECK-RV64-NEXT: lw a1, 0(a1)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sw a2, 12(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sw a3, 8(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sw a4, 4(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sw a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v4i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v4i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: lw a2, 24(a1)
+; CHECK-RV64C-NEXT: lw a3, 16(a1)
+; CHECK-RV64C-NEXT: lw a4, 8(a1)
+; CHECK-RV64C-NEXT: lw a1, 0(a1)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sw a2, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sw a3, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sw a4, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sw a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v4i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_v4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vse32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_v4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vse32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <4 x i32> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+define void @test_nontemporal_ALL_store_v2i64(ptr %p, <2 x i64> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v2i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v2i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v2i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v2i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_ALL_store_v2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vse64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_ALL_store_v2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vse64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <2 x i64> %v, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4
+ ret void
+}
+
+
!0 = !{i32 1}
+!1 = !{i32 2}
+!2 = !{i32 3}
+!3 = !{i32 4}
+!4 = !{i32 5}
More information about the cfe-commits
mailing list