[clang] [CIR][X86] Add support for `aes` and `aeswide` builtins (PR #175892)
Haocong Lu via cfe-commits
cfe-commits at lists.llvm.org
Thu Jan 15 00:45:55 PST 2026
https://github.com/Luhaocong updated https://github.com/llvm/llvm-project/pull/175892
>From c1fdb9cf6c28e0373df5016ce54b8d1ea87acdba Mon Sep 17 00:00:00 2001
From: Haocong Lu <haocong.lu at witintech.com>
Date: Thu, 15 Jan 2026 16:50:31 +0800
Subject: [PATCH] [CIR][X86] Add support for `aes` and `aeswide` builtins
- Support CIR codegen for follow builtin: `aesenc`, `aesdec`,
`aesencwide` and `aesdecwide`
---
clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 166 ++-
.../test/CIR/CodeGenBuiltins/X86/keylocker.c | 1176 +++++++++++++++++
2 files changed, 1339 insertions(+), 3 deletions(-)
create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/keylocker.c
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 44af8d6cc0ef4..497462a465145 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -597,6 +597,119 @@ static mlir::Value emitX86Fpclass(CIRGenBuilderTy &builder, mlir::Location loc,
return emitX86MaskedCompareResult(builder, fpclass, numElts, maskIn, loc);
}
+static mlir::Value emitX86Aes(CIRGenBuilderTy &builder, mlir::Location loc,
+ llvm::StringRef intrinsicName, mlir::Type retType,
+ llvm::ArrayRef<mlir::Value> ops) {
+ // Create return struct type and call intrinsic function.
+ mlir::Type vecType =
+ mlir::cast<cir::PointerType>(ops[0].getType()).getPointee();
+ cir::RecordType rstRecTy = builder.getAnonRecordTy({retType, vecType});
+ mlir::Value rstValueRec = emitIntrinsicCallOp(
+ builder, loc, intrinsicName, rstRecTy, mlir::ValueRange{ops[1], ops[2]});
+
+ // Extract the first return value and truncate it to 1 bit, then cast result
+ // to bool value.
+ mlir::Value flag =
+ cir::ExtractMemberOp::create(builder, loc, rstValueRec, /*index=*/0);
+ mlir::Value flagBit0 = builder.createCast(loc, cir::CastKind::integral, flag,
+ builder.getUIntNTy(1));
+ mlir::Value succ = builder.createCast(loc, cir::CastKind::int_to_bool,
+ flagBit0, builder.getBoolTy());
+
+ // Extract the second return value, store it to output address if success.
+ mlir::Value out =
+ cir::ExtractMemberOp::create(builder, loc, rstValueRec, /*index=*/1);
+ Address outAddr(ops[0], /*align=*/CharUnits::fromQuantity(16));
+ cir::IfOp::create(
+ builder, loc, succ, /*withElseRegion=*/true,
+ /*thenBuilder=*/
+ [&](mlir::OpBuilder &b, mlir::Location) {
+ builder.createStore(loc, out, outAddr);
+ builder.createYield(loc);
+ },
+ /*elseBuilder=*/
+ [&](mlir::OpBuilder &b, mlir::Location) {
+ mlir::Value zero = builder.getNullValue(vecType, loc);
+ builder.createStore(loc, zero, outAddr);
+ builder.createYield(loc);
+ });
+
+ return cir::ExtractMemberOp::create(builder, loc, rstValueRec, /*index=*/0);
+}
+
+static mlir::Value emitX86Aeswide(CIRGenBuilderTy &builder, mlir::Location loc,
+ llvm::StringRef intrinsicName,
+ mlir::Type retType,
+ llvm::ArrayRef<mlir::Value> ops) {
+ mlir::Type vecType =
+ mlir::cast<cir::PointerType>(ops[1].getType()).getPointee();
+
+ // Create struct for return type and load input arguments, then call
+ // intrinsic function.
+ mlir::Type recTypes[9] = {retType, vecType, vecType, vecType, vecType,
+ vecType, vecType, vecType, vecType};
+ mlir::Value arguments[9];
+ arguments[0] = ops[2];
+ for (int i = 0; i < 8; i++) {
+ // Loading each vector argument from input address.
+ cir::ConstantOp idx = builder.getUInt32(i, loc);
+ mlir::Value nextInElePtr =
+ builder.getArrayElement(loc, loc, ops[1], vecType, idx,
+ /*shouldDecay=*/false);
+ arguments[i + 1] =
+ builder.createAlignedLoad(loc, vecType, nextInElePtr,
+ /*align=*/CharUnits::fromQuantity(16));
+ }
+ cir::RecordType rstRecTy = builder.getAnonRecordTy(recTypes);
+ mlir::Value rstValueRec =
+ emitIntrinsicCallOp(builder, loc, intrinsicName, rstRecTy, arguments);
+
+ // Extract the first return value and truncate it to 1 bit, then cast result
+ // to bool value.
+ mlir::Value flag =
+ cir::ExtractMemberOp::create(builder, loc, rstValueRec, /*index=*/0);
+ mlir::Value flagBit0 = builder.createCast(loc, cir::CastKind::integral, flag,
+ builder.getUIntNTy(1));
+ mlir::Value succ = builder.createCast(loc, cir::CastKind::int_to_bool,
+ flagBit0, builder.getBoolTy());
+
+ // Extract other return values, store those to output address if success.
+ cir::IfOp::create(
+ builder, loc, succ, /*withElseRegion=*/true,
+ /*thenBuilder=*/
+ [&](mlir::OpBuilder &b, mlir::Location) {
+ for (int i = 0; i < 8; i++) {
+ mlir::Value out =
+ cir::ExtractMemberOp::create(builder, loc, rstValueRec,
+ /*index=*/i + 1);
+ cir::ConstantOp idx = builder.getUInt32(i, loc);
+ mlir::Value nextOutEleAddr =
+ builder.getArrayElement(loc, loc, ops[0], vecType, idx,
+ /*shouldDecay=*/false);
+ Address outAddr(nextOutEleAddr,
+ /*align=*/CharUnits::fromQuantity(16));
+ builder.createStore(loc, out, outAddr);
+ }
+ builder.createYield(loc);
+ },
+ /*elseBuilder=*/
+ [&](mlir::OpBuilder &b, mlir::Location) {
+ mlir::Value zero = builder.getNullValue(vecType, loc);
+ for (int i = 0; i < 8; i++) {
+ cir::ConstantOp idx = builder.getUInt32(i, loc);
+ mlir::Value nextOutEleAddr =
+ builder.getArrayElement(loc, loc, ops[0], vecType, idx,
+ /*shouldDecay=*/false);
+ Address outAddr(nextOutEleAddr,
+ /*align=*/CharUnits::fromQuantity(16));
+ builder.createStore(loc, zero, outAddr);
+ }
+ builder.createYield(loc);
+ });
+
+ return cir::ExtractMemberOp::create(builder, loc, rstValueRec, /*index=*/0);
+}
+
std::optional<mlir::Value>
CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -2147,15 +2260,62 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
case X86::BI__readgsdword:
case X86::BI__readgsqword:
case X86::BI__builtin_ia32_encodekey128_u32:
- case X86::BI__builtin_ia32_encodekey256_u32:
+ case X86::BI__builtin_ia32_encodekey256_u32: {
+ cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented X86 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+ return mlir::Value{};
+ }
case X86::BI__builtin_ia32_aesenc128kl_u8:
case X86::BI__builtin_ia32_aesdec128kl_u8:
case X86::BI__builtin_ia32_aesenc256kl_u8:
- case X86::BI__builtin_ia32_aesdec256kl_u8:
+ case X86::BI__builtin_ia32_aesdec256kl_u8: {
+ llvm::StringRef intrinsicName;
+ switch (builtinID) {
+ default:
+ llvm_unreachable("Unexpected builtin");
+ case X86::BI__builtin_ia32_aesenc128kl_u8:
+ intrinsicName = "x86.aesenc128kl";
+ break;
+ case X86::BI__builtin_ia32_aesdec128kl_u8:
+ intrinsicName = "x86.aesdec128kl";
+ break;
+ case X86::BI__builtin_ia32_aesenc256kl_u8:
+ intrinsicName = "x86.aesenc256kl";
+ break;
+ case X86::BI__builtin_ia32_aesdec256kl_u8:
+ intrinsicName = "x86.aesdec256kl";
+ break;
+ }
+
+ return emitX86Aes(builder, getLoc(expr->getExprLoc()), intrinsicName,
+ convertType(expr->getType()), ops);
+ }
case X86::BI__builtin_ia32_aesencwide128kl_u8:
case X86::BI__builtin_ia32_aesdecwide128kl_u8:
case X86::BI__builtin_ia32_aesencwide256kl_u8:
- case X86::BI__builtin_ia32_aesdecwide256kl_u8:
+ case X86::BI__builtin_ia32_aesdecwide256kl_u8: {
+ llvm::StringRef intrinsicName;
+ switch (builtinID) {
+ default:
+ llvm_unreachable("Unexpected builtin");
+ case X86::BI__builtin_ia32_aesencwide128kl_u8:
+ intrinsicName = "x86.aesencwide128kl";
+ break;
+ case X86::BI__builtin_ia32_aesdecwide128kl_u8:
+ intrinsicName = "x86.aesdecwide128kl";
+ break;
+ case X86::BI__builtin_ia32_aesencwide256kl_u8:
+ intrinsicName = "x86.aesencwide256kl";
+ break;
+ case X86::BI__builtin_ia32_aesdecwide256kl_u8:
+ intrinsicName = "x86.aesdecwide256kl";
+ break;
+ }
+
+ return emitX86Aeswide(builder, getLoc(expr->getExprLoc()), intrinsicName,
+ convertType(expr->getType()), ops);
+ }
case X86::BI__builtin_ia32_vfcmaddcph512_mask:
case X86::BI__builtin_ia32_vfmaddcph512_mask:
case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/keylocker.c b/clang/test/CIR/CodeGenBuiltins/X86/keylocker.c
new file mode 100644
index 0000000000000..a4995a108c8c8
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/keylocker.c
@@ -0,0 +1,1176 @@
+// RUN: %clang_cc1 -x c -ffreestanding -triple x86_64-unknown-linux -target-feature +kl -target-feature +widekl -Wno-implicit-function-declaration -fclangir -emit-cir -o %t.cir %s
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -ffreestanding -triple x86_64-unknown-linux -target-feature +kl -target-feature +widekl -Wno-implicit-function-declaration -fclangir -emit-cir -o %t.cir %s
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c -ffreestanding -triple x86_64-unknown-linux -target-feature +kl -target-feature +widekl -Wno-implicit-function-declaration -fclangir -emit-llvm -o %t.ll %s
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c++ -ffreestanding -triple x86_64-unknown-linux -Wno-implicit-function-declaration -fclangir -emit-llvm -o %t.ll %s
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -ffreestanding -triple=x86_64-unknown-linux -target-feature +kl -target-feature +widekl -emit-llvm -Wall -Werror %s -o - | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -ffreestanding -triple=x86_64-unknown-linux -target-feature +kl -target-feature +widekl -emit-llvm -Wall -Werror %s -o - | FileCheck %s -check-prefix=OGCG
+
+// This test mimics clang/test/CodeGen/X86/keylocker.c, which eventually
+// CIR shall be able to support fully.
+
+#include <x86intrin.h>
+
+// CIR: !rec_anon_struct = !cir.record<struct {!u8i, !cir.vector<2 x !s64i>}>
+// CIR: !rec_anon_struct1 = !cir.record<struct {!u8i, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>}>
+
+unsigned char test_mm_aesenc256kl_u8(__m128i *odata, __m128i idata, const void *h) {
+ // CIR-LABEL: _mm_aesenc256kl_u8
+ // CIR: %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesenc256kl" %[[IDATA:.+]], %[[H:.+]] : (!cir.vector<2 x !s64i>, !cir.ptr<!void>) -> !rec_anon_struct
+ // CIR: %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+ // CIR: %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+ // CIR: %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+ // CIR: %[[OUT:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct -> !cir.vector<2 x !s64i>
+ // CIR: cir.if %[[SUCC]] {
+ // CIR: cir.store align(16) %[[OUT]], %[[ODATA_PTR:.+]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: } else {
+ // CIR: %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+ // CIR: cir.store align(16) %[[NULL]], %[[ODATA_PTR]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: }
+ // CIR: %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+ // CIR: cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+ // CIR: %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+ // CIR: cir.return %[[RET]] : !u8i
+
+ // LLVM-LABEL: _mm_aesenc256kl_u8
+ // LLVM: %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+ // LLVM: %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // LLVM: %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+ // LLVM: %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+ // LLVM: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // LLVM: [[NO_ERROR]]:
+ // LLVM: store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+ // LLVM: br label %[[EXIT:.+]]
+ // LLVM: [[ERROR]]:
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // LLVM: br label %[[EXIT]]
+ // LLVM: [[EXIT]]:
+ // LLVM: %[[FLAG1:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // LLVM: store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+ // LLVM: %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+ // LLVM: store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+ // LLVM: %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+ // LLVM: ret i8 %[[RET]]
+
+ // OGCG-LABEL: test_mm_aesenc256kl_u8
+ // OGCG: %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+ // OGCG: %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // OGCG: %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+ // OGCG: %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+ // OGCG: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // OGCG: [[NO_ERROR]]:
+ // OGCG: store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+ // OGCG: br label %[[EXIT:.+]]
+ // OGCG: [[ERROR]]:
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // OGCG: br label %[[EXIT]]
+ // OGCG: [[EXIT]]:
+ // OGCG: %[[RET:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // OGCG: ret i8 %[[RET]]
+ return _mm_aesenc256kl_u8(odata, idata, h);
+}
+
+unsigned char test_mm_aesdec256kl_u8(__m128i *odata, __m128i idata, const void *h) {
+ // CIR-LABEL: _mm_aesdec256kl_u8
+ // CIR: %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesdec256kl" %[[IDATA:.+]], %[[H:.+]] : (!cir.vector<2 x !s64i>, !cir.ptr<!void>) -> !rec_anon_struct
+ // CIR: %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+ // CIR: %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+ // CIR: %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+ // CIR: %[[OUT:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct -> !cir.vector<2 x !s64i>
+ // CIR: cir.if %[[SUCC]] {
+ // CIR: cir.store align(16) %[[OUT]], %[[ODATA_PTR:.+]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: } else {
+ // CIR: %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+ // CIR: cir.store align(16) %[[NULL]], %[[ODATA_PTR]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: }
+ // CIR: %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+ // CIR: cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+ // CIR: %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+ // CIR: cir.return %[[RET]] : !u8i
+
+ // LLVM-LABEL: _mm_aesdec256kl_u8
+ // LLVM: %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+ // LLVM: %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // LLVM: %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+ // LLVM: %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+ // LLVM: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // LLVM: [[NO_ERROR]]:
+ // LLVM: store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+ // LLVM: br label %[[EXIT:.+]]
+ // LLVM: [[ERROR]]:
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // LLVM: br label %[[EXIT]]
+ // LLVM: [[EXIT]]:
+ // LLVM: %[[FLAG1:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // LLVM: store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+ // LLVM: %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+ // LLVM: store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+ // LLVM: %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+ // LLVM: ret i8 %[[RET]]
+
+ // OGCG-LABEL: test_mm_aesdec256kl_u8
+ // OGCG: %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+ // OGCG: %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // OGCG: %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+ // OGCG: %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+ // OGCG: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // OGCG: [[NO_ERROR]]:
+ // OGCG: store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+ // OGCG: br label %[[EXIT:.+]]
+ // OGCG: [[ERROR]]:
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // OGCG: br label %[[EXIT]]
+ // OGCG: [[EXIT]]:
+ // OGCG: %[[RET:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // OGCG: ret i8 %[[RET]]
+ return _mm_aesdec256kl_u8(odata, idata, h);
+}
+
+unsigned char test_mm_aesenc128kl_u8(__m128i *odata, __m128i idata, const void *h) {
+ // CIR-LABEL: _mm_aesenc128kl_u8
+ // CIR: %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesenc128kl" %[[IDATA:.+]], %[[H:.+]] : (!cir.vector<2 x !s64i>, !cir.ptr<!void>) -> !rec_anon_struct
+ // CIR: %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+ // CIR: %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+ // CIR: %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+ // CIR: %[[OUT:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct -> !cir.vector<2 x !s64i>
+ // CIR: cir.if %[[SUCC]] {
+ // CIR: cir.store align(16) %[[OUT]], %[[ODATA_PTR:.+]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: } else {
+ // CIR: %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+ // CIR: cir.store align(16) %[[NULL]], %[[ODATA_PTR]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: }
+ // CIR: %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+ // CIR: cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+ // CIR: %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+ // CIR: cir.return %[[RET]] : !u8i
+
+ // LLVM-LABEL: _mm_aesenc128kl_u8
+ // LLVM: %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+ // LLVM: %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // LLVM: %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+ // LLVM: %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+ // LLVM: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // LLVM: [[NO_ERROR]]:
+ // LLVM: store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+ // LLVM: br label %[[EXIT:.+]]
+ // LLVM: [[ERROR]]:
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // LLVM: br label %[[EXIT]]
+ // LLVM: [[EXIT]]:
+ // LLVM: %[[FLAG1:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // LLVM: store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+ // LLVM: %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+ // LLVM: store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+ // LLVM: %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+ // LLVM: ret i8 %[[RET]]
+
+ // OGCG-LABEL: test_mm_aesenc128kl_u8
+ // OGCG: %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+ // OGCG: %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // OGCG: %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+ // OGCG: %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+ // OGCG: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // OGCG: [[NO_ERROR]]:
+ // OGCG: store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+ // OGCG: br label %[[EXIT:.+]]
+ // OGCG: [[ERROR]]:
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // OGCG: br label %[[EXIT]]
+ // OGCG: [[EXIT]]:
+ // OGCG: %[[RET:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // OGCG: ret i8 %[[RET]]
+ return _mm_aesenc128kl_u8(odata, idata, h);
+}
+
+unsigned char test_mm_aesdec128kl_u8(__m128i *odata, __m128i idata, const void *h) {
+ // CIR-LABEL: _mm_aesdec128kl_u8
+ // CIR: %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesdec128kl" %[[IDATA:.+]], %[[H:.+]] : (!cir.vector<2 x !s64i>, !cir.ptr<!void>) -> !rec_anon_struct
+ // CIR: %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+ // CIR: %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+ // CIR: %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+ // CIR: %[[OUT:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct -> !cir.vector<2 x !s64i>
+ // CIR: cir.if %[[SUCC]] {
+ // CIR: cir.store align(16) %[[OUT]], %[[ODATA_PTR:.+]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: } else {
+ // CIR: %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+ // CIR: cir.store align(16) %[[NULL]], %[[ODATA_PTR]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: }
+ // CIR: %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct -> !u8i
+ // CIR: cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+ // CIR: %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+ // CIR: cir.return %[[RET]] : !u8i
+
+ // LLVM-LABEL: _mm_aesdec128kl_u8
+ // LLVM: %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+ // LLVM: %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // LLVM: %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+ // LLVM: %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+ // LLVM: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // LLVM: [[NO_ERROR]]:
+ // LLVM: store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+ // LLVM: br label %[[EXIT:.+]]
+ // LLVM: [[ERROR]]:
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // LLVM: br label %[[EXIT]]
+ // LLVM: [[EXIT]]:
+ // LLVM: %[[FLAG1:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // LLVM: store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+ // LLVM: %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+ // LLVM: store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+ // LLVM: %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+ // LLVM: ret i8 %[[RET]]
+
+ // OGCG-LABEL: test_mm_aesdec128kl_u8
+ // OGCG: %[[RESULT:.+]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %[[IDATA:.+]], ptr %[[H:.+]])
+ // OGCG: %[[FLAG:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // OGCG: %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+ // OGCG: %[[OUT:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 1
+ // OGCG: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // OGCG: [[NO_ERROR]]:
+ // OGCG: store <2 x i64> %[[OUT]], ptr %[[ODATA_PTR:.+]], align 16
+ // OGCG: br label %[[EXIT:.+]]
+ // OGCG: [[ERROR]]:
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // OGCG: br label %[[EXIT]]
+ // OGCG: [[EXIT]]:
+ // OGCG: %[[RET:.+]] = extractvalue { i8, <2 x i64> } %[[RESULT]], 0
+ // OGCG: ret i8 %[[RET]]
+ return _mm_aesdec128kl_u8(odata, idata, h);
+}
+
+unsigned char test__mm_aesencwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
+ // CIR-LABEL: _mm_aesencwide256kl_u8
+ // CIR: %[[ZERO:.+]] = cir.const #cir.int<0> : !u32i
+ // CIR: %[[IN_ADDR0:.+]] = cir.ptr_stride %[[IDATA_ADDR:.+]], %[[ZERO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA0:.+]] = cir.load align(16) %[[IN_ADDR0]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[ONE:.+]] = cir.const #cir.int<1> : !u32i
+ // CIR: %[[IN_ADDR1:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[ONE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA1:.+]] = cir.load align(16) %[[IN_ADDR1]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[TWO:.+]] = cir.const #cir.int<2> : !u32i
+ // CIR: %[[IN_ADDR2:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[TWO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA2:.+]] = cir.load align(16) %[[IN_ADDR2]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[THREE:.+]] = cir.const #cir.int<3> : !u32i
+ // CIR: %[[IN_ADDR3:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[THREE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA3:.+]] = cir.load align(16) %[[IN_ADDR3]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[FOUR:.+]] = cir.const #cir.int<4> : !u32i
+ // CIR: %[[IN_ADDR4:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FOUR]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA4:.+]] = cir.load align(16) %[[IN_ADDR4]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[FIVE:.+]] = cir.const #cir.int<5> : !u32i
+ // CIR: %[[IN_ADDR5:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FIVE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA5:.+]] = cir.load align(16) %[[IN_ADDR5]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[SIX:.+]] = cir.const #cir.int<6> : !u32i
+ // CIR: %[[IN_ADDR6:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SIX]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA6:.+]] = cir.load align(16) %[[IN_ADDR6]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[SEVEN:.+]] = cir.const #cir.int<7> : !u32i
+ // CIR: %[[IN_ADDR7:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SEVEN]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA7:.+]] = cir.load align(16) %[[IN_ADDR7]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesencwide256kl" %[[H_ADDR:.+]], %[[IN_DATA0]], %[[IN_DATA1]], %[[IN_DATA2]], %[[IN_DATA3]], %[[IN_DATA4]], %[[IN_DATA5]], %[[IN_DATA6]], %[[IN_DATA7]] : (!cir.ptr<!void>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>) -> !rec_anon_struct1
+ // CIR: %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+ // CIR: %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+ // CIR: %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+ // CIR: cir.if %[[SUCC]] {
+ // CIR: %[[OUT_DATA0:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[ZERO_1:.+]] = cir.const #cir.int<0> : !u32i
+ // CIR: %[[OUT_ADDR0:.+]] = cir.ptr_stride %[[ODATA_ADDR:.+]], %[[ZERO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA0]], %[[OUT_ADDR0]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA1:.+]] = cir.extract_member %[[RESULT]][2] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[ONE_1:.+]] = cir.const #cir.int<1> : !u32i
+ // CIR: %[[OUT_ADDR1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA1]], %[[OUT_ADDR1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA2:.+]] = cir.extract_member %[[RESULT]][3] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[TWO_1:.+]] = cir.const #cir.int<2> : !u32i
+ // CIR: %[[OUT_ADDR2:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA2]], %[[OUT_ADDR2]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA3:.+]] = cir.extract_member %[[RESULT]][4] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[THREE_1:.+]] = cir.const #cir.int<3> : !u32i
+ // CIR: %[[OUT_ADDR3:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA3]], %[[OUT_ADDR3]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA4:.+]] = cir.extract_member %[[RESULT]][5] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[FOUR_1:.+]] = cir.const #cir.int<4> : !u32i
+ // CIR: %[[OUT_ADDR4:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA4]], %[[OUT_ADDR4]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA5:.+]] = cir.extract_member %[[RESULT]][6] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[FIVE_1:.+]] = cir.const #cir.int<5> : !u32i
+ // CIR: %[[OUT_ADDR5:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA5]], %[[OUT_ADDR5]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA6:.+]] = cir.extract_member %[[RESULT]][7] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[SIX_1:.+]] = cir.const #cir.int<6> : !u32i
+ // CIR: %[[OUT_ADDR6:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA6]], %[[OUT_ADDR6]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA7:.+]] = cir.extract_member %[[RESULT]][8] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[SEVEN_1:.+]] = cir.const #cir.int<7> : !u32i
+ // CIR: %[[OUT_ADDR7:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA7]], %[[OUT_ADDR7]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: } else {
+ // CIR: %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+ // CIR: %[[ZERO_2:.+]] = cir.const #cir.int<0> : !u32i
+ // CIR: %[[OUT_ADDR0_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ZERO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR0_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[ONE_2:.+]] = cir.const #cir.int<1> : !u32i
+ // CIR: %[[OUT_ADDR1_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR1_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[TWO_2:.+]] = cir.const #cir.int<2> : !u32i
+ // CIR: %[[OUT_ADDR2_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR2_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[THREE_2:.+]] = cir.const #cir.int<3> : !u32i
+ // CIR: %[[OUT_ADDR3_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR3_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[FOUR_2:.+]] = cir.const #cir.int<4> : !u32i
+ // CIR: %[[OUT_ADDR4_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR4_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[FIVE_2:.+]] = cir.const #cir.int<5> : !u32i
+ // CIR: %[[OUT_ADDR5_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR5_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[SIX_2:.+]] = cir.const #cir.int<6> : !u32i
+ // CIR: %[[OUT_ADDR6_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR6_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[SEVEN_2:.+]] = cir.const #cir.int<7> : !u32i
+ // CIR: %[[OUT_ADDR7_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR7_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: }
+ // CIR: %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+ // CIR: cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+ // CIR: %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+ // CIR: cir.return %[[RET]] : !u8i
+
+ // LLVM-LABEL: _mm_aesencwide256kl_u8
+ // LLVM: %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+ // LLVM: %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 1
+ // LLVM: %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+ // LLVM: %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 2
+ // LLVM: %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+ // LLVM: %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 3
+ // LLVM: %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+ // LLVM: %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 4
+ // LLVM: %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+ // LLVM: %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 5
+ // LLVM: %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+ // LLVM: %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 6
+ // LLVM: %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+ // LLVM: %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 7
+ // LLVM: %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+ // LLVM: %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(ptr %[[H_ADDR:.+]], <2 x i64> %[[IN_DATA0]], <2 x i64> %[[IN_DATA1]], <2 x i64> %[[IN_DATA2]], <2 x i64> %[[IN_DATA3]], <2 x i64> %[[IN_DATA4]], <2 x i64> %[[IN_DATA5]], <2 x i64> %[[IN_DATA6]], <2 x i64> %[[IN_DATA7]])
+ // LLVM: %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // LLVM: %[[SUCC:.+]] = trunc i8 %34 to i1
+ // LLVM: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // LLVM: [[NO_ERROR]]:
+ // LLVM: %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+ // LLVM: store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+ // LLVM: %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+ // LLVM: %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+ // LLVM: store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+ // LLVM: %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+ // LLVM: %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+ // LLVM: store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+ // LLVM: %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+ // LLVM: %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+ // LLVM: store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+ // LLVM: %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+ // LLVM: %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+ // LLVM: store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+ // LLVM: %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+ // LLVM: %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+ // LLVM: store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+ // LLVM: %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+ // LLVM: %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+ // LLVM: store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+ // LLVM: %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+ // LLVM: %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+ // LLVM: store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+ // LLVM: br label %[[EXIT:.+]]
+ // LLVM: [[ERROR]]:
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // LLVM: %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+ // LLVM: %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+ // LLVM: %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+ // LLVM: %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+ // LLVM: %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+ // LLVM: %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+ // LLVM: %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+ // LLVM: br label %[[EXIT]]
+ // LLVM: [[EXIT]]:
+ // LLVM: %[[FLAG1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // LLVM: store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+ // LLVM: %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+ // LLVM: store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+ // LLVM: %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+ // LLVM: ret i8 %[[RET]]
+
+ // OGCG-LABEL: _mm_aesencwide256kl_u8
+ // OGCG: %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+ // OGCG: %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 1
+ // OGCG: %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+ // OGCG: %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 2
+ // OGCG: %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+ // OGCG: %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 3
+ // OGCG: %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+ // OGCG: %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 4
+ // OGCG: %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+ // OGCG: %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 5
+ // OGCG: %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+ // OGCG: %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 6
+ // OGCG: %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+ // OGCG: %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 7
+ // OGCG: %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+ // OGCG: %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(ptr %[[H_ADDR:.+]], <2 x i64> %[[IN_DATA0]], <2 x i64> %[[IN_DATA1]], <2 x i64> %[[IN_DATA2]], <2 x i64> %[[IN_DATA3]], <2 x i64> %[[IN_DATA4]], <2 x i64> %[[IN_DATA5]], <2 x i64> %[[IN_DATA6]], <2 x i64> %[[IN_DATA7]])
+ // OGCG: %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // OGCG: %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+ // OGCG: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // OGCG: [[NO_ERROR]]:
+ // OGCG: %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+ // OGCG: store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+ // OGCG: %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+ // OGCG: %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+ // OGCG: store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+ // OGCG: %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+ // OGCG: %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+ // OGCG: store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+ // OGCG: %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+ // OGCG: %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+ // OGCG: store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+ // OGCG: %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+ // OGCG: %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+ // OGCG: store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+ // OGCG: %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+ // OGCG: %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+ // OGCG: store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+ // OGCG: %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+ // OGCG: %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+ // OGCG: store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+ // OGCG: %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+ // OGCG: %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+ // OGCG: store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+ // OGCG: br label %[[EXIT:.+]]
+ // OGCG: [[ERROR]]:
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // OGCG: %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+ // OGCG: %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+ // OGCG: %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+ // OGCG: %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+ // OGCG: %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+ // OGCG: %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+ // OGCG: %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+ // OGCG: br label %[[EXIT]]
+ // OGCG: [[EXIT]]:
+ // OGCG: %[[RET:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // OGCG: ret i8 %[[RET]]
+ return _mm_aesencwide256kl_u8(odata, idata, h);
+}
+
+unsigned char test__mm_aesdecwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
+ // CIR-LABEL: _mm_aesdecwide256kl_u8
+ // CIR: %[[ZERO:.+]] = cir.const #cir.int<0> : !u32i
+ // CIR: %[[IN_ADDR0:.+]] = cir.ptr_stride %[[IDATA_ADDR:.+]], %[[ZERO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA0:.+]] = cir.load align(16) %[[IN_ADDR0]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[ONE:.+]] = cir.const #cir.int<1> : !u32i
+ // CIR: %[[IN_ADDR1:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[ONE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA1:.+]] = cir.load align(16) %[[IN_ADDR1]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[TWO:.+]] = cir.const #cir.int<2> : !u32i
+ // CIR: %[[IN_ADDR2:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[TWO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA2:.+]] = cir.load align(16) %[[IN_ADDR2]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[THREE:.+]] = cir.const #cir.int<3> : !u32i
+ // CIR: %[[IN_ADDR3:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[THREE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA3:.+]] = cir.load align(16) %[[IN_ADDR3]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[FOUR:.+]] = cir.const #cir.int<4> : !u32i
+ // CIR: %[[IN_ADDR4:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FOUR]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA4:.+]] = cir.load align(16) %[[IN_ADDR4]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[FIVE:.+]] = cir.const #cir.int<5> : !u32i
+ // CIR: %[[IN_ADDR5:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FIVE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA5:.+]] = cir.load align(16) %[[IN_ADDR5]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[SIX:.+]] = cir.const #cir.int<6> : !u32i
+ // CIR: %[[IN_ADDR6:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SIX]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA6:.+]] = cir.load align(16) %[[IN_ADDR6]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[SEVEN:.+]] = cir.const #cir.int<7> : !u32i
+ // CIR: %[[IN_ADDR7:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SEVEN]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA7:.+]] = cir.load align(16) %[[IN_ADDR7]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesdecwide256kl" %[[H_ADDR:.+]], %[[IN_DATA0]], %[[IN_DATA1]], %[[IN_DATA2]], %[[IN_DATA3]], %[[IN_DATA4]], %[[IN_DATA5]], %[[IN_DATA6]], %[[IN_DATA7]] : (!cir.ptr<!void>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>) -> !rec_anon_struct1
+ // CIR: %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+ // CIR: %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+ // CIR: %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+ // CIR: cir.if %[[SUCC]] {
+ // CIR: %[[OUT_DATA0:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[ZERO_1:.+]] = cir.const #cir.int<0> : !u32i
+ // CIR: %[[OUT_ADDR0:.+]] = cir.ptr_stride %[[ODATA_ADDR:.+]], %[[ZERO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA0]], %[[OUT_ADDR0]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA1:.+]] = cir.extract_member %[[RESULT]][2] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[ONE_1:.+]] = cir.const #cir.int<1> : !u32i
+ // CIR: %[[OUT_ADDR1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA1]], %[[OUT_ADDR1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA2:.+]] = cir.extract_member %[[RESULT]][3] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[TWO_1:.+]] = cir.const #cir.int<2> : !u32i
+ // CIR: %[[OUT_ADDR2:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA2]], %[[OUT_ADDR2]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA3:.+]] = cir.extract_member %[[RESULT]][4] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[THREE_1:.+]] = cir.const #cir.int<3> : !u32i
+ // CIR: %[[OUT_ADDR3:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA3]], %[[OUT_ADDR3]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA4:.+]] = cir.extract_member %[[RESULT]][5] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[FOUR_1:.+]] = cir.const #cir.int<4> : !u32i
+ // CIR: %[[OUT_ADDR4:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA4]], %[[OUT_ADDR4]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA5:.+]] = cir.extract_member %[[RESULT]][6] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[FIVE_1:.+]] = cir.const #cir.int<5> : !u32i
+ // CIR: %[[OUT_ADDR5:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA5]], %[[OUT_ADDR5]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA6:.+]] = cir.extract_member %[[RESULT]][7] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[SIX_1:.+]] = cir.const #cir.int<6> : !u32i
+ // CIR: %[[OUT_ADDR6:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA6]], %[[OUT_ADDR6]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA7:.+]] = cir.extract_member %[[RESULT]][8] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[SEVEN_1:.+]] = cir.const #cir.int<7> : !u32i
+ // CIR: %[[OUT_ADDR7:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA7]], %[[OUT_ADDR7]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: } else {
+ // CIR: %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+ // CIR: %[[ZERO_2:.+]] = cir.const #cir.int<0> : !u32i
+ // CIR: %[[OUT_ADDR0_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ZERO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR0_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[ONE_2:.+]] = cir.const #cir.int<1> : !u32i
+ // CIR: %[[OUT_ADDR1_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR1_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[TWO_2:.+]] = cir.const #cir.int<2> : !u32i
+ // CIR: %[[OUT_ADDR2_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR2_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[THREE_2:.+]] = cir.const #cir.int<3> : !u32i
+ // CIR: %[[OUT_ADDR3_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR3_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[FOUR_2:.+]] = cir.const #cir.int<4> : !u32i
+ // CIR: %[[OUT_ADDR4_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR4_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[FIVE_2:.+]] = cir.const #cir.int<5> : !u32i
+ // CIR: %[[OUT_ADDR5_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR5_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[SIX_2:.+]] = cir.const #cir.int<6> : !u32i
+ // CIR: %[[OUT_ADDR6_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR6_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[SEVEN_2:.+]] = cir.const #cir.int<7> : !u32i
+ // CIR: %[[OUT_ADDR7_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR7_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: }
+ // CIR: %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+ // CIR: cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+ // CIR: %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+ // CIR: cir.return %[[RET]] : !u8i
+
+ // LLVM-LABEL: _mm_aesdecwide256kl_u8
+ // LLVM: %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+ // LLVM: %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 1
+ // LLVM: %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+ // LLVM: %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 2
+ // LLVM: %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+ // LLVM: %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 3
+ // LLVM: %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+ // LLVM: %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 4
+ // LLVM: %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+ // LLVM: %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 5
+ // LLVM: %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+ // LLVM: %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 6
+ // LLVM: %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+ // LLVM: %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 7
+ // LLVM: %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+ // LLVM: %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(ptr %[[H_ADDR:.+]], <2 x i64> %[[IN_DATA0]], <2 x i64> %[[IN_DATA1]], <2 x i64> %[[IN_DATA2]], <2 x i64> %[[IN_DATA3]], <2 x i64> %[[IN_DATA4]], <2 x i64> %[[IN_DATA5]], <2 x i64> %[[IN_DATA6]], <2 x i64> %[[IN_DATA7]])
+ // LLVM: %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // LLVM: %[[SUCC:.+]] = trunc i8 %34 to i1
+ // LLVM: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // LLVM: [[NO_ERROR]]:
+ // LLVM: %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+ // LLVM: store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+ // LLVM: %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+ // LLVM: %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+ // LLVM: store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+ // LLVM: %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+ // LLVM: %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+ // LLVM: store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+ // LLVM: %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+ // LLVM: %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+ // LLVM: store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+ // LLVM: %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+ // LLVM: %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+ // LLVM: store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+ // LLVM: %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+ // LLVM: %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+ // LLVM: store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+ // LLVM: %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+ // LLVM: %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+ // LLVM: store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+ // LLVM: %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+ // LLVM: %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+ // LLVM: store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+ // LLVM: br label %[[EXIT:.+]]
+ // LLVM: [[ERROR]]:
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // LLVM: %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+ // LLVM: %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+ // LLVM: %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+ // LLVM: %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+ // LLVM: %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+ // LLVM: %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+ // LLVM: %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+ // LLVM: br label %[[EXIT]]
+ // LLVM: [[EXIT]]:
+ // LLVM: %[[FLAG1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // LLVM: store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+ // LLVM: %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+ // LLVM: store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+ // LLVM: %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+ // LLVM: ret i8 %[[RET]]
+
+ // OGCG-LABEL: _mm_aesdecwide256kl_u8
+ // OGCG: %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+ // OGCG: %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 1
+ // OGCG: %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+ // OGCG: %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 2
+ // OGCG: %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+ // OGCG: %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 3
+ // OGCG: %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+ // OGCG: %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 4
+ // OGCG: %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+ // OGCG: %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 5
+ // OGCG: %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+ // OGCG: %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 6
+ // OGCG: %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+ // OGCG: %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 7
+ // OGCG: %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+ // OGCG: %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(ptr %[[H_ADDR:.+]], <2 x i64> %[[IN_DATA0]], <2 x i64> %[[IN_DATA1]], <2 x i64> %[[IN_DATA2]], <2 x i64> %[[IN_DATA3]], <2 x i64> %[[IN_DATA4]], <2 x i64> %[[IN_DATA5]], <2 x i64> %[[IN_DATA6]], <2 x i64> %[[IN_DATA7]])
+ // OGCG: %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // OGCG: %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+ // OGCG: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // OGCG: [[NO_ERROR]]:
+ // OGCG: %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+ // OGCG: store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+ // OGCG: %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+ // OGCG: %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+ // OGCG: store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+ // OGCG: %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+ // OGCG: %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+ // OGCG: store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+ // OGCG: %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+ // OGCG: %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+ // OGCG: store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+ // OGCG: %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+ // OGCG: %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+ // OGCG: store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+ // OGCG: %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+ // OGCG: %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+ // OGCG: store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+ // OGCG: %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+ // OGCG: %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+ // OGCG: store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+ // OGCG: %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+ // OGCG: %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+ // OGCG: store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+ // OGCG: br label %[[EXIT:.+]]
+ // OGCG: [[ERROR]]:
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // OGCG: %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+ // OGCG: %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+ // OGCG: %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+ // OGCG: %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+ // OGCG: %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+ // OGCG: %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+ // OGCG: %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+ // OGCG: br label %[[EXIT]]
+ // OGCG: [[EXIT]]:
+ // OGCG: %[[RET:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // OGCG: ret i8 %[[RET]]
+ return _mm_aesdecwide256kl_u8(odata, idata, h);
+}
+
+unsigned char test__mm_aesencwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
+ // CIR-LABEL: _mm_aesencwide128kl_u8
+ // CIR: %[[ZERO:.+]] = cir.const #cir.int<0> : !u32i
+ // CIR: %[[IN_ADDR0:.+]] = cir.ptr_stride %[[IDATA_ADDR:.+]], %[[ZERO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA0:.+]] = cir.load align(16) %[[IN_ADDR0]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[ONE:.+]] = cir.const #cir.int<1> : !u32i
+ // CIR: %[[IN_ADDR1:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[ONE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA1:.+]] = cir.load align(16) %[[IN_ADDR1]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[TWO:.+]] = cir.const #cir.int<2> : !u32i
+ // CIR: %[[IN_ADDR2:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[TWO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA2:.+]] = cir.load align(16) %[[IN_ADDR2]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[THREE:.+]] = cir.const #cir.int<3> : !u32i
+ // CIR: %[[IN_ADDR3:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[THREE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA3:.+]] = cir.load align(16) %[[IN_ADDR3]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[FOUR:.+]] = cir.const #cir.int<4> : !u32i
+ // CIR: %[[IN_ADDR4:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FOUR]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA4:.+]] = cir.load align(16) %[[IN_ADDR4]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[FIVE:.+]] = cir.const #cir.int<5> : !u32i
+ // CIR: %[[IN_ADDR5:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FIVE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA5:.+]] = cir.load align(16) %[[IN_ADDR5]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[SIX:.+]] = cir.const #cir.int<6> : !u32i
+ // CIR: %[[IN_ADDR6:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SIX]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA6:.+]] = cir.load align(16) %[[IN_ADDR6]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[SEVEN:.+]] = cir.const #cir.int<7> : !u32i
+ // CIR: %[[IN_ADDR7:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SEVEN]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA7:.+]] = cir.load align(16) %[[IN_ADDR7]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesencwide128kl" %[[H_ADDR:.+]], %[[IN_DATA0]], %[[IN_DATA1]], %[[IN_DATA2]], %[[IN_DATA3]], %[[IN_DATA4]], %[[IN_DATA5]], %[[IN_DATA6]], %[[IN_DATA7]] : (!cir.ptr<!void>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>) -> !rec_anon_struct1
+ // CIR: %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+ // CIR: %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+ // CIR: %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+ // CIR: cir.if %[[SUCC]] {
+ // CIR: %[[OUT_DATA0:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[ZERO_1:.+]] = cir.const #cir.int<0> : !u32i
+ // CIR: %[[OUT_ADDR0:.+]] = cir.ptr_stride %[[ODATA_ADDR:.+]], %[[ZERO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA0]], %[[OUT_ADDR0]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA1:.+]] = cir.extract_member %[[RESULT]][2] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[ONE_1:.+]] = cir.const #cir.int<1> : !u32i
+ // CIR: %[[OUT_ADDR1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA1]], %[[OUT_ADDR1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA2:.+]] = cir.extract_member %[[RESULT]][3] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[TWO_1:.+]] = cir.const #cir.int<2> : !u32i
+ // CIR: %[[OUT_ADDR2:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA2]], %[[OUT_ADDR2]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA3:.+]] = cir.extract_member %[[RESULT]][4] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[THREE_1:.+]] = cir.const #cir.int<3> : !u32i
+ // CIR: %[[OUT_ADDR3:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA3]], %[[OUT_ADDR3]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA4:.+]] = cir.extract_member %[[RESULT]][5] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[FOUR_1:.+]] = cir.const #cir.int<4> : !u32i
+ // CIR: %[[OUT_ADDR4:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA4]], %[[OUT_ADDR4]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA5:.+]] = cir.extract_member %[[RESULT]][6] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[FIVE_1:.+]] = cir.const #cir.int<5> : !u32i
+ // CIR: %[[OUT_ADDR5:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA5]], %[[OUT_ADDR5]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA6:.+]] = cir.extract_member %[[RESULT]][7] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[SIX_1:.+]] = cir.const #cir.int<6> : !u32i
+ // CIR: %[[OUT_ADDR6:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA6]], %[[OUT_ADDR6]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA7:.+]] = cir.extract_member %[[RESULT]][8] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[SEVEN_1:.+]] = cir.const #cir.int<7> : !u32i
+ // CIR: %[[OUT_ADDR7:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA7]], %[[OUT_ADDR7]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: } else {
+ // CIR: %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+ // CIR: %[[ZERO_2:.+]] = cir.const #cir.int<0> : !u32i
+ // CIR: %[[OUT_ADDR0_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ZERO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR0_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[ONE_2:.+]] = cir.const #cir.int<1> : !u32i
+ // CIR: %[[OUT_ADDR1_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR1_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[TWO_2:.+]] = cir.const #cir.int<2> : !u32i
+ // CIR: %[[OUT_ADDR2_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR2_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[THREE_2:.+]] = cir.const #cir.int<3> : !u32i
+ // CIR: %[[OUT_ADDR3_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR3_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[FOUR_2:.+]] = cir.const #cir.int<4> : !u32i
+ // CIR: %[[OUT_ADDR4_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR4_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[FIVE_2:.+]] = cir.const #cir.int<5> : !u32i
+ // CIR: %[[OUT_ADDR5_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR5_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[SIX_2:.+]] = cir.const #cir.int<6> : !u32i
+ // CIR: %[[OUT_ADDR6_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR6_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[SEVEN_2:.+]] = cir.const #cir.int<7> : !u32i
+ // CIR: %[[OUT_ADDR7_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR7_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: }
+ // CIR: %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+ // CIR: cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+ // CIR: %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+ // CIR: cir.return %[[RET]] : !u8i
+
+ // LLVM-LABEL: _mm_aesencwide128kl_u8
+ // LLVM: %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+ // LLVM: %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 1
+ // LLVM: %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+ // LLVM: %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 2
+ // LLVM: %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+ // LLVM: %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 3
+ // LLVM: %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+ // LLVM: %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 4
+ // LLVM: %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+ // LLVM: %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 5
+ // LLVM: %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+ // LLVM: %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 6
+ // LLVM: %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+ // LLVM: %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 7
+ // LLVM: %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+ // LLVM: %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(ptr %[[H_ADDR:.+]], <2 x i64> %[[IN_DATA0]], <2 x i64> %[[IN_DATA1]], <2 x i64> %[[IN_DATA2]], <2 x i64> %[[IN_DATA3]], <2 x i64> %[[IN_DATA4]], <2 x i64> %[[IN_DATA5]], <2 x i64> %[[IN_DATA6]], <2 x i64> %[[IN_DATA7]])
+ // LLVM: %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // LLVM: %[[SUCC:.+]] = trunc i8 %34 to i1
+ // LLVM: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // LLVM: [[NO_ERROR]]:
+ // LLVM: %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+ // LLVM: store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+ // LLVM: %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+ // LLVM: %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+ // LLVM: store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+ // LLVM: %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+ // LLVM: %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+ // LLVM: store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+ // LLVM: %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+ // LLVM: %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+ // LLVM: store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+ // LLVM: %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+ // LLVM: %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+ // LLVM: store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+ // LLVM: %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+ // LLVM: %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+ // LLVM: store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+ // LLVM: %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+ // LLVM: %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+ // LLVM: store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+ // LLVM: %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+ // LLVM: %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+ // LLVM: store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+ // LLVM: br label %[[EXIT:.+]]
+ // LLVM: [[ERROR]]:
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // LLVM: %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+ // LLVM: %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+ // LLVM: %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+ // LLVM: %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+ // LLVM: %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+ // LLVM: %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+ // LLVM: %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+ // LLVM: br label %[[EXIT]]
+ // LLVM: [[EXIT]]:
+ // LLVM: %[[FLAG1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // LLVM: store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+ // LLVM: %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+ // LLVM: store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+ // LLVM: %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+ // LLVM: ret i8 %[[RET]]
+
+ // OGCG-LABEL: _mm_aesencwide128kl_u8
+ // OGCG: %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+ // OGCG: %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 1
+ // OGCG: %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+ // OGCG: %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 2
+ // OGCG: %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+ // OGCG: %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 3
+ // OGCG: %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+ // OGCG: %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 4
+ // OGCG: %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+ // OGCG: %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 5
+ // OGCG: %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+ // OGCG: %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 6
+ // OGCG: %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+ // OGCG: %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 7
+ // OGCG: %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+ // OGCG: %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(ptr %[[H_ADDR:.+]], <2 x i64> %[[IN_DATA0]], <2 x i64> %[[IN_DATA1]], <2 x i64> %[[IN_DATA2]], <2 x i64> %[[IN_DATA3]], <2 x i64> %[[IN_DATA4]], <2 x i64> %[[IN_DATA5]], <2 x i64> %[[IN_DATA6]], <2 x i64> %[[IN_DATA7]])
+ // OGCG: %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // OGCG: %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+ // OGCG: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // OGCG: [[NO_ERROR]]:
+ // OGCG: %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+ // OGCG: store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+ // OGCG: %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+ // OGCG: %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+ // OGCG: store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+ // OGCG: %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+ // OGCG: %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+ // OGCG: store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+ // OGCG: %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+ // OGCG: %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+ // OGCG: store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+ // OGCG: %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+ // OGCG: %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+ // OGCG: store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+ // OGCG: %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+ // OGCG: %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+ // OGCG: store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+ // OGCG: %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+ // OGCG: %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+ // OGCG: store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+ // OGCG: %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+ // OGCG: %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+ // OGCG: store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+ // OGCG: br label %[[EXIT:.+]]
+ // OGCG: [[ERROR]]:
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // OGCG: %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+ // OGCG: %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+ // OGCG: %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+ // OGCG: %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+ // OGCG: %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+ // OGCG: %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+ // OGCG: %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+ // OGCG: br label %[[EXIT]]
+ // OGCG: [[EXIT]]:
+ // OGCG: %[[RET:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // OGCG: ret i8 %[[RET]]
+ return _mm_aesencwide128kl_u8(odata, idata, h);
+}
+
+unsigned char test__mm_aesdecwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) {
+ // CIR-LABEL: _mm_aesdecwide128kl_u8
+ // CIR: %[[ZERO:.+]] = cir.const #cir.int<0> : !u32i
+ // CIR: %[[IN_ADDR0:.+]] = cir.ptr_stride %[[IDATA_ADDR:.+]], %[[ZERO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA0:.+]] = cir.load align(16) %[[IN_ADDR0]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[ONE:.+]] = cir.const #cir.int<1> : !u32i
+ // CIR: %[[IN_ADDR1:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[ONE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA1:.+]] = cir.load align(16) %[[IN_ADDR1]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[TWO:.+]] = cir.const #cir.int<2> : !u32i
+ // CIR: %[[IN_ADDR2:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[TWO]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA2:.+]] = cir.load align(16) %[[IN_ADDR2]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[THREE:.+]] = cir.const #cir.int<3> : !u32i
+ // CIR: %[[IN_ADDR3:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[THREE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA3:.+]] = cir.load align(16) %[[IN_ADDR3]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[FOUR:.+]] = cir.const #cir.int<4> : !u32i
+ // CIR: %[[IN_ADDR4:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FOUR]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA4:.+]] = cir.load align(16) %[[IN_ADDR4]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[FIVE:.+]] = cir.const #cir.int<5> : !u32i
+ // CIR: %[[IN_ADDR5:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[FIVE]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA5:.+]] = cir.load align(16) %[[IN_ADDR5]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[SIX:.+]] = cir.const #cir.int<6> : !u32i
+ // CIR: %[[IN_ADDR6:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SIX]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA6:.+]] = cir.load align(16) %[[IN_ADDR6]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[SEVEN:.+]] = cir.const #cir.int<7> : !u32i
+ // CIR: %[[IN_ADDR7:.+]] = cir.ptr_stride %[[IDATA_ADDR]], %[[SEVEN]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[IN_DATA7:.+]] = cir.load align(16) %[[IN_ADDR7]] : !cir.ptr<!cir.vector<2 x !s64i>>, !cir.vector<2 x !s64i>
+ // CIR: %[[RESULT:.+]] = cir.call_llvm_intrinsic "x86.aesdecwide128kl" %[[H_ADDR:.+]], %[[IN_DATA0]], %[[IN_DATA1]], %[[IN_DATA2]], %[[IN_DATA3]], %[[IN_DATA4]], %[[IN_DATA5]], %[[IN_DATA6]], %[[IN_DATA7]] : (!cir.ptr<!void>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i>) -> !rec_anon_struct1
+ // CIR: %[[FLAG:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+ // CIR: %[[FLAG_BIT0:.+]] = cir.cast integral %[[FLAG]] : !u8i -> !cir.int<u, 1>
+ // CIR: %[[SUCC:.+]] = cir.cast int_to_bool %[[FLAG_BIT0]] : !cir.int<u, 1> -> !cir.bool
+ // CIR: cir.if %[[SUCC]] {
+ // CIR: %[[OUT_DATA0:.+]] = cir.extract_member %[[RESULT]][1] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[ZERO_1:.+]] = cir.const #cir.int<0> : !u32i
+ // CIR: %[[OUT_ADDR0:.+]] = cir.ptr_stride %[[ODATA_ADDR:.+]], %[[ZERO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA0]], %[[OUT_ADDR0]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA1:.+]] = cir.extract_member %[[RESULT]][2] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[ONE_1:.+]] = cir.const #cir.int<1> : !u32i
+ // CIR: %[[OUT_ADDR1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA1]], %[[OUT_ADDR1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA2:.+]] = cir.extract_member %[[RESULT]][3] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[TWO_1:.+]] = cir.const #cir.int<2> : !u32i
+ // CIR: %[[OUT_ADDR2:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA2]], %[[OUT_ADDR2]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA3:.+]] = cir.extract_member %[[RESULT]][4] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[THREE_1:.+]] = cir.const #cir.int<3> : !u32i
+ // CIR: %[[OUT_ADDR3:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA3]], %[[OUT_ADDR3]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA4:.+]] = cir.extract_member %[[RESULT]][5] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[FOUR_1:.+]] = cir.const #cir.int<4> : !u32i
+ // CIR: %[[OUT_ADDR4:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA4]], %[[OUT_ADDR4]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA5:.+]] = cir.extract_member %[[RESULT]][6] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[FIVE_1:.+]] = cir.const #cir.int<5> : !u32i
+ // CIR: %[[OUT_ADDR5:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA5]], %[[OUT_ADDR5]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA6:.+]] = cir.extract_member %[[RESULT]][7] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[SIX_1:.+]] = cir.const #cir.int<6> : !u32i
+ // CIR: %[[OUT_ADDR6:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA6]], %[[OUT_ADDR6]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[OUT_DATA7:.+]] = cir.extract_member %[[RESULT]][8] : !rec_anon_struct1 -> !cir.vector<2 x !s64i>
+ // CIR: %[[SEVEN_1:.+]] = cir.const #cir.int<7> : !u32i
+ // CIR: %[[OUT_ADDR7:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_1]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[OUT_DATA7]], %[[OUT_ADDR7]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: } else {
+ // CIR: %[[NULL:.+]] = cir.const #cir.zero : !cir.vector<2 x !s64i>
+ // CIR: %[[ZERO_2:.+]] = cir.const #cir.int<0> : !u32i
+ // CIR: %[[OUT_ADDR0_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ZERO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR0_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[ONE_2:.+]] = cir.const #cir.int<1> : !u32i
+ // CIR: %[[OUT_ADDR1_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[ONE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR1_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[TWO_2:.+]] = cir.const #cir.int<2> : !u32i
+ // CIR: %[[OUT_ADDR2_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[TWO_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR2_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[THREE_2:.+]] = cir.const #cir.int<3> : !u32i
+ // CIR: %[[OUT_ADDR3_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[THREE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR3_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[FOUR_2:.+]] = cir.const #cir.int<4> : !u32i
+ // CIR: %[[OUT_ADDR4_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FOUR_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR4_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[FIVE_2:.+]] = cir.const #cir.int<5> : !u32i
+ // CIR: %[[OUT_ADDR5_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[FIVE_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR5_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[SIX_2:.+]] = cir.const #cir.int<6> : !u32i
+ // CIR: %[[OUT_ADDR6_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SIX_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR6_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: %[[SEVEN_2:.+]] = cir.const #cir.int<7> : !u32i
+ // CIR: %[[OUT_ADDR7_1:.+]] = cir.ptr_stride %[[ODATA_ADDR]], %[[SEVEN_2]] : (!cir.ptr<!cir.vector<2 x !s64i>>, !u32i) -> !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: cir.store align(16) %[[NULL]], %[[OUT_ADDR7_1]] : !cir.vector<2 x !s64i>, !cir.ptr<!cir.vector<2 x !s64i>>
+ // CIR: }
+ // CIR: %[[FLAG1:.+]] = cir.extract_member %[[RESULT]][0] : !rec_anon_struct1 -> !u8i
+ // CIR: cir.store %[[FLAG1]], %[[RET_ADDR:.+]] : !u8i, !cir.ptr<!u8i>
+ // CIR: %[[RET:.+]] = cir.load %[[RET_ADDR]] : !cir.ptr<!u8i>, !u8i
+ // CIR: cir.return %[[RET]] : !u8i
+
+ // LLVM-LABEL: _mm_aesdecwide128kl_u8
+ // LLVM: %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+ // LLVM: %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 1
+ // LLVM: %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+ // LLVM: %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 2
+ // LLVM: %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+ // LLVM: %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 3
+ // LLVM: %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+ // LLVM: %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 4
+ // LLVM: %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+ // LLVM: %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 5
+ // LLVM: %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+ // LLVM: %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 6
+ // LLVM: %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+ // LLVM: %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i64 7
+ // LLVM: %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+ // LLVM: %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(ptr %[[H_ADDR:.+]], <2 x i64> %[[IN_DATA0]], <2 x i64> %[[IN_DATA1]], <2 x i64> %[[IN_DATA2]], <2 x i64> %[[IN_DATA3]], <2 x i64> %[[IN_DATA4]], <2 x i64> %[[IN_DATA5]], <2 x i64> %[[IN_DATA6]], <2 x i64> %[[IN_DATA7]])
+ // LLVM: %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // LLVM: %[[SUCC:.+]] = trunc i8 %34 to i1
+ // LLVM: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // LLVM: [[NO_ERROR]]:
+ // LLVM: %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+ // LLVM: store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+ // LLVM: %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+ // LLVM: %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+ // LLVM: store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+ // LLVM: %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+ // LLVM: %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+ // LLVM: store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+ // LLVM: %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+ // LLVM: %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+ // LLVM: store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+ // LLVM: %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+ // LLVM: %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+ // LLVM: store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+ // LLVM: %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+ // LLVM: %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+ // LLVM: store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+ // LLVM: %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+ // LLVM: %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+ // LLVM: store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+ // LLVM: %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+ // LLVM: %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+ // LLVM: store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+ // LLVM: br label %[[EXIT:.+]]
+ // LLVM: [[ERROR]]:
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // LLVM: %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 1
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+ // LLVM: %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 2
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+ // LLVM: %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 3
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+ // LLVM: %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 4
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+ // LLVM: %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 5
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+ // LLVM: %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 6
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+ // LLVM: %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i64 7
+ // LLVM: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+ // LLVM: br label %[[EXIT]]
+ // LLVM: [[EXIT]]:
+ // LLVM: %[[FLAG1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // LLVM: store i8 %[[FLAG1]], ptr %[[FLAG1_ADDR:.+]], align 1
+ // LLVM: %[[FLAG2:.+]] = load i8, ptr %[[FLAG1_ADDR]], align 1
+ // LLVM: store i8 %[[FLAG2:.+]], ptr %[[FLAG2_ADDR:.+]], align 1
+ // LLVM: %[[RET:.+]] = load i8, ptr %[[FLAG2_ADDR]], align 1
+ // LLVM: ret i8 %[[RET]]
+
+ // OGCG-LABEL: _mm_aesdecwide128kl_u8
+ // OGCG: %[[IN_DATA0:.+]] = load <2 x i64>, ptr %[[IDATA_ADDR:.+]], align 16
+ // OGCG: %[[IN_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 1
+ // OGCG: %[[IN_DATA1:.+]] = load <2 x i64>, ptr %[[IN_ADDR1]], align 16
+ // OGCG: %[[IN_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 2
+ // OGCG: %[[IN_DATA2:.+]] = load <2 x i64>, ptr %[[IN_ADDR2]], align 16
+ // OGCG: %[[IN_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 3
+ // OGCG: %[[IN_DATA3:.+]] = load <2 x i64>, ptr %[[IN_ADDR3]], align 16
+ // OGCG: %[[IN_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 4
+ // OGCG: %[[IN_DATA4:.+]] = load <2 x i64>, ptr %[[IN_ADDR4]], align 16
+ // OGCG: %[[IN_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 5
+ // OGCG: %[[IN_DATA5:.+]] = load <2 x i64>, ptr %[[IN_ADDR5]], align 16
+ // OGCG: %[[IN_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 6
+ // OGCG: %[[IN_DATA6:.+]] = load <2 x i64>, ptr %[[IN_ADDR6]], align 16
+ // OGCG: %[[IN_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[IDATA_ADDR]], i32 7
+ // OGCG: %[[IN_DATA7:.+]] = load <2 x i64>, ptr %[[IN_ADDR7]], align 16
+ // OGCG: %[[RESULT:.+]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(ptr %[[H_ADDR:.+]], <2 x i64> %[[IN_DATA0]], <2 x i64> %[[IN_DATA1]], <2 x i64> %[[IN_DATA2]], <2 x i64> %[[IN_DATA3]], <2 x i64> %[[IN_DATA4]], <2 x i64> %[[IN_DATA5]], <2 x i64> %[[IN_DATA6]], <2 x i64> %[[IN_DATA7]])
+ // OGCG: %[[FLAG:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // OGCG: %[[SUCC:.+]] = trunc i8 %[[FLAG]] to i1
+ // OGCG: br i1 %[[SUCC]], label %[[NO_ERROR:.+]], label %[[ERROR:.+]]
+ // OGCG: [[NO_ERROR]]:
+ // OGCG: %[[OUT_DATA0:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 1
+ // OGCG: store <2 x i64> %[[OUT_DATA0]], ptr %[[ODATA_PTR:.+]], align 16
+ // OGCG: %[[OUT_DATA1:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 2
+ // OGCG: %[[OUT_ADDR1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+ // OGCG: store <2 x i64> %[[OUT_DATA1]], ptr %[[OUT_ADDR1]], align 16
+ // OGCG: %[[OUT_DATA2:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 3
+ // OGCG: %[[OUT_ADDR2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+ // OGCG: store <2 x i64> %[[OUT_DATA2]], ptr %[[OUT_ADDR2]], align 16
+ // OGCG: %[[OUT_DATA3:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 4
+ // OGCG: %[[OUT_ADDR3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+ // OGCG: store <2 x i64> %[[OUT_DATA3]], ptr %[[OUT_ADDR3]], align 16
+ // OGCG: %[[OUT_DATA4:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 5
+ // OGCG: %[[OUT_ADDR4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+ // OGCG: store <2 x i64> %[[OUT_DATA4]], ptr %[[OUT_ADDR4]], align 16
+ // OGCG: %[[OUT_DATA5:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 6
+ // OGCG: %[[OUT_ADDR5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+ // OGCG: store <2 x i64> %[[OUT_DATA5]], ptr %[[OUT_ADDR5]], align 16
+ // OGCG: %[[OUT_DATA6:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 7
+ // OGCG: %[[OUT_ADDR6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+ // OGCG: store <2 x i64> %[[OUT_DATA6]], ptr %[[OUT_ADDR6]], align 16
+ // OGCG: %[[OUT_DATA7:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 8
+ // OGCG: %[[OUT_ADDR7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+ // OGCG: store <2 x i64> %[[OUT_DATA7]], ptr %[[OUT_ADDR7]], align 16
+ // OGCG: br label %[[EXIT:.+]]
+ // OGCG: [[ERROR]]:
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[ODATA_PTR]], align 16
+ // OGCG: %[[OUT_ADDR0_1:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 1
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_1]], align 16
+ // OGCG: %[[OUT_ADDR0_2:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 2
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_2]], align 16
+ // OGCG: %[[OUT_ADDR0_3:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 3
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_3]], align 16
+ // OGCG: %[[OUT_ADDR0_4:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 4
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_4]], align 16
+ // OGCG: %[[OUT_ADDR0_5:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 5
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_5]], align 16
+ // OGCG: %[[OUT_ADDR0_6:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 6
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_6]], align 16
+ // OGCG: %[[OUT_ADDR0_7:.+]] = getelementptr <2 x i64>, ptr %[[ODATA_PTR]], i32 7
+ // OGCG: store <2 x i64> zeroinitializer, ptr %[[OUT_ADDR0_7]], align 16
+ // OGCG: br label %[[EXIT]]
+ // OGCG: [[EXIT]]:
+ // OGCG: %[[RET:.+]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %[[RESULT]], 0
+ // OGCG: ret i8 %[[RET]]
+ return _mm_aesdecwide128kl_u8(odata, idata, h);
+}
More information about the cfe-commits
mailing list