Skip to content

Commit

Permalink
Implement SIMD load/store between different record dimensions
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Jan 3, 2024
1 parent 61848c4 commit bd95c53
Show file tree
Hide file tree
Showing 3 changed files with 207 additions and 43 deletions.
20 changes: 11 additions & 9 deletions include/llama/RecordRef.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,11 +328,11 @@ namespace llama
inline constexpr auto isDirectListInitializableFromTuple<T, Tuple<Args...>>
= isDirectListInitializable<T, Args...>;

template<typename T, typename Simd, typename RecordCoord>
LLAMA_FN_HOST_ACC_INLINE void loadSimdFromField(const T& srcRef, Simd& dstSimd, RecordCoord rc);
template<typename T, typename Simd, typename SrcRC, typename DstRC>
LLAMA_FN_HOST_ACC_INLINE void loadSimdFromField(const T& srcRef, Simd& dstSimd, SrcRC srcRC, DstRC dstRC);

template<typename Simd, typename T, typename RecordCoord>
LLAMA_FN_HOST_ACC_INLINE void storeSimdToField(const Simd& srcSimd, T&& dstRef, RecordCoord rc);
template<typename Simd, typename TFwd, typename SrcRC, typename DstRC>
LLAMA_FN_HOST_ACC_INLINE void storeSimdToField(const Simd& srcSimd, TFwd&& dstRef, SrcRC srcRC, DstRC dstRC);
} // namespace internal

/// Record reference type returned by \ref View after resolving an array dimensions coordinate or partially
Expand Down Expand Up @@ -756,16 +756,18 @@ namespace llama
// FIXME(bgruber): the SIMD load/store functions need to navigate back from a record ref to the contained view
// to find subsequent elements. This is not a great design for now and the SIMD load/store functions should
// probably take iterators to records.
template<typename T, typename Simd, typename RecordCoord>
template<typename T, typename Simd, typename SrcRC, typename DstRC>
friend LLAMA_FN_HOST_ACC_INLINE void internal::loadSimdFromField(
const T& srcRef,
Simd& dstSimd,
RecordCoord rc);
template<typename Simd, typename T, typename RecordCoord>
SrcRC srcRC,
DstRC dstRC);
template<typename Simd, typename TFwd, typename SrcRC, typename DstRC>
friend LLAMA_FN_HOST_ACC_INLINE void internal::storeSimdToField(
const Simd& srcSimd,
T&& dstRef,
RecordCoord rc);
TFwd&& dstRef,
SrcRC srcRC,
DstRC dstRC);
};

// swap for heterogeneous RecordRef
Expand Down
99 changes: 73 additions & 26 deletions include/llama/Simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,28 +203,27 @@ namespace llama
return indices;
}();

template<typename T, typename Simd, typename RecordCoord>
LLAMA_FN_HOST_ACC_INLINE void loadSimdFromField(const T& srcRef, Simd& dstSimd, RecordCoord rc)
template<typename T, typename Simd, typename SrcRC, typename DstRC>
LLAMA_FN_HOST_ACC_INLINE void loadSimdFromField(const T& srcRef, Simd& dstSimd, SrcRC srcRC, DstRC dstRC)
{
using RecordDim = typename T::AccessibleRecordDim;
using FieldType = GetType<RecordDim, decltype(rc)>;
using ElementSimd = std::decay_t<decltype(dstSimd(rc))>;
using FieldType = GetType<typename T::AccessibleRecordDim, SrcRC>;
using ElementSimd = std::decay_t<decltype(dstSimd(dstRC))>;
using Traits = SimdTraits<ElementSimd>;

auto loadElementWise = [&]
{
auto b = ArrayIndexIterator{srcRef.view.extents(), srcRef.arrayIndex()};
for(std::size_t i = 0; i < Traits::lanes; i++)
reinterpret_cast<FieldType*>(&dstSimd(rc))[i]
= srcRef.view(*b++)(cat(typename T::BoundRecordCoord{}, rc));
reinterpret_cast<FieldType*>(&dstSimd(dstRC))[i]
= srcRef.view(*b++)(cat(typename T::BoundRecordCoord{}, srcRC));
};

// TODO(bgruber): can we generalize the logic whether we can load a dstSimd from that mapping?
using Mapping = typename T::View::Mapping;
if constexpr(mapping::isSoA<Mapping>)
{
LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
dstSimd(rc) = Traits::loadUnaligned(&srcRef(rc));
dstSimd(dstRC) = Traits::loadUnaligned(&srcRef(srcRC));
LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
}
else if constexpr(mapping::isAoSoA<typename T::View::Mapping>)
Expand All @@ -234,7 +233,7 @@ namespace llama
&& T::View::Mapping::lanes >= Traits::lanes)
{
LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
dstSimd(rc) = Traits::loadUnaligned(&srcRef(rc));
dstSimd(dstRC) = Traits::loadUnaligned(&srcRef(srcRC));
LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
}
else
Expand All @@ -243,20 +242,19 @@ namespace llama
else if constexpr(mapping::isAoS<Mapping>)
{
LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
dstSimd(rc) = Traits::gather(&srcRef(rc), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
dstSimd(dstRC) = Traits::gather(&srcRef(srcRC), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
}
else
loadElementWise();
}

template<typename Simd, typename TFwd, typename RecordCoord>
LLAMA_FN_HOST_ACC_INLINE void storeSimdToField(const Simd& srcSimd, TFwd&& dstRef, RecordCoord rc)
template<typename Simd, typename TFwd, typename SrcRC, typename DstRC>
LLAMA_FN_HOST_ACC_INLINE void storeSimdToField(const Simd& srcSimd, TFwd&& dstRef, SrcRC srcRC, DstRC dstRC)
{
using T = std::remove_reference_t<TFwd>;
using RecordDim = typename T::AccessibleRecordDim;
using FieldType = GetType<RecordDim, decltype(rc)>;
using ElementSimd = std::decay_t<decltype(srcSimd(rc))>;
using FieldType = GetType<typename T::AccessibleRecordDim, DstRC>;
using ElementSimd = std::decay_t<decltype(srcSimd(srcRC))>;
using Traits = SimdTraits<ElementSimd>;

auto storeElementWise = [&]
Expand All @@ -265,16 +263,16 @@ namespace llama
// direction should we collect SIMD values?
auto b = ArrayIndexIterator{dstRef.view.extents(), dstRef.arrayIndex()};
for(std::size_t i = 0; i < Traits::lanes; i++)
dstRef.view (*b++)(cat(typename T::BoundRecordCoord{}, rc))
= reinterpret_cast<const FieldType*>(&srcSimd(rc))[i];
dstRef.view (*b++)(cat(typename T::BoundRecordCoord{}, dstRC))
= reinterpret_cast<const FieldType*>(&srcSimd(srcRC))[i];
};

// TODO(bgruber): can we generalize the logic whether we can store a srcSimd to that mapping?
using Mapping = typename std::remove_reference_t<T>::View::Mapping;
if constexpr(mapping::isSoA<Mapping>)
{
LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
Traits::storeUnaligned(srcSimd(rc), &dstRef(rc));
Traits::storeUnaligned(srcSimd(srcRC), &dstRef(dstRC));
LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
}
else if constexpr(mapping::isAoSoA<typename T::View::Mapping>)
Expand All @@ -284,7 +282,7 @@ namespace llama
&& T::View::Mapping::lanes >= Traits::lanes)
{
LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
Traits::storeUnaligned(srcSimd(rc), &dstRef(rc));
Traits::storeUnaligned(srcSimd(srcRC), &dstRef(dstRC));
LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
}
else
Expand All @@ -293,7 +291,7 @@ namespace llama
else if constexpr(mapping::isAoS<Mapping>)
{
LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
Traits::scatter(srcSimd(rc), &dstRef(rc), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
Traits::scatter(srcSimd(srcRC), &dstRef(dstRC), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
}
else
Expand All @@ -315,8 +313,32 @@ namespace llama
if constexpr(simdLanes<Simd> == simdLanes<T>) // fast path mainly for scalar SimdN<T, 1, ...>
dstSimd = srcRef;
else
forEachLeafCoord<typename Simd::AccessibleRecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE { internal::loadSimdFromField(srcRef, dstSimd, rc); });
{
using SrcARD = typename T::AccessibleRecordDim;
using DstArd = typename Simd::AccessibleRecordDim;
if constexpr(std::is_same_v<SrcARD, DstArd>)
{
forEachLeafCoord<SrcARD>([&](auto rc) LLAMA_LAMBDA_INLINE
{ internal::loadSimdFromField(srcRef, dstSimd, rc, rc); });
}
else
{
forEachLeafCoord<SrcARD>(
[&](auto srcRC) LLAMA_LAMBDA_INLINE
{
using SrcInnerCoord = decltype(srcRC);
forEachLeafCoord<DstArd>(
[&](auto dstRC) LLAMA_LAMBDA_INLINE
{
using DstInnerCoord = decltype(dstRC);
if constexpr(hasSameTags<SrcARD, SrcInnerCoord, DstArd, DstInnerCoord>)
{
internal::loadSimdFromField(srcRef, dstSimd, srcRC, dstRC);
}
});
});
}
}
}
// unstructured dstSimd and reference type
else if constexpr(!isRecordRef<Simd> && !isRecordRef<T>)
Expand All @@ -337,17 +359,42 @@ namespace llama
/// SIMD vector will be stored for each of the fields. The number of elements stored per SIMD vector depends on the
/// SIMD width of the vector. Simd is allowed to have different vector lengths per element.
LLAMA_EXPORT
template<typename Simd, typename T>
LLAMA_FN_HOST_ACC_INLINE void storeSimd(const Simd& srcSimd, T&& dstRef)
template<typename Simd, typename TFwd>
LLAMA_FN_HOST_ACC_INLINE void storeSimd(const Simd& srcSimd, TFwd&& dstRef)
{
using T = std::decay_t<TFwd>;
// structured Simd type and record reference
if constexpr(isRecordRef<Simd> && isRecordRef<T>)
{
if constexpr(simdLanes<Simd> == simdLanes<T>) // fast path mainly for scalar SimdN<T, 1, ...>
dstRef = srcSimd;
else
forEachLeafCoord<typename T::AccessibleRecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE { internal::storeSimdToField(srcSimd, dstRef, rc); });
{
using SrcARD = typename Simd::AccessibleRecordDim;
using DstArd = typename T::AccessibleRecordDim;
if constexpr(std::is_same_v<SrcARD, DstArd>)
{
forEachLeafCoord<SrcARD>([&](auto rc) LLAMA_LAMBDA_INLINE
{ internal::storeSimdToField(srcSimd, dstRef, rc, rc); });
}
else
{
forEachLeafCoord<SrcARD>(
[&](auto srcRC) LLAMA_LAMBDA_INLINE
{
using SrcInnerCoord = decltype(srcRC);
forEachLeafCoord<DstArd>(
[&](auto dstRC) LLAMA_LAMBDA_INLINE
{
using DstInnerCoord = decltype(dstRC);
if constexpr(hasSameTags<SrcARD, SrcInnerCoord, DstArd, DstInnerCoord>)
{
internal::storeSimdToField(srcSimd, dstRef, srcRC, dstRC);
}
});
});
}
}
}
// unstructured srcSimd and reference type
else if constexpr(!isRecordRef<Simd> && !isRecordRef<T>)
Expand Down
131 changes: 123 additions & 8 deletions tests/simd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,13 @@ TEMPLATE_TEST_CASE(
CHECK(p(tag::Flags{}, llama::RecordCoord<1>{}) == 8);
CHECK(p(tag::Flags{}, llama::RecordCoord<2>{}) == 9);
CHECK(p(tag::Flags{}, llama::RecordCoord<3>{}) == 10);

llama::SimdN<Vec3D, 1, stdx::fixed_size_simd> v;
llama::loadSimd(view(0)(tag::Pos{}), v);

CHECK(v(tag::X{}) == 0);
CHECK(v(tag::Y{}) == 1);
CHECK(v(tag::Z{}) == 2);
}

TEMPLATE_TEST_CASE(
Expand Down Expand Up @@ -292,6 +299,74 @@ TEMPLATE_TEST_CASE(
CHECK(
SimdRange{p(tag::Flags{}, llama::RecordCoord<3>{})}
== SimdRange{stdx::fixed_size_simd<std::uint8_t, 4>{[](auto ic) -> std::uint8_t { return 10 + ic * 11; }}});

llama::SimdN<Vec3D, 4, stdx::fixed_size_simd> v;
llama::loadSimd(view(0)(tag::Pos{}), v);

CHECK(SimdRange{v(tag::X{})} == SimdRange{stdx::fixed_size_simd<double, 4>{[](auto ic) {
return 0.0 + ic * 11.0;
}}});
CHECK(SimdRange{v(tag::Y{})} == SimdRange{stdx::fixed_size_simd<double, 4>{[](auto ic) {
return 1.0 + ic * 11.0;
}}});
CHECK(SimdRange{v(tag::Z{})} == SimdRange{stdx::fixed_size_simd<double, 4>{[](auto ic) {
return 2.0 + ic * 11.0;
}}});
}

using Vec2I = llama::Record<llama::Field<tag::X, int>, llama::Field<tag::Y, int>>;
using Vec1I = llama::Record<llama::Field<tag::Y, int>>;

TEMPLATE_TEST_CASE(
"simd.heterogeneousLoadStore.stdsimd",
"",
llama::mapping::BindAoS<>,
llama::mapping::BindSoA<>,
llama::mapping::BindAoSoA<2>,
llama::mapping::BindAoSoA<32>)
{
using ArrayExtents = llama::ArrayExtentsDynamic<int, 1>;
const auto mapping = typename TestType::template fn<ArrayExtents, Vec2I>(ArrayExtents{2});
auto view = llama::allocViewUninitialized(mapping);
iotaFillView(view);

SECTION("BiggerSimdRecord")
{
llama::SimdN<Vec3I, 2, stdx::fixed_size_simd> v{};
llama::loadSimd(view(0), v);
CHECK(SimdRange{v(tag::X{})} == SimdRange{stdx::fixed_size_simd<int, 2>{[](auto ic) {
return 0 + static_cast<int>(ic) * 2;
}}});
CHECK(SimdRange{v(tag::Y{})} == SimdRange{stdx::fixed_size_simd<int, 2>{[](auto ic) {
return 1 + static_cast<int>(ic) * 2;
}}});
CHECK(SimdRange{v(tag::Z{})} == SimdRange{stdx::fixed_size_simd<int, 2>{}});

v(tag::X{}) = stdx::fixed_size_simd<int, 2>{[](auto ic) { return static_cast<int>(ic) + 100; }};
v(tag::Y{}) = stdx::fixed_size_simd<int, 2>{[](auto ic) { return static_cast<int>(ic) + 200; }};
v(tag::Z{}) = stdx::fixed_size_simd<int, 2>{[](auto ic) { return static_cast<int>(ic) + 300; }};
llama::storeSimd(v, view(0));
CHECK(view(0)(tag::X{}) == 100);
CHECK(view(1)(tag::X{}) == 101);
CHECK(view(0)(tag::Y{}) == 200);
CHECK(view(1)(tag::Y{}) == 201);
}

SECTION("SmallerSimdRecord")
{
llama::SimdN<Vec1I, 2, stdx::fixed_size_simd> v{};
llama::loadSimd(view(0), v);
CHECK(SimdRange{v(tag::Y{})} == SimdRange{stdx::fixed_size_simd<int, 2>{[](auto ic) {
return 1 + static_cast<int>(ic) * 2;
}}});

v(tag::Y{}) = stdx::fixed_size_simd<int, 2>{[](auto ic) { return static_cast<int>(ic) + 1000; }};
llama::storeSimd(v, view(0));
CHECK(view(0)(tag::X{}) == 0);
CHECK(view(1)(tag::X{}) == 2);
CHECK(view(0)(tag::Y{}) == 1000);
CHECK(view(1)(tag::Y{}) == 1001);
}
}

TEST_CASE("simd.storeSimd.scalar")
Expand Down Expand Up @@ -352,6 +427,24 @@ TEMPLATE_TEST_CASE(
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<1>{}) == 8);
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<2>{}) == 9);
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<3>{}) == 10);

llama::SimdN<Vec3D, 1, stdx::fixed_size_simd> v;
v(tag::X{}) = 100;
v(tag::Y{}) = 101;
v(tag::Z{}) = 102;
llama::storeSimd(v, view(0)(tag::Vel{}));

CHECK(view(0)(tag::Pos{}, tag::X{}) == 0);
CHECK(view(0)(tag::Pos{}, tag::Y{}) == 1);
CHECK(view(0)(tag::Pos{}, tag::Z{}) == 2);
CHECK(view(0)(tag::Mass{}) == 3);
CHECK(view(0)(tag::Vel{}, tag::X{}) == 100);
CHECK(view(0)(tag::Vel{}, tag::Y{}) == 101);
CHECK(view(0)(tag::Vel{}, tag::Z{}) == 102);
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<0>{}) == 7);
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<1>{}) == 8);
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<2>{}) == 9);
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<3>{}) == 10);
}

TEMPLATE_TEST_CASE(
Expand All @@ -367,14 +460,16 @@ TEMPLATE_TEST_CASE(
auto view = llama::allocViewUninitialized(mapping);

llama::SimdN<ParticleSimd, 3, stdx::fixed_size_simd> p;
auto& x = p(tag::Pos{}, tag::X{});
auto& y = p(tag::Pos{}, tag::Y{});
auto& z = p(tag::Pos{}, tag::Z{});
auto& m = p(tag::Mass{});
x[0] = 1, x[1] = 2, x[2] = 3;
y[0] = 4, y[1] = 5, y[2] = 6;
z[0] = 7, z[1] = 8, z[2] = 9;
m[0] = 80, m[1] = 81, m[2] = 82;
{
auto& x = p(tag::Pos{}, tag::X{});
auto& y = p(tag::Pos{}, tag::Y{});
auto& z = p(tag::Pos{}, tag::Z{});
auto& m = p(tag::Mass{});
x[0] = 1, x[1] = 2, x[2] = 3;
y[0] = 4, y[1] = 5, y[2] = 6;
z[0] = 7, z[1] = 8, z[2] = 9;
m[0] = 80, m[1] = 81, m[2] = 82;
}
llama::storeSimd(p, view(0));

CHECK(view(0)(tag::Pos{}, tag::X{}) == 1);
Expand All @@ -393,6 +488,26 @@ TEMPLATE_TEST_CASE(
CHECK(view(1)(tag::Mass{}) == 81);
CHECK(view(2)(tag::Mass{}) == 82);
CHECK(view(3)(tag::Mass{}) == 0);

llama::SimdN<Vec3D, 3, stdx::fixed_size_simd> v;
{
auto& x = v(tag::X{});
auto& y = v(tag::Y{});
auto& z = v(tag::Z{});
x[0] = 101, x[1] = 102, x[2] = 103;
y[0] = 104, y[1] = 105, y[2] = 106;
z[0] = 107, z[1] = 108, z[2] = 109;
}
llama::storeSimd(v, view(0)(tag::Pos{}));
CHECK(view(0)(tag::Pos{}, tag::X{}) == 101);
CHECK(view(1)(tag::Pos{}, tag::X{}) == 102);
CHECK(view(2)(tag::Pos{}, tag::X{}) == 103);
CHECK(view(0)(tag::Pos{}, tag::Y{}) == 104);
CHECK(view(1)(tag::Pos{}, tag::Y{}) == 105);
CHECK(view(2)(tag::Pos{}, tag::Y{}) == 106);
CHECK(view(0)(tag::Pos{}, tag::Z{}) == 107);
CHECK(view(1)(tag::Pos{}, tag::Z{}) == 108);
CHECK(view(2)(tag::Pos{}, tag::Z{}) == 109);
}

TEMPLATE_TEST_CASE(
Expand Down

0 comments on commit bd95c53

Please sign in to comment.