Skip to content

Commit c0c3ac0

Browse files
committed
fold parallelism decision into parallel_set_op
Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
1 parent e8beda2 commit c0c3ac0

2 files changed

Lines changed: 60 additions & 77 deletions

File tree

include/oneapi/dpl/pstl/algorithm_impl.h

Lines changed: 48 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3303,10 +3303,10 @@ inline constexpr auto __set_algo_cut_off = 1000;
33033303
template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _RandomAccessIterator2,
33043304
class _OutputIterator, class _SizeFunction, class _SetOP, class _Compare, class _Proj1, class _Proj2>
33053305
_OutputIterator
3306-
__parallel_set_op(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1,
3307-
_RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2,
3308-
_OutputIterator __result, _SizeFunction __size_func, _SetOP __set_op, _Compare __comp, _Proj1 __proj1,
3309-
_Proj2 __proj2)
3306+
__parallel_set_op_impl(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1,
3307+
_RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2,
3308+
_OutputIterator __result, _SizeFunction __size_func, _SetOP __set_op, _Compare __comp,
3309+
_Proj1 __proj1, _Proj2 __proj2)
33103310
{
33113311
using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag;
33123312

@@ -3399,6 +3399,38 @@ __parallel_set_op(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomA
33993399
});
34003400
}
34013401

3402+
// Thin wrapper over __parallel_set_op_impl that always partitions the larger range.
3403+
// When range2 is larger, it swaps ranges and wraps __set_op / __size_func / projections
3404+
// so that the leaf operation still sees the caller's original range order. This is
3405+
// important to satisfy semantic requirements to use elements from the first sequence in
3406+
// the output when elements are equivalent.
3407+
template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _RandomAccessIterator2,
3408+
class _OutputIterator, class _SizeFunction, class _SetOP, class _Compare, class _Proj1, class _Proj2>
3409+
_OutputIterator
3410+
__parallel_set_op(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1,
3411+
_RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2,
3412+
_OutputIterator __result, _SizeFunction __size_func, _SetOP __set_op, _Compare __comp, _Proj1 __proj1,
3413+
_Proj2 __proj2)
3414+
{
3415+
const auto __n1 = __last1 - __first1;
3416+
const auto __n2 = __last2 - __first2;
3417+
3418+
if (__n1 >= __n2)
3419+
{
3420+
return __parallel_set_op_impl(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2,
3421+
__last2, __result, __size_func, __set_op, __comp, __proj1, __proj2);
3422+
}
3423+
// Partition the larger range2, wrapping callbacks to preserve
3424+
// the caller's original range order for the leaf operation.
3425+
return __parallel_set_op_impl(
3426+
__tag, std::forward<_ExecutionPolicy>(__exec), __first2, __last2, __first1, __last1, __result,
3427+
[&__size_func](auto __n, auto __m) { return __size_func(__m, __n); },
3428+
[&__set_op](auto __f2, auto __l2, auto __f1, auto __l1, auto* __res, auto __comp, auto __p2, auto __p1) {
3429+
return __set_op(__f1, __l1, __f2, __l2, __res, __comp, __p1, __p2);
3430+
},
3431+
__comp, __proj2, __proj1);
3432+
}
3433+
34023434
//a shared parallel pattern for '__pattern_set_union' and '__pattern_set_symmetric_difference'
34033435
template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _RandomAccessIterator2,
34043436
class _OutputIterator, class _SetUnionOp, class _Compare, class _Proj1, class _Proj2>
@@ -3681,41 +3713,18 @@ __pattern_set_intersection(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& _
36813713
const _DifferenceType __total_work = __n1 + __n2;
36823714
if (__total_work > __set_algo_cut_off)
36833715
{
3684-
return __internal::__except_handler([&]() {
3685-
// Decide which range to partition based on size
3686-
if (__n1 >= __n2)
3687-
{
3688-
return __internal::__parallel_set_op(
3689-
__tag, std::forward<_ExecutionPolicy>(__exec), __begin1, __last1, __begin2, __last2, __result,
3690-
[](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
3691-
[](_RandomAccessIterator1 __lmda_first1, _RandomAccessIterator1 __lmda_last1,
3692-
_RandomAccessIterator2 __lmda_first2, _RandomAccessIterator2 __lmda_last2, _T* __result,
3693-
_Compare __comp, oneapi::dpl::identity, oneapi::dpl::identity) {
3694-
return oneapi::dpl::__utils::__set_intersection_construct(
3695-
__lmda_first1, __lmda_last1, __lmda_first2, __lmda_last2, __result,
3696-
oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp,
3697-
oneapi::dpl::identity{}, oneapi::dpl::identity{});
3698-
},
3699-
__comp, oneapi::dpl::identity{}, oneapi::dpl::identity{});
3700-
}
3701-
else
3702-
{
3703-
return __internal::__parallel_set_op(
3704-
__tag, std::forward<_ExecutionPolicy>(__exec), __begin2, __last2, __begin1, __last1, __result,
3705-
[](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
3706-
[](_RandomAccessIterator2 __lmda_first2, _RandomAccessIterator2 __lmda_last2,
3707-
_RandomAccessIterator1 __lmda_first1, _RandomAccessIterator1 __lmda_last1, _T* __result,
3708-
_Compare __comp, oneapi::dpl::identity, oneapi::dpl::identity) {
3709-
// Lambda params: __lmda_first1 = iter of range2, __lmda_first2 = iter of range1
3710-
// Swap to pass logical range1 first for semantic correctness (must copy from first range)
3711-
return oneapi::dpl::__utils::__set_intersection_construct(
3712-
__lmda_first1, __lmda_last1, __lmda_first2, __lmda_last2, __result,
3713-
oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp,
3714-
oneapi::dpl::identity{}, oneapi::dpl::identity{});
3715-
},
3716-
__comp, oneapi::dpl::identity{}, oneapi::dpl::identity{});
3717-
}
3718-
});
3716+
return __internal::__parallel_set_op(
3717+
__tag, std::forward<_ExecutionPolicy>(__exec), __begin1, __last1, __begin2, __last2, __result,
3718+
[](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
3719+
[](_RandomAccessIterator1 __lmda_first1, _RandomAccessIterator1 __lmda_last1,
3720+
_RandomAccessIterator2 __lmda_first2, _RandomAccessIterator2 __lmda_last2, _T* __result, _Compare __comp,
3721+
oneapi::dpl::identity, oneapi::dpl::identity) {
3722+
return oneapi::dpl::__utils::__set_intersection_construct(
3723+
__lmda_first1, __lmda_last1, __lmda_first2, __lmda_last2, __result,
3724+
oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp,
3725+
oneapi::dpl::identity{}, oneapi::dpl::identity{});
3726+
},
3727+
__comp, oneapi::dpl::identity{}, oneapi::dpl::identity{});
37193728
}
37203729

37213730
// Work too small for parallelization - use serial algorithm

include/oneapi/dpl/pstl/algorithm_ranges_impl.h

Lines changed: 12 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -957,44 +957,18 @@ __pattern_set_intersection(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& _
957957
const _DifferenceType __total_work = __n1 + __n2;
958958
if (__total_work > oneapi::dpl::__internal::__set_algo_cut_off)
959959
{
960-
return __internal::__except_handler([&]() {
961-
// Decide which range to partition based on size
962-
if (__n1 >= __n2)
963-
{
964-
auto __out_last = __internal::__parallel_set_op(
965-
__tag, std::forward<_ExecutionPolicy>(__exec), __begin1, __last1, __begin2, __last2, __result,
966-
[](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
967-
[](_RandomAccessIterator1 __lmda_first1, _RandomAccessIterator1 __lmda_last1,
968-
_RandomAccessIterator2 __lmda_first2, _RandomAccessIterator2 __lmda_last2, _T* __result,
969-
_Comp __comp, _Proj1 __proj1, _Proj2 __proj2) {
970-
return oneapi::dpl::__utils::__set_intersection_construct(
971-
__lmda_first1, __lmda_last1, __lmda_first2, __lmda_last2, __result,
972-
oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp, __proj1,
973-
__proj2);
974-
},
975-
__comp, __proj1, __proj2);
976-
return __set_intersection_return_t<_R1, _R2, _OutRange>{__last1, __last2, __out_last};
977-
}
978-
else
979-
{
980-
// Partition the larger full_range2, search into trimmed_range1
981-
auto __out_last = __internal::__parallel_set_op(
982-
__tag, std::forward<_ExecutionPolicy>(__exec), __begin2, __last2, __begin1, __last1, __result,
983-
[](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
984-
[](_RandomAccessIterator2 __lmda_first2, _RandomAccessIterator2 __lmda_last2,
985-
_RandomAccessIterator1 __lmda_first1, _RandomAccessIterator1 __lmda_last1, _T* __result,
986-
_Comp __comp, _Proj2 __proj2, _Proj1 __proj1) {
987-
// Lambda params: __lmda_first2 = iter of range2, __lmda_first1 = iter of range1
988-
// Swap to pass logical range1 first for semantic correctness (must copy from first set)
989-
return oneapi::dpl::__utils::__set_intersection_construct(
990-
__lmda_first1, __lmda_last1, __lmda_first2, __lmda_last2, __result,
991-
oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp, __proj1,
992-
__proj2);
993-
},
994-
__comp, __proj2, __proj1);
995-
return __set_intersection_return_t<_R1, _R2, _OutRange>{__last1, __last2, __out_last};
996-
}
997-
});
960+
auto __out_last = __internal::__parallel_set_op(
961+
__tag, std::forward<_ExecutionPolicy>(__exec), __begin1, __last1, __begin2, __last2, __result,
962+
[](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); },
963+
[](_RandomAccessIterator1 __lmda_first1, _RandomAccessIterator1 __lmda_last1,
964+
_RandomAccessIterator2 __lmda_first2, _RandomAccessIterator2 __lmda_last2, _T* __result, _Comp __comp,
965+
_Proj1 __proj1, _Proj2 __proj2) {
966+
return oneapi::dpl::__utils::__set_intersection_construct(
967+
__lmda_first1, __lmda_last1, __lmda_first2, __lmda_last2, __result,
968+
oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp, __proj1, __proj2);
969+
},
970+
__comp, __proj1, __proj2);
971+
return __set_intersection_return_t<_R1, _R2, _OutRange>{__last1, __last2, __out_last};
998972
}
999973

1000974
// Work too small for parallelization - use serial algorithm

0 commit comments

Comments
 (0)