diff --git a/CMakeLists.txt b/CMakeLists.txt index 2acfd08051c..5e4f4bfedb6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake) option(ONEDPL_ENABLE_SIMD "Enable SIMD vectorization by passing an OpenMP SIMD flag to the compiler if supported" ON) option(ONEDPL_CMAKE_QUIET_CHECKS "Silence output from compiler/header checks during configuration" ON) +option(ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT "Compatibility mode for result of oneapi::dpl::ranges::set_difference with C++23" OFF) cmake_dependent_option(ONEDPL_TEST_WIN_ICX_FIXES "Enable icx workarounds for Windows" ON "CMAKE_HOST_WIN32;NOT _onedpl_is_subproject" OFF) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/include/oneapi/dpl/internal/version_impl.h @@ -151,6 +152,9 @@ endif() add_library(oneDPL INTERFACE) target_compile_features(oneDPL INTERFACE cxx_std_17) target_compile_definitions(oneDPL INTERFACE $<$:PSTL_USE_DEBUG=1>) +target_compile_definitions(oneDPL INTERFACE + $<$:ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT=1> +) if (CMAKE_BUILD_TYPE) message(STATUS "Build type is ${CMAKE_BUILD_TYPE}") diff --git a/include/oneapi/dpl/internal/common_config.h b/include/oneapi/dpl/internal/common_config.h index 0863e815972..43b22d5a6e3 100644 --- a/include/oneapi/dpl/internal/common_config.h +++ b/include/oneapi/dpl/internal/common_config.h @@ -68,4 +68,8 @@ # endif #endif // __cplusplus >= 201703L +#ifndef ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT +# define ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT 0 +#endif + #endif // _ONEDPL_COMMON_CONFIG_H diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 44482f0f49c..8062b9fc634 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -23,6 +23,8 @@ #include #include #include +#include // for std::array +#include // for std::optional #include "algorithm_fwd.h" @@ -3281,136 +3283,828 @@ __pattern_includes(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _ }); } -inline constexpr auto __set_algo_cut_off = 1000; +template +constexpr bool +__is_set_algo_cutoff_exceeded(Size size) +{ + // 1000 is chosen as a cut-off value based on benchmarking source data sizes + constexpr Size __set_algo_cut_off = 1000; + return size > __set_algo_cut_off; +} -template -_OutputIterator -__parallel_set_op(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, - _OutputIterator __result, _SizeFunction __size_func, _SetOP __set_op, _Compare __comp, _Proj1 __proj1, - _Proj2 __proj2) +// _ReachedOffset - describes reached offset in input range +// - the first field contains the amount of processed items +// - the second field contains the amount of processed (i.e. skipped) items in the end +template +using _ReachedOffset = std::pair<_DifferenceType, _DifferenceType>; + +template +struct _DataPart { - using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; + // [.........................) + // Temporary windowed buffer: TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT + // ^ ^ + // +<-(__buf_pos) +<-(__buf_pos + __len) + // | | + // +--+ +--+ + // | | + // |<-(__pos) |<-(__pos + __len) + // V V + // Result buffer: OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO + + _DifferenceType __pos{}; // Offset in output range w/o limitation to output data size + _DifferenceType __len{}; // The length of data pack: the same for windowed and result buffers + _DifferenceType __buf_pos{}; // Offset in temporary buffer w/o limitation to output data size + + bool + empty() const + { + return __len == 0; + } + + static bool + is_left(const _DataPart& __a, const _DataPart& __b) + { + return __b.__buf_pos > __a.__buf_pos || (__b.__buf_pos == __a.__buf_pos && !__b.empty()); + } + + static _DataPart + combine_with(const _DataPart& __a, const _DataPart& __b) + { + return is_left(__a, __b) ? _DataPart{__a.__pos + __a.__len + __b.__pos, __b.__len, __b.__buf_pos} + : _DataPart{__b.__pos + __b.__len + __a.__pos, __a.__len, __a.__buf_pos}; + } + bool + is_output_size_reached(_DifferenceType __n_out) const + { + const _DifferenceType __n_out_idx = std::max(__n_out, _DifferenceType{1}) - 1; // to handle zero output size case + + // (1).__buf_pos (2).__buf_pos (3).__buf_pos (4).__buf_pos (5).__buf_pos (5).__buf_pos (6).__buf_pos (7).__buf_pos + // | | | | | | | | + // V-----------) V-------) V-----------) V-) V----------) V----) V--) V-) + // Temporary buffer: [..............................................................................................................................) + // + // (2).__pos (2).__pos + _len (5).__pos (5).__pos + (5).__len + // | | | | + // V V V V + // Result buffer: [.......................)................................................X............................. + // ^ ^ + // | | + // Positions in result buffer: __n_out_idx __n_out_idx + 1 + + return __pos <= __n_out_idx && __n_out_idx < __pos + __len; + } +}; + +template +struct _SrcDataProcessingOffset +{ + _DifferenceType __offset = {}; // Offset in input range to processing data + _DifferenceType __length = {}; // Length of processing data +}; + +template +struct _SrcDataProcessingOffsets +{ + _SrcDataProcessingOffset<_DifferenceType1> __in1; + _SrcDataProcessingOffset<_DifferenceType2> __in2; +}; + +template +struct _SrcProcessedDataAmount +{ + _DifferenceType1 __length1 = {}; // Amount of processed data in the first input range + _DifferenceType2 __length2 = {}; // Amount of processed data in the second input range + + static _SrcProcessedDataAmount + combine_with(const _SrcProcessedDataAmount& __a, const _SrcProcessedDataAmount& __b) + { + return _SrcProcessedDataAmount{std::max(__a.__length1, __b.__length1), std::max(__a.__length2, __b.__length2)}; + } +}; + +// Describes a data window in the temporary buffer and corresponding positions in the output range +template +struct _SetRangeImpl +{ + static constexpr std::size_t _DataIndex = 0; + static constexpr std::size_t _SrcOffsetsIndex = 1; + static constexpr std::size_t _SrcProcessedDataIndex = 2; + + using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2, _DifferenceTypeOut>; + + using _DataStorage = std::conditional_t< + !__Bounded, _DataPart<_DifferenceType>, + std::tuple<_DataPart<_DifferenceType>, _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2>, + _SrcProcessedDataAmount<_DifferenceType1, _DifferenceType2>>>; + + _DataStorage __data; + + const _DataPart<_DifferenceType>& + get_data_part() const + { + if constexpr (!__Bounded) + return __data; + else + return std::get<_DataIndex>(__data); + } + + const _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2>& + get_src_offsets_part() const + { + static_assert(__Bounded, "Source data offsets part is available only for bounded set operations"); + return std::get<_SrcOffsetsIndex>(__data); + } + + const _SrcProcessedDataAmount<_DifferenceType1, _DifferenceType2>& + get_src_processed_data_amount_part() const + { + static_assert(__Bounded, "Source data processed amount part is available only for bounded set operations"); + return std::get<_SrcProcessedDataIndex>(__data); + } + + static _SetRangeImpl + combine_with(const _SetRangeImpl& __a, const _SetRangeImpl& __b) + { + auto __new_data_part = _DataPart<_DifferenceType>::combine_with(__a.get_data_part(), __b.get_data_part()); + + if constexpr (!__Bounded) + { + return _SetRangeImpl{__new_data_part}; + } + else + { + typename _SetRangeImpl::_DataStorage __ds{ + __new_data_part, + _DataPart<_DifferenceType>::is_left(__a.get_data_part(), __b.get_data_part()) + ? __b.get_src_offsets_part() + : __a.get_src_offsets_part(), + _SrcProcessedDataAmount<_DifferenceType1, _DifferenceType2>::combine_with( + __a.get_src_processed_data_amount_part(), __b.get_src_processed_data_amount_part())}; + return _SetRangeImpl{__ds}; + } + } +}; + +struct _ParallelSetOpCombinePred +{ + template + _SetRangeImpl<__Bounded, _DifferenceType1, _DifferenceType2, _DifferenceTypeOut, _DifferenceTypeMask> + operator()(const _SetRangeImpl<__Bounded, _DifferenceType1, _DifferenceType2, _DifferenceTypeOut, + _DifferenceTypeMask>& __a, + const _SetRangeImpl<__Bounded, _DifferenceType1, _DifferenceType2, _DifferenceTypeOut, + _DifferenceTypeMask>& __b) const + { + return _SetRangeImpl<__Bounded, _DifferenceType1, _DifferenceType2, _DifferenceTypeOut, + _DifferenceTypeMask>::combine_with(__a, __b); + } +}; + +template +using __parallel_set_op_return_t = + oneapi::dpl::__utils::__set_operations_result<_RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator>; + +template +struct _SetOpReachedPosEvaluator +{ using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type; - using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2>; - using _T = typename std::iterator_traits<_OutputIterator>::value_type; + using _DifferenceTypeOut = typename std::iterator_traits<_OutputIterator>::difference_type; + + using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2, _DifferenceTypeOut>; + + using _SetOpReachedPosEvaluatorData = std::tuple<_DifferenceType1, _DifferenceType2, _DifferenceTypeOut>; + + _SetOpReachedPosEvaluator(__parallel_tag<_IsVector> __tag, _ExecutionPolicy& __exec, + _RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, + _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, + _OutputIterator __result1, _OutputIterator __result2, _Compare __comp, _Proj1 __proj1, + _Proj2 __proj2, _SetUnionOp __set_union_op, _SizeFunction __size_func, + _MaskSizeFunction __mask_size_func) + : __tag(__tag), __exec(__exec), __first1(__first1), __last1(__last1), __first2(__first2), __last2(__last2), + __result1(__result1), __result2(__result2), __comp(__comp), __proj1(__proj1), __proj2(__proj2), + __set_union_op(__set_union_op), __size_func(__size_func), __mask_size_func(__mask_size_func), + __n_out(__result2 - __result1) + { + } + + void + __on_output_size_reached(std::size_t __offset_from_n_out, const _DataPart<_DifferenceType>& __data_part, + const _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2>& __source_data_offsets) + { + assert(__offset_from_n_out < 2); + + __output_size_reached_info_opt[__offset_from_n_out] = OutputSizeReachedInfo{__data_part, __source_data_offsets}; + + // Reset reached positions in the output and input ranges due to they will be evaluated based on the information about output size reached point + __res_data_opt.reset(); + } + + void + __on_apex(const _SetRange& __total) + { + __apex_total = __total; + + // Reset reached positions in the output and input ranges due to they will be evaluated based on the information about output size reached point + __res_data_opt.reset(); + } + + // Get evaluated reached positions for the source ranges and output range + _SetOpReachedPosEvaluatorData + __get_reached_positions() + { + if (!__res_data_opt.has_value()) + { + const _DataPart<_DifferenceType>& __apex_total_data_part = __apex_total.get_data_part(); + + const std::pair<_DifferenceType1, _DifferenceType2> __input_reached_positions = + __eval_reached_input_positions(); + + __res_data_opt.emplace(__input_reached_positions.first, __input_reached_positions.second, + std::min(__apex_total_data_part.__pos + __apex_total_data_part.__len, __n_out)); + } - struct _SetRange + return __res_data_opt.value(); + } + + protected: + struct _NoopConstruct { - _DifferenceType __pos, __len, __buf_pos; - bool - empty() const + template + std::nullptr_t + operator()(_ForwardIterator, _ForwardIterator, std::nullptr_t) { - return __len == 0; + return nullptr; } }; - const _DifferenceType1 __n1 = __last1 - __first1; - const _DifferenceType2 __n2 = __last2 - __first2; + template + _DifferenceTypeArg + __eval_reached_pos(oneapi::dpl::__utils::__parallel_set_op_mask* __mask_buffer_begin, + oneapi::dpl::__utils::__parallel_set_op_mask* __mask_buffer_end, + oneapi::dpl::__utils::__parallel_set_op_mask __dest_data_mask_state, _DifferenceTypeOut __pos_no, + _DifferenceTypeArg __reached_pos) const + { + assert(__dest_data_mask_state == oneapi::dpl::__utils::__parallel_set_op_mask::eData1 || + __dest_data_mask_state == oneapi::dpl::__utils::__parallel_set_op_mask::eData2); - __par_backend::__buffer<_T> __buf(__size_func(__n1, __n2)); - - return __internal::__except_handler([&__exec, __n1, __first1, __last1, __first2, __last2, __result, __size_func, - __set_op, &__buf, __comp, __proj1, __proj2]() { - auto __tmp_memory = __buf.get(); - _DifferenceType1 __m{}; - auto __scan = [=](_DifferenceType1, _DifferenceType1, const _SetRange& __s) { // Scan - if (!__s.empty()) - __brick_move_destroy<__parallel_tag<_IsVector>>{}(__tmp_memory + __s.__buf_pos, - __tmp_memory + (__s.__buf_pos + __s.__len), - __result + __s.__pos, _IsVector{}); - }; - __par_backend::__parallel_strict_scan( - __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n1, _SetRange{0, 0, 0}, - [=](_DifferenceType1 __i, _DifferenceType1 __len) { // Reduce - //[__b; __e) - a subrange of the first sequence, to reduce - _RandomAccessIterator1 __b = __first1 + __i; - _RandomAccessIterator1 __e = __first1 + (__i + __len); - - //try searching for the first element which not equal to *__b - if (__b != __first1) - __b += __internal::__pstl_upper_bound(__b, _DifferenceType1{0}, __last1 - __b, __b, __comp, __proj1, __proj1); - - //try searching for the first element which not equal to *__e - if (__e != __last1) - __e += __internal::__pstl_upper_bound(__e, _DifferenceType1{0}, __last1 - __e, __e, __comp, __proj1, __proj1); - - //check is [__b; __e) empty - if (__e - __b < 1) - { - _RandomAccessIterator2 __bb = __last2; - if (__b != __last1) - __bb = __first2 + __internal::__pstl_lower_bound(__first2, _DifferenceType2{0}, __last2 - __first2, - __b, __comp, __proj2, __proj1); + auto __mask_buffer_it = __mask_buffer_begin; - const _DifferenceType __buf_pos = __size_func((__b - __first1), (__bb - __first2)); - return _SetRange{0, 0, __buf_pos}; - } + for (; __mask_buffer_it != __mask_buffer_end && __pos_no < __n_out; ++__mask_buffer_it) + { + auto __state = *__mask_buffer_it; - //try searching for "corresponding" subrange [__bb; __ee) in the second sequence - _RandomAccessIterator2 __bb = __first2; - if (__b != __first1) - __bb = __first2 + __internal::__pstl_lower_bound(__first2, _DifferenceType2{0}, __last2 - __first2, - __b, __comp, __proj2, __proj1); + __pos_no += __test_mask(oneapi::dpl::__utils::__parallel_set_op_mask::eDataOut, __state) ? 1 : 0; + __reached_pos += __test_mask(__dest_data_mask_state, __state) ? 1 : 0; + } - _RandomAccessIterator2 __ee = __last2; - if (__e != __last1) - __ee = __bb + __internal::__pstl_lower_bound(__bb, _DifferenceType2{0}, __last2 - __bb, __e, __comp, - __proj2, __proj1); + // 2. Pass positions which not generates output + for (; __mask_buffer_it != __mask_buffer_end; ++__mask_buffer_it) + { + auto __state = *__mask_buffer_it; - const _DifferenceType __buf_pos = __size_func((__b - __first1), (__bb - __first2)); - auto __buffer_b = __tmp_memory + __buf_pos; - auto __res = __set_op(__b, __e, __bb, __ee, __buffer_b, __comp, __proj1, __proj2); + // Breaks if we detected mask which describes output data generation from specified data set + if (__test_mask(oneapi::dpl::__utils::__parallel_set_op_mask::eDataOut, __state)) + break; - return _SetRange{0, __res - __buffer_b, __buf_pos}; - }, - [](const _SetRange& __a, const _SetRange& __b) { // Combine - if (__b.__buf_pos > __a.__buf_pos || ((__b.__buf_pos == __a.__buf_pos) && !__b.empty())) - return _SetRange{__a.__pos + __a.__len + __b.__pos, __b.__len, __b.__buf_pos}; - return _SetRange{__b.__pos + __b.__len + __a.__pos, __a.__len, __a.__buf_pos}; - }, - __scan, // Scan - [&__m, &__scan](const _SetRange& __total) { // Apex - //final scan - __scan(0, 0, __total); - __m = __total.__pos + __total.__len; - }); - return __result + __m; + __reached_pos += __test_mask(__dest_data_mask_state, __state) ? 1 : 0; + } + + return __reached_pos; + } + + template + const _SrcDataProcessingOffset>& + __get_source_data_offset_part( + const _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2>& __src_offsets_part) const + { + if constexpr (_IsFirstRange) + return __src_offsets_part.__in1; + else + return __src_offsets_part.__in2; + } + + template ::difference_type> + std::pair<_DifferenceType, _DifferenceType> + __eval_offset_and_size(_RandomAccessIterator __first, _RandomAccessIterator __last) const + { + _DifferenceType __offset = 0; + _DifferenceType __length = __last - __first; + + assert(__output_size_reached_info_opt[0].has_value()); + + const auto& __offset_part_n0 = + __get_source_data_offset_part<_IsFirstRange>(__output_size_reached_info_opt[0].value().__src_offsets_part); + __offset = __offset_part_n0.__offset; + __length = __offset_part_n0.__length; + assert(__offset + __length <= __last - __first); + + if (__output_size_reached_info_opt[1].has_value()) + { + const auto& __offset_part_n1 = __get_source_data_offset_part<_IsFirstRange>( + __output_size_reached_info_opt[1].value().__src_offsets_part); + _DifferenceType __offset_n1 = __offset_part_n1.__offset; + _DifferenceType __length_n1 = __offset_part_n1.__length; + + if (__offset_n1 + __length_n1 > __offset + __length) + { + // Process till the end of the second data part + __length = __offset_n1 + __length_n1 - __offset; + } + } + + return {__offset, __length}; + } + + std::pair<_DifferenceType1, _DifferenceType2> + __eval_reached_input_positions() const + { + if constexpr (!__Bounded) + { + // In not bounded set operation we don't have real output size reached point, + // so just return the amounts of processed data in input ranges which are equal to input ranges sizes + return {__last1 - __first1, __last2 - __first2}; + } + else + { + // In bounded set operation when we don't reached output size limit, we can process all data in input ranges, + // so return the amounts of processed data in input ranges which are equal to input ranges sizes + if (!__output_size_reached_info_opt[0].has_value()) + { + const _SrcProcessedDataAmount<_DifferenceType1, _DifferenceType2>& __src_processed = + __apex_total.get_src_processed_data_amount_part(); + return {__src_processed.__length1, __src_processed.__length2}; + } + + // Create & fill buffer with mask + const auto [__offset1, __size1] = __eval_offset_and_size(__first1, __last1); + const auto [__offset2, __size2] = __eval_offset_and_size(__first2, __last2); + + const auto __mask_buf_size = __mask_size_func(__size1, __size2); + + // We need to have initialized memory under mask buffer + std::vector __mask_bufs( + __mask_buf_size, oneapi::dpl::__utils::__parallel_set_op_mask::eNone); + + auto [__first1_tmp_reached, __first2_tmp_reached, __output_discard_it_reached, __mask_buffer_reached] = + __set_union_op( + __first1 + __offset1, __first1 + __offset1 + __size1, // First input range bounds + __first2 + __offset2, __first2 + __offset2 + __size2, // Second input range bounds + oneapi::dpl::__utils::_SetOpDiscardIterator{}, // No real output buffer, so using discard iterator + __comp, __proj1, __proj2, __mask_bufs.data()); + assert(__mask_buffer_reached - __mask_bufs.data() <= static_cast(__mask_bufs.size())); + + //////////////////////////////////////////////////////////// + // Process data based on buffer with mask + + assert(__output_size_reached_info_opt[0].has_value()); + const OutputSizeReachedInfo& __ri_n0 = __output_size_reached_info_opt[0].value(); + + using __backend_tag = typename decltype(__tag)::__backend_tag; + + // Calculate reached positions based on mask buffer + _DifferenceType1 __res_reachedPos1 = {}; + _DifferenceType2 __res_reachedPos2 = {}; + __par_backend::__parallel_invoke( + __backend_tag{}, __exec, + [&]() { + __res_reachedPos1 = __eval_reached_pos( + __mask_bufs.data(), __mask_buffer_reached, oneapi::dpl::__utils::__parallel_set_op_mask::eData1, + __ri_n0.__data_part.__pos, __ri_n0.__src_offsets_part.__in1.__offset); + }, + [&]() { + __res_reachedPos2 = __eval_reached_pos( + __mask_bufs.data(), __mask_buffer_reached, oneapi::dpl::__utils::__parallel_set_op_mask::eData2, + __ri_n0.__data_part.__pos, __ri_n0.__src_offsets_part.__in2.__offset); + }); + + return {__res_reachedPos1, __res_reachedPos2}; + } + } + + bool + __test_mask(oneapi::dpl::__utils::__parallel_set_op_mask __checking_mask_state, + oneapi::dpl::__utils::__parallel_set_op_mask __real_mask_state) const noexcept + { + using _UT = std::underlying_type_t; + + const _UT __state_value = static_cast<_UT>(__real_mask_state); + + // The zero state is incorrect mask state! + assert(__state_value != 0); + + // Check correct memory state + [[maybe_unused]] constexpr _UT __valid_bits = + static_cast<_UT>(oneapi::dpl::__utils::__parallel_set_op_mask::eBothOut); + assert((__state_value & (~__valid_bits)) == 0); + + return (__state_value & static_cast<_UT>(__checking_mask_state)) == static_cast<_UT>(__checking_mask_state); + } + + protected: + __parallel_tag<_IsVector> __tag; + _ExecutionPolicy& __exec; + + _RandomAccessIterator1 __first1, __last1; + _RandomAccessIterator2 __first2, __last2; + _OutputIterator __result1, __result2; + _Compare __comp; + _Proj1 __proj1; + _Proj2 __proj2; + _SetUnionOp __set_union_op; + _SizeFunction __size_func; + _MaskSizeFunction __mask_size_func; + + const _DifferenceTypeOut __n_out = {}; // Size of output range + + _SetRange __apex_total; + + struct OutputSizeReachedInfo + { + _DataPart<_DifferenceType> __data_part; + _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2> __src_offsets_part; + }; + + // Information about two data parts which can generate output data when output size will be reached: + // - element 0: the part which reached output size (__n) + // - element 1: the part which reached output size (__n + 1) + std::optional __output_size_reached_info_opt[2]; + + // Reached positions in the input and output ranges + std::optional<_SetOpReachedPosEvaluatorData> __res_data_opt; +}; + +template +struct _ParallelSetOpScanPred +{ + __parallel_tag<_IsVector> __tag; + _ExecutionPolicy& __exec; + ProcessingDataPointer __buf_pos_begin, __buf_pos_end; // Temporary data buffer (windowed) + _OutputIterator __result_buf_pos_begin, __result_buf_pos_end; // Result data buffer + _SetOpReachedPosEvaluator& __source_final_pos_evaluator; // Evaluator of the final position in the source ranges + + template + void + operator()(_DifferenceType, _DifferenceType, const _SetRange& __s) const + { + const _DataPart<_DifferenceType>& __data_part = __s.get_data_part(); + + if constexpr (!__Bounded) + { + // 1. Copy source data (unbounded) + __copy_data_to_result_buf(__data_part); + } + else + { + // Copy source data (bounded) + ProcessingDataPointer __buf_pos_start_of_not_copied = __buf_pos_begin; + const auto __remaining_data_size = __eval_remaining_data_size(__data_part); + if (__remaining_data_size > 0) + __buf_pos_start_of_not_copied = __copy_data_to_result_buf_bounded(__data_part, __remaining_data_size); + + // Destroy not copied data + if (__remaining_data_size < __data_part.__len) + __brick_destroy(__buf_pos_start_of_not_copied, __buf_pos_end, _IsVector{}); + + const _DifferenceType __n_out = __result_buf_pos_end - __result_buf_pos_begin; + + // Save subrange info if we reached final/after final positions at this subrange + for (_DifferenceType __n_offset : {0, 1}) + { + if (__data_part.is_output_size_reached(__n_out + __n_offset)) + __source_final_pos_evaluator.__on_output_size_reached(__n_offset, __data_part, + __s.get_src_offsets_part()); + } + } + } + + void + __on_apex(const _SetRange& __total) + { + __source_final_pos_evaluator.__on_apex(__total); + } + + protected: + template + void + __copy_data_to_result_buf(const _DataPart<_DifferenceType>& __data_part) const + { + // Processed data + __brick_move_destroy{}(__buf_pos_begin + __data_part.__buf_pos, + __buf_pos_begin + __data_part.__buf_pos + __data_part.__len, + __result_buf_pos_begin + __data_part.__pos, _IsVector{}); + } + + template + typename std::iterator_traits<_OutputIterator>::difference_type + __eval_remaining_data_size(const _DataPart<_DifferenceType>& __data_part) const + { + // Evaluate output range boundaries for current data chunk + const auto __result_from = __advance_clamped(__result_buf_pos_begin, __data_part.__pos, __result_buf_pos_end); + const auto __result_to = + __advance_clamped(__result_buf_pos_begin, __data_part.__pos + __data_part.__len, __result_buf_pos_end); + + return __result_to - __result_from; + } + + template + ProcessingDataPointer + __copy_data_to_result_buf_bounded(const _DataPart<_DifferenceType>& __data_part, + _DifferenceType __result_remaining) const + { + // Evaluate output range boundaries for current data chunk + const auto __result_from = __advance_clamped(__result_buf_pos_begin, __data_part.__pos, __result_buf_pos_end); + + assert(__result_remaining <= __data_part.__len); + + // Evaluate pointers to current data chunk in temporary buffer + const auto __buf_pos_from = __advance_clamped(__buf_pos_begin, __data_part.__buf_pos, __buf_pos_end); + const auto __buf_pos_to = + __advance_clamped(__buf_pos_begin, __data_part.__buf_pos + __result_remaining, __buf_pos_end); + + // Copy results data into results range to have final output + __brick_move_destroy{}(__buf_pos_from, __buf_pos_to, __result_from, _IsVector{}); + + return __buf_pos_to; + } + + // Move it1 forward by n, but not beyond it2 + template ::difference_type> + _RandomAccessIterator + __advance_clamped(_RandomAccessIterator it1, Size n, _RandomAccessIterator it2) const + { + assert(it1 <= it2); + return it1 + std::min(it2 - it1, n); + } +}; + +template +struct _ParallelSetOpStrictReducePred +{ + _Tag __tag; + _ExecutionPolicy& __exec; + + _RandomAccessIterator1 __first1, __last1; + _RandomAccessIterator2 __first2, __last2; + _SizeFunction __size_func; + _MaskSizeFunction __mask_size_func; + _SetUnionOp __set_union_op; + + _Compare __comp; + _Proj1 __proj1; + _Proj2 __proj2; + + _T* __buf_raw_data_begin = nullptr; + + using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; + using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type; + using _DifferenceTypeOutput = typename std::iterator_traits<_OutputIterator>::difference_type; + using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2, _DifferenceTypeOutput>; + + _SetRange + operator()(_DifferenceType1 __i, _DifferenceType1 __len) const + { + //[__b; __e) - a subrange of the first sequence, to reduce + _RandomAccessIterator1 __b = __first1 + __i; + _RandomAccessIterator1 __e = __first1 + __i + __len; + + //try searching for the first element which not equal to *__b + if (__b != __first1) + __b += + __internal::__pstl_upper_bound(__b, _DifferenceType1{0}, __last1 - __b, __b, __comp, __proj1, __proj1); + + //try searching for the first element which not equal to *__e + if (__e != __last1) + __e += + __internal::__pstl_upper_bound(__e, _DifferenceType1{0}, __last1 - __e, __e, __comp, __proj1, __proj1); + + //check is [__b; __e) empty + if (__e - __b < 1) + { + _RandomAccessIterator2 __bb = __last2; + if (__b != __last1) + __bb = __first2 + __internal::__pstl_lower_bound(__first2, _DifferenceType2{0}, __last2 - __first2, __b, + __comp, __proj2, __proj1); + + const _DifferenceType __buf_pos = __size_func(__b - __first1, __bb - __first2); + + _DataPart<_DifferenceType> __new_processing_data{0, 0, __buf_pos}; + + if constexpr (!__Bounded) + { + return _SetRange{__new_processing_data}; + } + else + { + _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2> __new_offsets_to_processing_data{ + {__b - __first1, 0}, {__bb - __first2, 0}}; + + _SrcProcessedDataAmount<_DifferenceType1, _DifferenceType2> __new_processed_data_amount{0, 0}; + + typename _SetRange::_DataStorage _ds{__new_processing_data, __new_offsets_to_processing_data, + __new_processed_data_amount}; + + return _SetRange{_ds}; + } + } + + //try searching for "corresponding" subrange [__bb; __ee) in the second sequence + _RandomAccessIterator2 __bb = __first2; + if (__b != __first1) + __bb = __first2 + __internal::__pstl_lower_bound(__first2, _DifferenceType2{0}, __last2 - __first2, __b, + __comp, __proj2, __proj1); + + _RandomAccessIterator2 __ee = __last2; + if (__e != __last1) + __ee = __bb + __internal::__pstl_lower_bound(__bb, _DifferenceType2{0}, __last2 - __bb, __e, __comp, + __proj2, __proj1); + + const _DifferenceType __buf_pos = __size_func(__b - __first1, __bb - __first2); + + _T* __buffer_b = __buf_raw_data_begin + __buf_pos; + + auto [__it1_reached, __it2_reached, __output_reached, __mask_reached] = + __set_union_op(__b, __e, __bb, __ee, __buffer_b, __comp, __proj1, __proj2, nullptr); + + // Prepare processed data info + const _DataPart<_DifferenceType> __new_processing_data{0, __output_reached - __buffer_b, __buf_pos}; + + if constexpr (!__Bounded) + { + return _SetRange{__new_processing_data}; + } + else + { + _SrcDataProcessingOffsets<_DifferenceType1, _DifferenceType2> __new_offsets_to_processing_data{ + {__b - __first1, __it1_reached - __b}, {__bb - __first2, __it2_reached - __bb}}; + + const bool __something_reached = __it1_reached != __b || __it2_reached != __bb; + + _SrcProcessedDataAmount<_DifferenceType1, _DifferenceType2> __new_processed_data_amount{ + __something_reached ? __it1_reached - __first1 : 0, __it2_reached - __first2}; + + typename _SetRange::_DataStorage _ds{__new_processing_data, __new_offsets_to_processing_data, + __new_processed_data_amount}; + + return _SetRange{_ds}; + } + } +}; + +template +struct _ParallelSetOpApexPred +{ + _ParallelSetOpScanPred<__Bounded, _IsVector, _ExecutionPolicy, ProcessingDataPointer, _SetRange, _OutputIterator, + _SetOpReachedPosEvaluator>& __scan_pred; + + void + operator()(const _SetRange& __total) const + { + //final scan + __scan_pred(/* 0 */ _DifferenceType{}, /* 0 */ _DifferenceType{}, __total); + + __scan_pred.__on_apex(__total); + } +}; + +template +__parallel_set_op_return_t<_RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator> +__parallel_set_op(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, + _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, + _OutputIterator __result1, _OutputIterator __result2, _Compare __comp, _Proj1 __proj1, _Proj2 __proj2, + _SizeFunction __size_func, _MaskSizeFunction __mask_size_func, _SetUnionOp __set_union_op) +{ + using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; + + using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; + using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type; + using _DifferenceTypeOutput = typename std::iterator_traits<_OutputIterator>::difference_type; + using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2, _DifferenceTypeOutput>; + using _T = typename std::iterator_traits<_OutputIterator>::value_type; + + const _DifferenceType1 __n1 = __last1 - __first1; + const _DifferenceType2 __n2 = __last2 - __first2; + + const _DifferenceType __buf_size = __size_func(__n1, __n2); + __par_backend::__buffer<_T> __buf(__buf_size); // Temporary (windowed) buffer for result preparation + + using __mask_difference_type_t = + typename std::iterator_traits::difference_type; + + using _SetRange = + _SetRangeImpl<__Bounded, _DifferenceType1, _DifferenceType2, _DifferenceTypeOutput, __mask_difference_type_t>; + + return __internal::__except_handler([__tag, &__exec, __n1, __first1, __last1, __first2, __last2, __result1, + __result2, __comp, __proj1, __proj2, __size_func, __mask_size_func, + __set_union_op, &__buf, __buf_size]() { + // Buffer raw data begin/end pointers + _T* __buf_raw_data_begin = __buf.get(); + _T* __buf_raw_data_end = __buf_raw_data_begin + __buf_size; + + _SetOpReachedPosEvaluator<_IsVector, _ExecutionPolicy, _RandomAccessIterator1, _RandomAccessIterator2, + _OutputIterator, _Compare, _Proj1, _Proj2, _SetUnionOp, _SizeFunction, + _MaskSizeFunction, _SetRange, __Bounded> + __source_final_pos_evaluator(__tag, __exec, __first1, __last1, __first2, __last2, __result1, __result2, + __comp, __proj1, __proj2, __set_union_op, __size_func, __mask_size_func); + + // Scan predicate + _ParallelSetOpScanPred<__Bounded, _IsVector, _ExecutionPolicy, _T*, _SetRange, _OutputIterator, + decltype(__source_final_pos_evaluator)> + __scan_pred{__tag, __exec, __buf_raw_data_begin, __buf_raw_data_end, + __result1, __result2, __source_final_pos_evaluator}; + + _ParallelSetOpStrictReducePred<__Bounded, __parallel_tag<_IsVector>, _ExecutionPolicy, _SetRange, + _RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator, _SizeFunction, + _MaskSizeFunction, _SetUnionOp, _Compare, _Proj1, _Proj2, _T> + __reduce_pred{__tag, + __exec, + __first1, + __last1, + __first2, + __last2, + __size_func, + __mask_size_func, + __set_union_op, + __comp, + __proj1, + __proj2, + __buf_raw_data_begin}; + + _ParallelSetOpCombinePred __combine_pred; + + _ParallelSetOpApexPred<__Bounded, _IsVector, _ExecutionPolicy, _T*, _SetRange, _OutputIterator, + _DifferenceType1, decltype(__source_final_pos_evaluator)> + __apex_pred{__scan_pred}; + + __par_backend::__parallel_strict_scan(__backend_tag{}, __exec, __n1, _SetRange(), __reduce_pred, __combine_pred, + __scan_pred, __apex_pred); + + // Get evaluated reached positions for the source ranges and output range + const auto [__res_reachedPos1, __res_reachedPos2, __res_reachedPosOut] = + __source_final_pos_evaluator.__get_reached_positions(); + + return __parallel_set_op_return_t<_RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator>{ + __first1 + __res_reachedPos1, __first2 + __res_reachedPos2, __result1 + __res_reachedPosOut}; }); } //a shared parallel pattern for '__pattern_set_union' and '__pattern_set_symmetric_difference' -template -_OutputIterator +template +oneapi::dpl::__utils::__set_operations_result<_RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator> __parallel_set_union_op(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, - _OutputIterator __result, _SetUnionOp __set_union_op, _Compare __comp, _Proj1 __proj1, - _Proj2 __proj2) + _OutputIterator __result1, _OutputIterator __result2, _Compare __comp, _Proj1 __proj1, + _Proj2 __proj2, _SetUnionOp __set_union_op) { using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type; - using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2>; + using _DifferenceTypeOutput = typename std::iterator_traits<_OutputIterator>::difference_type; + using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2, _DifferenceTypeOutput>; - const auto __n1 = __last1 - __first1; - const auto __n2 = __last2 - __first2; + const _DifferenceType1 __n1 = __last1 - __first1; + const _DifferenceType2 __n2 = __last2 - __first2; + const _DifferenceTypeOutput __n_out = __result2 - __result1; __brick_copy<__parallel_tag<_IsVector>> __copy_range{}; // {1} {}: parallel copying just first sequence if (__n2 == 0) - return __internal::__pattern_walk2_brick(__tag, ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, - __result, __copy_range); + { + _RandomAccessIterator1 __last1_tmp = !__Bounded ? __last1 : __first1 + std::min<_DifferenceType>(__n1, __n_out); + + _OutputIterator __result_finish = __internal::__pattern_walk2_brick( + __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1_tmp, __result1, __copy_range); + + return {__last1_tmp, __first2, __result_finish}; + } // {} {2}: parallel copying just second sequence if (__n1 == 0) - return __internal::__pattern_walk2_brick(__tag, ::std::forward<_ExecutionPolicy>(__exec), __first2, __last2, - __result, __copy_range); + { + _RandomAccessIterator2 __last2_tmp = !__Bounded ? __last2 : __first2 + std::min<_DifferenceType>(__n2, __n_out); + + _OutputIterator __result_finish = __internal::__pattern_walk2_brick( + __tag, std::forward<_ExecutionPolicy>(__exec), __first2, __last2_tmp, __result1, __copy_range); + + return {__first1, __last2_tmp, __result_finish}; + } // testing whether the sequences are intersected _RandomAccessIterator1 __left_bound_seq_1 = @@ -3419,16 +4113,26 @@ __parallel_set_union_op(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __ex if (__left_bound_seq_1 == __last1) { + _RandomAccessIterator1 __last1_tmp = !__Bounded ? __last1 : __first1 + std::min<_DifferenceType>(__n1, __n_out); + const _DifferenceType1 __n1_tmp = __last1_tmp - __first1; + + _RandomAccessIterator2 __last2_tmp = + !__Bounded ? __last2 + : __first2 + std::min<_DifferenceType>(__n2, __n_out > __n1_tmp ? __n_out - __n1_tmp : 0); + const _DifferenceType2 __n2_tmp = __last2_tmp - __first2; + //{1} < {2}: seq2 is wholly greater than seq1, so, do parallel copying seq1 and seq2 __par_backend::__parallel_invoke( __backend_tag{}, __exec, [=, &__exec] { - __internal::__pattern_walk2_brick(__tag, __exec, __first1, __last1, __result, __copy_range); + __internal::__pattern_walk2_brick(__tag, __exec, __first1, __last1_tmp, __result1, __copy_range); }, [=, &__exec] { - __internal::__pattern_walk2_brick(__tag, __exec, __first2, __last2, __result + __n1, __copy_range); + __internal::__pattern_walk2_brick(__tag, __exec, __first2, __last2_tmp, __result1 + __n1_tmp, + __copy_range); }); - return __result + __n1 + __n2; + + return {__last1_tmp, __last2_tmp, __result1 + __n1_tmp + __n2_tmp}; } // testing whether the sequences are intersected @@ -3438,62 +4142,91 @@ __parallel_set_union_op(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __ex if (__left_bound_seq_2 == __last2) { + _RandomAccessIterator2 __last2_tmp = !__Bounded ? __last2 : __first2 + std::min<_DifferenceType>(__n2, __n_out); + const _DifferenceType2 __n2_tmp = __last2_tmp - __first2; + + _RandomAccessIterator1 __last1_tmp = + !__Bounded ? __last1 + : __first1 + std::min<_DifferenceType>(__n1, __n_out > __n2_tmp ? __n_out - __n2_tmp : 0); + const _DifferenceType1 __n1_tmp = __last1_tmp - __first1; + //{2} < {1}: seq2 is wholly greater than seq1, so, do parallel copying seq1 and seq2 __par_backend::__parallel_invoke( __backend_tag{}, __exec, [=, &__exec] { - __internal::__pattern_walk2_brick(__tag, __exec, __first2, __last2, __result, __copy_range); + __internal::__pattern_walk2_brick(__tag, __exec, __first2, __last2_tmp, __result1, __copy_range); }, [=, &__exec] { - __internal::__pattern_walk2_brick(__tag, __exec, __first1, __last1, __result + __n2, __copy_range); + __internal::__pattern_walk2_brick(__tag, __exec, __first1, __last1_tmp, __result1 + __n2_tmp, + __copy_range); }); - return __result + __n1 + __n2; + + return {__last1_tmp, __last2_tmp, __result1 + __n1_tmp + __n2_tmp}; } - const auto __m1 = __left_bound_seq_1 - __first1; - if (__m1 > __set_algo_cut_off) + auto __size_fnc = [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }; + auto __mask_size_fnc = __size_fnc; + + const _DifferenceType1 __m1 = __left_bound_seq_1 - __first1; + if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__m1)) { - auto __res_or = __result; - __result += __m1; //we know proper offset due to [first1; left_bound_seq_1) < [first2; last2) + oneapi::dpl::__utils::__set_operations_result<_RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator> + __finish; + + const _DifferenceType __to_copy = __Bounded ? std::min<_DifferenceType>(__m1, __n_out) : __m1; + __par_backend::__parallel_invoke( __backend_tag{}, __exec, //do parallel copying of [first1; left_bound_seq_1) [=, &__exec] { - __internal::__pattern_walk2_brick(__tag, __exec, __first1, __left_bound_seq_1, __res_or, __copy_range); + __internal::__pattern_walk2_brick(__tag, __exec, __first1, __first1 + __to_copy, __result1, + __copy_range); }, - [=, &__exec, &__result] { - __result = __internal::__parallel_set_op( - __tag, __exec, __left_bound_seq_1, __last1, __first2, __last2, __result, - [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }, __set_union_op, __comp, __proj1, - __proj2); + [=, &__exec, &__finish] { + __finish = __internal::__parallel_set_op<__Bounded>( + __tag, __exec, __left_bound_seq_1, __last1, __first2, __last2, __result1 + __to_copy, __result2, + __comp, __proj1, __proj2, __size_fnc, __mask_size_fnc, __set_union_op); }); - return __result; + + if constexpr (__Bounded) + if (__to_copy < __m1) + __finish.__in1 = __first1 + __to_copy; + + return __finish; } - const auto __m2 = __left_bound_seq_2 - __first2; + const _DifferenceType2 __m2 = __left_bound_seq_2 - __first2; assert(__m1 == 0 || __m2 == 0); - if (__m2 > __set_algo_cut_off) + if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__m2)) { - auto __res_or = __result; - __result += __m2; //we know proper offset due to [first2; left_bound_seq_2) < [first1; last1) + oneapi::dpl::__utils::__set_operations_result<_RandomAccessIterator1, _RandomAccessIterator2, _OutputIterator> + __finish; + + const _DifferenceType __to_copy = __Bounded ? std::min<_DifferenceType>(__m2, __n_out) : __m2; + __par_backend::__parallel_invoke( __backend_tag{}, __exec, //do parallel copying of [first2; left_bound_seq_2) [=, &__exec] { - __internal::__pattern_walk2_brick(__tag, __exec, __first2, __left_bound_seq_2, __res_or, __copy_range); + __internal::__pattern_walk2_brick(__tag, __exec, __first2, __first2 + __to_copy, __result1, + __copy_range); }, - [=, &__exec, &__result] { - __result = __internal::__parallel_set_op( - __tag, __exec, __first1, __last1, __left_bound_seq_2, __last2, __result, - [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }, __set_union_op, __comp, __proj1, - __proj2); + [=, &__exec, &__finish] { + __finish = __internal::__parallel_set_op<__Bounded>( + __tag, __exec, __first1, __last1, __left_bound_seq_2, __last2, __result1 + __to_copy, __result2, + __comp, __proj1, __proj2, __size_fnc, __mask_size_fnc, __set_union_op); }); - return __result; + + if constexpr (__Bounded) + if (__to_copy < __m2) + __finish.__in2 = __first2 + __to_copy; + + return __finish; } - return __internal::__parallel_set_op( - __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, - [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }, __set_union_op, __comp, __proj1, __proj2); + return __internal::__parallel_set_op<__Bounded>(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, + __first2, __last2, __result1, __result2, __comp, __proj1, __proj2, + __size_fnc, __mask_size_fnc, __set_union_op); } //------------------------------------------------------------------------ @@ -3550,24 +4283,24 @@ __pattern_set_union(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, _OutputIterator __result, _Compare __comp) { - const auto __n1 = __last1 - __first1; - const auto __n2 = __last2 - __first2; + using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; + using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type; + + const _DifferenceType1 __n1 = __last1 - __first1; + const _DifferenceType2 __n2 = __last2 - __first2; // use serial algorithm - if (__n1 + __n2 <= __set_algo_cut_off) + if (!oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2)) return std::set_union(__first1, __last1, __first2, __last2, __result, __comp); - using _Tp = typename std::iterator_traits<_OutputIterator>::value_type; - return __parallel_set_union_op( - __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _Tp* __result, _Compare __comp, oneapi::dpl::identity, - oneapi::dpl::identity) { - return oneapi::dpl::__utils::__set_union_construct(__first1, __last1, __first2, __last2, __result, - __BrickCopyConstruct<_IsVector>(), __comp, - oneapi::dpl::identity{}, oneapi::dpl::identity{}); - }, - __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{}); + return __parallel_set_union_op( + __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, + __result + __n1 + __n2, __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{}, + [](auto&&... __args) { + return oneapi::dpl::__utils::__set_union_construct<__BrickCopyConstruct<_IsVector>>( + std::forward(__args)...); + }) + .__get_reached_out(); } //------------------------------------------------------------------------ @@ -3614,14 +4347,12 @@ __pattern_set_intersection(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& _ _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, _RandomAccessIterator3 __result, _Compare __comp) { - using _T = typename std::iterator_traits<_RandomAccessIterator3>::value_type; - using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type; using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2>; - const auto __n1 = __last1 - __first1; - const auto __n2 = __last2 - __first2; + const _DifferenceType1 __n1 = __last1 - __first1; + const _DifferenceType2 __n2 = __last2 - __first2; // intersection is empty if (__n1 == 0 || __n2 == 0) @@ -3639,44 +4370,41 @@ __pattern_set_intersection(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& _ if (__left_bound_seq_2 == __last2) return __result; - const auto __m1 = __last1 - __left_bound_seq_1 + __n2; - if (__m1 > __set_algo_cut_off) + const _DifferenceType __m1 = __last1 - __left_bound_seq_1 + __n2; + if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__m1)) { //we know proper offset due to [first1; left_bound_seq_1) < [first2; last2) return __internal::__except_handler([&]() { - return __internal::__parallel_set_op( - __tag, std::forward<_ExecutionPolicy>(__exec), __left_bound_seq_1, __last1, __first2, __last2, __result, - [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); }, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _T* __result, _Compare __comp, oneapi::dpl::identity, - oneapi::dpl::identity) { - return oneapi::dpl::__utils::__set_intersection_construct( - __first1, __last1, __first2, __last2, __result, - oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp, - oneapi::dpl::identity{}, oneapi::dpl::identity{}); - }, - __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{}); + return __internal::__parallel_set_op( + __tag, std::forward<_ExecutionPolicy>(__exec), __left_bound_seq_1, __last1, __first2, __last2, + __result, __result + __n1 + __n2, __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{}, + [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); }, + [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }, + [](auto&&... __args) { + return oneapi::dpl::__utils::__set_intersection_construct< + oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>>( + std::forward(__args)...); + }) + .__get_reached_out(); }); } - const auto __m2 = __last2 - __left_bound_seq_2 + __n1; - if (__m2 > __set_algo_cut_off) + const _DifferenceType __m2 = __last2 - __left_bound_seq_2 + __n1; + if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__m2)) { //we know proper offset due to [first2; left_bound_seq_2) < [first1; last1) return __internal::__except_handler([&]() { - __result = __internal::__parallel_set_op( - __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __left_bound_seq_2, __last2, __result, - [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); }, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _T* __result, _Compare __comp, oneapi::dpl::identity, - oneapi::dpl::identity) { - return oneapi::dpl::__utils::__set_intersection_construct( - __first1, __last1, __first2, __last2, __result, - oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp, - oneapi::dpl::identity{}, oneapi::dpl::identity{}); - }, - __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{}); - return __result; + return __internal::__parallel_set_op( + __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __left_bound_seq_2, __last2, + __result, __result + __n1 + __n2, __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{}, + [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); }, + [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }, + [](auto&&... __args) { + return oneapi::dpl::__utils::__set_intersection_construct< + oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>>( + std::forward(__args)...); + }) + .__get_reached_out(); }); } @@ -3727,11 +4455,12 @@ __pattern_set_difference(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __e _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, _RandomAccessIterator3 __result, _Compare __comp) { - using _T = typename std::iterator_traits<_RandomAccessIterator3>::value_type; - using _DifferenceType = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; + using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; + using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type; + using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2>; - const auto __n1 = __last1 - __first1; - const auto __n2 = __last2 - __first2; + const _DifferenceType1 __n1 = __last1 - __first1; + const _DifferenceType2 __n2 = __last2 - __first2; // {} \ {2}: the difference is empty if (__n1 == 0) @@ -3756,18 +4485,19 @@ __pattern_set_difference(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __e return __internal::__pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __result, __brick_copy<__parallel_tag<_IsVector>>{}); - if (__n1 + __n2 > __set_algo_cut_off) - return __parallel_set_op( - __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, - [](_DifferenceType __n, _DifferenceType) { return __n; }, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _T* __result, _Compare __comp, oneapi::dpl::identity, - oneapi::dpl::identity) { - return oneapi::dpl::__utils::__set_difference_construct( - __first1, __last1, __first2, __last2, __result, __BrickCopyConstruct<_IsVector>(), __comp, - oneapi::dpl::identity{}, oneapi::dpl::identity{}); - }, - __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{}); + if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2)) + { + return __parallel_set_op( + __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, + __result + __n1 + __n2, __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{}, + [](_DifferenceType __n, _DifferenceType) { return __n; }, + [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }, + [](auto&&... __args) { + return oneapi::dpl::__utils::__set_difference_construct<__BrickCopyConstruct<_IsVector>>( + std::forward(__args)...); + }) + .__get_reached_out(); + } // use serial algorithm return std::set_difference(__first1, __last1, __first2, __last2, __result, __comp); @@ -3818,25 +4548,25 @@ __pattern_set_symmetric_difference(__parallel_tag<_IsVector> __tag, _ExecutionPo _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, _RandomAccessIterator3 __result, _Compare __comp) { - const auto __n1 = __last1 - __first1; - const auto __n2 = __last2 - __first2; + using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; + using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type; + + const _DifferenceType1 __n1 = __last1 - __first1; + const _DifferenceType2 __n2 = __last2 - __first2; // use serial algorithm - if (__n1 + __n2 <= __set_algo_cut_off) + if (!oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2)) return std::set_symmetric_difference(__first1, __last1, __first2, __last2, __result, __comp); - using _T = typename std::iterator_traits<_RandomAccessIterator3>::value_type; return __internal::__except_handler([&]() { - return __internal::__parallel_set_union_op( - __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _T* __result, _Compare __comp, oneapi::dpl::identity, - oneapi::dpl::identity) { - return oneapi::dpl::__utils::__set_symmetric_difference_construct( - __first1, __last1, __first2, __last2, __result, __BrickCopyConstruct<_IsVector>(), __comp, - oneapi::dpl::identity{}, oneapi::dpl::identity{}); - }, - __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{}); + return __internal::__parallel_set_union_op( + __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, + __result + __n1 + __n2, __comp, oneapi::dpl::identity{}, oneapi::dpl::identity{}, + [](auto&&... __args) { + return oneapi::dpl::__utils::__set_symmetric_difference_construct< + __BrickCopyConstruct<_IsVector>>(std::forward(__args)...); + }) + .__get_reached_out(); }); } diff --git a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h index 6d7fa7df3bd..f914bc3ce6a 100644 --- a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h @@ -727,7 +727,7 @@ __pattern_includes(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _ const auto __n2 = std::ranges::size(__r2); // use serial algorithm - if (__n1 + __n2 <= oneapi::dpl::__internal::__set_algo_cut_off) + if (!oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2)) return std::ranges::includes(std::forward<_R1>(__r1), std::forward<_R2>(__r2), __comp, __proj1, __proj2); auto __first1 = std::ranges::begin(__r1); @@ -805,13 +805,60 @@ using __set_union_return_t = std::ranges::set_union_result, std::ranges::borrowed_iterator_t<_R2>, std::ranges::borrowed_iterator_t<_OutRange>>; +// Bounded set union: performs set_union with output range capacity checking. +// Truncates result if output range is too small. +template +__set_union_return_t<_R1, _R2, _OutRange> +__serial_set_union(_R1&& __r1, _R2&& __r2, _OutRange&& __r_out, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) +{ + using DifferenceType = oneapi::dpl::__ranges::__common_size_t; + + auto [__it1, __end1] = oneapi::dpl::__ranges::__get_range_bounds(__r1); + auto [__it2, __end2] = oneapi::dpl::__ranges::__get_range_bounds(__r2); + auto [__out_it, __out_end] = oneapi::dpl::__ranges::__get_range_bounds(__r_out); + + // 1. Main set_union operation + while (__it1 != __end1 && __it2 != __end2 && __out_it != __out_end) + { + if (std::invoke(__comp, std::invoke(__proj1, *__it1), std::invoke(__proj2, *__it2))) + { + *__out_it = *__it1; + ++__it1; + } + else if (std::invoke(__comp, std::invoke(__proj2, *__it2), std::invoke(__proj1, *__it1))) + { + *__out_it = *__it2; + ++__it2; + } + else + { + *__out_it = *__it1; + ++__it1; + ++__it2; + } + ++__out_it; + } + + // 2. Copying the residual elements if one of the input sequences is exhausted + const DifferenceType __remaining_capacity1 = __out_end - __out_it; + const DifferenceType __copy_n1 = __end1 - __it1; + auto __copy1 = std::ranges::copy(__it1, __it1 + std::min(__copy_n1, __remaining_capacity1), __out_it); + + const DifferenceType __remaining_capacity2 = __out_end - __copy1.out; + const DifferenceType __copy_n2 = __end2 - __it2; + auto __copy2 = std::ranges::copy(__it2, __it2 + std::min(__copy_n2, __remaining_capacity2), __copy1.out); + + return {__copy1.in, __copy2.in, __copy2.out}; +} + template __set_union_return_t<_R1, _R2, _OutRange> __brick_set_union(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2, /*__is_vector=*/std::false_type) noexcept { - return std::ranges::set_union(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r), __comp, - __proj1, __proj2); + return __serial_set_union(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r), + __comp, __proj1, __proj2); } template @@ -820,8 +867,8 @@ __brick_set_union(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Pr /*__is_vector=*/std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); - return std::ranges::set_union(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r), __comp, - __proj1, __proj2); + return __serial_set_union(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r), + __comp, __proj1, __proj2); } template __pattern_set_union(_Tag, _ExecutionPolicy&&, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - static_assert(__is_serial_tag_v<_Tag>); + static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>); return __brick_set_union(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2, typename _Tag::__is_vector{}); @@ -842,34 +889,23 @@ __set_union_return_t<_R1, _R2, _OutRange> __pattern_set_union(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - using _RandomAccessIterator1 = std::ranges::iterator_t<_R1>; - using _RandomAccessIterator2 = std::ranges::iterator_t<_R2>; - using _Tp = std::ranges::range_value_t<_OutRange>; - - const auto __n1 = std::ranges::size(__r1); - const auto __n2 = std::ranges::size(__r2); + auto [__first1, __last1, __n1] = oneapi::dpl::__ranges::__get_range_bounds_n(__r1); + auto [__first2, __last2, __n2] = oneapi::dpl::__ranges::__get_range_bounds_n(__r2); + auto [__result1, __result2] = oneapi::dpl::__ranges::__get_range_bounds(__out_r); // use serial algorithm - if (__n1 + __n2 <= oneapi::dpl::__internal::__set_algo_cut_off) - return std::ranges::set_union(__r1, __r2, std::begin(__out_r), __comp, __proj1, __proj2); - - auto __first1 = std::ranges::begin(__r1); - auto __last1 = __first1 + __n1; - auto __first2 = std::ranges::begin(__r2); - auto __last2 = __first2 + __n2; - auto __result = std::ranges::begin(__out_r); + if (!oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2)) + return __serial_set_union(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r), + __comp, __proj1, __proj2); - auto __out_last = oneapi::dpl::__internal::__parallel_set_union_op( - __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _Tp* __result, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - return oneapi::dpl::__utils::__set_union_construct( - __first1, __last1, __first2, __last2, __result, - oneapi::dpl::__internal::__BrickCopyConstruct<_IsVector>(), __comp, __proj1, __proj2); - }, - __comp, __proj1, __proj2); - - return {__first1 + __n1, __first2 + __n2, __result + (__out_last - __result)}; + return oneapi::dpl::__internal::__parallel_set_union_op( + __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result1, + __result2, __comp, __proj1, __proj2, + [](auto&&... __args) { + return oneapi::dpl::__utils::__set_union_construct<__BrickCopyConstruct<_IsVector>>( + std::forward(__args)...); + }) + .template __get_reached_in1_in2_out<__set_union_return_t<_R1, _R2, _OutRange>>(); } //--------------------------------------------------------------------------------------------------------------------- @@ -881,13 +917,49 @@ using __set_intersection_return_t = std::ranges::set_intersection_result, std::ranges::borrowed_iterator_t<_R2>, std::ranges::borrowed_iterator_t<_OutRange>>; +// Bounded set intersection: performs set_intersection with output range capacity checking. +// Truncates result if output range is too small. +template +__set_intersection_return_t<_R1, _R2, _OutRange> +__serial_set_intersection(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) +{ + auto [__it1, __end1] = oneapi::dpl::__ranges::__get_range_bounds(__r1); + auto [__it2, __end2] = oneapi::dpl::__ranges::__get_range_bounds(__r2); + auto [__out_it, __out_end] = oneapi::dpl::__ranges::__get_range_bounds(__out_r); + + while (__it1 != __end1 && __it2 != __end2) + { + if (std::invoke(__comp, std::invoke(__proj1, *__it1), std::invoke(__proj2, *__it2))) + { + ++__it1; + } + else if (std::invoke(__comp, std::invoke(__proj2, *__it2), std::invoke(__proj1, *__it1))) + { + ++__it2; + } + else if (__out_it != __out_end) + { + *__out_it = *__it1; + ++__out_it; + ++__it1; + ++__it2; + } + else + { + break; + } + } + + return {__it1, __it2, __out_it}; +} + template __set_intersection_return_t<_R1, _R2, _OutRange> __brick_set_intersection(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2, /*__is_vector=*/std::false_type) noexcept { - return std::ranges::set_intersection(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r), - __comp, __proj1, __proj2); + return __serial_set_intersection(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r), + __comp, __proj1, __proj2); } template @@ -896,8 +968,8 @@ __brick_set_intersection(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __co /*__is_vector=*/std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); - return std::ranges::set_intersection(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r), - __comp, __proj1, __proj2); + return __serial_set_intersection(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r), + __comp, __proj1, __proj2); } template __pattern_set_intersection(_Tag, _ExecutionPolicy&&, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - static_assert(__is_serial_tag_v<_Tag>); + static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>); return __brick_set_intersection(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2, typename _Tag::__is_vector{}); @@ -920,24 +992,18 @@ __pattern_set_intersection(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& _ { using _RandomAccessIterator1 = std::ranges::iterator_t<_R1>; using _RandomAccessIterator2 = std::ranges::iterator_t<_R2>; - using _T = std::ranges::range_value_t<_OutRange>; using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type; using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2>; - const auto __n1 = std::ranges::size(__r1); - const auto __n2 = std::ranges::size(__r2); - - auto __first1 = std::ranges::begin(__r1); - auto __last1 = __first1 + __n1; - auto __first2 = std::ranges::begin(__r2); - auto __last2 = __first2 + __n2; - auto __result = std::ranges::begin(__out_r); + auto [__first1, __last1, __n1] = oneapi::dpl::__ranges::__get_range_bounds_n(__r1); + auto [__first2, __last2, __n2] = oneapi::dpl::__ranges::__get_range_bounds_n(__r2); + auto [__result1, __result2] = oneapi::dpl::__ranges::__get_range_bounds(__out_r); // intersection is empty if (__n1 == 0 || __n2 == 0) - return {__last1, __last2, __result}; + return {__first1, __first2, __result1}; // testing whether the sequences are intersected auto __left_bound_seq_1 = @@ -945,7 +1011,7 @@ __pattern_set_intersection(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& _ __first2, __comp, __proj1, __proj2); //{1} < {2}: seq 2 is wholly greater than seq 1, so, the intersection is empty if (__left_bound_seq_1 == __last1) - return {__last1, __last2, __result}; + return {__last1, __first2, __result1}; // testing whether the sequences are intersected auto __left_bound_seq_2 = @@ -953,85 +1019,151 @@ __pattern_set_intersection(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& _ __first1, __comp, __proj2, __proj1); //{2} < {1}: seq 1 is wholly greater than seq 2, so, the intersection is empty if (__left_bound_seq_2 == __last2) - return {__last1, __last2, __result}; + return {__first1, __last2, __result1}; const auto __m1 = __last1 - __left_bound_seq_1 + __n2; - if (__m1 > oneapi::dpl::__internal::__set_algo_cut_off) + if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__m1)) { //we know proper offset due to [first1; left_bound_seq_1) < [first2; last2) return __internal::__except_handler([&]() { - auto __out_last = __internal::__parallel_set_op( - __tag, std::forward<_ExecutionPolicy>(__exec), __left_bound_seq_1, __last1, __first2, __last2, __result, - [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); }, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _T* __result, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - return oneapi::dpl::__utils::__set_intersection_construct( - __first1, __last1, __first2, __last2, __result, - oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp, __proj1, __proj2); - }, - __comp, __proj1, __proj2); - return __set_intersection_return_t<_R1, _R2, _OutRange>{__last1, __last2, __out_last}; + return __internal::__parallel_set_op( + __tag, std::forward<_ExecutionPolicy>(__exec), __left_bound_seq_1, __last1, __first2, __last2, + __result1, __result2, __comp, __proj1, __proj2, + [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); }, + [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }, + [](auto&&... __args) { + return oneapi::dpl::__utils::__set_intersection_construct< + oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>>( + std::forward(__args)...); + }) + .template __get_reached_in1_in2_out<__set_intersection_return_t<_R1, _R2, _OutRange>>(); }); } const auto __m2 = __last2 - __left_bound_seq_2 + __n1; - if (__m2 > oneapi::dpl::__internal::__set_algo_cut_off) + if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__m2)) { //we know proper offset due to [first2; left_bound_seq_2) < [first1; last1) return __internal::__except_handler([&]() { - auto __out_last = __internal::__parallel_set_op( - __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __left_bound_seq_2, __last2, __result, - [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); }, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _T* __result, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - return oneapi::dpl::__utils::__set_intersection_construct( - __first1, __last1, __first2, __last2, __result, - oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>{}, __comp, __proj1, __proj2); - }, - __comp, __proj1, __proj2); - return __set_intersection_return_t<_R1, _R2, _OutRange>{__last1, __last2, __out_last}; + return __internal::__parallel_set_op( + __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __left_bound_seq_2, __last2, + __result1, __result2, __comp, __proj1, __proj2, + [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); }, + [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }, + [](auto&&... __args) { + return oneapi::dpl::__utils::__set_intersection_construct< + oneapi::dpl::__internal::__op_uninitialized_copy<_ExecutionPolicy>>( + std::forward(__args)...); + }) + .template __get_reached_in1_in2_out<__set_intersection_return_t<_R1, _R2, _OutRange>>(); }); } // [left_bound_seq_1; last1) and [left_bound_seq_2; last2) - use serial algorithm - return std::ranges::set_intersection(__left_bound_seq_1, __last1, __left_bound_seq_2, __last2, - std::ranges::begin(__out_r), __comp, __proj1, __proj2); + return __serial_set_intersection(std::ranges::subrange(__left_bound_seq_1, __last1), + std::ranges::subrange(__left_bound_seq_2, __last2), __out_r, __comp, __proj1, + __proj2); } //--------------------------------------------------------------------------------------------------------------------- // set_difference //--------------------------------------------------------------------------------------------------------------------- -template -using __set_difference_return_t = std::ranges::set_difference_result, - std::ranges::borrowed_iterator_t<_OutRange>>; +#if ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT +template +using __set_difference_return_t = + std::ranges::in_out_result, std::ranges::borrowed_iterator_t<_OutRange>>; +#else +template +using __set_difference_return_t = + std::ranges::in_in_out_result, std::ranges::borrowed_iterator_t<_R2>, + std::ranges::borrowed_iterator_t<_OutRange>>; +#endif + +// Helper function to create the appropriate return type for oneapi::dpl::ranges::set_difference based on C++23 compatibility mode. +// In C++23, set_difference returns in_out_result with the second input iterator omitted, as it is not needed for the caller. +template +__set_difference_return_t<_R1, _R2, _OutRange> +__create_set_difference_result(_It1 __it1, _It2 __it2, _ItOut __it_out) +{ +#if ONEDPL_RANGES_SET_DIFFERENCE_CPP23_RESULT + return std::ranges::in_out_result<_It1, _ItOut>{__it1, __it_out}; +#else + return std::ranges::in_in_out_result<_It1, _It2, _ItOut>{__it1, __it2, __it_out}; +#endif +} + +// Bounded set difference: performs set_difference with output range capacity checking. +// Truncates result if output range is too small. +template +__set_difference_return_t<_R1, _R2, _OutRange> +__serial_set_difference(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) +{ + using DifferenceType = oneapi::dpl::__ranges::__common_size_t; + + auto [__it1, __end1] = oneapi::dpl::__ranges::__get_range_bounds(__r1); + auto [__it2, __end2] = oneapi::dpl::__ranges::__get_range_bounds(__r2); + auto [__out_it, __out_end] = oneapi::dpl::__ranges::__get_range_bounds(__out_r); + + // 1. Main set_difference operation + while (__it1 != __end1 && __it2 != __end2) + { + if (std::invoke(__comp, std::invoke(__proj1, *__it1), std::invoke(__proj2, *__it2))) + { + if (__out_it != __out_end) + { + *__out_it = *__it1; + ++__it1; + ++__out_it; + } + else + break; + } + else if (std::invoke(__comp, std::invoke(__proj2, *__it2), std::invoke(__proj1, *__it1))) + { + ++__it2; + } + else + { + ++__it1; + ++__it2; + } + } + + // 2. Copying the rest of the first sequence + const DifferenceType __remaining_capacity = __out_end - __out_it; + const DifferenceType __copy_n = __end1 - __it1; + auto __copy = std::ranges::copy(__it1, __it1 + std::min(__copy_n, __remaining_capacity), __out_it); + + return __create_set_difference_result<_R1, _R2, _OutRange>(__copy.in, __it2, __copy.out); +} template -__set_difference_return_t<_R1, _OutRange> +__set_difference_return_t<_R1, _R2, _OutRange> __brick_set_difference(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2, /*__is_vector=*/std::false_type) noexcept { - return std::ranges::set_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r), - __comp, __proj1, __proj2); + return __serial_set_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r), + __comp, __proj1, __proj2); } template -__set_difference_return_t<_R1, _OutRange> +__set_difference_return_t<_R1, _R2, _OutRange> __brick_set_difference(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2, /*__is_vector=*/std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); - return std::ranges::set_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r), - __comp, __proj1, __proj2); + return __serial_set_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r), + __comp, __proj1, __proj2); } template -__set_difference_return_t<_R1, _OutRange> +__set_difference_return_t<_R1, _R2, _OutRange> __pattern_set_difference(_Tag, _ExecutionPolicy&&, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - static_assert(__is_serial_tag_v<_Tag>); + static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>); return __brick_set_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2, typename _Tag::__is_vector{}); @@ -1039,37 +1171,34 @@ __pattern_set_difference(_Tag, _ExecutionPolicy&&, _R1&& __r1, _R2&& __r2, _OutR template -__set_difference_return_t<_R1, _OutRange> +__set_difference_return_t<_R1, _R2, _OutRange> __pattern_set_difference(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { using _RandomAccessIterator1 = std::ranges::iterator_t<_R1>; using _RandomAccessIterator2 = std::ranges::iterator_t<_R2>; - using _T = std::ranges::range_value_t<_OutRange>; + using _RandomAccessIteratorOut = std::ranges::iterator_t<_OutRange>; using _DifferenceType1 = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; using _DifferenceType2 = typename std::iterator_traits<_RandomAccessIterator2>::difference_type; using _DifferenceType = std::common_type_t<_DifferenceType1, _DifferenceType2>; - const auto __n1 = std::ranges::size(__r1); - const auto __n2 = std::ranges::size(__r2); - - auto __first1 = std::ranges::begin(__r1); - auto __last1 = __first1 + __n1; - auto __first2 = std::ranges::begin(__r2); - auto __last2 = __first2 + __n2; - auto __result = std::ranges::begin(__out_r); + auto [__first1, __last1, __n1] = oneapi::dpl::__ranges::__get_range_bounds_n(__r1); + auto [__first2, __last2, __n2] = oneapi::dpl::__ranges::__get_range_bounds_n(__r2); + auto [__result1, __result2] = oneapi::dpl::__ranges::__get_range_bounds(__out_r); // {} \ {2}: the difference is empty if (__n1 == 0) - return {__first1, __result}; + return __create_set_difference_result<_R1, _R2, _OutRange>(__first1, __first2, __result1); // {1} \ {}: parallel copying just first sequence if (__n2 == 0) { - auto __out_last = __pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, - __result, __internal::__brick_copy<__parallel_tag<_IsVector>>{}); - return {__last1, __out_last}; + const _DifferenceType __n = std::min(__last1 - __first1, __result2 - __result1); + auto __out_last = + __internal::__pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __first1 + __n, + __result1, __internal::__brick_copy<__parallel_tag<_IsVector>>{}); + return __create_set_difference_result<_R1, _R2, _OutRange>(__first1 + __n, __first2, __out_last); } // testing whether the sequences are intersected @@ -1079,9 +1208,11 @@ __pattern_set_difference(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __e //{1} < {2}: seq 2 is wholly greater than seq 1, so, parallel copying just first sequence if (__left_bound_seq_1 == __last1) { - auto __out_last = __pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, - __result, __internal::__brick_copy<__parallel_tag<_IsVector>>{}); - return {__last1, __out_last}; + const _DifferenceType __n = std::min(__last1 - __first1, __result2 - __result1); + auto __out_last = + __internal::__pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __first1 + __n, + __result1, __internal::__brick_copy<__parallel_tag<_IsVector>>{}); + return __create_set_difference_result<_R1, _R2, _OutRange>(__first1 + __n, __first2, __out_last); } // testing whether the sequences are intersected @@ -1091,30 +1222,35 @@ __pattern_set_difference(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __e //{2} < {1}: seq 1 is wholly greater than seq 2, so, parallel copying just first sequence if (__left_bound_seq_2 == __last2) { + const _DifferenceType __n = std::min(__last1 - __first1, __result2 - __result1); auto __out_last = - __internal::__pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, - __result, __brick_copy<__parallel_tag<_IsVector>>{}); - return {__last1, __out_last}; + __internal::__pattern_walk2_brick(__tag, std::forward<_ExecutionPolicy>(__exec), __first1, __first1 + __n, + __result1, __internal::__brick_copy<__parallel_tag<_IsVector>>{}); + return __create_set_difference_result<_R1, _R2, _OutRange>(__first1 + __n, __last2, __out_last); } - if (__n1 + __n2 > __set_algo_cut_off) + if (oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2)) { - auto __out_last = __parallel_set_op( - __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, - [](_DifferenceType __n, _DifferenceType) { return __n; }, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _T* __result, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - return oneapi::dpl::__utils::__set_difference_construct(__first1, __last1, __first2, __last2, __result, - __BrickCopyConstruct<_IsVector>(), __comp, - __proj1, __proj2); - }, - __comp, __proj1, __proj2); - return {__last1, __result + (__out_last - __result)}; + //we know proper offset due to [first2; left_bound_seq_2) < [first1; last1) + auto [__it1, __it2, __it_out] = + __internal::__parallel_set_op( + __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __left_bound_seq_2, __last2, + __result1, __result2, __comp, __proj1, __proj2, + [](_DifferenceType __n, _DifferenceType) { return __n; }, + [](_DifferenceType __n, _DifferenceType __m) { return __n + __m; }, + [](auto&&... __args) { + return oneapi::dpl::__utils::__set_difference_construct<__BrickCopyConstruct<_IsVector>>( + std::forward(__args)...); + }) + .template __get_reached_in1_in2_out< + std::tuple<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIteratorOut>>(); + + return __create_set_difference_result<_R1, _R2, _OutRange>(__it1, __it2, __it_out); } // use serial algorithm - return std::ranges::set_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r), - __comp, __proj1, __proj2); + return __serial_set_difference(std::forward<_R1>(__r1), std::ranges::subrange(__left_bound_seq_2, __last2), + std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2); } //--------------------------------------------------------------------------------------------------------------------- @@ -1127,14 +1263,71 @@ using __set_symmetric_difference_return_t = std::ranges::borrowed_iterator_t<_R2>, std::ranges::borrowed_iterator_t<_OutRange>>; +// Bounded set symmetric difference: performs set_symmetric_difference with output range capacity checking. +// Truncates result if output range is too small. +template +__set_symmetric_difference_return_t<_R1, _R2, _OutRange> +__serial_set_symmetric_difference(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, + _Proj2 __proj2) +{ + using DifferenceType = oneapi::dpl::__ranges::__common_size_t; + + auto [__it1, __end1] = oneapi::dpl::__ranges::__get_range_bounds(__r1); + auto [__it2, __end2] = oneapi::dpl::__ranges::__get_range_bounds(__r2); + auto [__out_it, __out_end] = oneapi::dpl::__ranges::__get_range_bounds(__out_r); + + // 1. Main set_symmetric_difference operation + while (__it1 != __end1 && __it2 != __end2) + { + if (std::invoke(__comp, std::invoke(__proj1, *__it1), std::invoke(__proj2, *__it2))) + { + if (__out_it != __out_end) + { + *__out_it = *__it1; + ++__it1; + ++__out_it; + } + else + break; + } + else if (std::invoke(__comp, std::invoke(__proj2, *__it2), std::invoke(__proj1, *__it1))) + { + if (__out_it != __out_end) + { + *__out_it = *__it2; + ++__it2; + ++__out_it; + } + else + break; + } + else + { + ++__it1; + ++__it2; + } + } + + // 2. Copying the residual elements if one of the input sequences is exhausted + const DifferenceType __remaining_capacity1 = __out_end - __out_it; + const DifferenceType __copy_n1 = __end1 - __it1; + auto __copy1 = std::ranges::copy(__it1, __it1 + std::min(__copy_n1, __remaining_capacity1), __out_it); + + const DifferenceType __remaining_capacity2 = __out_end - __copy1.out; + const DifferenceType __copy_n2 = __end2 - __it2; + auto __copy2 = std::ranges::copy(__it2, __it2 + std::min(__copy_n2, __remaining_capacity2), __copy1.out); + + return {__copy1.in, __copy2.in, __copy2.out}; +} + template __set_symmetric_difference_return_t<_R1, _R2, _OutRange> __brick_set_symmetric_difference(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2, /*__is_vector=*/std::false_type) noexcept { - return std::ranges::set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), - std::ranges::begin(__out_r), __comp, __proj1, __proj2); + return __serial_set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), + std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2); } template @@ -1144,8 +1337,8 @@ __brick_set_symmetric_difference(_R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _C /*__is_vector=*/std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); - return std::ranges::set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), - std::ranges::begin(__out_r), __comp, __proj1, __proj2); + return __serial_set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), + std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2); } template __pattern_set_symmetric_difference(_Tag, _ExecutionPolicy&&, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - static_assert(__is_serial_tag_v<_Tag>); + static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>); return __brick_set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2, @@ -1167,35 +1360,23 @@ __set_symmetric_difference_return_t<_R1, _R2, _OutRange> __pattern_set_symmetric_difference(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - using _RandomAccessIterator1 = std::ranges::iterator_t<_R1>; - using _RandomAccessIterator2 = std::ranges::iterator_t<_R2>; - using _Tp = std::ranges::range_value_t<_OutRange>; - - const auto __n1 = std::ranges::size(__r1); - const auto __n2 = std::ranges::size(__r2); + auto [__first1, __last1, __n1] = oneapi::dpl::__ranges::__get_range_bounds_n(__r1); + auto [__first2, __last2, __n2] = oneapi::dpl::__ranges::__get_range_bounds_n(__r2); + auto [__result1, __result2] = oneapi::dpl::__ranges::__get_range_bounds(__out_r); // use serial algorithm - if (__n1 + __n2 <= oneapi::dpl::__internal::__set_algo_cut_off) - return std::ranges::set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), - std::ranges::begin(__out_r), __comp, __proj1, __proj2); - - auto __first1 = std::ranges::begin(__r1); - auto __last1 = __first1 + __n1; - auto __first2 = std::ranges::begin(__r2); - auto __last2 = __first2 + __n2; - auto __result = std::ranges::begin(__out_r); - - auto __out_last = oneapi::dpl::__internal::__parallel_set_union_op( - __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _Tp* __result, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - return oneapi::dpl::__utils::__set_symmetric_difference_construct( - __first1, __last1, __first2, __last2, __result, - oneapi::dpl::__internal::__BrickCopyConstruct<_IsVector>(), __comp, __proj1, __proj2); - }, - __comp, __proj1, __proj2); - - return {__last1, __last2, __out_last}; + if (!oneapi::dpl::__internal::__is_set_algo_cutoff_exceeded(__n1 + __n2)) + return __serial_set_symmetric_difference(std::forward<_R1>(__r1), std::forward<_R2>(__r2), + std::forward<_OutRange>(__out_r), __comp, __proj1, __proj2); + + return oneapi::dpl::__internal::__parallel_set_union_op( + __tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result1, + __result2, __comp, __proj1, __proj2, + [](auto&&... __args) { + return oneapi::dpl::__utils::__set_symmetric_difference_construct<__BrickCopyConstruct<_IsVector>>( + std::forward(__args)...); + }) + .template __get_reached_in1_in2_out<__set_symmetric_difference_return_t<_R1, _R2, _OutRange>>(); } //--------------------------------------------------------------------------------------------------------------------- diff --git a/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h index 561b80884ca..3176adf750b 100644 --- a/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h +++ b/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h @@ -897,8 +897,7 @@ struct __set_difference_fn std::mergeable, std::ranges::iterator_t<_R2>, std::ranges::iterator_t<_OutRange>, _Comp, _Proj1, _Proj2> - std::ranges::set_difference_result, - std::ranges::borrowed_iterator_t<_OutRange>> + oneapi::dpl::__internal::__ranges::__set_difference_return_t<_R1, _R2, _OutRange> operator()(_ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp = {}, _Proj1 __proj1 = {}, _Proj2 __proj2 = {}) const { diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 9f2e53cdcbd..095ab4be012 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -1102,6 +1102,7 @@ __pattern_set_intersection(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, oneapi::dpl::__ranges::__get_subscription_view(__r1), oneapi::dpl::__ranges::__get_subscription_view(__r2), oneapi::dpl::__ranges::__get_subscription_view(__out_r), __comp, __proj1, __proj2); + // TODO incorrect approach for new rules of stop positions for std::ranges::set_intersection return {__first1 + __n1, __first2 + __n2, __result + __result_size}; } @@ -1111,18 +1112,21 @@ struct __set_difference_copy_case_1; template -std::ranges::set_difference_result, std::ranges::borrowed_iterator_t<_OutRange>> +oneapi::dpl::__internal::__ranges::__set_difference_return_t<_R1, _R2, _OutRange> __pattern_set_difference(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { const auto __first1 = std::ranges::begin(__r1); + const auto __first2 = std::ranges::begin(__r2); const auto __result = std::ranges::begin(__out_r); const auto __n1 = oneapi::dpl::__ranges::__size(__r1); + const auto __n2 = oneapi::dpl::__ranges::__size(__r2); // {} \ {2}: the difference is empty if (__n1 == 0) - return {__first1, __result}; + return oneapi::dpl::__internal::__ranges::__create_set_difference_result<_R1, _R2, _OutRange>( + __first1, __first2, __result); // {1} \ {}: the difference is {1} if (oneapi::dpl::__ranges::__empty(__r2)) @@ -1135,16 +1139,18 @@ __pattern_set_difference(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e oneapi::dpl::__ranges::__get_subscription_view(__r1), oneapi::dpl::__ranges::__get_subscription_view(__out_r)); - return {__first1 + __n1, __result + __idx}; + return oneapi::dpl::__internal::__ranges::__create_set_difference_result<_R1, _R2, _OutRange>( + __first1 + __n1, __first2, __result + __idx); } const std::size_t __result_size = __par_backend_hetero::__parallel_set_op( _BackendTag{}, unseq_backend::_DifferenceTag{}, std::forward<_ExecutionPolicy>(__exec), - oneapi::dpl::__ranges::__get_subscription_view(__r1), - oneapi::dpl::__ranges::__get_subscription_view(std::forward<_R2>(__r2)), + oneapi::dpl::__ranges::__get_subscription_view(__r1), oneapi::dpl::__ranges::__get_subscription_view(__r2), oneapi::dpl::__ranges::__get_subscription_view(__out_r), __comp, __proj1, __proj2); - return {__first1 + __n1, __result + __result_size}; + // TODO the second argument isn't correct for now + return oneapi::dpl::__internal::__ranges::__create_set_difference_result<_R1, _R2, _OutRange>( + __first1 + __n1, __first2 + __n2, __result + __result_size); } //Dummy names to avoid kernel problems diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 7957e49a56f..003f03293b8 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -478,7 +478,7 @@ class __func_task : public __task execute() { return _M_func(this); - }; + } public: template diff --git a/include/oneapi/dpl/pstl/parallel_backend_utils.h b/include/oneapi/dpl/pstl/parallel_backend_utils.h index af85a4fc5e9..35ef85669cd 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_utils.h +++ b/include/oneapi/dpl/pstl/parallel_backend_utils.h @@ -24,6 +24,7 @@ #include #include #include +#include // for std::uint8_t #include "utils.h" #include "memory_fwd.h" #include "functional_impl.h" // for oneapi::dpl::identity, std::invoke @@ -217,123 +218,323 @@ struct __serial_move_merge } }; -template -_OutputIterator +enum class __parallel_set_op_mask : std::uint8_t +{ + eNone = 0x00, // initial state + eData1 = 0x01, // mask for first input data item usage + eData2 = 0x02, // mask for second input data item usage + eDataOut = 0x04, // mask for output data item usage + + eBoth = 0x03, // eData1 | eData2: mask for both input data items usage + eData1Out = 0x05, // eData1 | eDataOut: mask for copy data item from the first data set into output + eData2Out = 0x06, // eData2 | eDataOut: mask for copy data item from the second data set into output + eBothOut = 0x07 // eBoth | eDataOut: mask for copy data item from the first and the second data set into output +}; + +inline std::nullptr_t +__set_iterator_mask(std::nullptr_t, __parallel_set_op_mask) noexcept +{ + return nullptr; +} + +inline __parallel_set_op_mask* +__set_iterator_mask(__parallel_set_op_mask* __mask, __parallel_set_op_mask __state) noexcept +{ + *__mask = __state; + return ++__mask; +} + +template +inline std::nullptr_t +__set_iterator_mask_n(std::nullptr_t, __parallel_set_op_mask, _Size) noexcept +{ + return nullptr; +} + +template +inline __parallel_set_op_mask* +__set_iterator_mask_n(__parallel_set_op_mask* __mask, __parallel_set_op_mask __state, _Size __count) noexcept +{ + for (_Size __i = 0; __i < __count; ++__i) + __mask[__i] = __state; + + return __mask + __count; +} + +struct _SetOpDiscardIterator +{ + using iterator_category = std::output_iterator_tag; + using difference_type = std::ptrdiff_t; + using value_type = void; + using pointer = void; + using reference = void; + + _SetOpDiscardIterator& + operator*() noexcept + { + return *this; + } + + _SetOpDiscardIterator& + operator++() noexcept + { + return *this; + } + + _SetOpDiscardIterator + operator++(int) noexcept + { + return *this; + } + + template + _SetOpDiscardIterator& + operator=(const T&) noexcept + { + return *this; + } +}; + +template +struct _UninitializedCopyItem +{ + using _OutValueType = typename std::iterator_traits<_OutputIterator>::value_type; + + void + operator()(_InputIterator __it_in, _OutputIterator __it_out) const + { + if constexpr (!std::is_same_v<_OutputIterator, _SetOpDiscardIterator>) + { + // We should use placement new here because this method really works with raw uninitialized memory + new (std::addressof(*__it_out)) _OutValueType(*__it_in); + } + } +}; + +template +struct _CopyConstructRangeOpWrapper +{ + _CopyConstructRange _cc_range; + + template + _SetOpDiscardIterator + operator()(_InputIterator, _InputIterator, _SetOpDiscardIterator) + { + return _SetOpDiscardIterator{}; + } + + template + _OutputIterator + operator()(_InputIterator __first, _InputIterator __last, _OutputIterator __result) + { + return _cc_range(__first, __last, __result); + } +}; + +template +using _union_construct_return_t = std::tuple<_ForwardIterator1, _ForwardIterator2, _OutputIterator, _MaskIterator>; + +template +_union_construct_return_t<_ForwardIterator1, _ForwardIterator2, _OutputIterator, _MaskIterator> __set_union_construct(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, - _ForwardIterator2 __last2, _OutputIterator __result, _CopyConstructRange __cc_range, - _Compare __comp, _Proj1 __proj1, _Proj2 __proj2) + _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp, _Proj1 __proj1, + _Proj2 __proj2, _MaskIterator __mask) { - using _Tp = typename ::std::iterator_traits<_OutputIterator>::value_type; + _UninitializedCopyItem<_ForwardIterator1, _OutputIterator> _uninitialized_copy_from1; + _UninitializedCopyItem<_ForwardIterator2, _OutputIterator> _uninitialized_copy_from2; + + _CopyConstructRangeOpWrapper<_CopyConstructRange> __cc_range; for (; __first1 != __last1; ++__result) { if (__first2 == __last2) - return __cc_range(__first1, __last1, __result); + { + __mask = __set_iterator_mask_n(__mask, __parallel_set_op_mask::eData1Out, __last1 - __first1); + __result = __cc_range(__first1, __last1, __result); + + return {__last1, __first2, __result, __mask}; + } + if (std::invoke(__comp, std::invoke(__proj2, *__first2), std::invoke(__proj1, *__first1))) { - ::new (::std::addressof(*__result)) _Tp(*__first2); + _uninitialized_copy_from2(__first2, __result); ++__first2; + __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData2Out); } else { - ::new (::std::addressof(*__result)) _Tp(*__first1); + _uninitialized_copy_from1(__first1, __result); if (!std::invoke(__comp, std::invoke(__proj1, *__first1), std::invoke(__proj2, *__first2))) + { ++__first2; + __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eBothOut); + } + else + { + __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData1Out); + } ++__first1; } } - return __cc_range(__first2, __last2, __result); + + __mask = __set_iterator_mask_n(__mask, __parallel_set_op_mask::eData2Out, __last2 - __first2); + __result = __cc_range(__first2, __last2, __result); + + return {__first1, __last2, __result, __mask}; } -template -_OutputIterator +template +struct CopyOpWrapper +{ + _CopyFunc _copy; + + template + void + operator()(_InputIterator, _SetOpDiscardIterator) const + { + } + + template + void + operator()(_InputIterator __it_in, _OutputIterator __it_out) const + { + _copy(*__it_in, *__it_out); + } +}; + +template +_union_construct_return_t<_ForwardIterator1, _ForwardIterator2, _OutputIterator, _MaskIterator> __set_intersection_construct(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, - _ForwardIterator2 __last2, _OutputIterator __result, _CopyFunc _copy, _Compare __comp, - _Proj1 __proj1, _Proj2 __proj2) + _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp, _Proj1 __proj1, + _Proj2 __proj2, _MaskIterator __mask) { + CopyOpWrapper<_CopyFunc> __copy; + while (__first1 != __last1 && __first2 != __last2) { if (std::invoke(__comp, std::invoke(__proj1, *__first1), std::invoke(__proj2, *__first2))) + { ++__first1; + __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData1); + } else if (std::invoke(__comp, std::invoke(__proj2, *__first2), std::invoke(__proj1, *__first1))) + { ++__first2; + __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData2); + } else { - _copy(*__first1, *__result); - + __copy(__first1, __result); ++__first1; ++__first2; ++__result; + __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eBothOut); } } - return __result; + + return {__first1, __first2, __result, __mask}; } -template -_OutputIterator +template +_union_construct_return_t<_ForwardIterator1, _ForwardIterator2, _OutputIterator, _MaskIterator> __set_difference_construct(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, - _ForwardIterator2 __last2, _OutputIterator __result, _CopyConstructRange __cc_range, - _Compare __comp, _Proj1 __proj1, _Proj2 __proj2) + _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp, _Proj1 __proj1, + _Proj2 __proj2, _MaskIterator __mask) { - using _Tp = typename ::std::iterator_traits<_OutputIterator>::value_type; + _UninitializedCopyItem<_ForwardIterator1, _OutputIterator> _uninitialized_copy_from1; + + _CopyConstructRangeOpWrapper<_CopyConstructRange> __cc_range; - for (; __first1 != __last1;) + while (__first1 != __last1) { if (__first2 == __last2) - return __cc_range(__first1, __last1, __result); + { + __mask = __set_iterator_mask_n(__mask, __parallel_set_op_mask::eData1Out, __last1 - __first1); + __result = __cc_range(__first1, __last1, __result); + + return {__last1, __first2, __result, __mask}; + } if (std::invoke(__comp, std::invoke(__proj1, *__first1), std::invoke(__proj2, *__first2))) { - ::new (::std::addressof(*__result)) _Tp(*__first1); + _uninitialized_copy_from1(__first1, __result); ++__result; ++__first1; + __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData1Out); } else { if (!std::invoke(__comp, std::invoke(__proj2, *__first2), std::invoke(__proj1, *__first1))) + { ++__first1; + __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eBoth); + } + else + { + __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData2); + } ++__first2; } } - return __result; + + return {__first1, __first2, __result, __mask}; } -template -_OutputIterator +template +_union_construct_return_t<_ForwardIterator1, _ForwardIterator2, _OutputIterator, _MaskIterator> __set_symmetric_difference_construct(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, - _ForwardIterator2 __last2, _OutputIterator __result, - _CopyConstructRange __cc_range, _Compare __comp, _Proj1 __proj1, _Proj2 __proj2) + _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp, + _Proj1 __proj1, _Proj2 __proj2, _MaskIterator __mask) { - using _Tp = typename ::std::iterator_traits<_OutputIterator>::value_type; + _UninitializedCopyItem<_ForwardIterator1, _OutputIterator> _uninitialized_copy_from1; + _UninitializedCopyItem<_ForwardIterator2, _OutputIterator> _uninitialized_copy_from2; + + _CopyConstructRangeOpWrapper<_CopyConstructRange> __cc_range; - for (; __first1 != __last1;) + while (__first1 != __last1) { if (__first2 == __last2) - return __cc_range(__first1, __last1, __result); + { + __mask = __set_iterator_mask_n(__mask, __parallel_set_op_mask::eData1Out, __last1 - __first1); + __result = __cc_range(__first1, __last1, __result); + + return {__last1, __first2, __result, __mask}; + } if (std::invoke(__comp, std::invoke(__proj1, *__first1), std::invoke(__proj2, *__first2))) { - ::new (::std::addressof(*__result)) _Tp(*__first1); + // We should use placement new here because this method really works with raw uninitialized memory + _uninitialized_copy_from1(__first1, __result); ++__result; ++__first1; + __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData1Out); } else { if (std::invoke(__comp, std::invoke(__proj2, *__first2), std::invoke(__proj1, *__first1))) { - ::new (::std::addressof(*__result)) _Tp(*__first2); + // We should use placement new here because this method really works with raw uninitialized memory + _uninitialized_copy_from2(__first2, __result); ++__result; + __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eData2Out); } else + { ++__first1; + __mask = __set_iterator_mask(__mask, __parallel_set_op_mask::eBoth); + } ++__first2; } } - return __cc_range(__first2, __last2, __result); + + __mask = __set_iterator_mask_n(__mask, __parallel_set_op_mask::eData2Out, __last2 - __first2); + __result = __cc_range(__first2, __last2, __result); + + return {__first1, __last2, __result, __mask}; } template